Merge "Removed unused dr from VP8D_COMP"

diff --git a/README b/README
index 0dfb0fe..0475dad 100644
--- a/README
+++ b/README

@@ -1,5 +1,5 @@
 vpx Multi-Format Codec SDK
-README - 19 May 2010
+README - 21 June 2012
 
 Welcome to the WebM VP8 Codec SDK!
 
@@ -15,11 +15,19 @@
     * Building the documentation requires PHP[3] and Doxygen[4]. If you do not
       have these packages, you must pass --disable-install-docs to the
       configure script.
+    * Downloading the data for the unit tests requires curl[5] and sha1sum.
+      sha1sum is provided via the GNU coreutils, installed by default on
+      many *nix platforms, as well as MinGW and Cygwin. If coreutils is not
+      available, a compatible version of sha1sum can be built from
+      source[6]. These requirements are optional if not running the unit
+      tests.
 
     [1]: http://www.tortall.net/projects/yasm
     [2]: http://www.cygwin.com
     [3]: http://php.net
     [4]: http://www.doxygen.org
+    [5]: http://curl.haxx.se
+    [6]: http://www.microbrew.org/tools/md5sha1sum/
 
   2. Out-of-tree builds
   Out of tree builds are a supported method of building the application. For
@@ -94,5 +102,5 @@
 
 SUPPORT
   This library is an open source project supported by its community. Please
-  please email webm-users@webmproject.org for help.
+  please email webm-discuss@webmproject.org for help.
 

diff --git a/build/make/Android.mk b/build/make/Android.mk
index 6fcd4ae..d54639a 100644
--- a/build/make/Android.mk
+++ b/build/make/Android.mk

@@ -34,8 +34,15 @@
 # Application.mk in the jni directory that contains:
 # APP_ABI := armeabi-v7a
 #
+# By default libvpx will detect at runtime the existance of NEON extension.
+# For this we import the 'cpufeatures' module from the NDK sources.
+# libvpx can also be configured without this runtime detection method.
+# Configuring with --disable-runtime-cpu-detect will assume presence of NEON.
+# Configuring with --disable-runtime-cpu-detect --disable-neon will remove any
+# NEON dependency.
+
 # To change to building armeabi, run ./libvpx/configure again, but with
-# --target=arm5te-android-gcc and and modify the Application.mk file to
+# --target=arm5te-android-gcc and modify the Application.mk file to
 # set APP_ABI := armeabi
 #
 # Running ndk-build will build libvpx and include it in your project.
@@ -166,7 +173,9 @@
 
 LOCAL_LDLIBS := -llog
 
-LOCAL_STATIC_LIBRARIES := cpufeatures
+ifeq ($(CONFIG_RUNTIME_CPU_DETECT),yes)
+  LOCAL_STATIC_LIBRARIES := cpufeatures
+endif
 
 $(foreach file, $(LOCAL_SRC_FILES), $(LOCAL_PATH)/$(file)): vpx_rtcd.h
 
@@ -196,4 +205,7 @@
     $(LIBVPX_PATH)/vp8/encoder/asm_enc_offsets.c))
 endif
 
+ifeq ($(CONFIG_RUNTIME_CPU_DETECT),yes)
 $(call import-module,cpufeatures)
+endif
+

diff --git a/build/make/Makefile b/build/make/Makefile
index 26ca0be..1088c84 100644
--- a/build/make/Makefile
+++ b/build/make/Makefile

@@ -21,6 +21,7 @@
 clean:: .DEFAULT
 install:: .DEFAULT
 test:: .DEFAULT
+testdata:: .DEFAULT
 
 
 # Note: md5sum is not installed on OS X, but openssl is. Openssl may not be
@@ -99,6 +100,8 @@
 install::
 .PHONY: test
 test::
+.PHONY: testdata
+testdata::
 
 $(BUILD_PFX)%.c.d: %.c
 	$(if $(quiet),@echo "    [DEP] $@")

diff --git a/build/make/configure.sh b/build/make/configure.sh
index f8d69ae..eeb959a 100755
--- a/build/make/configure.sh
+++ b/build/make/configure.sh

@@ -593,6 +593,15 @@
 
         # detect tgt_isa
         case "$gcctarget" in
+            armv6*)
+                tgt_isa=armv6
+                ;;
+            armv7*)
+                tgt_isa=armv7
+                ;;
+            armv5te*)
+                tgt_isa=armv5te
+                ;;
             *x86_64*|*amd64*)
                 tgt_isa=x86_64
                 ;;
@@ -773,17 +782,23 @@
             check_add_asflags --defsym ARCHITECTURE=${arch_int}
             tune_cflags="-mtune="
             if [ ${tgt_isa} == "armv7" ]; then
+                check_add_cflags  -march=armv7-a -mfloat-abi=softfp
+                check_add_asflags -march=armv7-a -mfloat-abi=softfp
+
                 if enabled neon
                 then
                     check_add_cflags -mfpu=neon #-ftree-vectorize
                     check_add_asflags -mfpu=neon
                 fi
-                check_add_cflags -march=armv7-a -mcpu=cortex-a8 -mfloat-abi=softfp
-                check_add_asflags -mcpu=cortex-a8 -mfloat-abi=softfp  #-march=armv7-a
+
+                if [ -z "${tune_cpu}" ]; then
+                    tune_cpu=cortex-a8
+                fi
             else
                 check_add_cflags -march=${tgt_isa}
                 check_add_asflags -march=${tgt_isa}
             fi
+
             enabled debug && add_asflags -g
             asm_conversion_cmd="${source_path}/build/make/ads2gas.pl"
             ;;
@@ -851,12 +866,17 @@
             add_cflags "--sysroot=${alt_libc}"
             add_ldflags "--sysroot=${alt_libc}"
 
-            add_cflags "-I${SDK_PATH}/sources/android/cpufeatures/"
+            # linker flag that routes around a CPU bug in some
+            # Cortex-A8 implementations (NDK Dev Guide)
+            add_ldflags "-Wl,--fix-cortex-a8"
 
             enable pic
             soft_enable realtime_only
             if [ ${tgt_isa} == "armv7" ]; then
-                enable runtime_cpu_detect
+                soft_enable runtime_cpu_detect
+            fi
+            if enabled runtime_cpu_detect; then
+                add_cflags "-I${SDK_PATH}/sources/android/cpufeatures"
             fi
           ;;
 
@@ -932,13 +952,16 @@
         esac
     ;;
     mips*)
-        CROSS=${CROSS:-mipsel-linux-uclibc-}
         link_with_cc=gcc
         setup_gnu_toolchain
         tune_cflags="-mtune="
+        if enabled dspr2; then
+            check_add_cflags -mips32r2 -mdspr2
+            disable fast_unaligned
+        fi
         check_add_cflags -march=${tgt_isa}
-    check_add_asflags -march=${tgt_isa}
-    check_add_asflags -KPIC
+        check_add_asflags -march=${tgt_isa}
+        check_add_asflags -KPIC
     ;;
     ppc*)
         enable ppc
@@ -1008,7 +1031,7 @@
                         tune_cflags="-march="
                     ;;
                 esac
-                ;;
+            ;;
             gcc*)
                 add_cflags -m${bits}
                 add_ldflags -m${bits}
@@ -1017,7 +1040,13 @@
             setup_gnu_toolchain
                 #for 32 bit x86 builds, -O3 did not turn on this flag
                 enabled optimizations && check_add_cflags -fomit-frame-pointer
-                ;;
+            ;;
+            vs*)
+                # When building with Microsoft Visual Studio the assembler is
+                # invoked directly. Checking at configure time is unnecessary.
+                # Skip the check by setting AS arbitrarily
+                AS=msvs
+            ;;
         esac
 
         case "${AS}" in
@@ -1026,7 +1055,7 @@
                 which yasm >/dev/null 2>&1 && AS=yasm
                 [ "${AS}" = auto -o -z "${AS}" ] \
                     && die "Neither yasm nor nasm have been found"
-                ;;
+            ;;
         esac
         log_echo "  using $AS"
         [ "${AS##*/}" = nasm ] && add_asflags -Ox
@@ -1108,7 +1137,7 @@
 
     # Work around longjmp interception on glibc >= 2.11, to improve binary
     # compatibility. See http://code.google.com/p/webm/issues/detail?id=166
-    enabled linux && check_add_cflags -D_FORTIFY_SOURCE=0
+    enabled linux && check_add_cflags -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0
 
     # Check for strip utility variant
     ${STRIP} -V 2>/dev/null | grep GNU >/dev/null && enable gnu_strip
@@ -1123,12 +1152,24 @@
     # Almost every platform uses pthreads.
     if enabled multithread; then
         case ${toolchain} in
-            *-win*);;
+            *-win*-vs*);;
             *-android-gcc);;
             *) check_header pthread.h && add_extralibs -lpthread
         esac
     fi
 
+    # only for MIPS platforms
+    case ${toolchain} in
+        mips*)
+            if enabled dspr2; then
+                if enabled big_endian; then
+                    echo "dspr2 optimizations are available only for little endian platforms"
+                    disable dspr2
+                fi
+            fi
+        ;;
+    esac
+
     # for sysconf(3) and friends.
     check_header unistd.h
 

diff --git a/build/make/gen_asm_deps.sh b/build/make/gen_asm_deps.sh
index 717f870..0b4e3aa 100755
--- a/build/make/gen_asm_deps.sh
+++ b/build/make/gen_asm_deps.sh

@@ -42,7 +42,7 @@
 
 [ -n "$srcfile" ] || show_help
 sfx=${sfx:-asm}
-includes=$(LC_ALL=C egrep -i "include +\"?+[a-z0-9_/]+\.${sfx}" $srcfile |
+includes=$(LC_ALL=C egrep -i "include +\"?[a-z0-9_/]+\.${sfx}" $srcfile |
            perl -p -e "s;.*?([a-z0-9_/]+.${sfx}).*;\1;")
 #" restore editor state
 for inc in ${includes}; do

diff --git a/build/make/rtcd.sh b/build/make/rtcd.sh
index 1dffde5..ddf9e09 100755
--- a/build/make/rtcd.sh
+++ b/build/make/rtcd.sh

@@ -211,6 +211,8 @@
 $(process_forward_decls)
 
 $(declare_function_pointers c $ALL_ARCHS)
+
+void ${symbol:-rtcd}(void);
 EOF
 }
 
@@ -231,11 +233,10 @@
 
   cat <<EOF
 $(common_top)
-void ${symbol:-rtcd}(void);
 
 #ifdef RTCD_C
 #include "vpx_ports/x86.h"
-void ${symbol:-rtcd}(void)
+static void setup_rtcd_internal(void)
 {
     int flags = x86_simd_caps();
 
@@ -261,11 +262,9 @@
 $(common_top)
 #include "vpx_config.h"
 
-void ${symbol:-rtcd}(void);
-
 #ifdef RTCD_C
 #include "vpx_ports/arm.h"
-void ${symbol:-rtcd}(void)
+static void setup_rtcd_internal(void)
 {
     int flags = arm_cpu_caps();
 
@@ -279,16 +278,34 @@
 }
 
 
+mips() {
+  determine_indirection c $ALL_ARCHS
+  cat <<EOF
+$(common_top)
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+static void setup_rtcd_internal(void)
+{
+$(set_function_pointers c $ALL_ARCHS)
+#if HAVE_DSPR2
+void dsputil_static_init();
+dsputil_static_init();
+#endif
+}
+#endif
+$(common_bottom)
+EOF
+}
+
 unoptimized() {
   determine_indirection c
   cat <<EOF
 $(common_top)
 #include "vpx_config.h"
 
-void ${symbol:-rtcd}(void);
-
 #ifdef RTCD_C
-void ${symbol:-rtcd}(void)
+static void setup_rtcd_internal(void)
 {
 $(set_function_pointers c)
 }
@@ -312,6 +329,15 @@
     require $(filter $REQUIRES)
     x86
     ;;
+  mips32)
+    ALL_ARCHS=$(filter mips32)
+    dspr2=$([ -f "$config_file" ] && eval echo $(grep HAVE_DSPR2 "$config_file"))
+    HAVE_DSPR2="${dspr2#*=}"
+    if [ "$HAVE_DSPR2" = "yes" ]; then
+        ALL_ARCHS=$(filter mips32 dspr2)
+    fi
+    mips
+    ;;
   armv5te)
     ALL_ARCHS=$(filter edsp)
     arm

diff --git a/configure b/configure
index eed6f97..dde215f 100755
--- a/configure
+++ b/configure

@@ -209,6 +209,7 @@
     neon
 
     mips32
+    dspr2
 
     mmx
     sse
@@ -453,7 +454,13 @@
         # Can only build shared libs on a subset of platforms. Doing this check
         # here rather than at option parse time because the target auto-detect
         # magic happens after the command line has been parsed.
-        enabled linux || die "--enable-shared only supported on ELF for now"
+        if ! enabled linux; then
+            if enabled gnu; then
+                echo "--enable-shared is only supported on ELF; assuming this is OK"
+            else
+                die "--enable-shared only supported on ELF for now"
+            fi
+        fi
     fi
     if [ -z "$CC" ]; then
         echo "Bypassing toolchain for environment detection."
@@ -588,13 +595,18 @@
     fi
 
     # Enable unit tests if we have a working C++ compiler
-    case "$tgt_cc" in
-        vs*)
-            soft_enable unit_tests;;
+    case "$toolchain" in
+        *-vs*)
+            soft_enable unit_tests
+        ;;
+        *-android-*)
+            # GTestLog must be modified to use Android logging utilities.
+        ;;
         *)
             check_cxx "$@" <<EOF && soft_enable unit_tests
 int z;
 EOF
+        ;;
     esac
 }
 

diff --git a/examples/decoder_tmpl.txt b/examples/decoder_tmpl.txt
index 92a2c30..e652a63 100644
--- a/examples/decoder_tmpl.txt
+++ b/examples/decoder_tmpl.txt

@@ -48,8 +48,8 @@
     unsigned char *buf =img->planes[plane];
 
     for(y=0; y < (plane ? (img->d_h + 1) >> 1 : img->d_h); y++) {
-        if(fwrite(buf, 1, (plane ? (img->d_w + 1) >> 1 : img->d_w),
-           outfile));
+        (void) fwrite(buf, 1, (plane ? (img->d_w + 1) >> 1 : img->d_w),
+                      outfile);
         buf += img->stride[plane];
     }
 }

diff --git a/examples/encoder_tmpl.c b/examples/encoder_tmpl.c
index cc70b008..e2b65ec 100644
--- a/examples/encoder_tmpl.c
+++ b/examples/encoder_tmpl.c

@@ -85,7 +85,7 @@
     mem_put_le32(header+24, frame_cnt);           /* length */
     mem_put_le32(header+28, 0);                   /* unused */
 
-    if(fwrite(header, 1, 32, outfile));
+    (void) fwrite(header, 1, 32, outfile);
 }
 
 
@@ -103,7 +103,7 @@
     mem_put_le32(header+4, pts&0xFFFFFFFF);
     mem_put_le32(header+8, pts >> 32);
 
-    if(fwrite(header, 1, 12, outfile));
+    (void) fwrite(header, 1, 12, outfile);
 }
 
 int main(int argc, char **argv) {

diff --git a/examples/encoder_tmpl.txt b/examples/encoder_tmpl.txt
index 0042071..1afbd8b 100644
--- a/examples/encoder_tmpl.txt
+++ b/examples/encoder_tmpl.txt

@@ -61,13 +61,14 @@
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PROCESS_FRAME
 case VPX_CODEC_CX_FRAME_PKT:
     write_ivf_frame_header(outfile, pkt);
-    if(fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz,
-              outfile));
+    (void) fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz,
+                  outfile);
     break;
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PROCESS_FRAME
 
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DESTROY
+vpx_img_free(&raw);
 if(vpx_codec_destroy(&codec))
     die_codec(&codec, "Failed to destroy codec");
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DESTROY

diff --git a/examples/twopass_encoder.txt b/examples/twopass_encoder.txt
index 4683bc7..2f81a90 100644
--- a/examples/twopass_encoder.txt
+++ b/examples/twopass_encoder.txt

@@ -71,5 +71,17 @@
 It's sometimes helpful to see when each pass completes.
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ TWOPASS_LOOP_END
     printf("Pass %d complete.\n", pass+1);
+    if(vpx_codec_destroy(&codec))
+        die_codec(&codec, "Failed to destroy codec");
 }
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ TWOPASS_LOOP_END
+
+
+Clean-up
+-----------------------------
+Destruction of the encoder instance must be done on each pass. The
+raw image should be destroyed at the end as usual.
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DESTROY
+vpx_img_free(&raw);
+free(stats.buf);
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DESTROY

diff --git a/libmkv/EbmlIDs.h b/libmkv/EbmlIDs.h
index 3418e36..e3ce585 100644
--- a/libmkv/EbmlIDs.h
+++ b/libmkv/EbmlIDs.h

@@ -1,16 +1,16 @@
-// Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the LICENSE file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS.  All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-
-
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
 #ifndef MKV_DEFS_HPP
 #define MKV_DEFS_HPP 1
 
-//Commenting out values not available in webm, but available in matroska
+/* Commenting out values not available in webm, but available in matroska */
 
 enum mkv
 {
@@ -22,7 +22,7 @@
     DocType = 0x4282,
     DocTypeVersion = 0x4287,
     DocTypeReadVersion = 0x4285,
-//  CRC_32 = 0xBF,
+/* CRC_32 = 0xBF, */
     Void = 0xEC,
     SignatureSlot = 0x1B538667,
     SignatureAlgo = 0x7E8A,
@@ -32,61 +32,61 @@
     SignatureElements = 0x7E5B,
     SignatureElementList = 0x7E7B,
     SignedElement = 0x6532,
-    //segment
+    /* segment */
     Segment = 0x18538067,
-    //Meta Seek Information
+    /* Meta Seek Information */
     SeekHead = 0x114D9B74,
     Seek = 0x4DBB,
     SeekID = 0x53AB,
     SeekPosition = 0x53AC,
-    //Segment Information
+    /* Segment Information */
     Info = 0x1549A966,
-//  SegmentUID = 0x73A4,
-//  SegmentFilename = 0x7384,
-//  PrevUID = 0x3CB923,
-//  PrevFilename = 0x3C83AB,
-//  NextUID = 0x3EB923,
-//  NextFilename = 0x3E83BB,
-//  SegmentFamily = 0x4444,
-//  ChapterTranslate = 0x6924,
-//  ChapterTranslateEditionUID = 0x69FC,
-//  ChapterTranslateCodec = 0x69BF,
-//  ChapterTranslateID = 0x69A5,
+/* SegmentUID = 0x73A4, */
+/* SegmentFilename = 0x7384, */
+/* PrevUID = 0x3CB923, */
+/* PrevFilename = 0x3C83AB, */
+/* NextUID = 0x3EB923, */
+/* NextFilename = 0x3E83BB, */
+/* SegmentFamily = 0x4444, */
+/* ChapterTranslate = 0x6924, */
+/* ChapterTranslateEditionUID = 0x69FC, */
+/* ChapterTranslateCodec = 0x69BF, */
+/* ChapterTranslateID = 0x69A5, */
     TimecodeScale = 0x2AD7B1,
     Segment_Duration = 0x4489,
     DateUTC = 0x4461,
-//  Title = 0x7BA9,
+/* Title = 0x7BA9, */
     MuxingApp = 0x4D80,
     WritingApp = 0x5741,
-    //Cluster
+    /* Cluster */
     Cluster = 0x1F43B675,
     Timecode = 0xE7,
-//  SilentTracks = 0x5854,
-//  SilentTrackNumber = 0x58D7,
-//  Position = 0xA7,
+/* SilentTracks = 0x5854, */
+/* SilentTrackNumber = 0x58D7, */
+/* Position = 0xA7, */
     PrevSize = 0xAB,
     BlockGroup = 0xA0,
     Block = 0xA1,
-//  BlockVirtual = 0xA2,
-//  BlockAdditions = 0x75A1,
-//  BlockMore = 0xA6,
-//  BlockAddID = 0xEE,
-//  BlockAdditional = 0xA5,
+/* BlockVirtual = 0xA2, */
+/* BlockAdditions = 0x75A1, */
+/* BlockMore = 0xA6, */
+/* BlockAddID = 0xEE, */
+/* BlockAdditional = 0xA5, */
     BlockDuration = 0x9B,
-//  ReferencePriority = 0xFA,
+/* ReferencePriority = 0xFA, */
     ReferenceBlock = 0xFB,
-//  ReferenceVirtual = 0xFD,
-//  CodecState = 0xA4,
-//  Slices = 0x8E,
-//  TimeSlice = 0xE8,
+/* ReferenceVirtual = 0xFD, */
+/* CodecState = 0xA4, */
+/* Slices = 0x8E, */
+/* TimeSlice = 0xE8, */
     LaceNumber = 0xCC,
-//  FrameNumber = 0xCD,
-//  BlockAdditionID = 0xCB,
-//  MkvDelay = 0xCE,
-//  Cluster_Duration = 0xCF,
+/* FrameNumber = 0xCD, */
+/* BlockAdditionID = 0xCB, */
+/* MkvDelay = 0xCE, */
+/* Cluster_Duration = 0xCF, */
     SimpleBlock = 0xA3,
-//  EncryptedBlock = 0xAF,
-    //Track
+/* EncryptedBlock = 0xAF, */
+    /* Track */
     Tracks = 0x1654AE6B,
     TrackEntry = 0xAE,
     TrackNumber = 0xD7,
@@ -96,28 +96,28 @@
     FlagDefault = 0x88,
     FlagForced = 0x55AA,
     FlagLacing = 0x9C,
-//  MinCache = 0x6DE7,
-//  MaxCache = 0x6DF8,
+/* MinCache = 0x6DE7, */
+/* MaxCache = 0x6DF8, */
     DefaultDuration = 0x23E383,
-//  TrackTimecodeScale = 0x23314F,
-//  TrackOffset = 0x537F,
-//  MaxBlockAdditionID = 0x55EE,
+/* TrackTimecodeScale = 0x23314F, */
+/* TrackOffset = 0x537F, */
+/* MaxBlockAdditionID = 0x55EE, */
     Name = 0x536E,
     Language = 0x22B59C,
     CodecID = 0x86,
     CodecPrivate = 0x63A2,
     CodecName = 0x258688,
-//  AttachmentLink = 0x7446,
-//  CodecSettings = 0x3A9697,
-//  CodecInfoURL = 0x3B4040,
-//  CodecDownloadURL = 0x26B240,
-//  CodecDecodeAll = 0xAA,
-//  TrackOverlay = 0x6FAB,
-//  TrackTranslate = 0x6624,
-//  TrackTranslateEditionUID = 0x66FC,
-//  TrackTranslateCodec = 0x66BF,
-//  TrackTranslateTrackID = 0x66A5,
-    //video
+/* AttachmentLink = 0x7446, */
+/* CodecSettings = 0x3A9697, */
+/* CodecInfoURL = 0x3B4040, */
+/* CodecDownloadURL = 0x26B240, */
+/* CodecDecodeAll = 0xAA, */
+/* TrackOverlay = 0x6FAB, */
+/* TrackTranslate = 0x6624, */
+/* TrackTranslateEditionUID = 0x66FC, */
+/* TrackTranslateCodec = 0x66BF, */
+/* TrackTranslateTrackID = 0x66A5, */
+    /* video */
     Video = 0xE0,
     FlagInterlaced = 0x9A,
     StereoMode = 0x53B8,
@@ -131,101 +131,101 @@
     DisplayHeight = 0x54BA,
     DisplayUnit = 0x54B2,
     AspectRatioType = 0x54B3,
-//  ColourSpace = 0x2EB524,
-//  GammaValue = 0x2FB523,
+/* ColourSpace = 0x2EB524, */
+/* GammaValue = 0x2FB523, */
     FrameRate = 0x2383E3,
-    //end video
-    //audio
+    /* end video */
+    /* audio */
     Audio = 0xE1,
     SamplingFrequency = 0xB5,
     OutputSamplingFrequency = 0x78B5,
     Channels = 0x9F,
-//  ChannelPositions = 0x7D7B,
+/* ChannelPositions = 0x7D7B, */
     BitDepth = 0x6264,
-    //end audio
-    //content encoding
-//  ContentEncodings = 0x6d80,
-//  ContentEncoding = 0x6240,
-//  ContentEncodingOrder = 0x5031,
-//  ContentEncodingScope = 0x5032,
-//  ContentEncodingType = 0x5033,
-//  ContentCompression = 0x5034,
-//  ContentCompAlgo = 0x4254,
-//  ContentCompSettings = 0x4255,
-//  ContentEncryption = 0x5035,
-//  ContentEncAlgo = 0x47e1,
-//  ContentEncKeyID = 0x47e2,
-//  ContentSignature = 0x47e3,
-//  ContentSigKeyID = 0x47e4,
-//  ContentSigAlgo = 0x47e5,
-//  ContentSigHashAlgo = 0x47e6,
-    //end content encoding
-    //Cueing Data
+    /* end audio */
+    /* content encoding */
+/* ContentEncodings = 0x6d80, */
+/* ContentEncoding = 0x6240, */
+/* ContentEncodingOrder = 0x5031, */
+/* ContentEncodingScope = 0x5032, */
+/* ContentEncodingType = 0x5033, */
+/* ContentCompression = 0x5034, */
+/* ContentCompAlgo = 0x4254, */
+/* ContentCompSettings = 0x4255, */
+/* ContentEncryption = 0x5035, */
+/* ContentEncAlgo = 0x47e1, */
+/* ContentEncKeyID = 0x47e2, */
+/* ContentSignature = 0x47e3, */
+/* ContentSigKeyID = 0x47e4, */
+/* ContentSigAlgo = 0x47e5, */
+/* ContentSigHashAlgo = 0x47e6, */
+    /* end content encoding */
+    /* Cueing Data */
     Cues = 0x1C53BB6B,
     CuePoint = 0xBB,
     CueTime = 0xB3,
     CueTrackPositions = 0xB7,
     CueTrack = 0xF7,
     CueClusterPosition = 0xF1,
-    CueBlockNumber = 0x5378,
-//  CueCodecState = 0xEA,
-//  CueReference = 0xDB,
-//  CueRefTime = 0x96,
-//  CueRefCluster = 0x97,
-//  CueRefNumber = 0x535F,
-//  CueRefCodecState = 0xEB,
-    //Attachment
-//  Attachments = 0x1941A469,
-//  AttachedFile = 0x61A7,
-//  FileDescription = 0x467E,
-//  FileName = 0x466E,
-//  FileMimeType = 0x4660,
-//  FileData = 0x465C,
-//  FileUID = 0x46AE,
-//  FileReferral = 0x4675,
-    //Chapters
-//  Chapters = 0x1043A770,
-//  EditionEntry = 0x45B9,
-//  EditionUID = 0x45BC,
-//  EditionFlagHidden = 0x45BD,
-//  EditionFlagDefault = 0x45DB,
-//  EditionFlagOrdered = 0x45DD,
-//  ChapterAtom = 0xB6,
-//  ChapterUID = 0x73C4,
-//  ChapterTimeStart = 0x91,
-//  ChapterTimeEnd = 0x92,
-//  ChapterFlagHidden = 0x98,
-//  ChapterFlagEnabled = 0x4598,
-//  ChapterSegmentUID = 0x6E67,
-//  ChapterSegmentEditionUID = 0x6EBC,
-//  ChapterPhysicalEquiv = 0x63C3,
-//  ChapterTrack = 0x8F,
-//  ChapterTrackNumber = 0x89,
-//  ChapterDisplay = 0x80,
-//  ChapString = 0x85,
-//  ChapLanguage = 0x437C,
-//  ChapCountry = 0x437E,
-//  ChapProcess = 0x6944,
-//  ChapProcessCodecID = 0x6955,
-//  ChapProcessPrivate = 0x450D,
-//  ChapProcessCommand = 0x6911,
-//  ChapProcessTime = 0x6922,
-//  ChapProcessData = 0x6933,
-    //Tagging
-//  Tags = 0x1254C367,
-//  Tag = 0x7373,
-//  Targets = 0x63C0,
-//  TargetTypeValue = 0x68CA,
-//  TargetType = 0x63CA,
-//  Tagging_TrackUID = 0x63C5,
-//  Tagging_EditionUID = 0x63C9,
-//  Tagging_ChapterUID = 0x63C4,
-//  AttachmentUID = 0x63C6,
-//  SimpleTag = 0x67C8,
-//  TagName = 0x45A3,
-//  TagLanguage = 0x447A,
-//  TagDefault = 0x4484,
-//  TagString = 0x4487,
-//  TagBinary = 0x4485,
+    CueBlockNumber = 0x5378
+/* CueCodecState = 0xEA, */
+/* CueReference = 0xDB, */
+/* CueRefTime = 0x96, */
+/* CueRefCluster = 0x97, */
+/* CueRefNumber = 0x535F, */
+/* CueRefCodecState = 0xEB, */
+    /* Attachment */
+/* Attachments = 0x1941A469, */
+/* AttachedFile = 0x61A7, */
+/* FileDescription = 0x467E, */
+/* FileName = 0x466E, */
+/* FileMimeType = 0x4660, */
+/* FileData = 0x465C, */
+/* FileUID = 0x46AE, */
+/* FileReferral = 0x4675, */
+    /* Chapters */
+/* Chapters = 0x1043A770, */
+/* EditionEntry = 0x45B9, */
+/* EditionUID = 0x45BC, */
+/* EditionFlagHidden = 0x45BD, */
+/* EditionFlagDefault = 0x45DB, */
+/* EditionFlagOrdered = 0x45DD, */
+/* ChapterAtom = 0xB6, */
+/* ChapterUID = 0x73C4, */
+/* ChapterTimeStart = 0x91, */
+/* ChapterTimeEnd = 0x92, */
+/* ChapterFlagHidden = 0x98, */
+/* ChapterFlagEnabled = 0x4598, */
+/* ChapterSegmentUID = 0x6E67, */
+/* ChapterSegmentEditionUID = 0x6EBC, */
+/* ChapterPhysicalEquiv = 0x63C3, */
+/* ChapterTrack = 0x8F, */
+/* ChapterTrackNumber = 0x89, */
+/* ChapterDisplay = 0x80, */
+/* ChapString = 0x85, */
+/* ChapLanguage = 0x437C, */
+/* ChapCountry = 0x437E, */
+/* ChapProcess = 0x6944, */
+/* ChapProcessCodecID = 0x6955, */
+/* ChapProcessPrivate = 0x450D, */
+/* ChapProcessCommand = 0x6911, */
+/* ChapProcessTime = 0x6922, */
+/* ChapProcessData = 0x6933, */
+    /* Tagging */
+/* Tags = 0x1254C367, */
+/* Tag = 0x7373, */
+/* Targets = 0x63C0, */
+/* TargetTypeValue = 0x68CA, */
+/* TargetType = 0x63CA, */
+/* Tagging_TrackUID = 0x63C5, */
+/* Tagging_EditionUID = 0x63C9, */
+/* Tagging_ChapterUID = 0x63C4, */
+/* AttachmentUID = 0x63C6, */
+/* SimpleTag = 0x67C8, */
+/* TagName = 0x45A3, */
+/* TagLanguage = 0x447A, */
+/* TagDefault = 0x4484, */
+/* TagString = 0x4487, */
+/* TagBinary = 0x4485, */
 };
 #endif

diff --git a/libmkv/EbmlWriter.c b/libmkv/EbmlWriter.c
index fbf2c66..d70f06e 100644
--- a/libmkv/EbmlWriter.c
+++ b/libmkv/EbmlWriter.c

@@ -1,12 +1,12 @@
-// Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the LICENSE file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS.  All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-
-
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
 #include "EbmlWriter.h"
 #include <stdlib.h>
 #include <wchar.h>
@@ -18,11 +18,13 @@
 #define LITERALU64(n) n##LLU
 #endif
 
-void Ebml_WriteLen(EbmlGlobal *glob, long long val)
+void Ebml_WriteLen(EbmlGlobal *glob, int64_t val)
 {
-    //TODO check and make sure we are not > than 0x0100000000000000LLU
-    unsigned char size = 8; //size in bytes to output
-    unsigned long long minVal = LITERALU64(0x00000000000000ff); //mask to compare for byte size
+    /* TODO check and make sure we are not > than 0x0100000000000000LLU */
+    unsigned char size = 8; /* size in bytes to output */
+
+    /* mask to compare for byte size */
+    int64_t minVal = 0xff;
 
     for (size = 1; size < 8; size ++)
     {
@@ -32,7 +34,7 @@
         minVal = (minVal << 7);
     }
 
-    val |= (LITERALU64(0x000000000000080) << ((size - 1) * 7));
+    val |= (((uint64_t)0x80) << ((size - 1) * 7));
 
     Ebml_Serialize(glob, (void *) &val, sizeof(val), size);
 }
@@ -40,23 +42,25 @@
 void Ebml_WriteString(EbmlGlobal *glob, const char *str)
 {
     const size_t size_ = strlen(str);
-    const unsigned long long  size = size_;
+    const uint64_t  size = size_;
     Ebml_WriteLen(glob, size);
-    //TODO: it's not clear from the spec whether the nul terminator
-    //should be serialized too.  For now we omit the null terminator.
-    Ebml_Write(glob, str, size);
+    /* TODO: it's not clear from the spec whether the nul terminator
+     * should be serialized too.  For now we omit the null terminator.
+     */
+    Ebml_Write(glob, str, (unsigned long)size);
 }
 
 void Ebml_WriteUTF8(EbmlGlobal *glob, const wchar_t *wstr)
 {
     const size_t strlen = wcslen(wstr);
 
-    //TODO: it's not clear from the spec whether the nul terminator
-    //should be serialized too.  For now we include it.
-    const unsigned long long  size = strlen;
+    /* TODO: it's not clear from the spec whether the nul terminator
+     * should be serialized too.  For now we include it.
+     */
+    const uint64_t  size = strlen;
 
     Ebml_WriteLen(glob, size);
-    Ebml_Write(glob, wstr, size);
+    Ebml_Write(glob, wstr, (unsigned long)size);
 }
 
 void Ebml_WriteID(EbmlGlobal *glob, unsigned long class_id)
@@ -85,12 +89,12 @@
 
 void Ebml_SerializeUnsigned(EbmlGlobal *glob, unsigned long class_id, unsigned long ui)
 {
-    unsigned char size = 8; //size in bytes to output
+    unsigned char size = 8; /* size in bytes to output */
     unsigned char sizeSerialized = 0;
     unsigned long minVal;
 
     Ebml_WriteID(glob, class_id);
-    minVal = 0x7fLU; //mask to compare for byte size
+    minVal = 0x7fLU; /* mask to compare for byte size */
 
     for (size = 1; size < 4; size ++)
     {
@@ -106,7 +110,7 @@
     Ebml_Serialize(glob, &sizeSerialized, sizeof(sizeSerialized), 1);
     Ebml_Serialize(glob, &ui, sizeof(ui), size);
 }
-//TODO: perhaps this is a poor name for this id serializer helper function
+/* TODO: perhaps this is a poor name for this id serializer helper function */
 void Ebml_SerializeBinary(EbmlGlobal *glob, unsigned long class_id, unsigned long bin)
 {
     int size;
@@ -168,4 +172,4 @@
     }
 }
 
-//TODO Serialize Date
+/* TODO Serialize Date */

diff --git a/libmkv/EbmlWriter.h b/libmkv/EbmlWriter.h
index 324c9bc..b94f757 100644
--- a/libmkv/EbmlWriter.h
+++ b/libmkv/EbmlWriter.h

@@ -1,26 +1,30 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
 #ifndef EBMLWRITER_HPP
 #define EBMLWRITER_HPP
-
-// Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the LICENSE file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS.  All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-
-//note: you must define write and serialize functions as well as your own EBML_GLOBAL
-//These functions MUST be implemented
 #include <stddef.h>
 #include "vpx/vpx_integer.h"
 
+/* note: you must define write and serialize functions as well as your own
+ * EBML_GLOBAL
+ *
+ * These functions MUST be implemented
+ */
+
 typedef struct EbmlGlobal EbmlGlobal;
 void  Ebml_Serialize(EbmlGlobal *glob, const void *, int, unsigned long);
 void  Ebml_Write(EbmlGlobal *glob, const void *, unsigned long);
-/////
 
+/*****/
 
-void Ebml_WriteLen(EbmlGlobal *glob, long long val);
+void Ebml_WriteLen(EbmlGlobal *glob, int64_t val);
 void Ebml_WriteString(EbmlGlobal *glob, const char *str);
 void Ebml_WriteUTF8(EbmlGlobal *glob, const wchar_t *wstr);
 void Ebml_WriteID(EbmlGlobal *glob, unsigned long class_id);
@@ -28,11 +32,11 @@
 void Ebml_SerializeUnsigned(EbmlGlobal *glob, unsigned long class_id, unsigned long ui);
 void Ebml_SerializeBinary(EbmlGlobal *glob, unsigned long class_id, unsigned long ui);
 void Ebml_SerializeFloat(EbmlGlobal *glob, unsigned long class_id, double d);
-//TODO make this more generic to signed
+/* TODO make this more generic to signed */
 void Ebml_WriteSigned16(EbmlGlobal *glob, short val);
 void Ebml_SerializeString(EbmlGlobal *glob, unsigned long class_id, const char *s);
 void Ebml_SerializeUTF8(EbmlGlobal *glob, unsigned long class_id, wchar_t *s);
 void Ebml_SerializeData(EbmlGlobal *glob, unsigned long class_id, unsigned char *data, unsigned long data_length);
 void Ebml_WriteVoid(EbmlGlobal *glob, unsigned long vSize);
-//TODO need date function
+/* TODO need date function */
 #endif

diff --git a/libs.mk b/libs.mk
index 516e6d8..373c1cd 100644
--- a/libs.mk
+++ b/libs.mk

@@ -147,7 +147,7 @@
 obj_int_extract.vcproj: $(SRC_PATH_BARE)/build/make/obj_int_extract.c
 	@cp $(SRC_PATH_BARE)/build/x86-msvs/obj_int_extract.bat .
 	@echo "    [CREATE] $@"
-	$(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh \
+	$(qexec)$(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh \
     --exe \
     --target=$(TOOLCHAIN) \
     --name=obj_int_extract \
@@ -163,14 +163,14 @@
 
 vpx.def: $(call enabled,CODEC_EXPORTS)
 	@echo "    [CREATE] $@"
-	$(SRC_PATH_BARE)/build/make/gen_msvs_def.sh\
+	$(qexec)$(SRC_PATH_BARE)/build/make/gen_msvs_def.sh\
             --name=vpx\
             --out=$@ $^
 CLEAN-OBJS += vpx.def
 
 vpx.vcproj: $(CODEC_SRCS) vpx.def
 	@echo "    [CREATE] $@"
-	$(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh \
+	$(qexec)$(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh \
 			--lib \
 			--target=$(TOOLCHAIN) \
             $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
@@ -243,6 +243,7 @@
 	$(qexec)echo 'Requires:' >> $@
 	$(qexec)echo 'Conflicts:' >> $@
 	$(qexec)echo 'Libs: -L$${libdir} -lvpx' >> $@
+	$(qexec)echo 'Libs.private: -lm -lpthread' >> $@
 	$(qexec)echo 'Cflags: -I$${includedir}' >> $@
 INSTALL-LIBS-yes += $(LIBSUBDIR)/pkgconfig/vpx.pc
 INSTALL_MAPS += $(LIBSUBDIR)/pkgconfig/%.pc %.pc
@@ -285,38 +286,44 @@
 
 ifeq ($(filter icc gcc,$(TGT_CC)), $(TGT_CC))
     $(BUILD_PFX)asm_com_offsets.asm: $(BUILD_PFX)$(VP8_PREFIX)common/asm_com_offsets.c.S
-	LC_ALL=C grep $(OFFSET_PATTERN) $< | tr -d '$$\#' $(ADS2GAS) > $@
+	@echo "    [CREATE] $@"
+	$(qexec)LC_ALL=C grep $(OFFSET_PATTERN) $< | tr -d '$$\#' $(ADS2GAS) > $@
     $(BUILD_PFX)$(VP8_PREFIX)common/asm_com_offsets.c.S: $(VP8_PREFIX)common/asm_com_offsets.c
     CLEAN-OBJS += $(BUILD_PFX)asm_com_offsets.asm $(BUILD_PFX)$(VP8_PREFIX)common/asm_com_offsets.c.S
 
     $(BUILD_PFX)asm_enc_offsets.asm: $(BUILD_PFX)$(VP8_PREFIX)encoder/asm_enc_offsets.c.S
-	LC_ALL=C grep $(OFFSET_PATTERN) $< | tr -d '$$\#' $(ADS2GAS) > $@
+	@echo "    [CREATE] $@"
+	$(qexec)LC_ALL=C grep $(OFFSET_PATTERN) $< | tr -d '$$\#' $(ADS2GAS) > $@
     $(BUILD_PFX)$(VP8_PREFIX)encoder/asm_enc_offsets.c.S: $(VP8_PREFIX)encoder/asm_enc_offsets.c
     CLEAN-OBJS += $(BUILD_PFX)asm_enc_offsets.asm $(BUILD_PFX)$(VP8_PREFIX)encoder/asm_enc_offsets.c.S
 
     $(BUILD_PFX)asm_dec_offsets.asm: $(BUILD_PFX)$(VP8_PREFIX)decoder/asm_dec_offsets.c.S
-	LC_ALL=C grep $(OFFSET_PATTERN) $< | tr -d '$$\#' $(ADS2GAS) > $@
+	@echo "    [CREATE] $@"
+	$(qexec)LC_ALL=C grep $(OFFSET_PATTERN) $< | tr -d '$$\#' $(ADS2GAS) > $@
     $(BUILD_PFX)$(VP8_PREFIX)decoder/asm_dec_offsets.c.S: $(VP8_PREFIX)decoder/asm_dec_offsets.c
     CLEAN-OBJS += $(BUILD_PFX)asm_dec_offsets.asm $(BUILD_PFX)$(VP8_PREFIX)decoder/asm_dec_offsets.c.S
 else
   ifeq ($(filter rvct,$(TGT_CC)), $(TGT_CC))
     asm_com_offsets.asm: obj_int_extract
     asm_com_offsets.asm: $(VP8_PREFIX)common/asm_com_offsets.c.o
-	./obj_int_extract rvds $< $(ADS2GAS) > $@
+	@echo "    [CREATE] $@"
+	$(qexec)./obj_int_extract rvds $< $(ADS2GAS) > $@
     OBJS-yes += $(VP8_PREFIX)common/asm_com_offsets.c.o
     CLEAN-OBJS += asm_com_offsets.asm
     $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_com_offsets.asm
 
     asm_enc_offsets.asm: obj_int_extract
     asm_enc_offsets.asm: $(VP8_PREFIX)encoder/asm_enc_offsets.c.o
-	./obj_int_extract rvds $< $(ADS2GAS) > $@
+	@echo "    [CREATE] $@"
+	$(qexec)./obj_int_extract rvds $< $(ADS2GAS) > $@
     OBJS-yes += $(VP8_PREFIX)encoder/asm_enc_offsets.c.o
     CLEAN-OBJS += asm_enc_offsets.asm
     $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_enc_offsets.asm
 
     asm_dec_offsets.asm: obj_int_extract
     asm_dec_offsets.asm: $(VP8_PREFIX)decoder/asm_dec_offsets.c.o
-	./obj_int_extract rvds $< $(ADS2GAS) > $@
+	@echo "    [CREATE] $@"
+	$(qexec)./obj_int_extract rvds $< $(ADS2GAS) > $@
     OBJS-yes += $(VP8_PREFIX)decoder/asm_dec_offsets.c.o
     CLEAN-OBJS += asm_dec_offsets.asm
     $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_dec_offsets.asm
@@ -346,17 +353,39 @@
 ## libvpx test directives
 ##
 ifeq ($(CONFIG_UNIT_TESTS),yes)
+LIBVPX_TEST_DATA_PATH ?= .
 
 include $(SRC_PATH_BARE)/test/test.mk
 LIBVPX_TEST_SRCS=$(addprefix test/,$(call enabled,LIBVPX_TEST_SRCS))
 LIBVPX_TEST_BINS=./test_libvpx
+LIBVPX_TEST_DATA=$(addprefix $(LIBVPX_TEST_DATA_PATH)/,\
+                     $(call enabled,LIBVPX_TEST_DATA))
+libvpx_test_data_url=http://downloads.webmproject.org/test_data/libvpx/$(1)
+
+$(LIBVPX_TEST_DATA):
+	@echo "    [DOWNLOAD] $@"
+	$(qexec)trap 'rm -f $@' INT TERM &&\
+            curl -L -o $@ $(call libvpx_test_data_url,$(@F))
+
+testdata:: $(LIBVPX_TEST_DATA)
+	$(qexec)if [ -x "$$(which sha1sum)" ]; then\
+            echo "Checking test data:";\
+            if [ -n "$(LIBVPX_TEST_DATA)" ]; then\
+                for f in $(call enabled,LIBVPX_TEST_DATA); do\
+                    grep $$f $(SRC_PATH_BARE)/test/test-data.sha1 |\
+                        (cd $(LIBVPX_TEST_DATA_PATH); sha1sum -c);\
+                done; \
+            fi; \
+        else\
+            echo "Skipping test data integrity check, sha1sum not found.";\
+        fi
 
 ifeq ($(CONFIG_EXTERNAL_BUILD),yes)
 ifeq ($(CONFIG_MSVS),yes)
 
 gtest.vcproj: $(SRC_PATH_BARE)/third_party/googletest/src/src/gtest-all.cc
 	@echo "    [CREATE] $@"
-	$(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh \
+	$(qexec)$(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh \
             --lib \
             --target=$(TOOLCHAIN) \
             $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
@@ -369,27 +398,22 @@
 
 PROJECTS-$(CONFIG_MSVS) += gtest.vcproj
 
-define unit_test_vcproj_template
-$(notdir $(1:.cc=.vcproj)): $(SRC_PATH_BARE)/$(1)
-	@echo "    [vcproj] $$@"
-	$$(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh\
-            --exe\
-            --target=$$(TOOLCHAIN)\
-            --name=$(notdir $(1:.cc=))\
-            --ver=$$(CONFIG_VS_VERSION)\
-            $$(if $$(CONFIG_STATIC_MSVCRT),--static-crt) \
-            --out=$$@ $$(INTERNAL_CFLAGS) $$(CFLAGS) \
+test_libvpx.vcproj: $(LIBVPX_TEST_SRCS)
+	@echo "    [CREATE] $@"
+	$(qexec)$(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh \
+            --exe \
+            --target=$(TOOLCHAIN) \
+            --name=test_libvpx \
+            --proj-guid=CD837F5F-52D8-4314-A370-895D614166A7 \
+            --ver=$(CONFIG_VS_VERSION) \
+            $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
+            --out=$@ $(INTERNAL_CFLAGS) $(CFLAGS) \
             -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" \
-            -L. -l$(CODEC_LIB) -lwinmm -l$(GTEST_LIB) $$^
-endef
+            -L. -l$(CODEC_LIB) -lwinmm -l$(GTEST_LIB) $^
 
-$(foreach proj,$(LIBVPX_TEST_BINS),\
-    $(eval $(call unit_test_vcproj_template,$(proj))))
+PROJECTS-$(CONFIG_MSVS) += test_libvpx.vcproj
 
-PROJECTS-$(CONFIG_MSVS) += $(foreach proj,$(LIBVPX_TEST_BINS),\
-     $(notdir $(proj:.cc=.vcproj)))
-
-test::
+test:: testdata
 	@set -e; for t in $(addprefix Win32/Release/,$(notdir $(LIBVPX_TEST_BINS:.cc=.exe))); do $$t; done
 endif
 else
@@ -407,21 +431,25 @@
 $(LIBVPX_TEST_OBJS) $(LIBVPX_TEST_OBJS:.o=.d): CXXFLAGS += -I$(SRC_PATH_BARE)/third_party/googletest/src
 $(LIBVPX_TEST_OBJS) $(LIBVPX_TEST_OBJS:.o=.d): CXXFLAGS += -I$(SRC_PATH_BARE)/third_party/googletest/src/include
 OBJS-$(BUILD_LIBVPX) += $(LIBVPX_TEST_OBJS)
+BINS-$(BUILD_LIBVPX) += $(LIBVPX_TEST_BINS)
 
 # Install test sources only if codec source is included
 INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(patsubst $(SRC_PATH_BARE)/%,%,\
     $(shell find $(SRC_PATH_BARE)/third_party/googletest -type f))
 INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(LIBVPX_TEST_SRCS)
 
+CODEC_LIB=$(if $(CONFIG_DEBUG_LIBS),vpx_g,vpx)
+CODEC_LIB_SUF=$(if $(CONFIG_SHARED),.so,.a)
 $(foreach bin,$(LIBVPX_TEST_BINS),\
-    $(if $(BUILD_LIBVPX),$(eval $(bin): libvpx.a libgtest.a ))\
+    $(if $(BUILD_LIBVPX),$(eval $(bin): \
+        lib$(CODEC_LIB)$(CODEC_LIB_SUF) libgtest.a ))\
     $(if $(BUILD_LIBVPX),$(eval $(call linkerxx_template,$(bin),\
         $(LIBVPX_TEST_OBJS) \
         -L. -lvpx -lgtest -lpthread -lm)\
         )))\
     $(if $(LIPO_LIBS),$(eval $(call lipo_bin_template,$(bin))))\
 
-test:: $(LIBVPX_TEST_BINS)
+test:: $(LIBVPX_TEST_BINS) testdata
 	@set -e; for t in $(LIBVPX_TEST_BINS); do $$t; done
 
 endif

diff --git a/nestegg/src/nestegg.c b/nestegg/src/nestegg.c
index 63a0e83..cc87788 100644
--- a/nestegg/src/nestegg.c
+++ b/nestegg/src/nestegg.c

@@ -1272,7 +1272,7 @@
   if (total > block_size)
     return -1;
 
-  entry = ne_find_track_entry(ctx, track - 1);
+  entry = ne_find_track_entry(ctx, (unsigned int)(track - 1));
   if (!entry)
     return -1;
 
@@ -1291,7 +1291,7 @@
 
   pkt = ne_alloc(sizeof(*pkt));
   pkt->track = track - 1;
-  pkt->timecode = abs_timecode * tc_scale * track_scale;
+  pkt->timecode = (uint64_t)(abs_timecode * tc_scale * track_scale);
 
   ctx->log(ctx, NESTEGG_LOG_DEBUG, "%sblock t %lld pts %f f %llx frames: %llu",
            block_id == ID_BLOCK ? "" : "simple", pkt->track, pkt->timecode / 1e9, flags, frames);
@@ -1774,35 +1774,35 @@
 
   if (ne_get_uint(entry->video.pixel_width, &value) != 0)
     return -1;
-  params->width = value;
+  params->width = (unsigned int)value;
 
   if (ne_get_uint(entry->video.pixel_height, &value) != 0)
     return -1;
-  params->height = value;
+  params->height = (unsigned int)value;
 
   value = 0;
   ne_get_uint(entry->video.pixel_crop_bottom, &value);
-  params->crop_bottom = value;
+  params->crop_bottom = (unsigned int)value;
 
   value = 0;
   ne_get_uint(entry->video.pixel_crop_top, &value);
-  params->crop_top = value;
+  params->crop_top = (unsigned int)value;
 
   value = 0;
   ne_get_uint(entry->video.pixel_crop_left, &value);
-  params->crop_left = value;
+  params->crop_left = (unsigned int)value;
 
   value = 0;
   ne_get_uint(entry->video.pixel_crop_right, &value);
-  params->crop_right = value;
+  params->crop_right = (unsigned int)value;
 
   value = params->width;
   ne_get_uint(entry->video.display_width, &value);
-  params->display_width = value;
+  params->display_width = (unsigned int)value;
 
   value = params->height;
   ne_get_uint(entry->video.display_height, &value);
-  params->display_height = value;
+  params->display_height = (unsigned int)value;
 
   return 0;
 }
@@ -1828,11 +1828,11 @@
 
   value = 1;
   ne_get_uint(entry->audio.channels, &value);
-  params->channels = value;
+  params->channels = (unsigned int)value;
 
   value = 16;
   ne_get_uint(entry->audio.bit_depth, &value);
-  params->depth = value;
+  params->depth = (unsigned int)value;
 
   return 0;
 }
@@ -1888,7 +1888,7 @@
 int
 nestegg_packet_track(nestegg_packet * pkt, unsigned int * track)
 {
-  *track = pkt->track;
+  *track = (unsigned int)pkt->track;
   return 0;
 }
 

diff --git a/solution.mk b/solution.mk
index 2de1d8d..948305f 100644
--- a/solution.mk
+++ b/solution.mk

@@ -8,18 +8,19 @@
 ##  be found in the AUTHORS file in the root of the source tree.
 ##
 
+# libvpx reverse dependencies (targets that depend on libvpx)
+VPX_NONDEPS=$(addsuffix .vcproj,vpx gtest obj_int_extract)
+VPX_RDEPS=$(foreach vcp,\
+              $(filter-out $(VPX_NONDEPS),$^), --dep=$(vcp:.vcproj=):vpx)
 
 vpx.sln: $(wildcard *.vcproj)
 	@echo "    [CREATE] $@"
 	$(SRC_PATH_BARE)/build/make/gen_msvs_sln.sh \
-            $(if $(filter %vpx.vcproj,$^),\
-                $(foreach vcp,$(filter-out %vpx.vcproj %gtest.vcproj %obj_int_extract.vcproj,$^),\
-                  --dep=$(vcp:.vcproj=):vpx) \
-                $(foreach vcp,$(filter %_test.vcproj,$^),\
-                  --dep=$(vcp:.vcproj=):gtest)) \
-                  --dep=vpx:obj_int_extract \
-                  --ver=$(CONFIG_VS_VERSION)\
-                  --out=$@ $^
+            $(if $(filter vpx.vcproj,$^),$(VPX_RDEPS)) \
+            --dep=vpx:obj_int_extract \
+            --dep=test_libvpx:gtest \
+            --ver=$(CONFIG_VS_VERSION)\
+            --out=$@ $^
 vpx.sln.mk: vpx.sln
 	@true
 

diff --git a/test/acm_random.h b/test/acm_random.h
new file mode 100644
index 0000000..514894e
--- /dev/null
+++ b/test/acm_random.h

@@ -0,0 +1,53 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef LIBVPX_TEST_ACM_RANDOM_H_
+#define LIBVPX_TEST_ACM_RANDOM_H_
+
+#include <stdlib.h>
+
+#include "vpx/vpx_integer.h"
+
+namespace libvpx_test {
+
+class ACMRandom {
+ public:
+  ACMRandom() {
+    Reset(DeterministicSeed());
+  }
+
+  explicit ACMRandom(int seed) {
+    Reset(seed);
+  }
+
+  void Reset(int seed) {
+    srand(seed);
+  }
+
+  uint8_t Rand8(void) {
+    return (rand() >> 8) & 0xff;
+  }
+
+  int PseudoUniform(int range) {
+    return (rand() >> 8) % range;
+  }
+
+  int operator()(int n) {
+    return PseudoUniform(n);
+  }
+
+  static int DeterministicSeed(void) {
+    return 0xbaba;
+  }
+};
+
+}  // namespace libvpx_test
+
+#endif  // LIBVPX_TEST_ACM_RANDOM_H_

diff --git a/test/altref_test.cc b/test/altref_test.cc
new file mode 100644
index 0000000..ca05577
--- /dev/null
+++ b/test/altref_test.cc

@@ -0,0 +1,71 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+
+namespace {
+
+// lookahead range: [kLookAheadMin, kLookAheadMax).
+const int kLookAheadMin = 5;
+const int kLookAheadMax = 26;
+
+class AltRefTest : public libvpx_test::EncoderTest,
+    public ::testing::TestWithParam<int> {
+ protected:
+  AltRefTest() : altref_count_(0) {}
+  virtual ~AltRefTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(libvpx_test::kTwoPassGood);
+  }
+
+  virtual void BeginPassHook(unsigned int pass) {
+    altref_count_ = 0;
+  }
+
+  virtual bool Continue() const {
+    return !HasFatalFailure() && !abort_;
+  }
+
+  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                                  libvpx_test::Encoder *encoder) {
+    if (video->frame() == 1) {
+      encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(VP8E_SET_CPUUSED, 3);
+    }
+  }
+
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    if (pkt->data.frame.flags & VPX_FRAME_IS_INVISIBLE) ++altref_count_;
+  }
+
+  int altref_count() const { return altref_count_; }
+
+ private:
+  int altref_count_;
+};
+
+TEST_P(AltRefTest, MonotonicTimestamps) {
+  const vpx_rational timebase = { 33333333, 1000000000 };
+  cfg_.g_timebase = timebase;
+  cfg_.rc_target_bitrate = 1000;
+  cfg_.g_lag_in_frames = GetParam();
+
+  libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     timebase.den, timebase.num, 0, 30);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  EXPECT_GE(altref_count(), 1);
+}
+
+INSTANTIATE_TEST_CASE_P(NonZeroLag, AltRefTest,
+                        ::testing::Range(kLookAheadMin, kLookAheadMax));
+}  // namespace

diff --git a/test/boolcoder_test.cc b/test/boolcoder_test.cc
index 41c2f7b..4e21be8 100644
--- a/test/boolcoder_test.cc
+++ b/test/boolcoder_test.cc

@@ -20,28 +20,16 @@
 #include <string.h>
 #include <sys/types.h>
 
+#include "test/acm_random.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "vpx/vpx_integer.h"
 
 namespace {
 const int num_tests = 10;
-
-class ACMRandom {
- public:
-  explicit ACMRandom(int seed) { Reset(seed); }
-
-  void Reset(int seed) { srand(seed); }
-
-  uint8_t Rand8(void) { return (rand() >> 8) & 0xff; }
-
-  int PseudoUniform(int range) { return (rand() >> 8) % range; }
-
-  int operator()(int n) { return PseudoUniform(n); }
-
-  static int DeterministicSeed(void) { return 0xbaba; }
-};
 }  // namespace
 
+using libvpx_test::ACMRandom;
+
 TEST(VP8, TestBitIO) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   for (int n = 0; n < num_tests; ++n) {

diff --git a/test/config_test.cc b/test/config_test.cc
new file mode 100644
index 0000000..c4da46e
--- /dev/null
+++ b/test/config_test.cc

@@ -0,0 +1,61 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/encode_test_driver.h"
+#include "test/video_source.h"
+
+namespace {
+
+class ConfigTest : public ::libvpx_test::EncoderTest,
+    public ::testing::TestWithParam<enum libvpx_test::TestMode> {
+ public:
+  ConfigTest() : frame_count_in_(0), frame_count_out_(0), frame_count_max_(0) {}
+
+ protected:
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GetParam());
+  }
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+    frame_count_in_ = 0;
+    frame_count_out_ = 0;
+  }
+
+  virtual void PreEncodeFrameHook(libvpx_test::VideoSource* /*video*/) {
+    ++frame_count_in_;
+    abort_ |= (frame_count_in_ >= frame_count_max_);
+  }
+
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t* /*pkt*/) {
+    ++frame_count_out_;
+  }
+
+  virtual bool Continue() const {
+    return !HasFatalFailure() && !abort_;
+  }
+
+  unsigned int frame_count_in_;
+  unsigned int frame_count_out_;
+  unsigned int frame_count_max_;
+};
+
+TEST_P(ConfigTest, LagIsDisabled) {
+  frame_count_max_ = 2;
+  cfg_.g_lag_in_frames = 15;
+
+  libvpx_test::DummyVideoSource video;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  EXPECT_EQ(frame_count_in_, frame_count_out_);
+}
+
+INSTANTIATE_TEST_CASE_P(OnePassModes, ConfigTest, ONE_PASS_TEST_MODES);
+}  // namespace

diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc
index 0a188fa..ce7e5b3 100644
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc

@@ -94,6 +94,8 @@
 
 void EncoderTest::RunLoop(VideoSource *video) {
   for (unsigned int pass = 0; pass < passes_; pass++) {
+    last_pts_ = 0;
+
     if (passes_ == 1)
       cfg_.g_pass = VPX_RC_ONE_PASS;
     else if (pass == 0)
@@ -105,11 +107,11 @@
     Encoder encoder(cfg_, deadline_, &stats_);
 
     bool again;
-
-    for (video->Begin(), again = true; again; video->Next()) {
+    for (again = true, video->Begin(); again; video->Next()) {
       again = video->img() != NULL;
 
       PreEncodeFrameHook(video);
+      PreEncodeFrameHook(video, &encoder);
       encoder.EncodeFrame(video, flags_);
 
       CxDataIterator iter = encoder.GetCxData();
@@ -120,6 +122,8 @@
         if (pkt->kind != VPX_CODEC_CX_FRAME_PKT)
           continue;
 
+        ASSERT_GE(pkt->data.frame.pts, last_pts_);
+        last_pts_ = pkt->data.frame.pts;
         FramePktHook(pkt);
       }
 

diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h
index 2fb627c..d9a79ec 100644
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h

@@ -101,6 +101,11 @@
     EncodeFrame(video, 0);
   }
 
+  void Control(int ctrl_id, int arg) {
+    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+  }
+
   void set_deadline(unsigned long deadline) {
     deadline_ = deadline;
   }
@@ -133,7 +138,7 @@
 // classes directly, so that tests can be parameterized differently.
 class EncoderTest {
  protected:
-  EncoderTest() : abort_(false), flags_(0) {}
+  EncoderTest() : abort_(false), flags_(0), last_pts_(0) {}
 
   virtual ~EncoderTest() {}
 
@@ -158,6 +163,7 @@
 
   // Hook to be called before encoding a frame.
   virtual void PreEncodeFrameHook(VideoSource *video) {}
+  virtual void PreEncodeFrameHook(VideoSource *video, Encoder *encoder) {}
 
   // Hook to be called on every compressed data packet.
   virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {}
@@ -171,12 +177,9 @@
   unsigned long        deadline_;
   TwopassStatsStore    stats_;
   unsigned long        flags_;
+  vpx_codec_pts_t      last_pts_;
 };
 
 }  // namespace libvpx_test
 
-// Macros to be used with ::testing::Combine
-#define PARAMS(...) ::testing::TestWithParam< std::tr1::tuple< __VA_ARGS__ > >
-#define GET_PARAM(k) std::tr1::get< k >(GetParam())
-
 #endif  // TEST_ENCODE_TEST_DRIVER_H_

diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
new file mode 100644
index 0000000..619b23d
--- /dev/null
+++ b/test/fdct4x4_test.cc

@@ -0,0 +1,169 @@
+/*
+*  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+*
+*  Use of this source code is governed by a BSD-style license
+*  that can be found in the LICENSE file in the root of the source
+*  tree. An additional intellectual property rights grant can be found
+*  in the file PATENTS.  All contributing project authors may
+*  be found in the AUTHORS file in the root of the source tree.
+*/
+
+
+#include <math.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+
+
+extern "C" {
+#include "vpx_rtcd.h"
+}
+
+#include "test/acm_random.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "vpx/vpx_integer.h"
+
+
+namespace {
+
+const int cospi8sqrt2minus1 = 20091;
+const int sinpi8sqrt2 = 35468;
+
+void reference_idct4x4(const int16_t *input, int16_t *output) {
+  const int16_t *ip = input;
+  int16_t *op = output;
+
+  for (int i = 0; i < 4; ++i) {
+    const int a1 = ip[0] + ip[8];
+    const int b1 = ip[0] - ip[8];
+    const int temp1 = (ip[4] * sinpi8sqrt2) >> 16;
+    const int temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
+    const int c1 = temp1 - temp2;
+    const int temp3 = ip[4] + ((ip[4] * cospi8sqrt2minus1) >> 16);
+    const int temp4 = (ip[12] * sinpi8sqrt2) >> 16;
+    const int d1 = temp3 + temp4;
+    op[0] = a1 + d1;
+    op[12] = a1 - d1;
+    op[4] = b1 + c1;
+    op[8] = b1 - c1;
+    ++ip;
+    ++op;
+  }
+  ip = output;
+  op = output;
+  for (int i = 0; i < 4; ++i) {
+    const int a1 = ip[0] + ip[2];
+    const int b1 = ip[0] - ip[2];
+    const int temp1 = (ip[1] * sinpi8sqrt2) >> 16;
+    const int temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1) >> 16);
+    const int c1 = temp1 - temp2;
+    const int temp3 = ip[1] + ((ip[1] * cospi8sqrt2minus1) >> 16);
+    const int temp4 = (ip[3] * sinpi8sqrt2) >> 16;
+    const int d1 = temp3 + temp4;
+    op[0] = (a1 + d1 + 4) >> 3;
+    op[3] = (a1 - d1 + 4) >> 3;
+    op[1] = (b1 + c1 + 4) >> 3;
+    op[2] = (b1 - c1 + 4) >> 3;
+    ip += 4;
+    op += 4;
+  }
+}
+
+using libvpx_test::ACMRandom;
+
+TEST(Vp8FdctTest, SignBiasCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  int16_t test_input_block[16];
+  int16_t test_output_block[16];
+  const int pitch = 8;
+  int count_sign_block[16][2];
+  const int count_test_block = 1000000;
+
+  memset(count_sign_block, 0, sizeof(count_sign_block));
+
+  for (int i = 0; i < count_test_block; ++i) {
+    // Initialize a test block with input range [-255, 255].
+    for (int j = 0; j < 16; ++j)
+      test_input_block[j] = rnd.Rand8() - rnd.Rand8();
+
+    vp8_short_fdct4x4_c(test_input_block, test_output_block, pitch);
+
+    for (int j = 0; j < 16; ++j) {
+      if (test_output_block[j] < 0)
+        ++count_sign_block[j][0];
+      else if (test_output_block[j] > 0)
+        ++count_sign_block[j][1];
+    }
+  }
+
+  bool bias_acceptable = true;
+  for (int j = 0; j < 16; ++j)
+    bias_acceptable = bias_acceptable &&
+    (abs(count_sign_block[j][0] - count_sign_block[j][1]) < 10000);
+
+  EXPECT_EQ(true, bias_acceptable)
+    << "Error: 4x4 FDCT has a sign bias > 1% for input range [-255, 255]";
+
+  memset(count_sign_block, 0, sizeof(count_sign_block));
+
+  for (int i = 0; i < count_test_block; ++i) {
+    // Initialize a test block with input range [-15, 15].
+    for (int j = 0; j < 16; ++j)
+      test_input_block[j] = (rnd.Rand8() >> 4) - (rnd.Rand8() >> 4);
+
+    vp8_short_fdct4x4_c(test_input_block, test_output_block, pitch);
+
+    for (int j = 0; j < 16; ++j) {
+      if (test_output_block[j] < 0)
+        ++count_sign_block[j][0];
+      else if (test_output_block[j] > 0)
+        ++count_sign_block[j][1];
+    }
+  }
+
+  bias_acceptable = true;
+  for (int j = 0; j < 16; ++j)
+    bias_acceptable = bias_acceptable &&
+    (abs(count_sign_block[j][0] - count_sign_block[j][1]) < 100000);
+
+  EXPECT_EQ(true, bias_acceptable)
+    << "Error: 4x4 FDCT has a sign bias > 10% for input range [-15, 15]";
+};
+
+TEST(Vp8FdctTest, RoundTripErrorCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  int max_error = 0;
+  double total_error = 0;
+  const int count_test_block = 1000000;
+  for (int i = 0; i < count_test_block; ++i) {
+    int16_t test_input_block[16];
+    int16_t test_temp_block[16];
+    int16_t test_output_block[16];
+
+    // Initialize a test block with input range [-255, 255].
+    for (int j = 0; j < 16; ++j)
+      test_input_block[j] = rnd.Rand8() - rnd.Rand8();
+
+    const int pitch = 8;
+    vp8_short_fdct4x4_c(test_input_block, test_temp_block, pitch);
+    reference_idct4x4(test_temp_block, test_output_block);
+
+    for (int j = 0; j < 16; ++j) {
+      const int diff = test_input_block[j] - test_output_block[j];
+      const int error = diff * diff;
+      if (max_error < error)
+        max_error = error;
+      total_error += error;
+    }
+  }
+
+  EXPECT_GE(1, max_error )
+    << "Error: FDCT/IDCT has an individual roundtrip error > 1";
+
+  EXPECT_GE(count_test_block, total_error)
+    << "Error: FDCT/IDCT has average roundtrip error > 1 per block";
+};
+
+}  // namespace

diff --git a/test/i420_video_source.h b/test/i420_video_source.h
new file mode 100644
index 0000000..c7362d2
--- /dev/null
+++ b/test/i420_video_source.h

@@ -0,0 +1,120 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef TEST_I420_VIDEO_SOURCE_H_
+#define TEST_I420_VIDEO_SOURCE_H_
+#include <cstdio>
+#include <cstdlib>
+
+#include "test/video_source.h"
+
+namespace libvpx_test {
+
+// This class extends VideoSource to allow parsing of raw yv12
+// so that we can do actual file encodes.
+class I420VideoSource : public VideoSource {
+ public:
+  I420VideoSource(const std::string &file_name,
+                  unsigned int width, unsigned int height,
+                  int rate_numerator, int rate_denominator,
+                  unsigned int start, int limit)
+      : file_name_(file_name),
+        img_(NULL),
+        start_(start),
+        limit_(limit),
+        frame_(0),
+        width_(0),
+        height_(0),
+        framerate_numerator_(rate_numerator),
+        framerate_denominator_(rate_denominator) {
+
+    // This initializes raw_sz_, width_, height_ and allocates an img.
+    SetSize(width, height);
+  }
+
+  virtual ~I420VideoSource() {
+    vpx_img_free(img_);
+    if (input_file_)
+      fclose(input_file_);
+  }
+
+  virtual void Begin() {
+    std::string path_to_source = file_name_;
+    const char *kDataPath = getenv("LIBVPX_TEST_DATA_PATH");
+    if (kDataPath) {
+      path_to_source = kDataPath;
+      path_to_source += "/";
+      path_to_source += file_name_;
+    }
+
+    input_file_ = fopen(path_to_source.c_str(), "rb");
+    ASSERT_TRUE(input_file_) << "File open failed.";
+
+    if (start_) {
+      fseek(input_file_, raw_sz_ * start_, SEEK_SET);
+    }
+
+    frame_ = start_;
+    FillFrame();
+  }
+
+  virtual void Next() {
+    ++frame_;
+    FillFrame();
+  }
+
+  virtual vpx_image_t *img() const { return (frame_ < limit_) ? img_ : NULL;  }
+
+  // Models a stream where Timebase = 1/FPS, so pts == frame.
+  virtual vpx_codec_pts_t pts() const { return frame_; }
+
+  virtual unsigned long duration() const { return 1; }
+
+  virtual vpx_rational_t timebase() const {
+    const vpx_rational_t t = { framerate_denominator_, framerate_numerator_ };
+    return t;
+  }
+
+  virtual unsigned int frame() const { return frame_; }
+
+  void SetSize(unsigned int width, unsigned int height) {
+    if (width != width_ || height != height_) {
+      vpx_img_free(img_);
+      img_ = vpx_img_alloc(NULL, VPX_IMG_FMT_VPXI420, width, height, 1);
+      ASSERT_TRUE(img_ != NULL);
+      width_ = width;
+      height_ = height;
+      raw_sz_ = width * height * 3 / 2;
+    }
+  }
+
+  virtual void FillFrame() {
+    // Read a frame from input_file.
+    if (fread(img_->img_data, raw_sz_, 1, input_file_) == 0) {
+      limit_ = frame_;
+    }
+  }
+
+ protected:
+  std::string file_name_;
+  FILE *input_file_;
+  vpx_image_t *img_;
+  size_t raw_sz_;
+  unsigned int start_;
+  unsigned int limit_;
+  unsigned int frame_;
+  unsigned int width_;
+  unsigned int height_;
+  unsigned int framerate_numerator_;
+  unsigned int framerate_denominator_;
+};
+
+}  // namespace libvpx_test
+
+#endif  // TEST_I420_VIDEO_SOURCE_H_

diff --git a/test/idctllm_test.cc b/test/idctllm_test.cc
old mode 100755
new mode 100644


diff --git a/test/intrapred_test.cc b/test/intrapred_test.cc
new file mode 100644
index 0000000..d741353
--- /dev/null
+++ b/test/intrapred_test.cc

@@ -0,0 +1,317 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <string.h>
+#include "test/acm_random.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+extern "C" {
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vp8/common/blockd.h"
+}
+
+namespace {
+
+using libvpx_test::ACMRandom;
+
+class IntraPredBase {
+ protected:
+  void SetupMacroblock(uint8_t *data, int block_size, int stride,
+                       int num_planes) {
+    memset(&mb_, 0, sizeof(mb_));
+    memset(&mi_, 0, sizeof(mi_));
+    mb_.up_available = 1;
+    mb_.left_available = 1;
+    mb_.mode_info_context = &mi_;
+    stride_ = stride;
+    block_size_ = block_size;
+    num_planes_ = num_planes;
+    for (int p = 0; p < num_planes; p++)
+      data_ptr_[p] = data + stride * (block_size + 1) * p +
+                     stride + block_size;
+  }
+
+  void FillRandom() {
+    // Fill edges with random data
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    for (int p = 0; p < num_planes_; p++) {
+      for (int x = -1 ; x <= block_size_; x++)
+        data_ptr_[p][x - stride_] = rnd.Rand8();
+      for (int y = 0; y < block_size_; y++)
+        data_ptr_[p][y * stride_ - 1] = rnd.Rand8();
+    }
+  }
+
+  virtual void Predict(MB_PREDICTION_MODE mode) = 0;
+
+  void SetLeftUnavailable() {
+    mb_.left_available = 0;
+    for (int p = 0; p < num_planes_; p++)
+      for (int i = -1; i < block_size_; ++i)
+        data_ptr_[p][stride_ * i - 1] = 129;
+  }
+
+  void SetTopUnavailable() {
+    mb_.up_available = 0;
+    for (int p = 0; p < num_planes_; p++)
+      memset(&data_ptr_[p][-1 - stride_], 127, block_size_ + 2);
+  }
+
+  void SetTopLeftUnavailable() {
+    SetLeftUnavailable();
+    SetTopUnavailable();
+  }
+
+  int BlockSizeLog2Min1() const {
+    switch (block_size_) {
+      case 16:
+        return 3;
+      case 8:
+        return 2;
+      default:
+        return 0;
+    }
+  }
+
+  // check DC prediction output against a reference
+  void CheckDCPrediction() const {
+    for (int p = 0; p < num_planes_; p++) {
+      // calculate expected DC
+      int expected;
+      if (mb_.up_available || mb_.left_available) {
+        int sum = 0, shift = BlockSizeLog2Min1() + mb_.up_available +
+                             mb_.left_available;
+        if (mb_.up_available)
+          for (int x = 0; x < block_size_; x++)
+            sum += data_ptr_[p][x - stride_];
+        if (mb_.left_available)
+          for (int y = 0; y < block_size_; y++)
+            sum += data_ptr_[p][y * stride_ - 1];
+        expected = (sum + (1 << (shift - 1))) >> shift;
+      } else
+        expected = 0x80;
+
+      // check that all subsequent lines are equal to the first
+      for (int y = 1; y < block_size_; ++y)
+        ASSERT_EQ(0, memcmp(data_ptr_[p], &data_ptr_[p][y * stride_],
+                            block_size_));
+      // within the first line, ensure that each pixel has the same value
+      for (int x = 1; x < block_size_; ++x)
+        ASSERT_EQ(data_ptr_[p][0], data_ptr_[p][x]);
+      // now ensure that that pixel has the expected (DC) value
+      ASSERT_EQ(expected, data_ptr_[p][0]);
+    }
+  }
+
+  // check V prediction output against a reference
+  void CheckVPrediction() const {
+    // check that all lines equal the top border
+    for (int p = 0; p < num_planes_; p++)
+      for (int y = 0; y < block_size_; y++)
+        ASSERT_EQ(0, memcmp(&data_ptr_[p][-stride_],
+                            &data_ptr_[p][y * stride_], block_size_));
+  }
+
+  // check H prediction output against a reference
+  void CheckHPrediction() const {
+    // for each line, ensure that each pixel is equal to the left border
+    for (int p = 0; p < num_planes_; p++)
+      for (int y = 0; y < block_size_; y++)
+        for (int x = 0; x < block_size_; x++)
+          ASSERT_EQ(data_ptr_[p][-1 + y * stride_],
+                    data_ptr_[p][x + y * stride_]);
+  }
+
+  static int ClipByte(int value) {
+    if (value > 255)
+      return 255;
+    else if (value < 0)
+      return 0;
+    return value;
+  }
+
+  // check TM prediction output against a reference
+  void CheckTMPrediction() const {
+    for (int p = 0; p < num_planes_; p++)
+      for (int y = 0; y < block_size_; y++)
+        for (int x = 0; x < block_size_; x++) {
+          const int expected = ClipByte(data_ptr_[p][x - stride_]
+                                      + data_ptr_[p][stride_ * y - 1]
+                                      - data_ptr_[p][-1 - stride_]);
+          ASSERT_EQ(expected, data_ptr_[p][y * stride_ + x]);
+       }
+  }
+
+  // Actual test
+  void RunTest() {
+    {
+      SCOPED_TRACE("DC_PRED");
+      FillRandom();
+      Predict(DC_PRED);
+      CheckDCPrediction();
+    }
+    {
+      SCOPED_TRACE("DC_PRED LEFT");
+      FillRandom();
+      SetLeftUnavailable();
+      Predict(DC_PRED);
+      CheckDCPrediction();
+    }
+    {
+      SCOPED_TRACE("DC_PRED TOP");
+      FillRandom();
+      SetTopUnavailable();
+      Predict(DC_PRED);
+      CheckDCPrediction();
+    }
+    {
+      SCOPED_TRACE("DC_PRED TOP_LEFT");
+      FillRandom();
+      SetTopLeftUnavailable();
+      Predict(DC_PRED);
+      CheckDCPrediction();
+    }
+    {
+      SCOPED_TRACE("H_PRED");
+      FillRandom();
+      Predict(H_PRED);
+      CheckHPrediction();
+    }
+    {
+      SCOPED_TRACE("V_PRED");
+      FillRandom();
+      Predict(V_PRED);
+      CheckVPrediction();
+    }
+    {
+      SCOPED_TRACE("TM_PRED");
+      FillRandom();
+      Predict(TM_PRED);
+      CheckTMPrediction();
+    }
+  }
+
+  MACROBLOCKD mb_;
+  MODE_INFO mi_;
+  uint8_t *data_ptr_[2];  // in the case of Y, only [0] is used
+  int stride_;
+  int block_size_;
+  int num_planes_;
+};
+
+typedef void (*intra_pred_y_fn_t)(MACROBLOCKD *x,
+                                  uint8_t *yabove_row,
+                                  uint8_t *yleft,
+                                  int left_stride,
+                                  uint8_t *ypred_ptr,
+                                  int y_stride);
+
+class IntraPredYTest : public ::testing::TestWithParam<intra_pred_y_fn_t>,
+    protected IntraPredBase {
+ protected:
+  static const int kBlockSize = 16;
+  static const int kStride = kBlockSize * 3;
+
+  virtual void SetUp() {
+    pred_fn_ = GetParam();
+    SetupMacroblock(data_array_, kBlockSize, kStride, 1);
+  }
+
+  virtual void Predict(MB_PREDICTION_MODE mode) {
+    mb_.mode_info_context->mbmi.mode = mode;
+    pred_fn_(&mb_, data_ptr_[0] - kStride, data_ptr_[0] - 1, kStride,
+             data_ptr_[0], kStride);
+  }
+
+  intra_pred_y_fn_t pred_fn_;
+  // We use 48 so that the data pointer of the first pixel in each row of
+  // each macroblock is 16-byte aligned, and this gives us access to the
+  // top-left and top-right corner pixels belonging to the top-left/right
+  // macroblocks.
+  // We use 17 lines so we have one line above us for top-prediction.
+  DECLARE_ALIGNED(16, uint8_t, data_array_[kStride * (kBlockSize + 1)]);
+};
+
+TEST_P(IntraPredYTest, IntraPredTests) {
+  RunTest();
+}
+
+INSTANTIATE_TEST_CASE_P(C, IntraPredYTest,
+                        ::testing::Values(
+                            vp8_build_intra_predictors_mby_s_c));
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, IntraPredYTest,
+                        ::testing::Values(
+                            vp8_build_intra_predictors_mby_s_sse2));
+#endif
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(SSSE3, IntraPredYTest,
+                        ::testing::Values(
+                            vp8_build_intra_predictors_mby_s_ssse3));
+#endif
+
+typedef void (*intra_pred_uv_fn_t)(MACROBLOCKD *x,
+                                   uint8_t *uabove_row,
+                                   uint8_t *vabove_row,
+                                   uint8_t *uleft,
+                                   uint8_t *vleft,
+                                   int left_stride,
+                                   uint8_t *upred_ptr,
+                                   uint8_t *vpred_ptr,
+                                   int pred_stride);
+
+class IntraPredUVTest : public ::testing::TestWithParam<intra_pred_uv_fn_t>,
+    protected IntraPredBase {
+ protected:
+  static const int kBlockSize = 8;
+  static const int kStride = kBlockSize * 3;
+
+  virtual void SetUp() {
+    pred_fn_ = GetParam();
+    SetupMacroblock(data_array_, kBlockSize, kStride, 2);
+  }
+
+  virtual void Predict(MB_PREDICTION_MODE mode) {
+    mb_.mode_info_context->mbmi.uv_mode = mode;
+    pred_fn_(&mb_, data_ptr_[0] - kStride, data_ptr_[1] - kStride,
+             data_ptr_[0] - 1, data_ptr_[1] - 1, kStride,
+             data_ptr_[0], data_ptr_[1], kStride);
+  }
+
+  intra_pred_uv_fn_t pred_fn_;
+  // We use 24 so that the data pointer of the first pixel in each row of
+  // each macroblock is 8-byte aligned, and this gives us access to the
+  // top-left and top-right corner pixels belonging to the top-left/right
+  // macroblocks.
+  // We use 9 lines so we have one line above us for top-prediction.
+  // [0] = U, [1] = V
+  DECLARE_ALIGNED(8, uint8_t, data_array_[2 * kStride * (kBlockSize + 1)]);
+};
+
+TEST_P(IntraPredUVTest, IntraPredTests) {
+  RunTest();
+}
+
+INSTANTIATE_TEST_CASE_P(C, IntraPredUVTest,
+                        ::testing::Values(
+                            vp8_build_intra_predictors_mbuv_s_c));
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, IntraPredUVTest,
+                        ::testing::Values(
+                            vp8_build_intra_predictors_mbuv_s_sse2));
+#endif
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(SSSE3, IntraPredUVTest,
+                        ::testing::Values(
+                            vp8_build_intra_predictors_mbuv_s_ssse3));
+#endif
+
+}  // namespace

diff --git a/test/keyframe_test.cc b/test/keyframe_test.cc
index db18e5d..19c7152 100644
--- a/test/keyframe_test.cc
+++ b/test/keyframe_test.cc

@@ -10,13 +10,13 @@
 #include <climits>
 #include <vector>
 #include "test/encode_test_driver.h"
-#include "test/video_source.h"
+#include "test/i420_video_source.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 namespace {
 
 class KeyframeTest : public ::libvpx_test::EncoderTest,
-  public ::testing::TestWithParam<enum libvpx_test::TestMode> {
+    public ::testing::TestWithParam<enum libvpx_test::TestMode> {
  protected:
   virtual void SetUp() {
     InitializeConfig();
@@ -24,15 +24,19 @@
     kf_count_ = 0;
     kf_count_max_ = INT_MAX;
     kf_do_force_kf_ = false;
+    set_cpu_used_ = 0;
   }
 
-  virtual bool Continue() {
+  virtual bool Continue() const {
     return !HasFatalFailure() && !abort_;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video) {
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
     if (kf_do_force_kf_)
       flags_ = (video->frame() % 3) ? 0 : VPX_EFLAG_FORCE_KF;
+    if (set_cpu_used_ && video->frame() == 1)
+      encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
   }
 
   virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
@@ -46,7 +50,8 @@
   bool kf_do_force_kf_;
   int kf_count_;
   int kf_count_max_;
-  std::vector< vpx_codec_pts_t > kf_pts_list_;
+  std::vector<vpx_codec_pts_t> kf_pts_list_;
+  int set_cpu_used_;
 };
 
 TEST_P(KeyframeTest, TestRandomVideoSource) {
@@ -78,9 +83,8 @@
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
   // verify that every third frame is a keyframe.
-  for (std::vector<vpx_codec_pts_t>::iterator iter = kf_pts_list_.begin();
-       iter != kf_pts_list_.end();
-       ++iter) {
+  for (std::vector<vpx_codec_pts_t>::const_iterator iter = kf_pts_list_.begin();
+       iter != kf_pts_list_.end(); ++iter) {
     ASSERT_EQ(0, *iter % 3) << "Unexpected keyframe at frame " << *iter;
   }
 }
@@ -92,12 +96,44 @@
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
   // verify that keyframe interval matches kf_max_dist
-  for (std::vector<vpx_codec_pts_t>::iterator iter = kf_pts_list_.begin();
-       iter != kf_pts_list_.end();
-       iter++) {
+  for (std::vector<vpx_codec_pts_t>::const_iterator iter = kf_pts_list_.begin();
+       iter != kf_pts_list_.end(); ++iter) {
     ASSERT_EQ(0, *iter % 25) << "Unexpected keyframe at frame " << *iter;
   }
 }
 
+TEST_P(KeyframeTest, TestAutoKeyframe) {
+  cfg_.kf_mode = VPX_KF_AUTO;
+  kf_do_force_kf_ = false;
+
+  // Force a deterministic speed step in Real Time mode, as the faster modes
+  // may not produce a keyframe like we expect. This is necessary when running
+  // on very slow environments (like Valgrind). The step -11 was determined
+  // experimentally as the fastest mode that still throws the keyframe.
+  if (deadline_ == VPX_DL_REALTIME)
+    set_cpu_used_ = -11;
+
+  // This clip has a cut scene every 30 frames -> Frame 0, 30, 60, 90, 120.
+  // I check only the first 40 frames to make sure there's a keyframe at frame
+  // 0 and 30.
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 40);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  EXPECT_EQ(2u, kf_pts_list_.size()) << " Not the right number of keyframes ";
+
+  // Verify that keyframes match the file keyframes in the file.
+  for (std::vector<vpx_codec_pts_t>::const_iterator iter = kf_pts_list_.begin();
+       iter != kf_pts_list_.end(); ++iter) {
+
+    if (deadline_ == VPX_DL_REALTIME && *iter > 0)
+      EXPECT_EQ(0, (*iter - 1) % 30) << "Unexpected keyframe at frame "
+        << *iter;
+    else
+      EXPECT_EQ(0, *iter % 30) << "Unexpected keyframe at frame " << *iter;
+  }
+}
+
 INSTANTIATE_TEST_CASE_P(AllModes, KeyframeTest, ALL_TEST_MODES);
 }  // namespace

diff --git a/test/pp_filter_test.cc b/test/pp_filter_test.cc
new file mode 100644
index 0000000..90b28e4
--- /dev/null
+++ b/test/pp_filter_test.cc

@@ -0,0 +1,109 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "third_party/googletest/src/include/gtest/gtest.h"
+extern "C" {
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
+}
+
+typedef void (*post_proc_func_t)(unsigned char *src_ptr,
+                                 unsigned char *dst_ptr,
+                                 int src_pixels_per_line,
+                                 int dst_pixels_per_line,
+                                 int rows,
+                                 int cols,
+                                 int flimit);
+
+namespace {
+
+class Vp8PostProcessingFilterTest
+    : public ::testing::TestWithParam<post_proc_func_t> {};
+
+// Test routine for the VP8 post-processing function
+// vp8_post_proc_down_and_across_c.
+
+TEST_P(Vp8PostProcessingFilterTest, FilterOutputCheck) {
+  // Size of the underlying data block that will be filtered.
+  const int block_width  = 16;
+  const int block_height = 16;
+
+  // 5-tap filter needs 2 padding rows above and below the block in the input.
+  const int input_width = block_width;
+  const int input_height = block_height + 4;
+  const int input_stride = input_width;
+  const int input_size = input_width * input_height;
+
+  // Filter extends output block by 8 samples at left and right edges.
+  const int output_width = block_width + 16;
+  const int output_height = block_height;
+  const int output_stride = output_width;
+  const int output_size = output_width * output_height;
+
+  uint8_t *const src_image =
+      reinterpret_cast<uint8_t*>(vpx_calloc(input_size, 1));
+  uint8_t *const dst_image =
+      reinterpret_cast<uint8_t*>(vpx_calloc(output_size, 1));
+
+  // Pointers to top-left pixel of block in the input and output images.
+  uint8_t *const src_image_ptr = src_image + (input_stride << 1);
+  uint8_t *const dst_image_ptr = dst_image + 8;
+
+  // Initialize pixels in the input:
+  //   block pixels to value 1,
+  //   border pixels to value 10.
+  (void)vpx_memset(src_image, 10, input_size);
+  uint8_t *pixel_ptr = src_image_ptr;
+  for (int i = 0; i < block_height; ++i) {
+    for (int j = 0; j < block_width; ++j) {
+      pixel_ptr[j] = 1;
+    }
+    pixel_ptr += input_stride;
+  }
+
+  // Initialize pixels in the output to 99.
+  (void)vpx_memset(dst_image, 99, output_size);
+
+  GetParam()(src_image_ptr, dst_image_ptr, input_stride,
+             output_stride, block_height, block_width,
+             255);
+
+  static const uint8_t expected_data[block_height] = {
+    3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3
+  };
+
+  pixel_ptr = dst_image;
+  for (int i = 0; i < block_height; ++i) {
+    for (int j = 0; j < block_width; ++j) {
+      EXPECT_EQ(expected_data[i], pixel_ptr[j])
+          << "Vp8PostProcessingFilterTest failed with invalid filter output";
+    }
+    pixel_ptr += output_stride;
+  }
+
+  vpx_free(src_image);
+  vpx_free(dst_image);
+};
+
+INSTANTIATE_TEST_CASE_P(C, Vp8PostProcessingFilterTest,
+                        ::testing::Values(vp8_post_proc_down_and_across_c));
+
+#if HAVE_MMX
+INSTANTIATE_TEST_CASE_P(MMX, Vp8PostProcessingFilterTest,
+                        ::testing::Values(vp8_post_proc_down_and_across_mmx));
+#endif
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, Vp8PostProcessingFilterTest,
+                        ::testing::Values(vp8_post_proc_down_and_across_xmm));
+#endif
+
+}  // namespace

diff --git a/test/sad_test.cc b/test/sad_test.cc
new file mode 100644
index 0000000..3e819ea
--- /dev/null
+++ b/test/sad_test.cc

@@ -0,0 +1,228 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <string.h>
+#include <limits.h>
+#include <stdio.h>
+
+extern "C" {
+#include "./vpx_config.h"
+#include "./vpx_rtcd.h"
+#include "vp8/common/blockd.h"
+}
+
+#include "test/acm_random.h"
+#include "test/util.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+
+typedef unsigned int (*sad_m_by_n_fn_t)(const unsigned char *source_ptr,
+                                        int source_stride,
+                                        const unsigned char *reference_ptr,
+                                        int reference_stride,
+                                        unsigned int max_sad);
+
+using libvpx_test::ACMRandom;
+
+namespace {
+class SADTest : public PARAMS(int, int, sad_m_by_n_fn_t) {
+  protected:
+  virtual void SetUp() {
+    sad_fn_ = GET_PARAM(2);
+    height_ = GET_PARAM(1);
+    width_ = GET_PARAM(0);
+    source_stride_ = width_ * 2;
+    reference_stride_ = width_ * 2;
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+  sad_m_by_n_fn_t sad_fn_;
+  virtual unsigned int SAD(unsigned int max_sad) {
+    return sad_fn_(source_data_, source_stride_,
+                   reference_data_, reference_stride_,
+                   max_sad);
+  }
+
+  // Sum of Absolute Differences. Given two blocks, calculate the absolute
+  // difference between two pixels in the same relative location; accumulate.
+  unsigned int ReferenceSAD(unsigned int max_sad) {
+    unsigned int sad = 0;
+
+    for (int h = 0; h < height_; ++h) {
+      for (int w = 0; w < width_; ++w) {
+        sad += abs(source_data_[h * source_stride_ + w]
+               - reference_data_[h * reference_stride_ + w]);
+      }
+      if (sad > max_sad) {
+        break;
+      }
+    }
+    return sad;
+  }
+
+  void FillConstant(uint8_t *data, int stride, uint8_t fill_constant) {
+    for (int h = 0; h < height_; ++h) {
+      for (int w = 0; w < width_; ++w) {
+        data[h * stride + w] = fill_constant;
+      }
+    }
+  }
+
+  void FillRandom(uint8_t *data, int stride) {
+    for (int h = 0; h < height_; ++h) {
+      for (int w = 0; w < width_; ++w) {
+        data[h * stride + w] = rnd_.Rand8();
+      }
+    }
+  }
+
+  void CheckSad(unsigned int max_sad) {
+    unsigned int reference_sad, exp_sad;
+
+    reference_sad = ReferenceSAD(max_sad);
+    exp_sad = SAD(max_sad);
+
+    if (reference_sad <= max_sad) {
+      ASSERT_EQ(exp_sad, reference_sad);
+    } else {
+      // Alternative implementations are not required to check max_sad
+      ASSERT_GE(exp_sad, reference_sad);
+    }
+  }
+
+  // Handle blocks up to 16x16 with stride up to 32
+  int height_, width_;
+  DECLARE_ALIGNED(16, uint8_t, source_data_[16*32]);
+  int source_stride_;
+  DECLARE_ALIGNED(16, uint8_t, reference_data_[16*32]);
+  int reference_stride_;
+
+  ACMRandom rnd_;
+};
+
+TEST_P(SADTest, MaxRef) {
+  FillConstant(source_data_, source_stride_, 0);
+  FillConstant(reference_data_, reference_stride_, 255);
+  CheckSad(UINT_MAX);
+}
+
+TEST_P(SADTest, MaxSrc) {
+  FillConstant(source_data_, source_stride_, 255);
+  FillConstant(reference_data_, reference_stride_, 0);
+  CheckSad(UINT_MAX);
+}
+
+TEST_P(SADTest, ShortRef) {
+  int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSad(UINT_MAX);
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADTest, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSad(UINT_MAX);
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADTest, ShortSrc) {
+  int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSad(UINT_MAX);
+  source_stride_ = tmp_stride;
+}
+
+TEST_P(SADTest, MaxSAD) {
+  // Verify that, when max_sad is set, the implementation does not return a
+  // value lower than the reference.
+  FillConstant(source_data_, source_stride_, 255);
+  FillConstant(reference_data_, reference_stride_, 0);
+  CheckSad(128);
+}
+
+using std::tr1::make_tuple;
+
+const sad_m_by_n_fn_t sad_16x16_c = vp8_sad16x16_c;
+const sad_m_by_n_fn_t sad_8x16_c = vp8_sad8x16_c;
+const sad_m_by_n_fn_t sad_16x8_c = vp8_sad16x8_c;
+const sad_m_by_n_fn_t sad_8x8_c = vp8_sad8x8_c;
+const sad_m_by_n_fn_t sad_4x4_c = vp8_sad4x4_c;
+INSTANTIATE_TEST_CASE_P(C, SADTest, ::testing::Values(
+                        make_tuple(16, 16, sad_16x16_c),
+                        make_tuple(8, 16, sad_8x16_c),
+                        make_tuple(16, 8, sad_16x8_c),
+                        make_tuple(8, 8, sad_8x8_c),
+                        make_tuple(4, 4, sad_4x4_c)));
+
+// ARM tests
+#if HAVE_MEDIA
+const sad_m_by_n_fn_t sad_16x16_armv6 = vp8_sad16x16_armv6;
+INSTANTIATE_TEST_CASE_P(MEDIA, SADTest, ::testing::Values(
+                        make_tuple(16, 16, sad_16x16_armv6)));
+
+#endif
+#if HAVE_NEON
+const sad_m_by_n_fn_t sad_16x16_neon = vp8_sad16x16_neon;
+const sad_m_by_n_fn_t sad_8x16_neon = vp8_sad8x16_neon;
+const sad_m_by_n_fn_t sad_16x8_neon = vp8_sad16x8_neon;
+const sad_m_by_n_fn_t sad_8x8_neon = vp8_sad8x8_neon;
+const sad_m_by_n_fn_t sad_4x4_neon = vp8_sad4x4_neon;
+INSTANTIATE_TEST_CASE_P(NEON, SADTest, ::testing::Values(
+                        make_tuple(16, 16, sad_16x16_neon),
+                        make_tuple(8, 16, sad_8x16_neon),
+                        make_tuple(16, 8, sad_16x8_neon),
+                        make_tuple(8, 8, sad_8x8_neon),
+                        make_tuple(4, 4, sad_4x4_neon)));
+#endif
+
+// X86 tests
+#if HAVE_MMX
+const sad_m_by_n_fn_t sad_16x16_mmx = vp8_sad16x16_mmx;
+const sad_m_by_n_fn_t sad_8x16_mmx = vp8_sad8x16_mmx;
+const sad_m_by_n_fn_t sad_16x8_mmx = vp8_sad16x8_mmx;
+const sad_m_by_n_fn_t sad_8x8_mmx = vp8_sad8x8_mmx;
+const sad_m_by_n_fn_t sad_4x4_mmx = vp8_sad4x4_mmx;
+INSTANTIATE_TEST_CASE_P(MMX, SADTest, ::testing::Values(
+                        make_tuple(16, 16, sad_16x16_mmx),
+                        make_tuple(8, 16, sad_8x16_mmx),
+                        make_tuple(16, 8, sad_16x8_mmx),
+                        make_tuple(8, 8, sad_8x8_mmx),
+                        make_tuple(4, 4, sad_4x4_mmx)));
+#endif
+#if HAVE_SSE2
+const sad_m_by_n_fn_t sad_16x16_wmt = vp8_sad16x16_wmt;
+const sad_m_by_n_fn_t sad_8x16_wmt = vp8_sad8x16_wmt;
+const sad_m_by_n_fn_t sad_16x8_wmt = vp8_sad16x8_wmt;
+const sad_m_by_n_fn_t sad_8x8_wmt = vp8_sad8x8_wmt;
+const sad_m_by_n_fn_t sad_4x4_wmt = vp8_sad4x4_wmt;
+INSTANTIATE_TEST_CASE_P(SSE2, SADTest, ::testing::Values(
+                        make_tuple(16, 16, sad_16x16_wmt),
+                        make_tuple(8, 16, sad_8x16_wmt),
+                        make_tuple(16, 8, sad_16x8_wmt),
+                        make_tuple(8, 8, sad_8x8_wmt),
+                        make_tuple(4, 4, sad_4x4_wmt)));
+#endif
+#if HAVE_SSSE3
+const sad_m_by_n_fn_t sad_16x16_sse3 = vp8_sad16x16_sse3;
+INSTANTIATE_TEST_CASE_P(SSE3, SADTest, ::testing::Values(
+                        make_tuple(16, 16, sad_16x16_sse3)));
+#endif
+
+}  // namespace

diff --git a/test/set_roi.cc b/test/set_roi.cc
new file mode 100644
index 0000000..3b6112e
--- /dev/null
+++ b/test/set_roi.cc

@@ -0,0 +1,182 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <math.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
+extern "C" {
+#include "vp8/encoder/onyx_int.h"
+}
+
+namespace {
+
+TEST(Vp8RoiMapTest, ParameterCheck) {
+  int delta_q[MAX_MB_SEGMENTS] = { -2, -25, 0, 31 };
+  int delta_lf[MAX_MB_SEGMENTS] = { -2, -25, 0, 31 };
+  unsigned int threshold[MAX_MB_SEGMENTS] = { 0, 100, 200, 300 };
+
+  const int internalq_trans[] = {
+    0,   1,  2,  3,  4,  5,  7,  8,
+    9,  10, 12, 13, 15, 17, 18, 19,
+    20,  21, 23, 24, 25, 26, 27, 28,
+    29,  30, 31, 33, 35, 37, 39, 41,
+    43,  45, 47, 49, 51, 53, 55, 57,
+    59,  61, 64, 67, 70, 73, 76, 79,
+    82,  85, 88, 91, 94, 97, 100, 103,
+    106, 109, 112, 115, 118, 121, 124, 127,
+  };
+
+  // Initialize elements of cpi with valid defaults.
+  VP8_COMP cpi;
+  cpi.mb.e_mbd.mb_segement_abs_delta = SEGMENT_DELTADATA;
+  cpi.cyclic_refresh_mode_enabled = 0;
+  cpi.mb.e_mbd.segmentation_enabled = 0;
+  cpi.mb.e_mbd.update_mb_segmentation_map = 0;
+  cpi.mb.e_mbd.update_mb_segmentation_data = 0;
+  cpi.common.mb_rows = 240 >> 4;
+  cpi.common.mb_cols = 320 >> 4;
+  const int mbs = (cpi.common.mb_rows * cpi.common.mb_cols);
+  vpx_memset(cpi.segment_feature_data, 0, sizeof(cpi.segment_feature_data));
+
+  // Segment map
+  cpi.segmentation_map = reinterpret_cast<unsigned char *>(vpx_calloc(mbs, 1));
+
+  // Allocate memory for the source memory map.
+  unsigned char *roi_map =
+    reinterpret_cast<unsigned char *>(vpx_calloc(mbs, 1));
+  vpx_memset(&roi_map[mbs >> 2], 1, (mbs >> 2));
+  vpx_memset(&roi_map[mbs >> 1], 2, (mbs >> 2));
+  vpx_memset(&roi_map[mbs -(mbs >> 2)], 3, (mbs >> 2));
+
+  // Do a test call with valid parameters.
+  int roi_retval = vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows,
+                                  cpi.common.mb_cols, delta_q, delta_lf,
+                                  threshold);
+  EXPECT_EQ(0, roi_retval)
+        << "vp8_set_roimap roi failed with default test parameters";
+
+  // Check that the values in the cpi structure get set as expected.
+  if (roi_retval == 0) {
+    // Check that the segment map got set.
+    const int mapcompare = memcmp(roi_map, cpi.segmentation_map, mbs);
+    EXPECT_EQ(0, mapcompare) << "segment map error";
+
+    // Check the q deltas (note the need to translate into
+    // the interanl range of 0-127.
+    for (int i = 0; i < MAX_MB_SEGMENTS; ++i) {
+      const int transq = internalq_trans[abs(delta_q[i])];
+      if (abs(cpi.segment_feature_data[MB_LVL_ALT_Q][i]) != transq) {
+          EXPECT_EQ(transq, cpi.segment_feature_data[MB_LVL_ALT_Q][i])
+                    << "segment delta_q  error";
+          break;
+      }
+    }
+
+    // Check the loop filter deltas
+    for (int i = 0; i < MAX_MB_SEGMENTS; ++i) {
+      if (cpi.segment_feature_data[MB_LVL_ALT_LF][i] != delta_lf[i]) {
+        EXPECT_EQ(delta_lf[i], cpi.segment_feature_data[MB_LVL_ALT_LF][i])
+                  << "segment delta_lf error";
+        break;
+      }
+    }
+
+    // Check the breakout thresholds
+    for (int i = 0; i < MAX_MB_SEGMENTS; ++i) {
+      unsigned int breakout =
+        static_cast<unsigned int>(cpi.segment_encode_breakout[i]);
+
+      if (threshold[i] != breakout) {
+        EXPECT_EQ(threshold[i], breakout)
+                  << "breakout threshold error";
+        break;
+      }
+    }
+
+    // Segmentation, and segmentation update flages should be set.
+    EXPECT_EQ(1, cpi.mb.e_mbd.segmentation_enabled)
+              << "segmentation_enabled error";
+    EXPECT_EQ(1, cpi.mb.e_mbd.update_mb_segmentation_map)
+              << "update_mb_segmentation_map error";
+    EXPECT_EQ(1, cpi.mb.e_mbd.update_mb_segmentation_data)
+              << "update_mb_segmentation_data error";
+
+
+    // Try a range of delta q and lf parameters (some legal, some not)
+    for (int i = 0; i < 1000; ++i) {
+      int rand_deltas[4];
+      int deltas_valid;
+      rand_deltas[0] = (rand() % 160) - 80;
+      rand_deltas[1] = (rand() % 160) - 80;
+      rand_deltas[2] = (rand() % 160) - 80;
+      rand_deltas[3] = (rand() % 160) - 80;
+
+      deltas_valid = ((abs(rand_deltas[0]) <= 63) &&
+                      (abs(rand_deltas[1]) <= 63) &&
+                      (abs(rand_deltas[2]) <= 63) &&
+                      (abs(rand_deltas[3]) <= 63)) ? 0 : -1;
+
+      // Test with random delta q values.
+      roi_retval = vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows,
+                                  cpi.common.mb_cols, rand_deltas,
+                                  delta_lf, threshold);
+      EXPECT_EQ(deltas_valid, roi_retval) << "dq range check error";
+
+      // One delta_q error shown at a time
+      if (deltas_valid != roi_retval)
+        break;
+
+      // Test with random loop filter values.
+      roi_retval = vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows,
+                                  cpi.common.mb_cols, delta_q,
+                                  rand_deltas, threshold);
+      EXPECT_EQ(deltas_valid, roi_retval) << "dlf range check error";
+
+      // One delta loop filter error shown at a time
+      if (deltas_valid != roi_retval)
+        break;
+    }
+
+    // Test that we report and error if cyclic refresh is enabled.
+    cpi.cyclic_refresh_mode_enabled = 1;
+    roi_retval = vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows,
+                                cpi.common.mb_cols, delta_q,
+                                delta_lf, threshold);
+    EXPECT_EQ(-1, roi_retval) << "cyclic refresh check error";
+    cpi.cyclic_refresh_mode_enabled = 0;
+
+    // Test invalid number of rows or colums.
+    roi_retval = vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows + 1,
+                                cpi.common.mb_cols, delta_q,
+                                delta_lf, threshold);
+    EXPECT_EQ(-1, roi_retval) << "MB rows bounds check error";
+
+    roi_retval = vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows,
+                                cpi.common.mb_cols - 1, delta_q,
+                                delta_lf, threshold);
+    EXPECT_EQ(-1, roi_retval) << "MB cols bounds check error";
+  }
+
+  // Free allocated memory
+  if (cpi.segmentation_map)
+    vpx_free(cpi.segmentation_map);
+  if (roi_map)
+    vpx_free(roi_map);
+};
+
+}  // namespace

diff --git a/test/sixtap_predict_test.cc b/test/sixtap_predict_test.cc
new file mode 100644
index 0000000..84b8988
--- /dev/null
+++ b/test/sixtap_predict_test.cc

@@ -0,0 +1,204 @@
+/*
+*  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+*
+*  Use of this source code is governed by a BSD-style license
+*  that can be found in the LICENSE file in the root of the source
+*  tree. An additional intellectual property rights grant can be found
+*  in the file PATENTS.  All contributing project authors may
+*  be found in the AUTHORS file in the root of the source tree.
+*/
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include "test/acm_random.h"
+#include "test/util.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+extern "C" {
+#include "./vpx_config.h"
+#include "./vpx_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_integer.h"
+}
+
+namespace {
+
+typedef void (*sixtap_predict_fn_t)(uint8_t *src_ptr,
+                                    int  src_pixels_per_line,
+                                    int  xoffset,
+                                    int  yoffset,
+                                    uint8_t *dst_ptr,
+                                    int  dst_pitch);
+
+class SixtapPredictTest : public PARAMS(int, int, sixtap_predict_fn_t) {
+ protected:
+  // Make test arrays big enough for 16x16 functions. Six-tap filters
+  // need 5 extra pixels outside of the macroblock.
+  static const int kSrcStride = 21;
+  static const int kDstStride = 16;
+
+  virtual void SetUp() {
+    width_ = GET_PARAM(0);
+    height_ = GET_PARAM(1);
+    sixtap_predict_ = GET_PARAM(2);
+    memset(src_, 0, sizeof(src_));
+    memset(dst_, 0, sizeof(dst_));
+    memset(dst_c_, 0, sizeof(dst_c_));
+  }
+
+  int width_;
+  int height_;
+  sixtap_predict_fn_t sixtap_predict_;
+  // The src stores the macroblock we will filter on, and makes it 1 byte larger
+  // in order to test unaligned access. The result is stored in dst and dst_c(c
+  // reference code result).
+  DECLARE_ALIGNED(16, uint8_t, src_[kSrcStride * kSrcStride + 1]);
+  DECLARE_ALIGNED(16, uint8_t, dst_[kDstStride * kDstStride]);
+  DECLARE_ALIGNED(16, uint8_t, dst_c_[kDstStride * kDstStride]);
+};
+
+TEST_P(SixtapPredictTest, TestWithPresetData) {
+  const size_t src_size = sizeof(src_) / sizeof(uint8_t);
+  const size_t dst_size = sizeof(dst_) / sizeof(uint8_t);
+
+  // Test input
+  static const uint8_t test_data[src_size] = {
+    216, 184, 4, 191, 82, 92, 41, 0, 1, 226, 236, 172, 20, 182, 42, 226, 177,
+    79, 94, 77, 179, 203, 206, 198, 22, 192, 19, 75, 17, 192, 44, 233, 120,
+    48, 168, 203, 141, 210, 203, 143, 180, 184, 59, 201, 110, 102, 171, 32,
+    182, 10, 109, 105, 213, 60, 47, 236, 253, 67, 55, 14, 3, 99, 247, 124,
+    148, 159, 71, 34, 114, 19, 177, 38, 203, 237, 239, 58, 83, 155, 91, 10,
+    166, 201, 115, 124, 5, 163, 104, 2, 231, 160, 16, 234, 4, 8, 103, 153,
+    167, 174, 187, 26, 193, 109, 64, 141, 90, 48, 200, 174, 204, 36, 184,
+    114, 237, 43, 238, 242, 207, 86, 245, 182, 247, 6, 161, 251, 14, 8, 148,
+    182, 182, 79, 208, 120, 188, 17, 6, 23, 65, 206, 197, 13, 242, 126, 128,
+    224, 170, 110, 211, 121, 197, 200, 47, 188, 207, 208, 184, 221, 216, 76,
+    148, 143, 156, 100, 8, 89, 117, 14, 112, 183, 221, 54, 197, 208, 180, 69,
+    176, 94, 180, 131, 215, 121, 76, 7, 54, 28, 216, 238, 249, 176, 58, 142,
+    64, 215, 242, 72, 49, 104, 87, 161, 32, 52, 216, 230, 4, 141, 44, 181,
+    235, 224, 57, 195, 89, 134, 203, 144, 162, 163, 126, 156, 84, 185, 42,
+    148, 145, 29, 221, 194, 134, 52, 100, 166, 105, 60, 140, 110, 201, 184,
+    35, 181, 153, 93, 121, 243, 227, 68, 131, 134, 232, 2, 35, 60, 187, 77,
+    209, 76, 106, 174, 15, 241, 227, 115, 151, 77, 175, 36, 187, 121, 221,
+    223, 47, 118, 61, 168, 105, 32, 237, 236, 167, 213, 238, 202, 17, 170,
+    24, 226, 247, 131, 145, 6, 116, 117, 121, 11, 194, 41, 48, 126, 162, 13,
+    93, 209, 131, 154, 122, 237, 187, 103, 217, 99, 60, 200, 45, 78, 115, 69,
+    49, 106, 200, 194, 112, 60, 56, 234, 72, 251, 19, 120, 121, 182, 134, 215,
+    135, 10, 114, 2, 247, 46, 105, 209, 145, 165, 153, 191, 243, 12, 5, 36,
+    119, 206, 231, 231, 11, 32, 209, 83, 27, 229, 204, 149, 155, 83, 109, 35,
+    93, 223, 37, 84, 14, 142, 37, 160, 52, 191, 96, 40, 204, 101, 77, 67, 52,
+    53, 43, 63, 85, 253, 147, 113, 226, 96, 6, 125, 179, 115, 161, 17, 83,
+    198, 101, 98, 85, 139, 3, 137, 75, 99, 178, 23, 201, 255, 91, 253, 52,
+    134, 60, 138, 131, 208, 251, 101, 48, 2, 227, 228, 118, 132, 245, 202,
+    75, 91, 44, 160, 231, 47, 41, 50, 147, 220, 74, 92, 219, 165, 89, 16
+  };
+
+  // Expected result
+  static const uint8_t expected_dst[dst_size] = {
+    117, 102, 74, 135, 42, 98, 175, 206, 70, 73, 222, 197, 50, 24, 39, 49, 38,
+    105, 90, 47, 169, 40, 171, 215, 200, 73, 109, 141, 53, 85, 177, 164, 79,
+    208, 124, 89, 212, 18, 81, 145, 151, 164, 217, 153, 91, 154, 102, 102,
+    159, 75, 164, 152, 136, 51, 213, 219, 186, 116, 193, 224, 186, 36, 231,
+    208, 84, 211, 155, 167, 35, 59, 42, 76, 216, 149, 73, 201, 78, 149, 184,
+    100, 96, 196, 189, 198, 188, 235, 195, 117, 129, 120, 129, 49, 25, 133,
+    113, 69, 221, 114, 70, 143, 99, 157, 108, 189, 140, 78, 6, 55, 65, 240,
+    255, 245, 184, 72, 90, 100, 116, 131, 39, 60, 234, 167, 33, 160, 88, 185,
+    200, 157, 159, 176, 127, 151, 138, 102, 168, 106, 170, 86, 82, 219, 189,
+    76, 33, 115, 197, 106, 96, 198, 136, 97, 141, 237, 151, 98, 137, 191,
+    185, 2, 57, 95, 142, 91, 255, 185, 97, 137, 76, 162, 94, 173, 131, 193,
+    161, 81, 106, 72, 135, 222, 234, 137, 66, 137, 106, 243, 210, 147, 95,
+    15, 137, 110, 85, 66, 16, 96, 167, 147, 150, 173, 203, 140, 118, 196,
+    84, 147, 160, 19, 95, 101, 123, 74, 132, 202, 82, 166, 12, 131, 166,
+    189, 170, 159, 85, 79, 66, 57, 152, 132, 203, 194, 0, 1, 56, 146, 180,
+    224, 156, 28, 83, 181, 79, 76, 80, 46, 160, 175, 59, 106, 43, 87, 75,
+    136, 85, 189, 46, 71, 200, 90
+  };
+
+  uint8_t *src = const_cast<uint8_t*>(test_data);
+
+  sixtap_predict_(&src[kSrcStride * 2 + 2 + 1], kSrcStride,
+                  2, 2, dst_, kDstStride);
+
+  for (int i = 0; i < height_; ++i)
+    for (int j = 0; j < width_; ++j)
+      ASSERT_EQ(expected_dst[i * kDstStride + j], dst_[i * kDstStride + j])
+          << "i==" << (i * width_ + j);
+}
+
+using libvpx_test::ACMRandom;
+
+TEST_P(SixtapPredictTest, TestWithRandomData) {
+  const size_t src_size = sizeof(src_) / sizeof(uint8_t);
+
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  for (size_t i = 0; i < src_size; ++i)
+    src_[i] = rnd.Rand8();
+
+  // Run tests for all possible offsets.
+  for (int xoffset = 0; xoffset < 8; ++xoffset) {
+    for (int yoffset = 0; yoffset < 8; ++yoffset) {
+      // Call c reference function.
+      // Move start point to next pixel to test if the function reads
+      // unaligned data correctly.
+      vp8_sixtap_predict16x16_c(&src_[kSrcStride * 2 + 2 + 1], kSrcStride,
+                                xoffset, yoffset, dst_c_, kDstStride);
+
+      // Run test.
+      sixtap_predict_(&src_[kSrcStride * 2 + 2 + 1], kSrcStride,
+                      xoffset, yoffset, dst_, kDstStride);
+
+      for (int i = 0; i < height_; ++i)
+        for (int j = 0; j < width_; ++j)
+          ASSERT_EQ(dst_c_[i * kDstStride + j], dst_[i * kDstStride + j])
+              << "i==" << (i * width_ + j);
+    }
+  }
+}
+
+using std::tr1::make_tuple;
+
+const sixtap_predict_fn_t sixtap_16x16_c = vp8_sixtap_predict16x16_c;
+const sixtap_predict_fn_t sixtap_8x8_c = vp8_sixtap_predict8x8_c;
+const sixtap_predict_fn_t sixtap_8x4_c = vp8_sixtap_predict8x4_c;
+const sixtap_predict_fn_t sixtap_4x4_c = vp8_sixtap_predict4x4_c;
+INSTANTIATE_TEST_CASE_P(
+    C, SixtapPredictTest, ::testing::Values(
+        make_tuple(16, 16, sixtap_16x16_c),
+        make_tuple(8, 8, sixtap_8x8_c),
+        make_tuple(8, 4, sixtap_8x4_c),
+        make_tuple(4, 4, sixtap_4x4_c)));
+#if HAVE_MMX
+const sixtap_predict_fn_t sixtap_16x16_mmx = vp8_sixtap_predict16x16_mmx;
+const sixtap_predict_fn_t sixtap_8x8_mmx = vp8_sixtap_predict8x8_mmx;
+const sixtap_predict_fn_t sixtap_8x4_mmx = vp8_sixtap_predict8x4_mmx;
+const sixtap_predict_fn_t sixtap_4x4_mmx = vp8_sixtap_predict4x4_mmx;
+INSTANTIATE_TEST_CASE_P(
+    MMX, SixtapPredictTest, ::testing::Values(
+        make_tuple(16, 16, sixtap_16x16_mmx),
+        make_tuple(8, 8, sixtap_8x8_mmx),
+        make_tuple(8, 4, sixtap_8x4_mmx),
+        make_tuple(4, 4, sixtap_4x4_mmx)));
+#endif
+#if HAVE_SSE2
+const sixtap_predict_fn_t sixtap_16x16_sse2 = vp8_sixtap_predict16x16_sse2;
+const sixtap_predict_fn_t sixtap_8x8_sse2 = vp8_sixtap_predict8x8_sse2;
+const sixtap_predict_fn_t sixtap_8x4_sse2 = vp8_sixtap_predict8x4_sse2;
+INSTANTIATE_TEST_CASE_P(
+    SSE2, SixtapPredictTest, ::testing::Values(
+        make_tuple(16, 16, sixtap_16x16_sse2),
+        make_tuple(8, 8, sixtap_8x8_sse2),
+        make_tuple(8, 4, sixtap_8x4_sse2)));
+#endif
+#if HAVE_SSSE3
+const sixtap_predict_fn_t sixtap_16x16_ssse3 = vp8_sixtap_predict16x16_ssse3;
+const sixtap_predict_fn_t sixtap_8x8_ssse3 = vp8_sixtap_predict8x8_ssse3;
+const sixtap_predict_fn_t sixtap_8x4_ssse3 = vp8_sixtap_predict8x4_ssse3;
+const sixtap_predict_fn_t sixtap_4x4_ssse3 = vp8_sixtap_predict4x4_ssse3;
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, SixtapPredictTest, ::testing::Values(
+        make_tuple(16, 16, sixtap_16x16_ssse3),
+        make_tuple(8, 8, sixtap_8x8_ssse3),
+        make_tuple(8, 4, sixtap_8x4_ssse3),
+        make_tuple(4, 4, sixtap_4x4_ssse3)));
+#endif
+}  // namespace

diff --git a/test/test-data.sha1 b/test/test-data.sha1
new file mode 100644
index 0000000..8d40242
--- /dev/null
+++ b/test/test-data.sha1

@@ -0,0 +1 @@
+d5dfb0151c9051f8c85999255645d7a23916d3c0  hantro_collage_w352h288.yuv

diff --git a/test/test.mk b/test/test.mk
index ac500fe..9486f14 100644
--- a/test/test.mk
+++ b/test/test.mk

@@ -1,9 +1,47 @@
+LIBVPX_TEST_SRCS-yes += acm_random.h
 LIBVPX_TEST_SRCS-yes += test.mk
-LIBVPX_TEST_SRCS-yes += boolcoder_test.cc
-LIBVPX_TEST_SRCS-yes += encode_test_driver.cc
-LIBVPX_TEST_SRCS-yes += encode_test_driver.h
-LIBVPX_TEST_SRCS-yes += idctllm_test.cc
-LIBVPX_TEST_SRCS-yes += keyframe_test.cc
-LIBVPX_TEST_SRCS-yes += resize_test.cc
 LIBVPX_TEST_SRCS-yes += test_libvpx.cc
-LIBVPX_TEST_SRCS-yes += video_source.h
+LIBVPX_TEST_SRCS-yes += util.h
+
+##
+## BLACK BOX TESTS
+##
+## Black box tests only use the public API.
+##
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += altref_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += config_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += encode_test_driver.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += encode_test_driver.h
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += i420_video_source.h
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += resize_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += video_source.h
+
+##
+## WHITE BOX TESTS
+##
+## Whitebox tests invoke functions not exposed via the public API. Certain
+## shared library builds don't make these functions accessible.
+##
+ifeq ($(CONFIG_SHARED),)
+
+# These tests require both the encoder and decoder to be built.
+ifeq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),yesyes)
+LIBVPX_TEST_SRCS-yes                   += boolcoder_test.cc
+endif
+
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += fdct4x4_test.cc
+LIBVPX_TEST_SRCS-yes                   += idctllm_test.cc
+LIBVPX_TEST_SRCS-yes                   += intrapred_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_POSTPROC)    += pp_filter_test.cc
+LIBVPX_TEST_SRCS-yes                   += sad_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += set_roi.cc
+LIBVPX_TEST_SRCS-yes                   += sixtap_predict_test.cc
+
+endif
+
+
+##
+## TEST DATA
+##
+LIBVPX_TEST_DATA-$(CONFIG_VP8_ENCODER) += hantro_collage_w352h288.yuv

diff --git a/test/test_libvpx.cc b/test/test_libvpx.cc
index 48bfc6c..cfd5d28 100644
--- a/test/test_libvpx.cc
+++ b/test/test_libvpx.cc

@@ -7,8 +7,39 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include <string>
+#include "vpx_config.h"
+#if ARCH_X86 || ARCH_X86_64
+extern "C" {
+#include "vpx_ports/x86.h"
+}
+#endif
 #include "third_party/googletest/src/include/gtest/gtest.h"
+
+static void append_gtest_filter(const char *str) {
+  std::string filter = ::testing::FLAGS_gtest_filter;
+  filter += str;
+  ::testing::FLAGS_gtest_filter = filter;
+}
+
 int main(int argc, char **argv) {
   ::testing::InitGoogleTest(&argc, argv);
+
+#if ARCH_X86 || ARCH_X86_64
+  const int simd_caps = x86_simd_caps();
+  if(!(simd_caps & HAS_MMX))
+    append_gtest_filter(":-MMX/*");
+  if(!(simd_caps & HAS_SSE))
+    append_gtest_filter(":-SSE/*");
+  if(!(simd_caps & HAS_SSE2))
+    append_gtest_filter(":-SSE2/*");
+  if(!(simd_caps & HAS_SSE3))
+    append_gtest_filter(":-SSE3/*");
+  if(!(simd_caps & HAS_SSSE3))
+    append_gtest_filter(":-SSSE3/*");
+  if(!(simd_caps & HAS_SSE4_1))
+    append_gtest_filter(":-SSE4_1/*");
+#endif
+
   return RUN_ALL_TESTS();
 }

diff --git a/test/util.h b/test/util.h
new file mode 100644
index 0000000..06a70cc
--- /dev/null
+++ b/test/util.h

@@ -0,0 +1,18 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef TEST_UTIL_H_
+#define TEST_UTIL_H_
+
+// Macros
+#define PARAMS(...) ::testing::TestWithParam< std::tr1::tuple< __VA_ARGS__ > >
+#define GET_PARAM(k) std::tr1::get< k >(GetParam())
+
+#endif  // TEST_UTIL_H_

diff --git a/test/video_source.h b/test/video_source.h
index 3507ef0..688e185 100644
--- a/test/video_source.h
+++ b/test/video_source.h

@@ -9,6 +9,8 @@
  */
 #ifndef TEST_VIDEO_SOURCE_H_
 #define TEST_VIDEO_SOURCE_H_
+
+#include "test/acm_random.h"
 #include "vpx/vpx_encoder.h"
 
 namespace libvpx_test {
@@ -99,16 +101,31 @@
 
 
 class RandomVideoSource : public DummyVideoSource {
+ public:
+  RandomVideoSource(int seed = ACMRandom::DeterministicSeed())
+      : rnd_(seed),
+        seed_(seed) { }
+
  protected:
+  // Reset the RNG to get a matching stream for the second pass
+  virtual void Begin() {
+    frame_ = 0;
+    rnd_.Reset(seed_);
+    FillFrame();
+  }
+
   // 15 frames of noise, followed by 15 static frames. Reset to 0 rather
   // than holding previous frames to encourage keyframes to be thrown.
   virtual void FillFrame() {
     if (frame_ % 30 < 15)
       for (size_t i = 0; i < raw_sz_; ++i)
-        img_->img_data[i] = rand();
+        img_->img_data[i] = rnd_.Rand8();
     else
       memset(img_->img_data, 0, raw_sz_);
   }
+
+  ACMRandom rnd_;
+  int seed_;
 };
 
 }  // namespace libvpx_test

diff --git a/vp8/common/arm/armv6/intra4x4_predict_v6.asm b/vp8/common/arm/armv6/intra4x4_predict_v6.asm
index a974cd1..c5ec824 100644
--- a/vp8/common/arm/armv6/intra4x4_predict_v6.asm
+++ b/vp8/common/arm/armv6/intra4x4_predict_v6.asm

@@ -18,15 +18,23 @@
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
 
-;void vp8_intra4x4_predict(unsigned char *src, int src_stride, int b_mode,
-;                          unsigned char *dst, int dst_stride)
+;void vp8_intra4x4_predict_armv6(unsigned char *Above, unsigned char *yleft,
+;                                B_PREDICTION_MODE left_stride, int b_mode,
+;                                unsigned char *dst, int dst_stride,
+;                                unsigned char top_left)
 
+; r0: *Above
+; r1: *yleft
+; r2: left_stride
+; r3: b_mode
+; sp + #40: dst
+; sp + #44: dst_stride
+; sp + #48: top_left
 |vp8_intra4x4_predict_armv6| PROC
     push        {r4-r12, lr}
 
-
-    cmp         r2, #10
-    addlt       pc, pc, r2, lsl #2       ; position independent switch
+    cmp         r3, #10
+    addlt       pc, pc, r3, lsl #2       ; position independent switch
     pop         {r4-r12, pc}             ; default
     b           b_dc_pred
     b           b_tm_pred
@@ -41,13 +49,13 @@
 
 b_dc_pred
     ; load values
-    ldr         r8, [r0, -r1]            ; Above
-    ldrb        r4, [r0, #-1]!           ; Left[0]
+    ldr         r8, [r0]                 ; Above
+    ldrb        r4, [r1], r2             ; Left[0]
     mov         r9, #0
-    ldrb        r5, [r0, r1]             ; Left[1]
-    ldrb        r6, [r0, r1, lsl #1]!    ; Left[2]
+    ldrb        r5, [r1], r2             ; Left[1]
+    ldrb        r6, [r1], r2             ; Left[2]
     usad8       r12, r8, r9
-    ldrb        r7, [r0, r1]             ; Left[3]
+    ldrb        r7, [r1]                 ; Left[3]
 
     ; calculate dc
     add         r4, r4, r5
@@ -55,31 +63,30 @@
     add         r4, r4, r7
     add         r4, r4, r12
     add         r4, r4, #4
-    ldr         r0, [sp, #40]           ; load stride
+    ldr         r0, [sp, #44]           ; dst_stride
     mov         r12, r4, asr #3         ; (expected_dc + 4) >> 3
 
     add         r12, r12, r12, lsl #8
-    add         r3, r3, r0
+    ldr         r3, [sp, #40]           ; dst
     add         r12, r12, r12, lsl #16
 
     ; store values
-    str         r12, [r3, -r0]
+    str         r12, [r3], r0
+    str         r12, [r3], r0
+    str         r12, [r3], r0
     str         r12, [r3]
-    str         r12, [r3, r0]
-    str         r12, [r3, r0, lsl #1]
 
     pop        {r4-r12, pc}
 
 b_tm_pred
-    sub         r10, r0, #1             ; Left
-    ldr         r8, [r0, -r1]           ; Above
-    ldrb        r9, [r10, -r1]          ; top_left
-    ldrb        r4, [r0, #-1]!          ; Left[0]
-    ldrb        r5, [r10, r1]!          ; Left[1]
-    ldrb        r6, [r0, r1, lsl #1]    ; Left[2]
-    ldrb        r7, [r10, r1, lsl #1]   ; Left[3]
-    ldr         r0, [sp, #40]           ; load stride
-
+    ldr         r8, [r0]                ; Above
+    ldrb        r9, [sp, #48]           ; top_left
+    ldrb        r4, [r1], r2            ; Left[0]
+    ldrb        r5, [r1], r2            ; Left[1]
+    ldrb        r6, [r1], r2            ; Left[2]
+    ldrb        r7, [r1]                ; Left[3]
+    ldr         r0, [sp, #44]           ; dst_stride
+    ldr         r3, [sp, #40]           ; dst
 
     add         r9, r9, r9, lsl #16     ; [tl|tl]
     uxtb16      r10, r8                 ; a[2|0]
@@ -126,25 +133,26 @@
     str         r12, [r3], r0
 
     add         r12, r4, r5, lsl #8     ; [3|2|1|0]
-    str         r12, [r3], r0
+    str         r12, [r3]
 
     pop        {r4-r12, pc}
 
 b_ve_pred
-    ldr         r8, [r0, -r1]!          ; a[3|2|1|0]
+    ldr         r8, [r0]                ; a[3|2|1|0]
     ldr         r11, c00FF00FF
-    ldrb        r9, [r0, #-1]           ; top_left
+    ldrb        r9, [sp, #48]           ; top_left
     ldrb        r10, [r0, #4]           ; a[4]
 
     ldr         r0, c00020002
 
     uxtb16      r4, r8                  ; a[2|0]
     uxtb16      r5, r8, ror #8          ; a[3|1]
-    ldr         r2, [sp, #40]           ; stride
+    ldr         r2, [sp, #44]           ; dst_stride
     pkhbt       r9, r9, r5, lsl #16     ; a[1|-1]
 
     add         r9, r9, r4, lsl #1      ;[a[1]+2*a[2]       | tl+2*a[0]       ]
     uxtab16     r9, r9, r5              ;[a[1]+2*a[2]+a[3]  | tl+2*a[0]+a[1]  ]
+    ldr         r3, [sp, #40]           ; dst
     uxtab16     r9, r9, r0              ;[a[1]+2*a[2]+a[3]+2| tl+2*a[0]+a[1]+2]
 
     add         r0, r0, r10, lsl #16    ;[a[4]+2            |                 2]
@@ -154,25 +162,23 @@
 
     and         r9, r11, r9, asr #2
     and         r4, r11, r4, asr #2
-    add         r3, r3, r2              ; dst + dst_stride
     add         r9, r9, r4, lsl #8
 
     ; store values
-    str         r9, [r3, -r2]
+    str         r9, [r3], r2
+    str         r9, [r3], r2
+    str         r9, [r3], r2
     str         r9, [r3]
-    str         r9, [r3, r2]
-    str         r9, [r3, r2, lsl #1]
 
     pop        {r4-r12, pc}
 
 
 b_he_pred
-    sub         r10, r0, #1             ; Left
-    ldrb        r4, [r0, #-1]!          ; Left[0]
-    ldrb        r8, [r10, -r1]          ; top_left
-    ldrb        r5, [r10, r1]!          ; Left[1]
-    ldrb        r6, [r0, r1, lsl #1]    ; Left[2]
-    ldrb        r7, [r10, r1, lsl #1]   ; Left[3]
+    ldrb        r4, [r1], r2            ; Left[0]
+    ldrb        r8, [sp, #48]           ; top_left
+    ldrb        r5, [r1], r2            ; Left[1]
+    ldrb        r6, [r1], r2            ; Left[2]
+    ldrb        r7, [r1]                ; Left[3]
 
     add         r8, r8, r4              ; tl   + l[0]
     add         r9, r4, r5              ; l[0] + l[1]
@@ -197,7 +203,8 @@
     pkhtb       r10, r10, r10, asr #16  ; l[-|2|-|2]
     pkhtb       r11, r11, r11, asr #16  ; l[-|3|-|3]
 
-    ldr         r0, [sp, #40]           ; stride
+    ldr         r0, [sp, #44]           ; dst_stride
+    ldr         r3, [sp, #40]           ; dst
 
     add         r8, r8, r8, lsl #8      ; l[0|0|0|0]
     add         r9, r9, r9, lsl #8      ; l[1|1|1|1]
@@ -206,16 +213,16 @@
 
     ; store values
     str         r8, [r3], r0
-    str         r9, [r3]
-    str         r10, [r3, r0]
-    str         r11, [r3, r0, lsl #1]
+    str         r9, [r3], r0
+    str         r10, [r3], r0
+    str         r11, [r3]
 
     pop        {r4-r12, pc}
 
 b_ld_pred
-    ldr         r4, [r0, -r1]!          ; Above
+    ldr         r4, [r0]                ; Above[0-3]
     ldr         r12, c00020002
-    ldr         r5, [r0, #4]
+    ldr         r5, [r0, #4]            ; Above[4-7]
     ldr         lr,  c00FF00FF
 
     uxtb16      r6, r4                  ; a[2|0]
@@ -225,7 +232,6 @@
     pkhtb       r10, r6, r8             ; a[2|4]
     pkhtb       r11, r7, r9             ; a[3|5]
 
-
     add         r4, r6, r7, lsl #1      ; [a2+2*a3      |      a0+2*a1]
     add         r4, r4, r10, ror #16    ; [a2+2*a3+a4   |   a0+2*a1+a2]
     uxtab16     r4, r4, r12             ; [a2+2*a3+a4+2 | a0+2*a1+a2+2]
@@ -244,7 +250,8 @@
     add         r7, r7, r9, asr #16     ; [                 a5+2*a6+a7]
     uxtah       r7, r7, r12             ; [               a5+2*a6+a7+2]
 
-    ldr         r0, [sp, #40]           ; stride
+    ldr         r0, [sp, #44]           ; dst_stride
+    ldr         r3, [sp, #40]           ; dst
 
     ; scale down
     and         r4, lr, r4, asr #2
@@ -266,18 +273,17 @@
     mov         r6, r6, lsr #16
     mov         r11, r10, lsr #8
     add         r11, r11, r6, lsl #24   ; [6|5|4|3]
-    str         r11, [r3], r0
+    str         r11, [r3]
 
     pop        {r4-r12, pc}
 
 b_rd_pred
-    sub         r12, r0, r1             ; Above = src - src_stride
-    ldrb        r7, [r0, #-1]!          ; l[0] = pp[3]
-    ldr         lr, [r12]               ; Above = pp[8|7|6|5]
-    ldrb        r8, [r12, #-1]!         ; tl   = pp[4]
-    ldrb        r6, [r12, r1, lsl #1]   ; l[1] = pp[2]
-    ldrb        r5, [r0, r1, lsl #1]    ; l[2] = pp[1]
-    ldrb        r4, [r12, r1, lsl #2]   ; l[3] = pp[0]
+    ldrb        r7, [r1], r2            ; l[0] = pp[3]
+    ldr         lr, [r0]                ; Above = pp[8|7|6|5]
+    ldrb        r8, [sp, #48]           ; tl   = pp[4]
+    ldrb        r6, [r1], r2            ; l[1] = pp[2]
+    ldrb        r5, [r1], r2            ; l[2] = pp[1]
+    ldrb        r4, [r1], r2            ; l[3] = pp[0]
 
 
     uxtb16      r9, lr                  ; p[7|5]
@@ -307,7 +313,8 @@
     add         r7, r7, r10             ; [p6+2*p7+p8   |   p4+2*p5+p6]
     uxtab16     r7, r7, r12             ; [p6+2*p7+p8+2 | p4+2*p5+p6+2]
 
-    ldr         r0, [sp, #40]           ; stride
+    ldr         r0, [sp, #44]           ; dst_stride
+    ldr         r3, [sp, #40]           ; dst
 
     ; scale down
     and         r7, lr, r7, asr #2
@@ -328,18 +335,17 @@
 
     mov         r11, r10, lsl #8        ; [3|2|1|-]
     uxtab       r11, r11, r4            ; [3|2|1|0]
-    str         r11, [r3], r0
+    str         r11, [r3]
 
     pop        {r4-r12, pc}
 
 b_vr_pred
-    sub         r12, r0, r1             ; Above = src - src_stride
-    ldrb        r7, [r0, #-1]!          ; l[0] = pp[3]
-    ldr         lr, [r12]               ; Above = pp[8|7|6|5]
-    ldrb        r8, [r12, #-1]!         ; tl   = pp[4]
-    ldrb        r6, [r12, r1, lsl #1]   ; l[1] = pp[2]
-    ldrb        r5, [r0, r1, lsl #1]    ; l[2] = pp[1]
-    ldrb        r4, [r12, r1, lsl #2]   ; l[3] = pp[0]
+    ldrb        r7, [r1], r2            ; l[0] = pp[3]
+    ldr         lr, [r0]                ; Above = pp[8|7|6|5]
+    ldrb        r8, [sp, #48]           ; tl   = pp[4]
+    ldrb        r6, [r1], r2            ; l[1] = pp[2]
+    ldrb        r5, [r1], r2            ; l[2] = pp[1]
+    ldrb        r4, [r1]                ; l[3] = pp[0]
 
     add         r5, r5, r7, lsl #16     ; p[3|1]
     add         r6, r6, r8, lsl #16     ; p[4|2]
@@ -376,7 +382,8 @@
     add         r8, r8, r10             ; [p6+2*p7+p8   |   p4+2*p5+p6]
     uxtab16     r8, r8, r12             ; [p6+2*p7+p8+2 | p4+2*p5+p6+2]
 
-    ldr         r0, [sp, #40]           ; stride
+    ldr         r0, [sp, #44]           ; dst_stride
+    ldr         r3, [sp, #40]           ; dst
 
     ; scale down
     and         r5, lr, r5, asr #2      ; [B|A]
@@ -397,14 +404,14 @@
     pkhtb       r10, r7, r5, asr #16    ; [-|H|-|B]
     str         r2, [r3], r0
     add         r12, r12, r10, lsl #8   ; [H|D|B|A]
-    str         r12, [r3], r0
+    str         r12, [r3]
 
     pop        {r4-r12, pc}
 
 b_vl_pred
-    ldr         r4, [r0, -r1]!          ; [3|2|1|0]
+    ldr         r4, [r0]                ; [3|2|1|0] = Above[0-3]
     ldr         r12, c00020002
-    ldr         r5, [r0, #4]            ; [7|6|5|4]
+    ldr         r5, [r0, #4]            ; [7|6|5|4] = Above[4-7]
     ldr         lr,  c00FF00FF
     ldr         r2,  c00010001
 
@@ -441,7 +448,8 @@
     add         r9, r9, r11             ; [p5+2*p6+p7   |   p3+2*p4+p5]
     uxtab16     r9, r9, r12             ; [p5+2*p6+p7+2 | p3+2*p4+p5+2]
 
-    ldr         r0, [sp, #40]           ; stride
+    ldr         r0, [sp, #44]           ; dst_stride
+    ldr         r3, [sp, #40]           ; dst
 
     ; scale down
     and         r5, lr, r5, asr #2      ; [D|C]
@@ -449,7 +457,6 @@
     and         r8, lr, r8, asr #2      ; [I|D]
     and         r9, lr, r9, asr #2      ; [J|H]
 
-
     add         r10, r4, r6, lsl #8     ; [F|B|E|A]
     str         r10, [r3], r0
 
@@ -463,18 +470,17 @@
     str         r12, [r3], r0
 
     add         r10, r7, r10, lsl #8    ; [J|H|D|G]
-    str         r10, [r3], r0
+    str         r10, [r3]
 
     pop        {r4-r12, pc}
 
 b_hd_pred
-    sub         r12, r0, r1             ; Above = src - src_stride
-    ldrb        r7, [r0, #-1]!          ; l[0] = pp[3]
-    ldr         lr, [r12]               ; Above = pp[8|7|6|5]
-    ldrb        r8, [r12, #-1]!         ; tl   = pp[4]
-    ldrb        r6, [r0, r1]            ; l[1] = pp[2]
-    ldrb        r5, [r0, r1, lsl #1]    ; l[2] = pp[1]
-    ldrb        r4, [r12, r1, lsl #2]   ; l[3] = pp[0]
+    ldrb        r7, [r1], r2            ; l[0] = pp[3]
+    ldr         lr, [r0]                ; Above = pp[8|7|6|5]
+    ldrb        r8, [sp, #48]           ; tl   = pp[4]
+    ldrb        r6, [r1], r2            ; l[1] = pp[2]
+    ldrb        r5, [r1], r2            ; l[2] = pp[1]
+    ldrb        r4, [r1]                ; l[3] = pp[0]
 
     uxtb16      r9, lr                  ; p[7|5]
     uxtb16      r10, lr, ror #8         ; p[8|6]
@@ -492,7 +498,6 @@
     pkhtb       r1, r9, r10             ; p[7|6]
     pkhbt       r10, r8, r10, lsl #16   ; p[6|5]
 
-
     uadd16      r11, r4, r5             ; [p1+p2        |        p0+p1]
     uhadd16     r11, r11, r2            ; [(p1+p2+1)>>1 | (p0+p1+1)>>1]
                                         ; [B|A]
@@ -518,7 +523,8 @@
     and         r5, lr, r5, asr #2      ; [H|G]
     and         r6, lr, r6, asr #2      ; [J|I]
 
-    ldr         lr, [sp, #40]           ; stride
+    ldr         lr, [sp, #44]           ; dst_stride
+    ldr         r3, [sp, #40]           ; dst
 
     pkhtb       r2, r0, r6              ; [-|F|-|I]
     pkhtb       r12, r6, r5, asr #16    ; [-|J|-|H]
@@ -527,7 +533,6 @@
     mov         r12, r12, ror #24       ; [J|I|H|F]
     str         r12, [r3], lr
 
-
     mov         r7, r11, asr #16        ; [-|-|-|B]
     str         r2, [r3], lr
     add         r7, r7, r0, lsl #16     ; [-|E|-|B]
@@ -536,21 +541,20 @@
     str         r7, [r3], lr
 
     add         r5, r11, r4, lsl #8     ; [D|B|C|A]
-    str         r5, [r3], lr
+    str         r5, [r3]
 
     pop        {r4-r12, pc}
 
 
 
 b_hu_pred
-    ldrb        r4, [r0, #-1]!          ; Left[0]
+    ldrb        r4, [r1], r2            ; Left[0]
     ldr         r12, c00020002
-    ldrb        r5, [r0, r1]!           ; Left[1]
+    ldrb        r5, [r1], r2            ; Left[1]
     ldr         lr,  c00FF00FF
-    ldrb        r6, [r0, r1]!           ; Left[2]
+    ldrb        r6, [r1], r2            ; Left[2]
     ldr         r2,  c00010001
-    ldrb        r7, [r0, r1]            ; Left[3]
-
+    ldrb        r7, [r1]                ; Left[3]
 
     add         r4, r4, r5, lsl #16     ; [1|0]
     add         r5, r5, r6, lsl #16     ; [2|1]
@@ -563,7 +567,8 @@
     add         r4, r4, r5, lsl #1      ; [p1+2*p2      |      p0+2*p1]
     add         r4, r4, r9              ; [p1+2*p2+p3   |   p0+2*p1+p2]
     uxtab16     r4, r4, r12             ; [p1+2*p2+p3+2 | p0+2*p1+p2+2]
-    ldr         r2, [sp, #40]           ; stride
+    ldr         r2, [sp, #44]           ; dst_stride
+    ldr         r3, [sp, #40]           ; dst
     and         r4, lr, r4, asr #2      ; [D|C]
 
     add         r10, r6, r7             ; [p2+p3]
@@ -587,9 +592,9 @@
 
     add         r10, r11, lsl #8        ; [-|-|F|E]
     add         r10, r10, r9, lsl #16   ; [G|G|F|E]
-    str         r10, [r3]
+    str         r10, [r3], r2
 
-    str         r7, [r3, r2]
+    str         r7, [r3]
 
     pop        {r4-r12, pc}
 

diff --git a/vp8/common/arm/neon/dc_only_idct_add_neon.asm b/vp8/common/arm/neon/dc_only_idct_add_neon.asm
index 65a4680..79ff02c 100644
--- a/vp8/common/arm/neon/dc_only_idct_add_neon.asm
+++ b/vp8/common/arm/neon/dc_only_idct_add_neon.asm

@@ -46,7 +46,7 @@
     vst1.32         {d2[1]}, [r3], r12
     vst1.32         {d4[0]}, [r3], r12
     vst1.32         {d4[1]}, [r3]
- 
+
     bx              lr
 
     ENDP

diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h
index c7206b2..f7ff577 100644
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h

@@ -161,7 +161,7 @@
     uint8_t segment_id;                  /* Which set of segmentation parameters should be used for this MB */
 } MB_MODE_INFO;
 
-typedef struct
+typedef struct modeinfo
 {
     MB_MODE_INFO mbmi;
     union b_mode_info bmi[16];
@@ -174,8 +174,7 @@
     MB_PREDICTION_MODE mode;
     MV_REFERENCE_FRAME ref_frame;
     int_mv mv;
-    //union b_mode_info bmi[16];
-    int dissim;    // dissimilarity level of the macroblock
+    int dissim;    /* dissimilarity level of the macroblock */
 } LOWER_RES_MB_INFO;
 
 /* The frame-level information needed to be stored for higher-resolution
@@ -183,6 +182,9 @@
 typedef struct
 {
     FRAME_TYPE frame_type;
+    int is_frame_dropped;
+    /* The frame number of each reference frames */
+    unsigned int low_res_ref_frames[MAX_REF_FRAMES];
     LOWER_RES_MB_INFO *mb_info;
 } LOWER_RES_FRAME_INFO;
 #endif

diff --git a/vp8/common/entropy.c b/vp8/common/entropy.c
index a95a923..8c046a4 100644
--- a/vp8/common/entropy.c
+++ b/vp8/common/entropy.c

@@ -101,7 +101,7 @@
 /* vp8_coef_encodings generated with:
     vp8_tokens_from_tree(vp8_coef_encodings, vp8_coef_tree);
 */
-const vp8_token vp8_coef_encodings[MAX_ENTROPY_TOKENS] =
+vp8_token vp8_coef_encodings[MAX_ENTROPY_TOKENS] =
 {
     {2, 2},
     {6, 3},

diff --git a/vp8/common/entropymode.h b/vp8/common/entropymode.h
index 3a2fa84..1df0f64 100644
--- a/vp8/common/entropymode.h
+++ b/vp8/common/entropymode.h

@@ -24,11 +24,11 @@
     SUBMVREF_LEFT_ABOVE_ZED
 } sumvfref_t;
 
-typedef const int vp8_mbsplit[16];
+typedef int vp8_mbsplit[16];
 
 #define VP8_NUMMBSPLITS 4
 
-extern vp8_mbsplit vp8_mbsplits [VP8_NUMMBSPLITS];
+extern const vp8_mbsplit vp8_mbsplits [VP8_NUMMBSPLITS];
 
 extern const int vp8_mbsplit_count [VP8_NUMMBSPLITS];    /* # of subsets */
 

diff --git a/vp8/common/extend.c b/vp8/common/extend.c
index 9089e16..c9bdd21 100644
--- a/vp8/common/extend.c
+++ b/vp8/common/extend.c

@@ -116,7 +116,7 @@
     int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1);
     int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1);
 
-    // If the side is not touching the bounder then don't extend.
+    /* If the side is not touching the bounder then don't extend. */
     if (srcy)
       et = 0;
     if (srcx)
@@ -157,7 +157,10 @@
 
 
 /* note the extension is only for the last row, for intra prediction purpose */
-void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr, unsigned char *UPtr, unsigned char *VPtr)
+void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf,
+                       unsigned char *YPtr,
+                       unsigned char *UPtr,
+                       unsigned char *VPtr)
 {
     int i;
 

diff --git a/vp8/common/filter.h b/vp8/common/filter.h
index 0f225c2..b7591f2 100644
--- a/vp8/common/filter.h
+++ b/vp8/common/filter.h

@@ -19,4 +19,4 @@
 extern const short vp8_bilinear_filters[8][2];
 extern const short vp8_sub_pel_filters[8][6];
 
-#endif //FILTER_H
+#endif

diff --git a/vp8/common/generic/systemdependent.c b/vp8/common/generic/systemdependent.c
index 2a30166..5a6ac7b 100644
--- a/vp8/common/generic/systemdependent.c
+++ b/vp8/common/generic/systemdependent.c

@@ -83,57 +83,6 @@
 #endif
 
 
-#if HAVE_PTHREAD_H
-#include <pthread.h>
-static void once(void (*func)(void))
-{
-    static pthread_once_t lock = PTHREAD_ONCE_INIT;
-    pthread_once(&lock, func);
-}
-
-
-#elif defined(_WIN32)
-static void once(void (*func)(void))
-{
-    /* Using a static initializer here rather than InitializeCriticalSection()
-     * since there's no race-free context in which to execute it. Protecting
-     * it with an atomic op like InterlockedCompareExchangePointer introduces
-     * an x86 dependency, and InitOnceExecuteOnce requires Vista.
-     */
-    static CRITICAL_SECTION lock = {(void *)-1, -1, 0, 0, 0, 0};
-    static int done;
-
-    EnterCriticalSection(&lock);
-
-    if (!done)
-    {
-        func();
-        done = 1;
-    }
-
-    LeaveCriticalSection(&lock);
-}
-
-
-#else
-/* No-op version that performs no synchronization. vpx_rtcd() is idempotent,
- * so as long as your platform provides atomic loads/stores of pointers
- * no synchronization is strictly necessary.
- */
-
-static void once(void (*func)(void))
-{
-    static int done;
-
-    if(!done)
-    {
-        func();
-        done = 1;
-    }
-}
-#endif
-
-
 void vp8_machine_specific_config(VP8_COMMON *ctx)
 {
 #if CONFIG_MULTITHREAD
@@ -145,6 +94,4 @@
 #elif ARCH_X86 || ARCH_X86_64
     ctx->cpu_caps = x86_simd_caps();
 #endif
-
-    once(vpx_rtcd);
 }

diff --git a/vp8/common/loopfilter.c b/vp8/common/loopfilter.c
index b9ac0ff..41b4f12 100644
--- a/vp8/common/loopfilter.c
+++ b/vp8/common/loopfilter.c

@@ -196,6 +196,114 @@
     }
 }
 
+
+void vp8_loop_filter_row_normal(VP8_COMMON *cm, MODE_INFO *mode_info_context,
+                         int mb_row, int post_ystride, int post_uvstride,
+                         unsigned char *y_ptr, unsigned char *u_ptr,
+                         unsigned char *v_ptr)
+{
+    int mb_col;
+    int filter_level;
+    loop_filter_info_n *lfi_n = &cm->lf_info;
+    loop_filter_info lfi;
+    FRAME_TYPE frame_type = cm->frame_type;
+
+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+    {
+        int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+                        mode_info_context->mbmi.mode != SPLITMV &&
+                        mode_info_context->mbmi.mb_skip_coeff);
+
+        const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+        const int seg = mode_info_context->mbmi.segment_id;
+        const int ref_frame = mode_info_context->mbmi.ref_frame;
+
+        filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+
+        if (filter_level)
+        {
+            const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
+            lfi.mblim = lfi_n->mblim[filter_level];
+            lfi.blim = lfi_n->blim[filter_level];
+            lfi.lim = lfi_n->lim[filter_level];
+            lfi.hev_thr = lfi_n->hev_thr[hev_index];
+
+            if (mb_col > 0)
+                vp8_loop_filter_mbv
+                (y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, &lfi);
+
+            if (!skip_lf)
+                vp8_loop_filter_bv
+                (y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, &lfi);
+
+            /* don't apply across umv border */
+            if (mb_row > 0)
+                vp8_loop_filter_mbh
+                (y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, &lfi);
+
+            if (!skip_lf)
+                vp8_loop_filter_bh
+                (y_ptr, u_ptr, v_ptr, post_ystride, post_uvstride, &lfi);
+        }
+
+        y_ptr += 16;
+        u_ptr += 8;
+        v_ptr += 8;
+
+        mode_info_context++;     /* step to next MB */
+    }
+
+}
+
+void vp8_loop_filter_row_simple(VP8_COMMON *cm, MODE_INFO *mode_info_context,
+                         int mb_row, int post_ystride, int post_uvstride,
+                         unsigned char *y_ptr, unsigned char *u_ptr,
+                         unsigned char *v_ptr)
+{
+    int mb_col;
+    int filter_level;
+    loop_filter_info_n *lfi_n = &cm->lf_info;
+
+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+    {
+        int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+                        mode_info_context->mbmi.mode != SPLITMV &&
+                        mode_info_context->mbmi.mb_skip_coeff);
+
+        const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+        const int seg = mode_info_context->mbmi.segment_id;
+        const int ref_frame = mode_info_context->mbmi.ref_frame;
+
+        filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+
+        if (filter_level)
+        {
+            if (mb_col > 0)
+                vp8_loop_filter_simple_mbv
+                (y_ptr, post_ystride, lfi_n->mblim[filter_level]);
+
+            if (!skip_lf)
+                vp8_loop_filter_simple_bv
+                (y_ptr, post_ystride, lfi_n->blim[filter_level]);
+
+            /* don't apply across umv border */
+            if (mb_row > 0)
+                vp8_loop_filter_simple_mbh
+                (y_ptr, post_ystride, lfi_n->mblim[filter_level]);
+
+            if (!skip_lf)
+                vp8_loop_filter_simple_bh
+                (y_ptr, post_ystride, lfi_n->blim[filter_level]);
+        }
+
+        y_ptr += 16;
+        u_ptr += 8;
+        v_ptr += 8;
+
+        mode_info_context++;     /* step to next MB */
+    }
+
+}
 void vp8_loop_filter_frame(VP8_COMMON *cm,
                            MACROBLOCKD *mbd,
                            int frame_type)

diff --git a/vp8/common/loopfilter.h b/vp8/common/loopfilter.h
index 0497271..b3af2d6 100644
--- a/vp8/common/loopfilter.h
+++ b/vp8/common/loopfilter.h

@@ -69,6 +69,7 @@
 /* assorted loopfilter functions which get used elsewhere */
 struct VP8Common;
 struct macroblockd;
+struct modeinfo;
 
 void vp8_loop_filter_init(struct VP8Common *cm);
 
@@ -90,4 +91,15 @@
 void vp8_loop_filter_update_sharpness(loop_filter_info_n *lfi,
                                       int sharpness_lvl);
 
+void vp8_loop_filter_row_normal(struct VP8Common *cm,
+                                struct modeinfo *mode_info_context,
+                                int mb_row, int post_ystride, int post_uvstride,
+                                unsigned char *y_ptr, unsigned char *u_ptr,
+                                unsigned char *v_ptr);
+
+void vp8_loop_filter_row_simple(struct VP8Common *cm,
+                                struct modeinfo *mode_info_context,
+                                int mb_row, int post_ystride, int post_uvstride,
+                                unsigned char *y_ptr, unsigned char *u_ptr,
+                                unsigned char *v_ptr);
 #endif

diff --git a/vp8/common/mfqe.c b/vp8/common/mfqe.c
index ca67e91..3fe5bca 100644
--- a/vp8/common/mfqe.c
+++ b/vp8/common/mfqe.c

@@ -160,9 +160,9 @@
         vsad = (vp8_variance8x8(v, uv_stride, vd, uvd_stride, &sse));
         vsad = (sse + 32)>>6;
 #else
-        sad = (vp8_sad16x16(y, y_stride, yd, yd_stride, INT_MAX)+128)>>8;
-        usad = (vp8_sad8x8(u, uv_stride, ud, uvd_stride, INT_MAX)+32)>>6;
-        vsad = (vp8_sad8x8(v, uv_stride, vd, uvd_stride, INT_MAX)+32)>>6;
+        sad = (vp8_sad16x16(y, y_stride, yd, yd_stride, UINT_MAX) + 128) >> 8;
+        usad = (vp8_sad8x8(u, uv_stride, ud, uvd_stride, UINT_MAX) + 32) >> 6;
+        vsad = (vp8_sad8x8(v, uv_stride, vd, uvd_stride, UINT_MAX)+ 32) >> 6;
 #endif
     }
     else /* if (blksize == 8) */
@@ -177,9 +177,9 @@
         vsad = (vp8_variance4x4(v, uv_stride, vd, uvd_stride, &sse));
         vsad = (sse + 8)>>4;
 #else
-        sad = (vp8_sad8x8(y, y_stride, yd, yd_stride, INT_MAX)+32)>>6;
-        usad = (vp8_sad4x4(u, uv_stride, ud, uvd_stride, INT_MAX)+8)>>4;
-        vsad = (vp8_sad4x4(v, uv_stride, vd, uvd_stride, INT_MAX)+8)>>4;
+        sad = (vp8_sad8x8(y, y_stride, yd, yd_stride, UINT_MAX) + 32) >> 6;
+        usad = (vp8_sad4x4(u, uv_stride, ud, uvd_stride, UINT_MAX) + 8) >> 4;
+        vsad = (vp8_sad4x4(v, uv_stride, vd, uvd_stride, UINT_MAX) + 8) >> 4;
 #endif
     }
 

diff --git a/vp8/common/mips/dspr2/dequantize_dspr2.c b/vp8/common/mips/dspr2/dequantize_dspr2.c
new file mode 100644
index 0000000..6823325
--- /dev/null
+++ b/vp8/common/mips/dspr2/dequantize_dspr2.c

@@ -0,0 +1,33 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vpx_mem/vpx_mem.h"
+
+#if HAVE_DSPR2
+void vp8_dequant_idct_add_dspr2(short *input, short *dq,
+                                unsigned char *dest, int stride)
+{
+    int i;
+
+    for (i = 0; i < 16; i++)
+    {
+        input[i] = dq[i] * input[i];
+    }
+
+    vp8_short_idct4x4llm_dspr2(input, dest, stride, dest, stride);
+
+    vpx_memset(input, 0, 32);
+
+}
+
+#endif

diff --git a/vp8/common/mips/dspr2/filter_dspr2.c b/vp8/common/mips/dspr2/filter_dspr2.c
new file mode 100644
index 0000000..71fdcd7
--- /dev/null
+++ b/vp8/common/mips/dspr2/filter_dspr2.c

@@ -0,0 +1,2823 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdlib.h>
+#include "vpx_rtcd.h"
+#include "vpx_ports/mem.h"
+
+#if HAVE_DSPR2
+#define CROP_WIDTH 256
+unsigned char ff_cropTbl[256 + 2 * CROP_WIDTH];
+
+static const unsigned short sub_pel_filterss[8][3] =
+{
+    {      0,      0,      0},
+    {      0, 0x0601, 0x7b0c},
+    { 0x0201, 0x0b08, 0x6c24},
+    {      0, 0x0906, 0x5d32},
+    { 0x0303, 0x1010, 0x4d4d},
+    {      0, 0x0609, 0x325d},
+    { 0x0102, 0x080b, 0x246c},
+    {      0, 0x0106, 0x0c7b},
+};
+
+
+static const int sub_pel_filters_int[8][3] =
+{
+    {          0,          0,          0},
+    { 0x0000fffa, 0x007b000c, 0xffff0000},
+    { 0x0002fff5, 0x006c0024, 0xfff80001},
+    { 0x0000fff7, 0x005d0032, 0xfffa0000},
+    { 0x0003fff0, 0x004d004d, 0xfff00003},
+    { 0x0000fffa, 0x0032005d, 0xfff70000},
+    { 0x0001fff8, 0x0024006c, 0xfff50002},
+    { 0x0000ffff, 0x000c007b, 0xfffa0000},
+};
+
+
+static const int sub_pel_filters_inv[8][3] =
+{
+    {          0,          0,          0},
+    { 0xfffa0000, 0x000c007b, 0x0000ffff},
+    { 0xfff50002, 0x0024006c, 0x0001fff8},
+    { 0xfff70000, 0x0032005d, 0x0000fffa},
+    { 0xfff00003, 0x004d004d, 0x0003fff0},
+    { 0xfffa0000, 0x005d0032, 0x0000fff7},
+    { 0xfff80001, 0x006c0024, 0x0002fff5},
+    { 0xffff0000, 0x007b000c, 0x0000fffa},
+};
+
+
+static const int sub_pel_filters_int_tap_4[8][2] =
+{
+    {          0,          0},
+    { 0xfffa007b, 0x000cffff},
+    {          0,          0},
+    { 0xfff7005d, 0x0032fffa},
+    {          0,          0},
+    { 0xfffa0032, 0x005dfff7},
+    {          0,          0},
+    { 0xffff000c, 0x007bfffa},
+};
+
+
+static const int sub_pel_filters_inv_tap_4[8][2] =
+{
+    {          0,          0},
+    { 0x007bfffa, 0xffff000c},
+    {          0,          0},
+    { 0x005dfff7, 0xfffa0032},
+    {          0,          0},
+    { 0x0032fffa, 0xfff7005d},
+    {          0,          0},
+    { 0x000cffff, 0xfffa007b},
+};
+
+inline void prefetch_load(unsigned char *src)
+{
+    __asm__ __volatile__ (
+        "pref   0,  0(%[src])   \n\t"
+        :
+        : [src] "r" (src)
+    );
+}
+
+
+inline void prefetch_store(unsigned char *dst)
+{
+    __asm__ __volatile__ (
+        "pref   1,  0(%[dst])   \n\t"
+        :
+        : [dst] "r" (dst)
+    );
+}
+
+void dsputil_static_init(void)
+{
+    int i;
+
+    for (i = 0; i < 256; i++) ff_cropTbl[i + CROP_WIDTH] = i;
+
+    for (i = 0; i < CROP_WIDTH; i++)
+    {
+        ff_cropTbl[i] = 0;
+        ff_cropTbl[i + CROP_WIDTH + 256] = 255;
+    }
+}
+
+void vp8_filter_block2d_first_pass_4
+(
+    unsigned char *RESTRICT src_ptr,
+    unsigned char *RESTRICT dst_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int output_height,
+    int xoffset,
+    int pitch
+)
+{
+    unsigned int i;
+    int Temp1, Temp2, Temp3, Temp4;
+
+    unsigned int vector4a = 64;
+    int vector1b, vector2b, vector3b;
+    unsigned int tp1, tp2, tn1, tn2;
+    unsigned int p1, p2, p3;
+    unsigned int n1, n2, n3;
+    unsigned char *cm = ff_cropTbl + CROP_WIDTH;
+
+    vector3b = sub_pel_filters_inv[xoffset][2];
+
+    /* if (xoffset == 0) we don't need any filtering */
+    if (vector3b == 0)
+    {
+        for (i = 0; i < output_height; i++)
+        {
+            /* prefetch src_ptr data to cache memory */
+            prefetch_load(src_ptr + src_pixels_per_line);
+            dst_ptr[0] = src_ptr[0];
+            dst_ptr[1] = src_ptr[1];
+            dst_ptr[2] = src_ptr[2];
+            dst_ptr[3] = src_ptr[3];
+
+            /* next row... */
+            src_ptr += src_pixels_per_line;
+            dst_ptr += 4;
+        }
+    }
+    else
+    {
+        if (vector3b > 65536)
+        {
+            /* 6 tap filter */
+
+            vector1b = sub_pel_filters_inv[xoffset][0];
+            vector2b = sub_pel_filters_inv[xoffset][1];
+
+            /* prefetch src_ptr data to cache memory */
+            prefetch_load(src_ptr + src_pixels_per_line);
+
+            for (i = output_height; i--;)
+            {
+                /* apply filter with vectors pairs */
+                __asm__ __volatile__ (
+                    "ulw              %[tp1],      -2(%[src_ptr])                 \n\t"
+                    "ulw              %[tp2],      2(%[src_ptr])                  \n\t"
+
+                    /* even 1. pixel */
+                    "mtlo             %[vector4a], $ac3                           \n\t"
+                    "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
+                    "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
+                    "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
+                    "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
+
+                    /* even 2. pixel */
+                    "mtlo             %[vector4a], $ac2                           \n\t"
+                    "preceu.ph.qbl    %[p1],       %[tp2]                         \n\t"
+                    "balign           %[tp2],      %[tp1],         3              \n\t"
+                    "extp             %[Temp1],    $ac3,           9              \n\t"
+                    "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[p1],          %[vector3b]    \n\t"
+
+                    /* odd 1. pixel */
+                    "ulw              %[tn2],      3(%[src_ptr])                  \n\t"
+                    "mtlo             %[vector4a], $ac3                           \n\t"
+                    "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
+                    "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
+                    "preceu.ph.qbr    %[n3],       %[tn2]                         \n\t"
+                    "extp             %[Temp3],    $ac2,           9              \n\t"
+                    "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[n3],          %[vector3b]    \n\t"
+
+                    /* even 2. pixel */
+                    "mtlo             %[vector4a], $ac2                           \n\t"
+                    "preceu.ph.qbl    %[n1],       %[tn2]                         \n\t"
+                    "extp             %[Temp2],    $ac3,           9              \n\t"
+                    "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
+                    "extp             %[Temp4],    $ac2,           9              \n\t"
+
+                    /* clamp */
+                    "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
+                    "lbux             %[tn1],      %[Temp2](%[cm])                \n\t"
+                    "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
+                    "lbux             %[n2],       %[Temp4](%[cm])                \n\t"
+
+                    /* store bytes */
+                    "sb               %[tp1],      0(%[dst_ptr])                  \n\t"
+                    "sb               %[tn1],      1(%[dst_ptr])                  \n\t"
+                    "sb               %[tp2],      2(%[dst_ptr])                  \n\t"
+                    "sb               %[n2],       3(%[dst_ptr])                  \n\t"
+
+                    : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1),
+                      [tn2] "=&r" (tn2), [p1] "=&r" (p1), [p2] "=&r" (p2),
+                      [p3] "=&r" (p3), [n1] "=&r" (n1), [n2] "=&r" (n2),
+                      [n3] "=&r" (n3), [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+                      [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
+                    : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+                      [vector4a] "r" (vector4a), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr),
+                      [vector3b] "r" (vector3b), [src_ptr] "r" (src_ptr)
+                );
+
+                /* Next row... */
+                src_ptr += src_pixels_per_line;
+                dst_ptr += pitch;
+            }
+        }
+        else
+        {
+            /* 4 tap filter */
+
+            vector1b = sub_pel_filters_inv_tap_4[xoffset][0];
+            vector2b = sub_pel_filters_inv_tap_4[xoffset][1];
+
+            for (i = output_height; i--;)
+            {
+                /* apply filter with vectors pairs */
+                __asm__ __volatile__ (
+                    "ulw              %[tp1],      -1(%[src_ptr])                 \n\t"
+                    "ulw              %[tp2],      3(%[src_ptr])                  \n\t"
+
+                    /* even 1. pixel */
+                    "mtlo             %[vector4a], $ac3                           \n\t"
+                    "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
+                    "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
+                    "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
+                    "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
+
+                    /* even 2. pixel */
+                    "mtlo             %[vector4a], $ac2                           \n\t"
+                    "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
+                    "extp             %[Temp1],    $ac3,           9              \n\t"
+
+                    /* odd 1. pixel */
+                    "srl              %[tn1],      %[tp2],         8              \n\t"
+                    "balign           %[tp2],      %[tp1],         3              \n\t"
+                    "mtlo             %[vector4a], $ac3                           \n\t"
+                    "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
+                    "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
+                    "preceu.ph.qbr    %[n3],       %[tn1]                         \n\t"
+                    "extp             %[Temp3],    $ac2,           9              \n\t"
+                    "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
+
+                    /* odd 2. pixel */
+                    "mtlo             %[vector4a], $ac2                           \n\t"
+                    "extp             %[Temp2],    $ac3,           9              \n\t"
+                    "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
+                    "extp             %[Temp4],    $ac2,           9              \n\t"
+
+                    /* clamp and store results */
+                    "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
+                    "lbux             %[tn1],      %[Temp2](%[cm])                \n\t"
+                    "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
+                    "sb               %[tp1],      0(%[dst_ptr])                  \n\t"
+                    "sb               %[tn1],      1(%[dst_ptr])                  \n\t"
+                    "lbux             %[n2],       %[Temp4](%[cm])                \n\t"
+                    "sb               %[tp2],      2(%[dst_ptr])                  \n\t"
+                    "sb               %[n2],       3(%[dst_ptr])                  \n\t"
+
+                    : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1),
+                      [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
+                      [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3),
+                      [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+                      [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
+                    : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+                      [vector4a] "r" (vector4a), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr),
+                      [src_ptr] "r" (src_ptr)
+                );
+                /*  Next row... */
+                src_ptr += src_pixels_per_line;
+                dst_ptr += pitch;
+            }
+        }
+    }
+}
+
+void vp8_filter_block2d_first_pass_8_all
+(
+    unsigned char *RESTRICT src_ptr,
+    unsigned char *RESTRICT dst_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int output_height,
+    int xoffset,
+    int pitch
+)
+{
+    unsigned int i;
+    int Temp1, Temp2, Temp3, Temp4;
+
+    unsigned int vector4a = 64;
+    unsigned int vector1b, vector2b, vector3b;
+    unsigned int tp1, tp2, tn1, tn2;
+    unsigned int p1, p2, p3, p4;
+    unsigned int n1, n2, n3, n4;
+
+    unsigned char *cm = ff_cropTbl + CROP_WIDTH;
+
+    /* if (xoffset == 0) we don't need any filtering */
+    if (xoffset == 0)
+    {
+        for (i = 0; i < output_height; i++)
+        {
+            /* prefetch src_ptr data to cache memory */
+            prefetch_load(src_ptr + src_pixels_per_line);
+
+            dst_ptr[0] = src_ptr[0];
+            dst_ptr[1] = src_ptr[1];
+            dst_ptr[2] = src_ptr[2];
+            dst_ptr[3] = src_ptr[3];
+            dst_ptr[4] = src_ptr[4];
+            dst_ptr[5] = src_ptr[5];
+            dst_ptr[6] = src_ptr[6];
+            dst_ptr[7] = src_ptr[7];
+
+            /* next row... */
+            src_ptr += src_pixels_per_line;
+            dst_ptr += 8;
+        }
+    }
+    else
+    {
+        vector3b = sub_pel_filters_inv[xoffset][2];
+
+        if (vector3b > 65536)
+        {
+            /* 6 tap filter */
+
+            vector1b = sub_pel_filters_inv[xoffset][0];
+            vector2b = sub_pel_filters_inv[xoffset][1];
+
+            for (i = output_height; i--;)
+            {
+                /* prefetch src_ptr data to cache memory */
+                prefetch_load(src_ptr + src_pixels_per_line);
+
+                /* apply filter with vectors pairs */
+                __asm__ __volatile__ (
+                    "ulw              %[tp1],      -2(%[src_ptr])                 \n\t"
+                    "ulw              %[tp2],      2(%[src_ptr])                  \n\t"
+
+                    /* even 1. pixel */
+                    "mtlo             %[vector4a], $ac3                           \n\t"
+                    "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
+                    "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
+                    "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
+                    "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
+
+                    /* even 2. pixel */
+                    "mtlo             %[vector4a], $ac2                           \n\t"
+                    "preceu.ph.qbl    %[p1],       %[tp2]                         \n\t"
+                    "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[p1],          %[vector3b]    \n\t"
+
+                    "balign           %[tp2],      %[tp1],         3              \n\t"
+                    "extp             %[Temp1],    $ac3,           9              \n\t"
+                    "ulw              %[tn2],      3(%[src_ptr])                  \n\t"
+
+                    /* odd 1. pixel */
+                    "mtlo             %[vector4a], $ac3                           \n\t"
+                    "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
+                    "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
+                    "preceu.ph.qbr    %[n3],       %[tn2]                         \n\t"
+                    "extp             %[Temp3],    $ac2,           9              \n\t"
+                    "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[n3],          %[vector3b]    \n\t"
+
+                    /* odd 2. pixel */
+                    "mtlo             %[vector4a], $ac2                           \n\t"
+                    "preceu.ph.qbl    %[n1],       %[tn2]                         \n\t"
+                    "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
+                    "ulw              %[tp1],      6(%[src_ptr])                  \n\t"
+                    "extp             %[Temp2],    $ac3,           9              \n\t"
+                    "mtlo             %[vector4a], $ac3                           \n\t"
+                    "preceu.ph.qbr    %[p2],       %[tp1]                         \n\t"
+                    "extp             %[Temp4],    $ac2,           9              \n\t"
+
+                    : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn2] "=&r" (tn2),
+                      [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
+                      [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3),
+                      [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+                      [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
+                    : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+                      [vector4a] "r" (vector4a), [vector3b] "r" (vector3b),
+                      [src_ptr] "r" (src_ptr)
+                );
+
+                /* clamp and store results */
+                dst_ptr[0] = cm[Temp1];
+                dst_ptr[1] = cm[Temp2];
+                dst_ptr[2] = cm[Temp3];
+                dst_ptr[3] = cm[Temp4];
+
+                /* next 4 pixels */
+                __asm__ __volatile__ (
+                    /* even 3. pixel */
+                    "dpa.w.ph         $ac3,        %[p3],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[p1],          %[vector2b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[p2],          %[vector3b]    \n\t"
+
+                    /* even 4. pixel */
+                    "mtlo             %[vector4a], $ac2                           \n\t"
+                    "preceu.ph.qbl    %[p4],       %[tp1]                         \n\t"
+                    "dpa.w.ph         $ac2,        %[p1],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[p2],          %[vector2b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[p4],          %[vector3b]    \n\t"
+
+                    "ulw              %[tn1],      7(%[src_ptr])                  \n\t"
+                    "extp             %[Temp1],    $ac3,           9              \n\t"
+
+                    /* odd 3. pixel */
+                    "mtlo             %[vector4a], $ac3                           \n\t"
+                    "preceu.ph.qbr    %[n2],       %[tn1]                         \n\t"
+                    "dpa.w.ph         $ac3,        %[n3],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[n1],          %[vector2b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[n2],          %[vector3b]    \n\t"
+                    "extp             %[Temp3],    $ac2,           9              \n\t"
+
+                    /* odd 4. pixel */
+                    "mtlo             %[vector4a], $ac2                           \n\t"
+                    "preceu.ph.qbl    %[n4],       %[tn1]                         \n\t"
+                    "dpa.w.ph         $ac2,        %[n1],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[n2],          %[vector2b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[n4],          %[vector3b]    \n\t"
+                    "extp             %[Temp2],    $ac3,           9              \n\t"
+                    "extp             %[Temp4],    $ac2,           9              \n\t"
+
+                    : [tn1] "=&r" (tn1), [n2] "=&r" (n2),
+                      [p4] "=&r" (p4), [n4] "=&r" (n4),
+                      [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+                      [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
+                    : [tp1] "r" (tp1), [vector1b] "r" (vector1b), [p2] "r" (p2),
+                      [vector2b] "r" (vector2b), [n1] "r" (n1), [p1] "r" (p1),
+                      [vector4a] "r" (vector4a), [vector3b] "r" (vector3b),
+                      [p3] "r" (p3), [n3] "r" (n3), [src_ptr] "r" (src_ptr)
+                );
+
+                /* clamp and store results */
+                dst_ptr[4] = cm[Temp1];
+                dst_ptr[5] = cm[Temp2];
+                dst_ptr[6] = cm[Temp3];
+                dst_ptr[7] = cm[Temp4];
+
+                src_ptr += src_pixels_per_line;
+                dst_ptr += pitch;
+            }
+        }
+        else
+        {
+            /* 4 tap filter */
+
+            vector1b = sub_pel_filters_inv_tap_4[xoffset][0];
+            vector2b = sub_pel_filters_inv_tap_4[xoffset][1];
+
+            for (i = output_height; i--;)
+            {
+                /* prefetch src_ptr data to cache memory */
+                prefetch_load(src_ptr + src_pixels_per_line);
+
+                /* apply filter with vectors pairs */
+                __asm__ __volatile__ (
+                    "ulw              %[tp1],      -1(%[src_ptr])                 \n\t"
+
+                    /* even 1. pixel */
+                    "mtlo             %[vector4a], $ac3                           \n\t"
+                    "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
+                    "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
+                    "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
+
+                    "ulw              %[tp2],      3(%[src_ptr])                  \n\t"
+
+                    /* even 2. pixel  */
+                    "mtlo             %[vector4a], $ac2                           \n\t"
+                    "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
+                    "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
+                    "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
+                    "extp             %[Temp1],    $ac3,           9              \n\t"
+
+                    "balign           %[tp2],      %[tp1],         3              \n\t"
+
+                    /* odd 1. pixel */
+                    "mtlo             %[vector4a], $ac3                           \n\t"
+                    "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
+                    "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
+                    "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
+                    "extp             %[Temp3],    $ac2,           9              \n\t"
+
+                    "ulw              %[tn2],      4(%[src_ptr])                  \n\t"
+
+                    /* odd 2. pixel */
+                    "mtlo             %[vector4a], $ac2                           \n\t"
+                    "preceu.ph.qbr    %[n3],       %[tn2]                         \n\t"
+                    "preceu.ph.qbl    %[n4],       %[tn2]                         \n\t"
+                    "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
+                    "ulw              %[tp1],      7(%[src_ptr])                  \n\t"
+                    "extp             %[Temp2],    $ac3,           9              \n\t"
+                    "mtlo             %[vector4a], $ac3                           \n\t"
+                    "extp             %[Temp4],    $ac2,           9              \n\t"
+
+                    : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
+                      [tn2] "=&r" (tn2), [p1] "=&r" (p1), [p2] "=&r" (p2),
+                      [p3] "=&r" (p3), [p4] "=&r" (p4), [n1] "=&r" (n1),
+                      [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4),
+                      [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+                      [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
+                    : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+                      [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
+                );
+
+                /* clamp and store results */
+                dst_ptr[0] = cm[Temp1];
+                dst_ptr[1] = cm[Temp2];
+                dst_ptr[2] = cm[Temp3];
+                dst_ptr[3] = cm[Temp4];
+
+                /* next 4 pixels */
+                __asm__ __volatile__ (
+                    /* even 3. pixel */
+                    "dpa.w.ph         $ac3,        %[p3],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[p4],          %[vector2b]    \n\t"
+
+                    /* even 4. pixel */
+                    "mtlo             %[vector4a], $ac2                           \n\t"
+                    "preceu.ph.qbr    %[p2],       %[tp1]                         \n\t"
+                    "dpa.w.ph         $ac2,        %[p4],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[p2],          %[vector2b]    \n\t"
+                    "extp             %[Temp1],    $ac3,           9              \n\t"
+
+                    /* odd 3. pixel */
+                    "mtlo             %[vector4a], $ac3                           \n\t"
+                    "dpa.w.ph         $ac3,        %[n3],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac3,        %[n4],          %[vector2b]    \n\t"
+                    "ulw              %[tn1],      8(%[src_ptr])                  \n\t"
+                    "extp             %[Temp3],    $ac2,           9              \n\t"
+
+                    /* odd 4. pixel */
+                    "mtlo             %[vector4a], $ac2                           \n\t"
+                    "preceu.ph.qbr    %[n2],       %[tn1]                         \n\t"
+                    "dpa.w.ph         $ac2,        %[n4],          %[vector1b]    \n\t"
+                    "dpa.w.ph         $ac2,        %[n2],          %[vector2b]    \n\t"
+                    "extp             %[Temp2],    $ac3,           9              \n\t"
+                    "extp             %[Temp4],    $ac2,           9              \n\t"
+
+                    : [tn1] "=&r" (tn1), [p2] "=&r" (p2), [n2] "=&r" (n2),
+                      [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+                      [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
+                    : [tp1] "r" (tp1), [p3] "r" (p3), [p4] "r" (p4),
+                      [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+                      [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr),
+                      [n3] "r" (n3), [n4] "r" (n4)
+                );
+
+                /* clamp and store results */
+                dst_ptr[4] = cm[Temp1];
+                dst_ptr[5] = cm[Temp2];
+                dst_ptr[6] = cm[Temp3];
+                dst_ptr[7] = cm[Temp4];
+
+                /* next row... */
+                src_ptr += src_pixels_per_line;
+                dst_ptr += pitch;
+            }
+        }
+    }
+}
+
+
+void vp8_filter_block2d_first_pass16_6tap
+(
+    unsigned char *RESTRICT src_ptr,
+    unsigned char *RESTRICT dst_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int output_height,
+    int xoffset,
+    int pitch
+)
+{
+    unsigned int i;
+    int Temp1, Temp2, Temp3, Temp4;
+
+    unsigned int vector4a;
+    unsigned int vector1b, vector2b, vector3b;
+    unsigned int tp1, tp2, tn1, tn2;
+    unsigned int p1, p2, p3, p4;
+    unsigned int n1, n2, n3, n4;
+    unsigned char *cm = ff_cropTbl + CROP_WIDTH;
+
+    vector1b = sub_pel_filters_inv[xoffset][0];
+    vector2b = sub_pel_filters_inv[xoffset][1];
+    vector3b = sub_pel_filters_inv[xoffset][2];
+    vector4a = 64;
+
+    for (i = output_height; i--;)
+    {
+        /* prefetch src_ptr data to cache memory */
+        prefetch_load(src_ptr + src_pixels_per_line);
+
+        /* apply filter with vectors pairs */
+        __asm__ __volatile__ (
+            "ulw                %[tp1],      -2(%[src_ptr])                 \n\t"
+            "ulw                %[tp2],      2(%[src_ptr])                  \n\t"
+
+            /* even 1. pixel */
+            "mtlo               %[vector4a], $ac3                           \n\t"
+            "preceu.ph.qbr      %[p1],       %[tp1]                         \n\t"
+            "preceu.ph.qbl      %[p2],       %[tp1]                         \n\t"
+            "preceu.ph.qbr      %[p3],       %[tp2]                         \n\t"
+            "dpa.w.ph           $ac3,        %[p1],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac3,        %[p2],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac3,        %[p3],           %[vector3b]   \n\t"
+
+            /* even 2. pixel */
+            "mtlo               %[vector4a], $ac2                           \n\t"
+            "preceu.ph.qbl      %[p1],       %[tp2]                         \n\t"
+            "dpa.w.ph           $ac2,        %[p2],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac2,        %[p3],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac2,        %[p1],           %[vector3b]   \n\t"
+
+            "balign             %[tp2],      %[tp1],          3             \n\t"
+            "ulw                %[tn2],      3(%[src_ptr])                  \n\t"
+            "extp               %[Temp1],    $ac3,            9             \n\t"
+
+            /* odd 1. pixel */
+            "mtlo               %[vector4a], $ac3                           \n\t"
+            "preceu.ph.qbr      %[n1],       %[tp2]                         \n\t"
+            "preceu.ph.qbl      %[n2],       %[tp2]                         \n\t"
+            "preceu.ph.qbr      %[n3],       %[tn2]                         \n\t"
+            "extp               %[Temp3],    $ac2,            9             \n\t"
+            "dpa.w.ph           $ac3,        %[n1],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac3,        %[n2],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac3,        %[n3],           %[vector3b]   \n\t"
+
+            /* odd 2. pixel */
+            "mtlo               %[vector4a], $ac2                           \n\t"
+            "preceu.ph.qbl      %[n1],       %[tn2]                         \n\t"
+            "dpa.w.ph           $ac2,        %[n2],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac2,        %[n3],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac2,        %[n1],           %[vector3b]   \n\t"
+            "ulw                %[tp1],      6(%[src_ptr])                  \n\t"
+            "extp               %[Temp2],    $ac3,            9             \n\t"
+            "mtlo               %[vector4a], $ac3                           \n\t"
+            "preceu.ph.qbr      %[p2],       %[tp1]                         \n\t"
+            "extp               %[Temp4],    $ac2,            9             \n\t"
+
+            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn2] "=&r" (tn2),
+              [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
+              [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3),
+              [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+              [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
+            : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+              [vector4a] "r" (vector4a), [vector3b] "r" (vector3b),
+              [src_ptr] "r" (src_ptr)
+        );
+
+        /* clamp and store results */
+        dst_ptr[0] = cm[Temp1];
+        dst_ptr[1] = cm[Temp2];
+        dst_ptr[2] = cm[Temp3];
+        dst_ptr[3] = cm[Temp4];
+
+        /* next 4 pixels */
+        __asm__ __volatile__ (
+            /* even 3. pixel */
+            "dpa.w.ph           $ac3,        %[p3],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac3,        %[p1],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac3,        %[p2],           %[vector3b]   \n\t"
+
+            /* even 4. pixel */
+            "mtlo               %[vector4a], $ac2                           \n\t"
+            "preceu.ph.qbl      %[p4],       %[tp1]                         \n\t"
+            "dpa.w.ph           $ac2,        %[p1],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac2,        %[p2],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac2,        %[p4],           %[vector3b]   \n\t"
+            "ulw                %[tn1],      7(%[src_ptr])                  \n\t"
+            "extp               %[Temp1],    $ac3,            9             \n\t"
+
+            /* odd 3. pixel */
+            "mtlo               %[vector4a], $ac3                           \n\t"
+            "preceu.ph.qbr      %[n2],       %[tn1]                         \n\t"
+            "dpa.w.ph           $ac3,        %[n3],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac3,        %[n1],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac3,        %[n2],           %[vector3b]   \n\t"
+            "extp               %[Temp3],    $ac2,            9             \n\t"
+
+            /* odd 4. pixel */
+            "mtlo               %[vector4a], $ac2                           \n\t"
+            "preceu.ph.qbl      %[n4],       %[tn1]                         \n\t"
+            "dpa.w.ph           $ac2,        %[n1],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac2,        %[n2],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac2,        %[n4],           %[vector3b]   \n\t"
+            "ulw                %[tp2],      10(%[src_ptr])                 \n\t"
+            "extp               %[Temp2],    $ac3,            9             \n\t"
+            "mtlo               %[vector4a], $ac3                           \n\t"
+            "preceu.ph.qbr      %[p1],       %[tp2]                         \n\t"
+            "extp               %[Temp4],    $ac2,            9             \n\t"
+
+            : [tn1] "=&r" (tn1), [tp2] "=&r" (tp2), [n2] "=&r" (n2),
+              [p4] "=&r" (p4), [n4] "=&r" (n4),
+              [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+              [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
+            : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+              [tp1] "r" (tp1), [n1] "r" (n1), [p1] "r" (p1),
+              [vector4a] "r" (vector4a), [p2] "r" (p2), [vector3b] "r" (vector3b),
+              [p3] "r" (p3), [n3] "r" (n3), [src_ptr] "r" (src_ptr)
+        );
+
+        /* clamp and store results */
+        dst_ptr[4] = cm[Temp1];
+        dst_ptr[5] = cm[Temp2];
+        dst_ptr[6] = cm[Temp3];
+        dst_ptr[7] = cm[Temp4];
+
+        /* next 4 pixels */
+        __asm__ __volatile__ (
+            /* even 5. pixel */
+            "dpa.w.ph           $ac3,        %[p2],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac3,        %[p4],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac3,        %[p1],           %[vector3b]   \n\t"
+
+            /* even 6. pixel */
+            "mtlo               %[vector4a], $ac2                           \n\t"
+            "preceu.ph.qbl      %[p3],       %[tp2]                         \n\t"
+            "dpa.w.ph           $ac2,        %[p4],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac2,        %[p1],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac2,        %[p3],           %[vector3b]   \n\t"
+
+            "ulw                %[tn1],      11(%[src_ptr])                 \n\t"
+            "extp               %[Temp1],    $ac3,            9             \n\t"
+
+            /* odd 5. pixel */
+            "mtlo               %[vector4a], $ac3                           \n\t"
+            "preceu.ph.qbr      %[n1],       %[tn1]                         \n\t"
+            "dpa.w.ph           $ac3,        %[n2],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac3,        %[n4],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac3,        %[n1],           %[vector3b]   \n\t"
+            "extp               %[Temp3],    $ac2,            9             \n\t"
+
+            /* odd 6. pixel */
+            "mtlo               %[vector4a], $ac2                           \n\t"
+            "preceu.ph.qbl      %[n3],       %[tn1]                         \n\t"
+            "dpa.w.ph           $ac2,        %[n4],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac2,        %[n1],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac2,        %[n3],           %[vector3b]   \n\t"
+            "ulw                %[tp1],      14(%[src_ptr])                 \n\t"
+            "extp               %[Temp2],    $ac3,            9             \n\t"
+            "mtlo               %[vector4a], $ac3                           \n\t"
+            "preceu.ph.qbr      %[p4],       %[tp1]                         \n\t"
+            "extp               %[Temp4],    $ac2,            9             \n\t"
+
+            : [tn1] "=&r" (tn1), [tp1] "=&r" (tp1),
+              [n1] "=&r" (n1), [p3] "=&r" (p3), [n3] "=&r" (n3),
+              [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+              [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
+            : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+              [tp2] "r" (tp2), [p2] "r" (p2), [n2] "r" (n2),
+              [p4] "r" (p4), [n4] "r" (n4), [p1] "r" (p1), [src_ptr] "r" (src_ptr),
+              [vector4a] "r" (vector4a), [vector3b] "r" (vector3b)
+        );
+
+        /* clamp and store results */
+        dst_ptr[8] = cm[Temp1];
+        dst_ptr[9] = cm[Temp2];
+        dst_ptr[10] = cm[Temp3];
+        dst_ptr[11] = cm[Temp4];
+
+        /* next 4 pixels */
+        __asm__ __volatile__ (
+            /* even 7. pixel */
+            "dpa.w.ph           $ac3,        %[p1],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac3,        %[p3],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac3,        %[p4],           %[vector3b]   \n\t"
+
+            /* even 8. pixel */
+            "mtlo               %[vector4a], $ac2                           \n\t"
+            "preceu.ph.qbl      %[p2],       %[tp1]                         \n\t"
+            "dpa.w.ph           $ac2,        %[p3],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac2,        %[p4],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac2,        %[p2],           %[vector3b]   \n\t"
+            "ulw                %[tn1],      15(%[src_ptr])                 \n\t"
+            "extp               %[Temp1],    $ac3,            9             \n\t"
+
+            /* odd 7. pixel */
+            "mtlo               %[vector4a], $ac3                           \n\t"
+            "preceu.ph.qbr      %[n4],       %[tn1]                         \n\t"
+            "dpa.w.ph           $ac3,        %[n1],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac3,        %[n3],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac3,        %[n4],           %[vector3b]   \n\t"
+            "extp               %[Temp3],    $ac2,            9             \n\t"
+
+            /* odd 8. pixel */
+            "mtlo               %[vector4a], $ac2                           \n\t"
+            "preceu.ph.qbl      %[n2],       %[tn1]                         \n\t"
+            "dpa.w.ph           $ac2,        %[n3],           %[vector1b]   \n\t"
+            "dpa.w.ph           $ac2,        %[n4],           %[vector2b]   \n\t"
+            "dpa.w.ph           $ac2,        %[n2],           %[vector3b]   \n\t"
+            "extp               %[Temp2],    $ac3,            9             \n\t"
+            "extp               %[Temp4],    $ac2,            9             \n\t"
+
+            /* clamp and store results */
+            "lbux               %[tp1],      %[Temp1](%[cm])                \n\t"
+            "lbux               %[tn1],      %[Temp2](%[cm])                \n\t"
+            "lbux               %[p2],       %[Temp3](%[cm])                \n\t"
+            "sb                 %[tp1],      12(%[dst_ptr])                 \n\t"
+            "sb                 %[tn1],      13(%[dst_ptr])                 \n\t"
+            "lbux               %[n2],       %[Temp4](%[cm])                \n\t"
+            "sb                 %[p2],       14(%[dst_ptr])                 \n\t"
+            "sb                 %[n2],       15(%[dst_ptr])                 \n\t"
+
+            : [tn1] "=&r" (tn1), [p2] "=&r" (p2), [n2] "=&r" (n2), [n4] "=&r" (n4),
+              [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+              [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
+            : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+              [tp1] "r" (tp1), [p4] "r" (p4), [n1] "r" (n1), [p1] "r" (p1),
+              [vector4a] "r" (vector4a), [vector3b] "r" (vector3b), [p3] "r" (p3),
+              [n3] "r" (n3), [src_ptr] "r" (src_ptr),
+              [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
+        );
+
+        src_ptr += src_pixels_per_line;
+        dst_ptr += pitch;
+    }
+}
+
+
+void vp8_filter_block2d_first_pass16_0
+(
+    unsigned char *RESTRICT src_ptr,
+    unsigned char *RESTRICT output_ptr,
+    unsigned int src_pixels_per_line
+)
+{
+    int Temp1, Temp2, Temp3, Temp4;
+    int i;
+
+    /* prefetch src_ptr data to cache memory */
+    prefetch_store(output_ptr + 32);
+
+    /* copy memory from src buffer to dst buffer */
+    for (i = 0; i < 7; i++)
+    {
+        __asm__ __volatile__ (
+            "ulw    %[Temp1],   0(%[src_ptr])                               \n\t"
+            "ulw    %[Temp2],   4(%[src_ptr])                               \n\t"
+            "ulw    %[Temp3],   8(%[src_ptr])                               \n\t"
+            "ulw    %[Temp4],   12(%[src_ptr])                              \n\t"
+            "sw     %[Temp1],   0(%[output_ptr])                            \n\t"
+            "sw     %[Temp2],   4(%[output_ptr])                            \n\t"
+            "sw     %[Temp3],   8(%[output_ptr])                            \n\t"
+            "sw     %[Temp4],   12(%[output_ptr])                           \n\t"
+            "addu   %[src_ptr], %[src_ptr],        %[src_pixels_per_line]   \n\t"
+
+            : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+              [Temp4] "=&r" (Temp4), [src_ptr] "+r" (src_ptr)
+            : [src_pixels_per_line] "r" (src_pixels_per_line),
+              [output_ptr] "r" (output_ptr)
+        );
+
+        __asm__ __volatile__ (
+            "ulw    %[Temp1],   0(%[src_ptr])                               \n\t"
+            "ulw    %[Temp2],   4(%[src_ptr])                               \n\t"
+            "ulw    %[Temp3],   8(%[src_ptr])                               \n\t"
+            "ulw    %[Temp4],   12(%[src_ptr])                              \n\t"
+            "sw     %[Temp1],   16(%[output_ptr])                           \n\t"
+            "sw     %[Temp2],   20(%[output_ptr])                           \n\t"
+            "sw     %[Temp3],   24(%[output_ptr])                           \n\t"
+            "sw     %[Temp4],   28(%[output_ptr])                           \n\t"
+            "addu   %[src_ptr], %[src_ptr],        %[src_pixels_per_line]   \n\t"
+
+            : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+              [Temp4] "=&r" (Temp4), [src_ptr] "+r" (src_ptr)
+            : [src_pixels_per_line] "r" (src_pixels_per_line),
+              [output_ptr] "r" (output_ptr)
+        );
+
+        __asm__ __volatile__ (
+            "ulw    %[Temp1],   0(%[src_ptr])                               \n\t"
+            "ulw    %[Temp2],   4(%[src_ptr])                               \n\t"
+            "ulw    %[Temp3],   8(%[src_ptr])                               \n\t"
+            "ulw    %[Temp4],   12(%[src_ptr])                              \n\t"
+            "sw     %[Temp1],   32(%[output_ptr])                           \n\t"
+            "sw     %[Temp2],   36(%[output_ptr])                           \n\t"
+            "sw     %[Temp3],   40(%[output_ptr])                           \n\t"
+            "sw     %[Temp4],   44(%[output_ptr])                           \n\t"
+            "addu   %[src_ptr], %[src_ptr],        %[src_pixels_per_line]   \n\t"
+
+            : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+              [Temp4] "=&r" (Temp4), [src_ptr] "+r" (src_ptr)
+            : [src_pixels_per_line] "r" (src_pixels_per_line),
+              [output_ptr] "r" (output_ptr)
+        );
+
+        output_ptr += 48;
+    }
+}
+
+
+void vp8_filter_block2d_first_pass16_4tap
+(
+    unsigned char *RESTRICT src_ptr,
+    unsigned char *RESTRICT output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int output_width,
+    unsigned int output_height,
+    int xoffset,
+    int yoffset,
+    unsigned char *RESTRICT dst_ptr,
+    int pitch
+)
+{
+    unsigned int i, j;
+    int Temp1, Temp2, Temp3, Temp4;
+
+    unsigned int vector4a;
+    int vector1b, vector2b;
+    unsigned int tp1, tp2, tp3, tn1;
+    unsigned int p1, p2, p3;
+    unsigned int n1, n2, n3;
+    unsigned char *cm = ff_cropTbl + CROP_WIDTH;
+
+    vector4a = 64;
+
+    vector1b = sub_pel_filters_inv_tap_4[xoffset][0];
+    vector2b = sub_pel_filters_inv_tap_4[xoffset][1];
+
+    /* if (yoffset == 0) don't need temp buffer, data will be stored in dst_ptr */
+    if (yoffset == 0)
+    {
+        output_height -= 5;
+        src_ptr += (src_pixels_per_line + src_pixels_per_line);
+
+        for (i = output_height; i--;)
+        {
+            __asm__ __volatile__ (
+                "ulw     %[tp3],   -1(%[src_ptr])               \n\t"
+                : [tp3] "=&r" (tp3)
+                : [src_ptr] "r" (src_ptr)
+            );
+
+            /* processing 4 adjacent pixels */
+            for (j = 0; j < 16; j += 4)
+            {
+                /* apply filter with vectors pairs */
+                __asm__ __volatile__ (
+                    "ulw              %[tp2],      3(%[src_ptr])                    \n\t"
+                    "move             %[tp1],      %[tp3]                           \n\t"
+
+                    /* even 1. pixel */
+                    "mtlo             %[vector4a], $ac3                             \n\t"
+                    "mthi             $0,          $ac3                             \n\t"
+                    "move             %[tp3],      %[tp2]                           \n\t"
+                    "preceu.ph.qbr    %[p1],       %[tp1]                           \n\t"
+                    "preceu.ph.qbl    %[p2],       %[tp1]                           \n\t"
+                    "preceu.ph.qbr    %[p3],       %[tp2]                           \n\t"
+                    "dpa.w.ph         $ac3,        %[p1],           %[vector1b]     \n\t"
+                    "dpa.w.ph         $ac3,        %[p2],           %[vector2b]     \n\t"
+
+                    /* even 2. pixel */
+                    "mtlo             %[vector4a], $ac2                             \n\t"
+                    "mthi             $0,          $ac2                             \n\t"
+                    "dpa.w.ph         $ac2,        %[p2],           %[vector1b]     \n\t"
+                    "dpa.w.ph         $ac2,        %[p3],           %[vector2b]     \n\t"
+                    "extr.w           %[Temp1],    $ac3,            7               \n\t"
+
+                    /* odd 1. pixel */
+                    "ulw              %[tn1],      4(%[src_ptr])                    \n\t"
+                    "balign           %[tp2],      %[tp1],          3               \n\t"
+                    "mtlo             %[vector4a], $ac3                             \n\t"
+                    "mthi             $0,          $ac3                             \n\t"
+                    "preceu.ph.qbr    %[n1],       %[tp2]                           \n\t"
+                    "preceu.ph.qbl    %[n2],       %[tp2]                           \n\t"
+                    "preceu.ph.qbr    %[n3],       %[tn1]                           \n\t"
+                    "extr.w           %[Temp3],    $ac2,            7               \n\t"
+                    "dpa.w.ph         $ac3,        %[n1],           %[vector1b]     \n\t"
+                    "dpa.w.ph         $ac3,        %[n2],           %[vector2b]     \n\t"
+
+                    /* odd 2. pixel */
+                    "mtlo             %[vector4a], $ac2                             \n\t"
+                    "mthi             $0,          $ac2                             \n\t"
+                    "extr.w           %[Temp2],    $ac3,            7               \n\t"
+                    "dpa.w.ph         $ac2,        %[n2],           %[vector1b]     \n\t"
+                    "dpa.w.ph         $ac2,        %[n3],           %[vector2b]     \n\t"
+                    "extr.w           %[Temp4],    $ac2,            7               \n\t"
+
+                    /* clamp and store results */
+                    "lbux             %[tp1],      %[Temp1](%[cm])                  \n\t"
+                    "lbux             %[tn1],      %[Temp2](%[cm])                  \n\t"
+                    "lbux             %[tp2],      %[Temp3](%[cm])                  \n\t"
+                    "sb               %[tp1],      0(%[dst_ptr])                    \n\t"
+                    "sb               %[tn1],      1(%[dst_ptr])                    \n\t"
+                    "lbux             %[n2],       %[Temp4](%[cm])                  \n\t"
+                    "sb               %[tp2],      2(%[dst_ptr])                    \n\t"
+                    "sb               %[n2],       3(%[dst_ptr])                    \n\t"
+
+                    : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
+                      [tn1] "=&r" (tn1), [p1] "=&r" (p1), [p2] "=&r" (p2),
+                      [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3),
+                      [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [p3] "=&r" (p3),
+                      [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
+                    : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+                      [vector4a] "r" (vector4a), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr),
+                      [src_ptr] "r" (src_ptr)
+                );
+
+                src_ptr += 4;
+            }
+
+            /* Next row... */
+            src_ptr += src_pixels_per_line - 16;
+            dst_ptr += pitch;
+        }
+    }
+    else
+    {
+        for (i = output_height; i--;)
+        {
+            /* processing 4 adjacent pixels */
+            for (j = 0; j < 16; j += 4)
+            {
+                /* apply filter with vectors pairs */
+                __asm__ __volatile__ (
+                    "ulw              %[tp1],      -1(%[src_ptr])                   \n\t"
+                    "ulw              %[tp2],      3(%[src_ptr])                    \n\t"
+
+                    /* even 1. pixel */
+                    "mtlo             %[vector4a], $ac3                             \n\t"
+                    "mthi             $0,          $ac3                             \n\t"
+                    "preceu.ph.qbr    %[p1],       %[tp1]                           \n\t"
+                    "preceu.ph.qbl    %[p2],       %[tp1]                           \n\t"
+                    "preceu.ph.qbr    %[p3],       %[tp2]                           \n\t"
+                    "dpa.w.ph         $ac3,        %[p1],           %[vector1b]     \n\t"
+                    "dpa.w.ph         $ac3,        %[p2],           %[vector2b]     \n\t"
+
+                    /* even 2. pixel */
+                    "mtlo             %[vector4a], $ac2                             \n\t"
+                    "mthi             $0,          $ac2                             \n\t"
+                    "dpa.w.ph         $ac2,        %[p2],           %[vector1b]     \n\t"
+                    "dpa.w.ph         $ac2,        %[p3],           %[vector2b]     \n\t"
+                    "extr.w           %[Temp1],    $ac3,            7               \n\t"
+
+                    /* odd 1. pixel */
+                    "ulw              %[tn1],      4(%[src_ptr])                    \n\t"
+                    "balign           %[tp2],      %[tp1],          3               \n\t"
+                    "mtlo             %[vector4a], $ac3                             \n\t"
+                    "mthi             $0,          $ac3                             \n\t"
+                    "preceu.ph.qbr    %[n1],       %[tp2]                           \n\t"
+                    "preceu.ph.qbl    %[n2],       %[tp2]                           \n\t"
+                    "preceu.ph.qbr    %[n3],       %[tn1]                           \n\t"
+                    "extr.w           %[Temp3],    $ac2,            7               \n\t"
+                    "dpa.w.ph         $ac3,        %[n1],           %[vector1b]     \n\t"
+                    "dpa.w.ph         $ac3,        %[n2],           %[vector2b]     \n\t"
+
+                    /* odd 2. pixel */
+                    "mtlo             %[vector4a], $ac2                             \n\t"
+                    "mthi             $0,          $ac2                             \n\t"
+                    "extr.w           %[Temp2],    $ac3,            7               \n\t"
+                    "dpa.w.ph         $ac2,        %[n2],           %[vector1b]     \n\t"
+                    "dpa.w.ph         $ac2,        %[n3],           %[vector2b]     \n\t"
+                    "extr.w           %[Temp4],    $ac2,            7               \n\t"
+
+                    /* clamp and store results */
+                    "lbux             %[tp1],      %[Temp1](%[cm])                  \n\t"
+                    "lbux             %[tn1],      %[Temp2](%[cm])                  \n\t"
+                    "lbux             %[tp2],      %[Temp3](%[cm])                  \n\t"
+                    "sb               %[tp1],      0(%[output_ptr])                 \n\t"
+                    "sb               %[tn1],      1(%[output_ptr])                 \n\t"
+                    "lbux             %[n2],       %[Temp4](%[cm])                  \n\t"
+                    "sb               %[tp2],      2(%[output_ptr])                 \n\t"
+                    "sb               %[n2],       3(%[output_ptr])                 \n\t"
+
+                    : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1),
+                      [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
+                      [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3),
+                      [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+                      [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
+                    : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+                      [vector4a] "r" (vector4a), [cm] "r" (cm),
+                      [output_ptr] "r" (output_ptr), [src_ptr] "r" (src_ptr)
+                );
+
+                src_ptr += 4;
+            }
+
+            /* next row... */
+            src_ptr += src_pixels_per_line;
+            output_ptr += output_width;
+        }
+    }
+}
+
+
+void vp8_filter_block2d_second_pass4
+(
+    unsigned char *RESTRICT src_ptr,
+    unsigned char *RESTRICT output_ptr,
+    int output_pitch,
+    int yoffset
+)
+{
+    unsigned int i;
+
+    int Temp1, Temp2, Temp3, Temp4;
+    unsigned int vector1b, vector2b, vector3b, vector4a;
+
+    unsigned char src_ptr_l2;
+    unsigned char src_ptr_l1;
+    unsigned char src_ptr_0;
+    unsigned char src_ptr_r1;
+    unsigned char src_ptr_r2;
+    unsigned char src_ptr_r3;
+
+    unsigned char *cm = ff_cropTbl + CROP_WIDTH;
+
+    vector4a = 64;
+
+    /* load filter coefficients */
+    vector1b = sub_pel_filterss[yoffset][0];
+    vector2b = sub_pel_filterss[yoffset][2];
+    vector3b = sub_pel_filterss[yoffset][1];
+
+    if (vector1b)
+    {
+        /* 6 tap filter */
+
+        for (i = 2; i--;)
+        {
+            /* prefetch src_ptr data to cache memory */
+            prefetch_load(src_ptr);
+
+            /* do not allow compiler to reorder instructions */
+            __asm__ __volatile__ (
+                ".set noreorder                                                 \n\t"
+                :
+                :
+            );
+
+            /* apply filter with vectors pairs */
+            __asm__ __volatile__ (
+                "lbu            %[src_ptr_l2],  -8(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_l1],  -4(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  4(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  8(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r3],  12(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac2                            \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -7(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_l1],  -3(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  5(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  9(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r3],  13(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac3                            \n\t"
+                "extp           %[Temp1],       $ac2,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -6(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_l1],  -2(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   2(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  6(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  10(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  14(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac0                            \n\t"
+                "extp           %[Temp2],       $ac3,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -5(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_l1],  -1(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   3(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  7(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  11(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  15(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac1                            \n\t"
+                "extp           %[Temp3],       $ac0,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp4],       $ac1,           9               \n\t"
+
+                : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+                  [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4),
+                  [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
+                  [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
+                  [src_ptr_l2] "=&r" (src_ptr_l2), [src_ptr_r3] "=&r" (src_ptr_r3)
+                : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+                  [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
+                  [src_ptr] "r" (src_ptr)
+            );
+
+            /* clamp and store results */
+            output_ptr[0] = cm[Temp1];
+            output_ptr[1] = cm[Temp2];
+            output_ptr[2] = cm[Temp3];
+            output_ptr[3] = cm[Temp4];
+
+            output_ptr += output_pitch;
+
+            /* apply filter with vectors pairs */
+            __asm__ __volatile__ (
+                "lbu            %[src_ptr_l2],  -4(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_l1],  0(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_0],   4(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  8(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  12(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  16(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac2                            \n\t"
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -3(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_l1],  1(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_0],   5(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  9(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  13(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  17(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac3                            \n\t"
+                "extp           %[Temp1],       $ac2,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -2(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_l1],  2(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_0],   6(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  10(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  14(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  18(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac0                            \n\t"
+                "extp           %[Temp2],       $ac3,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -1(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_l1],  3(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_0],   7(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  11(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  15(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  19(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac1                            \n\t"
+                "extp           %[Temp3],       $ac0,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp4],       $ac1,           9               \n\t"
+
+                : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+                  [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4),
+                  [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
+                  [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
+                  [src_ptr_l2] "=&r" (src_ptr_l2), [src_ptr_r3] "=&r" (src_ptr_r3)
+                : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+                  [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
+                  [src_ptr] "r" (src_ptr)
+            );
+
+            /* clamp and store results */
+            output_ptr[0] = cm[Temp1];
+            output_ptr[1] = cm[Temp2];
+            output_ptr[2] = cm[Temp3];
+            output_ptr[3] = cm[Temp4];
+
+            src_ptr += 8;
+            output_ptr += output_pitch;
+        }
+    }
+    else
+    {
+        /* 4 tap filter */
+
+        /* prefetch src_ptr data to cache memory */
+        prefetch_load(src_ptr);
+
+        for (i = 2; i--;)
+        {
+            /* do not allow compiler to reorder instructions */
+            __asm__ __volatile__ (
+                ".set noreorder                                                 \n\t"
+                :
+                :
+            );
+
+            /* apply filter with vectors pairs */
+            __asm__ __volatile__ (
+                "lbu            %[src_ptr_l1],  -4(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  4(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  8(%[src_ptr])                   \n\t"
+                "mtlo           %[vector4a],    $ac2                            \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l1],  -3(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  5(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  9(%[src_ptr])                   \n\t"
+                "mtlo           %[vector4a],    $ac3                            \n\t"
+                "extp           %[Temp1],       $ac2,           9               \n\t"
+
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l1],  -2(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   2(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  6(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  10(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac0                            \n\t"
+                "extp           %[Temp2],       $ac3,           9               \n\t"
+
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l1],  -1(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   3(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  7(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  11(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac1                            \n\t"
+                "extp           %[Temp3],       $ac0,           9               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp4],       $ac1,           9               \n\t"
+
+                : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+                  [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4),
+                  [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
+                  [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2)
+                : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
+                  [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
+            );
+
+            /* clamp and store results */
+            output_ptr[0] = cm[Temp1];
+            output_ptr[1] = cm[Temp2];
+            output_ptr[2] = cm[Temp3];
+            output_ptr[3] = cm[Temp4];
+
+            output_ptr += output_pitch;
+
+            /* apply filter with vectors pairs */
+            __asm__ __volatile__ (
+                "lbu            %[src_ptr_l1],  0(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_0],   4(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  8(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  12(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac2                            \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l1],  1(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_0],   5(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  9(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  13(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac3                            \n\t"
+                "extp           %[Temp1],       $ac2,           9               \n\t"
+
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l1],  2(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_0],   6(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  10(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  14(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac0                            \n\t"
+                "extp           %[Temp2],       $ac3,           9               \n\t"
+
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l1],  3(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_0],   7(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  11(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  15(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac1                            \n\t"
+                "extp           %[Temp3],       $ac0,           9               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp4],       $ac1,           9               \n\t"
+
+                : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+                  [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4),
+                  [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
+                  [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2)
+                : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
+                  [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
+            );
+
+            /* clamp and store results */
+            output_ptr[0] = cm[Temp1];
+            output_ptr[1] = cm[Temp2];
+            output_ptr[2] = cm[Temp3];
+            output_ptr[3] = cm[Temp4];
+
+            src_ptr += 8;
+            output_ptr += output_pitch;
+        }
+    }
+}
+
+
+void vp8_filter_block2d_second_pass_8
+(
+    unsigned char *RESTRICT src_ptr,
+    unsigned char *RESTRICT output_ptr,
+    int output_pitch,
+    unsigned int output_height,
+    unsigned int output_width,
+    unsigned int yoffset
+)
+{
+    unsigned int i;
+
+    int Temp1, Temp2, Temp3, Temp4, Temp5, Temp6, Temp7, Temp8;
+    unsigned int vector1b, vector2b, vector3b, vector4a;
+
+    unsigned char src_ptr_l2;
+    unsigned char src_ptr_l1;
+    unsigned char src_ptr_0;
+    unsigned char src_ptr_r1;
+    unsigned char src_ptr_r2;
+    unsigned char src_ptr_r3;
+    unsigned char *cm = ff_cropTbl + CROP_WIDTH;
+
+    vector4a = 64;
+
+    vector1b = sub_pel_filterss[yoffset][0];
+    vector2b = sub_pel_filterss[yoffset][2];
+    vector3b = sub_pel_filterss[yoffset][1];
+
+    if (vector1b)
+    {
+        /* 6 tap filter */
+
+        /* prefetch src_ptr data to cache memory */
+        prefetch_load(src_ptr);
+
+        for (i = output_height; i--;)
+        {
+            /* apply filter with vectors pairs */
+            __asm__ __volatile__ (
+                "lbu            %[src_ptr_l2],  -16(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -8(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  8(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  16(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  24(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac2                            \n\t"
+
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -15(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -7(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  9(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  17(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  25(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac3                            \n\t"
+                "extp           %[Temp1],       $ac2,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -14(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -6(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   2(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  10(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  18(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  26(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac0                            \n\t"
+                "extp           %[Temp2],       $ac3,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -13(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -5(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   3(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  11(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  19(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  27(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac1                            \n\t"
+                "extp           %[Temp3],       $ac0,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+                  [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
+                  [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
+                  [src_ptr_l2] "=&r" (src_ptr_l2), [src_ptr_r3] "=&r" (src_ptr_r3)
+                : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+                  [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
+                  [src_ptr] "r" (src_ptr)
+            );
+
+            /* apply filter with vectors pairs */
+            __asm__ __volatile__ (
+                "lbu            %[src_ptr_l2],  -12(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -4(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   4(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  12(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  20(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  28(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac2                            \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp4],       $ac1,           9               \n\t"
+
+                "lbu            %[src_ptr_l2],  -11(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -3(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   5(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  13(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  21(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  29(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac3                            \n\t"
+                "extp           %[Temp5],       $ac2,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -10(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -2(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   6(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  14(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  22(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  30(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac0                            \n\t"
+                "extp           %[Temp6],       $ac3,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -9(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_l1],  -1(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   7(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  15(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  23(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  31(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac1                            \n\t"
+                "extp           %[Temp7],       $ac0,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp8],       $ac1,           9               \n\t"
+
+                : [Temp4] "=&r" (Temp4), [Temp5] "=&r" (Temp5),
+                  [Temp6] "=&r" (Temp6), [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8),
+                  [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
+                  [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
+                  [src_ptr_l2] "=&r" (src_ptr_l2),[src_ptr_r3] "=&r" (src_ptr_r3)
+                : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+                  [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
+                  [src_ptr] "r" (src_ptr)
+            );
+
+            /* clamp and store results */
+            output_ptr[0] = cm[Temp1];
+            output_ptr[1] = cm[Temp2];
+            output_ptr[2] = cm[Temp3];
+            output_ptr[3] = cm[Temp4];
+            output_ptr[4] = cm[Temp5];
+            output_ptr[5] = cm[Temp6];
+            output_ptr[6] = cm[Temp7];
+            output_ptr[7] = cm[Temp8];
+
+            src_ptr += 8;
+            output_ptr += output_pitch;
+        }
+    }
+    else
+    {
+        /* 4 tap filter */
+
+        /* prefetch src_ptr data to cache memory */
+        prefetch_load(src_ptr);
+
+        for (i = output_height; i--;)
+        {
+            __asm__ __volatile__ (
+                "lbu            %[src_ptr_l1],  -8(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  8(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  16(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac2                            \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                : [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
+                  [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2)
+                : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
+                  [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
+            );
+
+            __asm__ __volatile__ (
+                "lbu            %[src_ptr_l1],  -7(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  9(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r2],  17(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac3                            \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp1],       $ac2,           9               \n\t"
+
+                : [Temp1] "=r" (Temp1),
+                  [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
+                  [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2)
+                : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
+                  [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
+            );
+
+            src_ptr_l1 = src_ptr[-6];
+            src_ptr_0  = src_ptr[2];
+            src_ptr_r1 = src_ptr[10];
+            src_ptr_r2 = src_ptr[18];
+
+            __asm__ __volatile__ (
+                "mtlo           %[vector4a],    $ac0                            \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp2],       $ac3,           9               \n\t"
+
+                : [Temp2] "=r" (Temp2)
+                : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
+                  [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
+                  [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
+                  [vector4a] "r" (vector4a)
+            );
+
+            src_ptr_l1 = src_ptr[-5];
+            src_ptr_0  = src_ptr[3];
+            src_ptr_r1 = src_ptr[11];
+            src_ptr_r2 = src_ptr[19];
+
+            __asm__ __volatile__ (
+                "mtlo           %[vector4a],    $ac1                            \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp3],       $ac0,           9               \n\t"
+
+                : [Temp3] "=r" (Temp3)
+                : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
+                  [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
+                  [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
+                  [vector4a] "r" (vector4a)
+            );
+
+            src_ptr_l1 = src_ptr[-4];
+            src_ptr_0  = src_ptr[4];
+            src_ptr_r1 = src_ptr[12];
+            src_ptr_r2 = src_ptr[20];
+
+            __asm__ __volatile__ (
+                "mtlo           %[vector4a],    $ac2                            \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp4],       $ac1,           9               \n\t"
+
+                : [Temp4] "=r" (Temp4)
+                : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
+                  [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
+                  [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
+                  [vector4a] "r" (vector4a)
+            );
+
+            src_ptr_l1 = src_ptr[-3];
+            src_ptr_0  = src_ptr[5];
+            src_ptr_r1 = src_ptr[13];
+            src_ptr_r2 = src_ptr[21];
+
+            __asm__ __volatile__ (
+                "mtlo           %[vector4a],    $ac3                            \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp5],       $ac2,           9               \n\t"
+
+                : [Temp5] "=&r" (Temp5)
+                : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
+                  [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
+                  [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
+                  [vector4a] "r" (vector4a)
+            );
+
+            src_ptr_l1 = src_ptr[-2];
+            src_ptr_0  = src_ptr[6];
+            src_ptr_r1 = src_ptr[14];
+            src_ptr_r2 = src_ptr[22];
+
+            __asm__ __volatile__ (
+                "mtlo           %[vector4a],    $ac0                            \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp6],       $ac3,           9               \n\t"
+
+                : [Temp6] "=r" (Temp6)
+                : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
+                  [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
+                  [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
+                  [vector4a] "r" (vector4a)
+            );
+
+            src_ptr_l1 = src_ptr[-1];
+            src_ptr_0  = src_ptr[7];
+            src_ptr_r1 = src_ptr[15];
+            src_ptr_r2 = src_ptr[23];
+
+            __asm__ __volatile__ (
+                "mtlo           %[vector4a],    $ac1                            \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp7],       $ac0,           9               \n\t"
+                "extp           %[Temp8],       $ac1,           9               \n\t"
+
+                : [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8)
+                : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
+                  [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
+                  [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
+                  [vector4a] "r" (vector4a)
+            );
+
+            /* clamp and store results */
+            output_ptr[0] = cm[Temp1];
+            output_ptr[1] = cm[Temp2];
+            output_ptr[2] = cm[Temp3];
+            output_ptr[3] = cm[Temp4];
+            output_ptr[4] = cm[Temp5];
+            output_ptr[5] = cm[Temp6];
+            output_ptr[6] = cm[Temp7];
+            output_ptr[7] = cm[Temp8];
+
+            src_ptr += 8;
+            output_ptr += output_pitch;
+        }
+    }
+}
+
+
+void vp8_filter_block2d_second_pass161
+(
+    unsigned char *RESTRICT src_ptr,
+    unsigned char *RESTRICT output_ptr,
+    int output_pitch,
+    const unsigned short *vp8_filter
+)
+{
+    unsigned int i, j;
+
+    int Temp1, Temp2, Temp3, Temp4, Temp5, Temp6, Temp7, Temp8;
+    unsigned int vector4a;
+    unsigned int vector1b, vector2b, vector3b;
+
+    unsigned char src_ptr_l2;
+    unsigned char src_ptr_l1;
+    unsigned char src_ptr_0;
+    unsigned char src_ptr_r1;
+    unsigned char src_ptr_r2;
+    unsigned char src_ptr_r3;
+    unsigned char *cm = ff_cropTbl + CROP_WIDTH;
+
+    vector4a = 64;
+
+    vector1b = vp8_filter[0];
+    vector2b = vp8_filter[2];
+    vector3b = vp8_filter[1];
+
+    if (vector1b == 0)
+    {
+        /* 4 tap filter */
+
+        /* prefetch src_ptr data to cache memory */
+        prefetch_load(src_ptr + 16);
+
+        for (i = 16; i--;)
+        {
+            /* unrolling for loop */
+            for (j = 0; j < 16; j += 8)
+            {
+                /* apply filter with vectors pairs */
+                __asm__ __volatile__ (
+                    "lbu            %[src_ptr_l1],  -16(%[src_ptr])                 \n\t"
+                    "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
+                    "lbu            %[src_ptr_r1],  16(%[src_ptr])                  \n\t"
+                    "lbu            %[src_ptr_r2],  32(%[src_ptr])                  \n\t"
+                    "mtlo           %[vector4a],    $ac2                            \n\t"
+                    "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                    "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                    "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
+                    "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                    "lbu            %[src_ptr_l1],  -15(%[src_ptr])                 \n\t"
+                    "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
+                    "lbu            %[src_ptr_r1],  17(%[src_ptr])                  \n\t"
+                    "lbu            %[src_ptr_r2],  33(%[src_ptr])                  \n\t"
+                    "mtlo           %[vector4a],    $ac3                            \n\t"
+                    "extp           %[Temp1],       $ac2,           9               \n\t"
+
+                    "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                    "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                    "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                    "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                    "lbu            %[src_ptr_l1],  -14(%[src_ptr])                 \n\t"
+                    "lbu            %[src_ptr_0],   2(%[src_ptr])                   \n\t"
+                    "lbu            %[src_ptr_r1],  18(%[src_ptr])                  \n\t"
+                    "lbu            %[src_ptr_r2],  34(%[src_ptr])                  \n\t"
+                    "mtlo           %[vector4a],    $ac1                            \n\t"
+                    "extp           %[Temp2],       $ac3,           9               \n\t"
+
+                    "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                    "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                    "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
+                    "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                    "lbu            %[src_ptr_l1],  -13(%[src_ptr])                 \n\t"
+                    "lbu            %[src_ptr_0],   3(%[src_ptr])                   \n\t"
+                    "lbu            %[src_ptr_r1],  19(%[src_ptr])                  \n\t"
+                    "lbu            %[src_ptr_r2],  35(%[src_ptr])                  \n\t"
+                    "mtlo           %[vector4a],    $ac3                            \n\t"
+                    "extp           %[Temp3],       $ac1,           9               \n\t"
+
+                    "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                    "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                    "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                    "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                    "lbu            %[src_ptr_l1],  -12(%[src_ptr])                 \n\t"
+                    "lbu            %[src_ptr_0],   4(%[src_ptr])                   \n\t"
+                    "lbu            %[src_ptr_r1],  20(%[src_ptr])                  \n\t"
+                    "lbu            %[src_ptr_r2],  36(%[src_ptr])                  \n\t"
+                    "mtlo           %[vector4a],    $ac2                            \n\t"
+                    "extp           %[Temp4],       $ac3,           9               \n\t"
+
+                    "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                    "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                    "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
+                    "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                    "lbu            %[src_ptr_l1],  -11(%[src_ptr])                 \n\t"
+                    "lbu            %[src_ptr_0],   5(%[src_ptr])                   \n\t"
+                    "lbu            %[src_ptr_r1],  21(%[src_ptr])                  \n\t"
+                    "lbu            %[src_ptr_r2],  37(%[src_ptr])                  \n\t"
+                    "mtlo           %[vector4a],    $ac3                            \n\t"
+                    "extp           %[Temp5],       $ac2,           9               \n\t"
+
+                    "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                    "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                    "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                    "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                    "lbu            %[src_ptr_l1],  -10(%[src_ptr])                 \n\t"
+                    "lbu            %[src_ptr_0],   6(%[src_ptr])                   \n\t"
+                    "lbu            %[src_ptr_r1],  22(%[src_ptr])                  \n\t"
+                    "lbu            %[src_ptr_r2],  38(%[src_ptr])                  \n\t"
+                    "mtlo           %[vector4a],    $ac1                            \n\t"
+                    "extp           %[Temp6],       $ac3,           9               \n\t"
+
+                    "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                    "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                    "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
+                    "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                    "lbu            %[src_ptr_l1],  -9(%[src_ptr])                  \n\t"
+                    "lbu            %[src_ptr_0],   7(%[src_ptr])                   \n\t"
+                    "lbu            %[src_ptr_r1],  23(%[src_ptr])                  \n\t"
+                    "lbu            %[src_ptr_r2],  39(%[src_ptr])                  \n\t"
+                    "mtlo           %[vector4a],    $ac3                            \n\t"
+                    "extp           %[Temp7],       $ac1,           9               \n\t"
+
+                    "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                    "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                    "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                    "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                    "extp           %[Temp8],       $ac3,           9               \n\t"
+
+                    : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+                      [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4),
+                      [Temp5] "=&r" (Temp5), [Temp6] "=&r" (Temp6),
+                      [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8),
+                      [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
+                      [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2)
+                    : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
+                      [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
+                );
+
+                /* clamp and store results */
+                output_ptr[j] = cm[Temp1];
+                output_ptr[j + 1] = cm[Temp2];
+                output_ptr[j + 2] = cm[Temp3];
+                output_ptr[j + 3] = cm[Temp4];
+                output_ptr[j + 4] = cm[Temp5];
+                output_ptr[j + 5] = cm[Temp6];
+                output_ptr[j + 6] = cm[Temp7];
+                output_ptr[j + 7] = cm[Temp8];
+
+                src_ptr += 8;
+            }
+
+            output_ptr += output_pitch;
+        }
+    }
+    else
+    {
+        /* 4 tap filter */
+
+        /* prefetch src_ptr data to cache memory */
+        prefetch_load(src_ptr + 16);
+
+        /* unroll for loop */
+        for (i = 16; i--;)
+        {
+            /* apply filter with vectors pairs */
+            __asm__ __volatile__ (
+                "lbu            %[src_ptr_l2],  -32(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -16(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  16(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  32(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  48(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac2                            \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -31(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -15(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  17(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  33(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  49(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac0                            \n\t"
+                "extp           %[Temp1],       $ac2,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -30(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -14(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_0],   2(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  18(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  34(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  50(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac1                            \n\t"
+                "extp           %[Temp2],       $ac0,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -29(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -13(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_0],   3(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  19(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  35(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  51(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac3                            \n\t"
+                "extp           %[Temp3],       $ac1,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -28(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -12(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_0],   4(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  20(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  36(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  52(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac2                            \n\t"
+                "extp           %[Temp4],       $ac3,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -27(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -11(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_0],   5(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  21(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  37(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  53(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac0                            \n\t"
+                "extp           %[Temp5],       $ac2,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -26(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -10(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_0],   6(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  22(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  38(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  54(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac1                            \n\t"
+                "extp           %[Temp6],       $ac0,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -25(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -9(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   7(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  23(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  39(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  55(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac3                            \n\t"
+                "extp           %[Temp7],       $ac1,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp8],       $ac3,           9               \n\t"
+
+                : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+                  [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4),
+                  [Temp5] "=&r" (Temp5), [Temp6] "=&r" (Temp6),
+                  [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8),
+                  [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
+                  [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
+                  [src_ptr_l2] "=&r" (src_ptr_l2),[src_ptr_r3] "=&r" (src_ptr_r3)
+                : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+                  [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
+                  [src_ptr] "r" (src_ptr)
+            );
+
+            /* clamp and store results */
+            output_ptr[0] = cm[Temp1];
+            output_ptr[1] = cm[Temp2];
+            output_ptr[2] = cm[Temp3];
+            output_ptr[3] = cm[Temp4];
+            output_ptr[4] = cm[Temp5];
+            output_ptr[5] = cm[Temp6];
+            output_ptr[6] = cm[Temp7];
+            output_ptr[7] = cm[Temp8];
+
+            /* apply filter with vectors pairs */
+            __asm__ __volatile__ (
+                "lbu            %[src_ptr_l2],  -24(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -8(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   8(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  24(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  40(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  56(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac2                            \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -23(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -7(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   9(%[src_ptr])                   \n\t"
+                "lbu            %[src_ptr_r1],  25(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  41(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  57(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac0                            \n\t"
+                "extp           %[Temp1],       $ac2,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -22(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -6(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   10(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r1],  26(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  42(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  58(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac1                            \n\t"
+                "extp           %[Temp2],       $ac0,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -21(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -5(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   11(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r1],  27(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  43(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  59(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac3                            \n\t"
+                "extp           %[Temp3],       $ac1,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -20(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -4(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   12(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r1],  28(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  44(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  60(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac2                            \n\t"
+                "extp           %[Temp4],       $ac3,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -19(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -3(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   13(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r1],  29(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  45(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  61(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac0                            \n\t"
+                "extp           %[Temp5],       $ac2,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -18(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -2(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   14(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r1],  30(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  46(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  62(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac1                            \n\t"
+                "extp           %[Temp6],       $ac0,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
+
+                "lbu            %[src_ptr_l2],  -17(%[src_ptr])                 \n\t"
+                "lbu            %[src_ptr_l1],  -1(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_0],   15(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r1],  31(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r2],  47(%[src_ptr])                  \n\t"
+                "lbu            %[src_ptr_r3],  63(%[src_ptr])                  \n\t"
+                "mtlo           %[vector4a],    $ac3                            \n\t"
+                "extp           %[Temp7],       $ac1,           9               \n\t"
+
+                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
+                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
+                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
+                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
+                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
+                "extp           %[Temp8],       $ac3,           9               \n\t"
+
+                : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
+                  [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4),
+                  [Temp5] "=&r" (Temp5), [Temp6] "=&r" (Temp6),
+                  [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8),
+                  [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
+                  [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
+                  [src_ptr_l2] "=&r" (src_ptr_l2), [src_ptr_r3] "=&r" (src_ptr_r3)
+                : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
+                  [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
+                  [src_ptr] "r" (src_ptr)
+            );
+
+            src_ptr += 16;
+            output_ptr[8] = cm[Temp1];
+            output_ptr[9] = cm[Temp2];
+            output_ptr[10] = cm[Temp3];
+            output_ptr[11] = cm[Temp4];
+            output_ptr[12] = cm[Temp5];
+            output_ptr[13] = cm[Temp6];
+            output_ptr[14] = cm[Temp7];
+            output_ptr[15] = cm[Temp8];
+
+            output_ptr += output_pitch;
+        }
+    }
+}
+
+
+void vp8_sixtap_predict4x4_dspr2
+(
+    unsigned char *RESTRICT src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *RESTRICT dst_ptr,
+    int dst_pitch
+)
+{
+    unsigned char FData[9 * 4]; /* Temp data bufffer used in filtering */
+    unsigned int pos = 16;
+
+    /* bit positon for extract from acc */
+    __asm__ __volatile__ (
+        "wrdsp      %[pos],     1           \n\t"
+        :
+        : [pos] "r" (pos)
+    );
+
+    if (yoffset)
+    {
+        /* First filter 1-D horizontally... */
+        vp8_filter_block2d_first_pass_4(src_ptr - (2 * src_pixels_per_line), FData,
+                                        src_pixels_per_line, 9, xoffset, 4);
+        /* then filter verticaly... */
+        vp8_filter_block2d_second_pass4(FData + 8, dst_ptr, dst_pitch, yoffset);
+    }
+    else
+        /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
+        vp8_filter_block2d_first_pass_4(src_ptr, dst_ptr, src_pixels_per_line,
+                                        4, xoffset, dst_pitch);
+}
+
+
+void vp8_sixtap_predict8x8_dspr2
+(
+    unsigned char   *RESTRICT src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *RESTRICT dst_ptr,
+    int  dst_pitch
+)
+{
+
+    unsigned char FData[13 * 8]; /* Temp data bufffer used in filtering */
+    unsigned int pos, Temp1, Temp2;
+
+    pos = 16;
+
+    /* bit positon for extract from acc */
+    __asm__ __volatile__ (
+        "wrdsp      %[pos],     1               \n\t"
+        :
+        : [pos] "r" (pos)
+    );
+
+    if (yoffset)
+    {
+
+        src_ptr = src_ptr - (2 * src_pixels_per_line);
+
+        if (xoffset)
+            /* filter 1-D horizontally... */
+            vp8_filter_block2d_first_pass_8_all(src_ptr, FData, src_pixels_per_line,
+                                                13, xoffset, 8);
+
+        else
+        {
+            /* prefetch src_ptr data to cache memory */
+            prefetch_load(src_ptr + 2 * src_pixels_per_line);
+
+            __asm__ __volatile__ (
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   0(%[FData])                             \n\t"
+                "sw     %[Temp2],   4(%[FData])                             \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   8(%[FData])                             \n\t"
+                "sw     %[Temp2],   12(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   16(%[FData])                            \n\t"
+                "sw     %[Temp2],   20(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   24(%[FData])                            \n\t"
+                "sw     %[Temp2],   28(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   32(%[FData])                            \n\t"
+                "sw     %[Temp2],   36(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   40(%[FData])                            \n\t"
+                "sw     %[Temp2],   44(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   48(%[FData])                            \n\t"
+                "sw     %[Temp2],   52(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   56(%[FData])                            \n\t"
+                "sw     %[Temp2],   60(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   64(%[FData])                            \n\t"
+                "sw     %[Temp2],   68(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   72(%[FData])                            \n\t"
+                "sw     %[Temp2],   76(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   80(%[FData])                            \n\t"
+                "sw     %[Temp2],   84(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   88(%[FData])                            \n\t"
+                "sw     %[Temp2],   92(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   96(%[FData])                            \n\t"
+                "sw     %[Temp2],   100(%[FData])                           \n\t"
+
+                : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2)
+                : [FData] "r" (FData), [src_ptr] "r" (src_ptr),
+                  [src_pixels_per_line] "r" (src_pixels_per_line)
+            );
+        }
+
+        /* filter verticaly... */
+        vp8_filter_block2d_second_pass_8(FData + 16, dst_ptr, dst_pitch, 8, 8, yoffset);
+    }
+
+    /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
+    else
+    {
+        if (xoffset)
+            vp8_filter_block2d_first_pass_8_all(src_ptr, dst_ptr, src_pixels_per_line,
+                                                8, xoffset, dst_pitch);
+
+        else
+        {
+            /* copy from src buffer to dst buffer */
+            __asm__ __volatile__ (
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   0(%[dst_ptr])                           \n\t"
+                "sw     %[Temp2],   4(%[dst_ptr])                           \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   8(%[dst_ptr])                           \n\t"
+                "sw     %[Temp2],   12(%[dst_ptr])                          \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   16(%[dst_ptr])                          \n\t"
+                "sw     %[Temp2],   20(%[dst_ptr])                          \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   24(%[dst_ptr])                          \n\t"
+                "sw     %[Temp2],   28(%[dst_ptr])                          \n\t"
+                "addu   %[src_ptr], %[src_ptr],   %[src_pixels_per_line]    \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   32(%[dst_ptr])                          \n\t"
+                "sw     %[Temp2],   36(%[dst_ptr])                          \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   40(%[dst_ptr])                          \n\t"
+                "sw     %[Temp2],   44(%[dst_ptr])                          \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   48(%[dst_ptr])                          \n\t"
+                "sw     %[Temp2],   52(%[dst_ptr])                          \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   56(%[dst_ptr])                          \n\t"
+                "sw     %[Temp2],   60(%[dst_ptr])                          \n\t"
+
+                : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2)
+                : [dst_ptr] "r" (dst_ptr), [src_ptr] "r" (src_ptr),
+                  [src_pixels_per_line] "r" (src_pixels_per_line)
+            );
+        }
+    }
+}
+
+
+void vp8_sixtap_predict8x4_dspr2
+(
+    unsigned char   *RESTRICT src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *RESTRICT dst_ptr,
+    int  dst_pitch
+)
+{
+    unsigned char FData[9 * 8]; /* Temp data bufffer used in filtering */
+    unsigned int pos, Temp1, Temp2;
+
+    pos = 16;
+
+    /* bit positon for extract from acc */
+    __asm__ __volatile__ (
+        "wrdsp      %[pos],     1           \n\t"
+        :
+        : [pos] "r" (pos)
+    );
+
+    if (yoffset)
+    {
+
+        src_ptr = src_ptr - (2 * src_pixels_per_line);
+
+        if (xoffset)
+            /* filter 1-D horizontally... */
+            vp8_filter_block2d_first_pass_8_all(src_ptr, FData, src_pixels_per_line,
+                                                9, xoffset, 8);
+
+        else
+        {
+            /* prefetch src_ptr data to cache memory */
+            prefetch_load(src_ptr + 2 * src_pixels_per_line);
+
+            __asm__ __volatile__ (
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   0(%[FData])                             \n\t"
+                "sw     %[Temp2],   4(%[FData])                             \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   8(%[FData])                             \n\t"
+                "sw     %[Temp2],   12(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   16(%[FData])                            \n\t"
+                "sw     %[Temp2],   20(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   24(%[FData])                            \n\t"
+                "sw     %[Temp2],   28(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   32(%[FData])                            \n\t"
+                "sw     %[Temp2],   36(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   40(%[FData])                            \n\t"
+                "sw     %[Temp2],   44(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   48(%[FData])                            \n\t"
+                "sw     %[Temp2],   52(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   56(%[FData])                            \n\t"
+                "sw     %[Temp2],   60(%[FData])                            \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   64(%[FData])                            \n\t"
+                "sw     %[Temp2],   68(%[FData])                            \n\t"
+
+                : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2)
+                : [FData] "r" (FData), [src_ptr] "r" (src_ptr),
+                  [src_pixels_per_line] "r" (src_pixels_per_line)
+            );
+        }
+
+        /* filter verticaly... */
+        vp8_filter_block2d_second_pass_8(FData + 16, dst_ptr, dst_pitch, 4, 8, yoffset);
+    }
+
+    /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
+    else
+    {
+        if (xoffset)
+            vp8_filter_block2d_first_pass_8_all(src_ptr, dst_ptr, src_pixels_per_line,
+                                                4, xoffset, dst_pitch);
+
+        else
+        {
+            /* copy from src buffer to dst buffer */
+            __asm__ __volatile__ (
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   0(%[dst_ptr])                           \n\t"
+                "sw     %[Temp2],   4(%[dst_ptr])                           \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   8(%[dst_ptr])                           \n\t"
+                "sw     %[Temp2],   12(%[dst_ptr])                          \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   16(%[dst_ptr])                          \n\t"
+                "sw     %[Temp2],   20(%[dst_ptr])                          \n\t"
+                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
+
+                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
+                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
+                "sw     %[Temp1],   24(%[dst_ptr])                          \n\t"
+                "sw     %[Temp2],   28(%[dst_ptr])                          \n\t"
+
+                : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2)
+                : [dst_ptr] "r" (dst_ptr), [src_ptr] "r" (src_ptr),
+                  [src_pixels_per_line] "r" (src_pixels_per_line)
+            );
+        }
+    }
+}
+
+
+void vp8_sixtap_predict16x16_dspr2
+(
+    unsigned char   *RESTRICT src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *RESTRICT dst_ptr,
+    int  dst_pitch
+)
+{
+    const unsigned short *VFilter;
+    unsigned char FData[21 * 16]; /* Temp data bufffer used in filtering */
+    unsigned int pos;
+
+    VFilter = sub_pel_filterss[yoffset];
+
+    pos = 16;
+
+    /* bit positon for extract from acc */
+    __asm__ __volatile__ (
+        "wrdsp      %[pos],     1           \n\t"
+        :
+        : [pos] "r" (pos)
+    );
+
+    if (yoffset)
+    {
+
+        src_ptr = src_ptr - (2 * src_pixels_per_line);
+
+        switch (xoffset)
+        {
+            /* filter 1-D horizontally... */
+        case 2:
+        case 4:
+        case 6:
+            /* 6 tap filter */
+            vp8_filter_block2d_first_pass16_6tap(src_ptr, FData, src_pixels_per_line,
+                                                 21, xoffset, 16);
+            break;
+
+        case 0:
+            /* only copy buffer */
+            vp8_filter_block2d_first_pass16_0(src_ptr, FData, src_pixels_per_line);
+            break;
+
+        case 1:
+        case 3:
+        case 5:
+        case 7:
+            /* 4 tap filter */
+            vp8_filter_block2d_first_pass16_4tap(src_ptr, FData, src_pixels_per_line, 16,
+                                                 21, xoffset, yoffset, dst_ptr, dst_pitch);
+            break;
+        }
+
+        /* filter verticaly... */
+        vp8_filter_block2d_second_pass161(FData + 32, dst_ptr, dst_pitch, VFilter);
+    }
+    else
+    {
+        /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
+        switch (xoffset)
+        {
+        case 2:
+        case 4:
+        case 6:
+            /* 6 tap filter */
+            vp8_filter_block2d_first_pass16_6tap(src_ptr, dst_ptr, src_pixels_per_line,
+                                                 16, xoffset, dst_pitch);
+            break;
+
+        case 1:
+        case 3:
+        case 5:
+        case 7:
+            /* 4 tap filter */
+            vp8_filter_block2d_first_pass16_4tap(src_ptr, dst_ptr, src_pixels_per_line, 16,
+                                                 21, xoffset, yoffset, dst_ptr, dst_pitch);
+            break;
+        }
+    }
+}
+
+#endif

diff --git a/vp8/common/mips/dspr2/idct_blk_dspr2.c b/vp8/common/mips/dspr2/idct_blk_dspr2.c
new file mode 100644
index 0000000..1e0ebd1
--- /dev/null
+++ b/vp8/common/mips/dspr2/idct_blk_dspr2.c

@@ -0,0 +1,88 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+
+#if HAVE_DSPR2
+
+void vp8_dequant_idct_add_y_block_dspr2
+(short *q, short *dq,
+ unsigned char *dst, int stride, char *eobs)
+{
+    int i, j;
+
+    for (i = 0; i < 4; i++)
+    {
+        for (j = 0; j < 4; j++)
+        {
+            if (*eobs++ > 1)
+                vp8_dequant_idct_add_dspr2(q, dq, dst, stride);
+            else
+            {
+                vp8_dc_only_idct_add_dspr2(q[0]*dq[0], dst, stride, dst, stride);
+                ((int *)q)[0] = 0;
+            }
+
+            q   += 16;
+            dst += 4;
+        }
+
+        dst += 4 * stride - 16;
+    }
+}
+
+void vp8_dequant_idct_add_uv_block_dspr2
+(short *q, short *dq,
+ unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
+{
+    int i, j;
+
+    for (i = 0; i < 2; i++)
+    {
+        for (j = 0; j < 2; j++)
+        {
+            if (*eobs++ > 1)
+                vp8_dequant_idct_add_dspr2(q, dq, dstu, stride);
+            else
+            {
+                vp8_dc_only_idct_add_dspr2(q[0]*dq[0], dstu, stride, dstu, stride);
+                ((int *)q)[0] = 0;
+            }
+
+            q    += 16;
+            dstu += 4;
+        }
+
+        dstu += 4 * stride - 8;
+    }
+
+    for (i = 0; i < 2; i++)
+    {
+        for (j = 0; j < 2; j++)
+        {
+            if (*eobs++ > 1)
+                vp8_dequant_idct_add_dspr2(q, dq, dstv, stride);
+            else
+            {
+                vp8_dc_only_idct_add_dspr2(q[0]*dq[0], dstv, stride, dstv, stride);
+                ((int *)q)[0] = 0;
+            }
+
+            q    += 16;
+            dstv += 4;
+        }
+
+        dstv += 4 * stride - 8;
+    }
+}
+
+#endif
+

diff --git a/vp8/common/mips/dspr2/idctllm_dspr2.c b/vp8/common/mips/dspr2/idctllm_dspr2.c
new file mode 100644
index 0000000..25b7936
--- /dev/null
+++ b/vp8/common/mips/dspr2/idctllm_dspr2.c

@@ -0,0 +1,369 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_rtcd.h"
+
+#if HAVE_DSPR2
+#define CROP_WIDTH 256
+
+/******************************************************************************
+ * Notes:
+ *
+ * This implementation makes use of 16 bit fixed point version of two multiply
+ * constants:
+ *         1.   sqrt(2) * cos (pi/8)
+ *         2.   sqrt(2) * sin (pi/8)
+ * Since the first constant is bigger than 1, to maintain the same 16 bit
+ * fixed point precision as the second one, we use a trick of
+ *         x * a = x + x*(a-1)
+ * so
+ *         x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
+ ****************************************************************************/
+extern unsigned char ff_cropTbl[256 + 2 * CROP_WIDTH];
+static const int cospi8sqrt2minus1 = 20091;
+static const int sinpi8sqrt2      = 35468;
+
+inline void prefetch_load_short(short *src)
+{
+    __asm__ __volatile__ (
+        "pref   0,  0(%[src])   \n\t"
+        :
+        : [src] "r" (src)
+    );
+}
+
+void vp8_short_idct4x4llm_dspr2(short *input, unsigned char *pred_ptr,
+                                int pred_stride, unsigned char *dst_ptr,
+                                int dst_stride)
+{
+    int r, c;
+    int a1, b1, c1, d1;
+    short output[16];
+    short *ip = input;
+    short *op = output;
+    int temp1, temp2;
+    int shortpitch = 4;
+
+    int c2, d2;
+    int temp3, temp4;
+    unsigned char *cm = ff_cropTbl + CROP_WIDTH;
+
+    /* prepare data for load */
+    prefetch_load_short(ip + 8);
+
+    /* first loop is unrolled */
+    a1 = ip[0] + ip[8];
+    b1 = ip[0] - ip[8];
+
+    temp1 = (ip[4] * sinpi8sqrt2) >> 16;
+    temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
+    c1 = temp1 - temp2;
+
+    temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1) >> 16);
+    temp2 = (ip[12] * sinpi8sqrt2) >> 16;
+    d1 = temp1 + temp2;
+
+    temp3 = (ip[5] * sinpi8sqrt2) >> 16;
+    temp4 = ip[13] + ((ip[13] * cospi8sqrt2minus1) >> 16);
+    c2 = temp3 - temp4;
+
+    temp3 = ip[5] + ((ip[5] * cospi8sqrt2minus1) >> 16);
+    temp4 = (ip[13] * sinpi8sqrt2) >> 16;
+    d2 = temp3 + temp4;
+
+    op[0] = a1 + d1;
+    op[12] = a1 - d1;
+    op[4] = b1 + c1;
+    op[8] = b1 - c1;
+
+    a1 = ip[1] + ip[9];
+    b1 = ip[1] - ip[9];
+
+    op[1] = a1 + d2;
+    op[13] = a1 - d2;
+    op[5] = b1 + c2;
+    op[9] = b1 - c2;
+
+    a1 = ip[2] + ip[10];
+    b1 = ip[2] - ip[10];
+
+    temp1 = (ip[6] * sinpi8sqrt2) >> 16;
+    temp2 = ip[14] + ((ip[14] * cospi8sqrt2minus1) >> 16);
+    c1 = temp1 - temp2;
+
+    temp1 = ip[6] + ((ip[6] * cospi8sqrt2minus1) >> 16);
+    temp2 = (ip[14] * sinpi8sqrt2) >> 16;
+    d1 = temp1 + temp2;
+
+    temp3 = (ip[7] * sinpi8sqrt2) >> 16;
+    temp4 = ip[15] + ((ip[15] * cospi8sqrt2minus1) >> 16);
+    c2 = temp3 - temp4;
+
+    temp3 = ip[7] + ((ip[7] * cospi8sqrt2minus1) >> 16);
+    temp4 = (ip[15] * sinpi8sqrt2) >> 16;
+    d2 = temp3 + temp4;
+
+    op[2] = a1 + d1;
+    op[14] = a1 - d1;
+    op[6] = b1 + c1;
+    op[10] = b1 - c1;
+
+    a1 = ip[3] + ip[11];
+    b1 = ip[3] - ip[11];
+
+    op[3] = a1 + d2;
+    op[15] = a1 - d2;
+    op[7] = b1 + c2;
+    op[11] = b1 - c2;
+
+    ip = output;
+
+    /* prepare data for load */
+    prefetch_load_short(ip + shortpitch);
+
+    /* second loop is unrolled */
+    a1 = ip[0] + ip[2];
+    b1 = ip[0] - ip[2];
+
+    temp1 = (ip[1] * sinpi8sqrt2) >> 16;
+    temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1) >> 16);
+    c1 = temp1 - temp2;
+
+    temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1) >> 16);
+    temp2 = (ip[3] * sinpi8sqrt2) >> 16;
+    d1 = temp1 + temp2;
+
+    temp3 = (ip[5] * sinpi8sqrt2) >> 16;
+    temp4 = ip[7] + ((ip[7] * cospi8sqrt2minus1) >> 16);
+    c2 = temp3 - temp4;
+
+    temp3 = ip[5] + ((ip[5] * cospi8sqrt2minus1) >> 16);
+    temp4 = (ip[7] * sinpi8sqrt2) >> 16;
+    d2 = temp3 + temp4;
+
+    op[0] = (a1 + d1 + 4) >> 3;
+    op[3] = (a1 - d1 + 4) >> 3;
+    op[1] = (b1 + c1 + 4) >> 3;
+    op[2] = (b1 - c1 + 4) >> 3;
+
+    a1 = ip[4] + ip[6];
+    b1 = ip[4] - ip[6];
+
+    op[4] = (a1 + d2 + 4) >> 3;
+    op[7] = (a1 - d2 + 4) >> 3;
+    op[5] = (b1 + c2 + 4) >> 3;
+    op[6] = (b1 - c2 + 4) >> 3;
+
+    a1 = ip[8] + ip[10];
+    b1 = ip[8] - ip[10];
+
+    temp1 = (ip[9] * sinpi8sqrt2) >> 16;
+    temp2 = ip[11] + ((ip[11] * cospi8sqrt2minus1) >> 16);
+    c1 = temp1 - temp2;
+
+    temp1 = ip[9] + ((ip[9] * cospi8sqrt2minus1) >> 16);
+    temp2 = (ip[11] * sinpi8sqrt2) >> 16;
+    d1 = temp1 + temp2;
+
+    temp3 = (ip[13] * sinpi8sqrt2) >> 16;
+    temp4 = ip[15] + ((ip[15] * cospi8sqrt2minus1) >> 16);
+    c2 = temp3 - temp4;
+
+    temp3 = ip[13] + ((ip[13] * cospi8sqrt2minus1) >> 16);
+    temp4 = (ip[15] * sinpi8sqrt2) >> 16;
+    d2 = temp3 + temp4;
+
+    op[8] = (a1 + d1 + 4) >> 3;
+    op[11] = (a1 - d1 + 4) >> 3;
+    op[9] = (b1 + c1 + 4) >> 3;
+    op[10] = (b1 - c1 + 4) >> 3;
+
+    a1 = ip[12] + ip[14];
+    b1 = ip[12] - ip[14];
+
+    op[12] = (a1 + d2 + 4) >> 3;
+    op[15] = (a1 - d2 + 4) >> 3;
+    op[13] = (b1 + c2 + 4) >> 3;
+    op[14] = (b1 - c2 + 4) >> 3;
+
+    ip = output;
+
+    for (r = 0; r < 4; r++)
+    {
+        for (c = 0; c < 4; c++)
+        {
+            short a = ip[c] + pred_ptr[c] ;
+            dst_ptr[c] = cm[a] ;
+        }
+
+        ip += 4;
+        dst_ptr += dst_stride;
+        pred_ptr += pred_stride;
+    }
+}
+
+void vp8_dc_only_idct_add_dspr2(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride)
+{
+    int a1;
+    int i, absa1;
+    int t2, vector_a1, vector_a;
+
+    /* a1 = ((input_dc + 4) >> 3); */
+    __asm__ __volatile__ (
+        "addi  %[a1], %[input_dc], 4   \n\t"
+        "sra   %[a1], %[a1],       3   \n\t"
+        : [a1] "=r" (a1)
+        : [input_dc] "r" (input_dc)
+    );
+
+    if (a1 < 0)
+    {
+        /* use quad-byte
+         * input and output memory are four byte aligned
+         */
+        __asm__ __volatile__ (
+            "abs        %[absa1],     %[a1]         \n\t"
+            "replv.qb   %[vector_a1], %[absa1]      \n\t"
+            : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
+            : [a1] "r" (a1)
+        );
+
+        /* use (a1 - predptr[c]) instead a1 + predptr[c] */
+        for (i = 4; i--;)
+        {
+            __asm__ __volatile__ (
+                "lw             %[t2],       0(%[pred_ptr])                     \n\t"
+                "add            %[pred_ptr], %[pred_ptr],    %[pred_stride]     \n\t"
+                "subu_s.qb      %[vector_a], %[t2],          %[vector_a1]       \n\t"
+                "sw             %[vector_a], 0(%[dst_ptr])                      \n\t"
+                "add            %[dst_ptr],  %[dst_ptr],     %[dst_stride]      \n\t"
+                : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
+                  [dst_ptr] "+&r" (dst_ptr), [pred_ptr] "+&r" (pred_ptr)
+                : [dst_stride] "r" (dst_stride), [pred_stride] "r" (pred_stride), [vector_a1] "r" (vector_a1)
+            );
+        }
+    }
+    else
+    {
+        /* use quad-byte
+         * input and output memory are four byte aligned
+         */
+        __asm__ __volatile__ (
+            "replv.qb       %[vector_a1], %[a1]     \n\t"
+            : [vector_a1] "=r" (vector_a1)
+            : [a1] "r" (a1)
+        );
+
+        for (i = 4; i--;)
+        {
+            __asm__ __volatile__ (
+                "lw             %[t2],       0(%[pred_ptr])                 \n\t"
+                "add            %[pred_ptr], %[pred_ptr],    %[pred_stride] \n\t"
+                "addu_s.qb      %[vector_a], %[vector_a1],   %[t2]          \n\t"
+                "sw             %[vector_a], 0(%[dst_ptr])                  \n\t"
+                "add            %[dst_ptr],  %[dst_ptr],     %[dst_stride]  \n\t"
+                : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
+                  [dst_ptr] "+&r" (dst_ptr), [pred_ptr] "+&r" (pred_ptr)
+                : [dst_stride] "r" (dst_stride), [pred_stride] "r" (pred_stride), [vector_a1] "r" (vector_a1)
+            );
+        }
+    }
+
+}
+
+void vp8_short_inv_walsh4x4_dspr2(short *input, short *mb_dqcoeff)
+{
+    short output[16];
+    int i;
+    int a1, b1, c1, d1;
+    int a2, b2, c2, d2;
+    short *ip = input;
+    short *op = output;
+
+    prefetch_load_short(ip);
+
+    for (i = 4; i--;)
+    {
+        a1 = ip[0] + ip[12];
+        b1 = ip[4] + ip[8];
+        c1 = ip[4] - ip[8];
+        d1 = ip[0] - ip[12];
+
+        op[0] = a1 + b1;
+        op[4] = c1 + d1;
+        op[8] = a1 - b1;
+        op[12] = d1 - c1;
+
+        ip++;
+        op++;
+    }
+
+    ip = output;
+    op = output;
+
+    prefetch_load_short(ip);
+
+    for (i = 4; i--;)
+    {
+        a1 = ip[0] + ip[3] + 3;
+        b1 = ip[1] + ip[2];
+        c1 = ip[1] - ip[2];
+        d1 = ip[0] - ip[3] + 3;
+
+        a2 = a1 + b1;
+        b2 = d1 + c1;
+        c2 = a1 - b1;
+        d2 = d1 - c1;
+
+        op[0] = a2 >> 3;
+        op[1] = b2 >> 3;
+        op[2] = c2 >> 3;
+        op[3] = d2 >> 3;
+
+        ip += 4;
+        op += 4;
+    }
+
+    for (i = 0; i < 16; i++)
+    {
+        mb_dqcoeff[i * 16] = output[i];
+    }
+}
+
+void vp8_short_inv_walsh4x4_1_dspr2(short *input, short *mb_dqcoeff)
+{
+    int a1;
+
+    a1 = ((input[0] + 3) >> 3);
+
+    __asm__ __volatile__ (
+        "sh             %[a1], 0(%[mb_dqcoeff])                    \n\t"
+        "sh             %[a1], 32(%[mb_dqcoeff])                   \n\t"
+        "sh             %[a1], 64(%[mb_dqcoeff])                   \n\t"
+        "sh             %[a1], 96(%[mb_dqcoeff])                   \n\t"
+        "sh             %[a1], 128(%[mb_dqcoeff])                  \n\t"
+        "sh             %[a1], 160(%[mb_dqcoeff])                  \n\t"
+        "sh             %[a1], 192(%[mb_dqcoeff])                  \n\t"
+        "sh             %[a1], 224(%[mb_dqcoeff])                  \n\t"
+        "sh             %[a1], 256(%[mb_dqcoeff])                  \n\t"
+        "sh             %[a1], 288(%[mb_dqcoeff])                  \n\t"
+        "sh             %[a1], 320(%[mb_dqcoeff])                  \n\t"
+        "sh             %[a1], 352(%[mb_dqcoeff])                  \n\t"
+        "sh             %[a1], 384(%[mb_dqcoeff])                  \n\t"
+        "sh             %[a1], 416(%[mb_dqcoeff])                  \n\t"
+        "sh             %[a1], 448(%[mb_dqcoeff])                  \n\t"
+        "sh             %[a1], 480(%[mb_dqcoeff])                  \n\t"
+
+        :
+        : [a1] "r" (a1), [mb_dqcoeff] "r" (mb_dqcoeff)
+    );
+}
+
+#endif

diff --git a/vp8/common/mips/dspr2/loopfilter_filters_dspr2.c b/vp8/common/mips/dspr2/loopfilter_filters_dspr2.c
new file mode 100644
index 0000000..b8e5e4d
--- /dev/null
+++ b/vp8/common/mips/dspr2/loopfilter_filters_dspr2.c

@@ -0,0 +1,2622 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdlib.h>
+#include "vpx_rtcd.h"
+#include "vp8/common/onyxc_int.h"
+
+#if HAVE_DSPR2
+typedef unsigned char uc;
+
+/* prefetch data for load */
+inline void prefetch_load_lf(unsigned char *src)
+{
+    __asm__ __volatile__ (
+        "pref   0,  0(%[src])   \n\t"
+        :
+        : [src] "r" (src)
+    );
+}
+
+
+/* prefetch data for store */
+inline void prefetch_store_lf(unsigned char *dst)
+{
+    __asm__ __volatile__ (
+        "pref   1,  0(%[dst])   \n\t"
+        :
+        : [dst] "r" (dst)
+    );
+}
+
+/* processing 4 pixels at the same time
+ * compute hev and mask in the same function
+ */
+static __inline void vp8_filter_mask_vec_mips
+(
+    uint32_t limit,
+    uint32_t flimit,
+    uint32_t p1,
+    uint32_t p0,
+    uint32_t p3,
+    uint32_t p2,
+    uint32_t q0,
+    uint32_t q1,
+    uint32_t q2,
+    uint32_t q3,
+    uint32_t thresh,
+    uint32_t *hev,
+    uint32_t *mask
+)
+{
+    uint32_t c, r, r3, r_k;
+    uint32_t s1, s2, s3;
+    uint32_t ones = 0xFFFFFFFF;
+    uint32_t hev1;
+
+    __asm__ __volatile__ (
+        /* mask |= (abs(p3 - p2) > limit) */
+        "subu_s.qb      %[c],   %[p3],     %[p2]        \n\t"
+        "subu_s.qb      %[r_k], %[p2],     %[p3]        \n\t"
+        "or             %[r_k], %[r_k],    %[c]         \n\t"
+        "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+        "or             %[r],   $0,        %[c]         \n\t"
+
+        /* mask |= (abs(p2 - p1) > limit) */
+        "subu_s.qb      %[c],   %[p2],     %[p1]        \n\t"
+        "subu_s.qb      %[r_k], %[p1],     %[p2]        \n\t"
+        "or             %[r_k], %[r_k],    %[c]         \n\t"
+        "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+        "or             %[r],   %[r],      %[c]         \n\t"
+
+        /* mask |= (abs(p1 - p0) > limit)
+         * hev  |= (abs(p1 - p0) > thresh)
+         */
+        "subu_s.qb      %[c],   %[p1],     %[p0]        \n\t"
+        "subu_s.qb      %[r_k], %[p0],     %[p1]        \n\t"
+        "or             %[r_k], %[r_k],    %[c]         \n\t"
+        "cmpgu.lt.qb    %[c],   %[thresh], %[r_k]       \n\t"
+        "or             %[r3],  $0,        %[c]         \n\t"
+        "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+        "or             %[r],   %[r],      %[c]         \n\t"
+
+        /* mask |= (abs(q1 - q0) > limit)
+         * hev  |= (abs(q1 - q0) > thresh)
+         */
+        "subu_s.qb      %[c],   %[q1],     %[q0]        \n\t"
+        "subu_s.qb      %[r_k], %[q0],     %[q1]        \n\t"
+        "or             %[r_k], %[r_k],    %[c]         \n\t"
+        "cmpgu.lt.qb    %[c],   %[thresh], %[r_k]       \n\t"
+        "or             %[r3],  %[r3],     %[c]         \n\t"
+        "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+        "or             %[r],   %[r],      %[c]         \n\t"
+
+        /* mask |= (abs(q2 - q1) > limit) */
+        "subu_s.qb      %[c],   %[q2],     %[q1]        \n\t"
+        "subu_s.qb      %[r_k], %[q1],     %[q2]        \n\t"
+        "or             %[r_k], %[r_k],    %[c]         \n\t"
+        "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+        "or             %[r],   %[r],      %[c]         \n\t"
+        "sll            %[r3],    %[r3],    24          \n\t"
+
+        /* mask |= (abs(q3 - q2) > limit) */
+        "subu_s.qb      %[c],   %[q3],     %[q2]        \n\t"
+        "subu_s.qb      %[r_k], %[q2],     %[q3]        \n\t"
+        "or             %[r_k], %[r_k],    %[c]         \n\t"
+        "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+        "or             %[r],   %[r],      %[c]         \n\t"
+
+        : [c] "=&r" (c), [r_k] "=&r" (r_k),
+          [r] "=&r" (r), [r3] "=&r" (r3)
+        : [limit] "r" (limit), [p3] "r" (p3), [p2] "r" (p2),
+          [p1] "r" (p1), [p0] "r" (p0), [q1] "r" (q1), [q0] "r" (q0),
+          [q2] "r" (q2), [q3] "r" (q3), [thresh] "r" (thresh)
+    );
+
+    __asm__ __volatile__ (
+        /* abs(p0 - q0) */
+        "subu_s.qb      %[c],   %[p0],     %[q0]        \n\t"
+        "subu_s.qb      %[r_k], %[q0],     %[p0]        \n\t"
+        "wrdsp          %[r3]                           \n\t"
+        "or             %[s1],  %[r_k],    %[c]         \n\t"
+
+        /* abs(p1 - q1) */
+        "subu_s.qb      %[c],    %[p1],    %[q1]        \n\t"
+        "addu_s.qb      %[s3],   %[s1],    %[s1]        \n\t"
+        "pick.qb        %[hev1], %[ones],  $0           \n\t"
+        "subu_s.qb      %[r_k],  %[q1],    %[p1]        \n\t"
+        "or             %[s2],   %[r_k],   %[c]         \n\t"
+
+        /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > flimit * 2 + limit */
+        "shrl.qb        %[s2],   %[s2],     1           \n\t"
+        "addu_s.qb      %[s1],   %[s2],     %[s3]       \n\t"
+        "cmpgu.lt.qb    %[c],    %[flimit], %[s1]       \n\t"
+        "or             %[r],    %[r],      %[c]        \n\t"
+        "sll            %[r],    %[r],      24          \n\t"
+
+        "wrdsp          %[r]                            \n\t"
+        "pick.qb        %[s2],  $0,         %[ones]     \n\t"
+
+        : [c] "=&r" (c), [r_k] "=&r" (r_k), [s1] "=&r" (s1), [hev1] "=&r" (hev1),
+          [s2] "=&r" (s2), [r] "+r" (r), [s3] "=&r" (s3)
+        : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [r3] "r" (r3),
+          [q1] "r" (q1), [ones] "r" (ones), [flimit] "r" (flimit)
+    );
+
+    *hev = hev1;
+    *mask = s2;
+}
+
+
+/* inputs & outputs are quad-byte vectors */
+static __inline void vp8_filter_mips
+(
+    uint32_t mask,
+    uint32_t hev,
+    uint32_t *ps1,
+    uint32_t *ps0,
+    uint32_t *qs0,
+    uint32_t *qs1
+)
+{
+    int32_t vp8_filter_l, vp8_filter_r;
+    int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r;
+    int32_t subr_r, subr_l;
+    uint32_t t1, t2, HWM, t3;
+    uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
+
+    int32_t vps1, vps0, vqs0, vqs1;
+    int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
+    uint32_t N128;
+
+    N128 = 0x80808080;
+    t1  = 0x03000300;
+    t2  = 0x04000400;
+    t3  = 0x01000100;
+    HWM = 0xFF00FF00;
+
+    vps0 = (*ps0) ^ N128;
+    vps1 = (*ps1) ^ N128;
+    vqs0 = (*qs0) ^ N128;
+    vqs1 = (*qs1) ^ N128;
+
+    /* use halfword pairs instead quad-bytes because of accuracy */
+    vps0_l = vps0 & HWM;
+    vps0_r = vps0 << 8;
+    vps0_r = vps0_r & HWM;
+
+    vps1_l = vps1 & HWM;
+    vps1_r = vps1 << 8;
+    vps1_r = vps1_r & HWM;
+
+    vqs0_l = vqs0 & HWM;
+    vqs0_r = vqs0 << 8;
+    vqs0_r = vqs0_r & HWM;
+
+    vqs1_l = vqs1 & HWM;
+    vqs1_r = vqs1 << 8;
+    vqs1_r = vqs1_r & HWM;
+
+    mask_l = mask & HWM;
+    mask_r = mask << 8;
+    mask_r = mask_r & HWM;
+
+    hev_l = hev & HWM;
+    hev_r = hev << 8;
+    hev_r = hev_r & HWM;
+
+    __asm__ __volatile__ (
+        /* vp8_filter = vp8_signed_char_clamp(ps1 - qs1); */
+        "subq_s.ph    %[vp8_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
+        "subq_s.ph    %[vp8_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
+
+        /* qs0 - ps0 */
+        "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
+        "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
+
+        /* vp8_filter &= hev; */
+        "and          %[vp8_filter_l], %[vp8_filter_l], %[hev_l]        \n\t"
+        "and          %[vp8_filter_r], %[vp8_filter_r], %[hev_r]        \n\t"
+
+        /* vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0)); */
+        "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
+        "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
+        "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
+        "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
+        "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
+        "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
+        "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
+        "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
+
+        /* vp8_filter &= mask; */
+        "and          %[vp8_filter_l], %[vp8_filter_l], %[mask_l]       \n\t"
+        "and          %[vp8_filter_r], %[vp8_filter_r], %[mask_r]       \n\t"
+
+        : [vp8_filter_l] "=&r" (vp8_filter_l), [vp8_filter_r] "=&r" (vp8_filter_r),
+          [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r),
+          [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r)
+
+        : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l),
+          [vps1_r] "r" (vps1_r), [vqs0_l] "r" (vqs0_l), [vqs0_r] "r" (vqs0_r),
+          [vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r),
+          [mask_l] "r" (mask_l), [mask_r] "r" (mask_r),
+          [hev_l] "r" (hev_l), [hev_r] "r" (hev_r),
+          [HWM] "r" (HWM)
+    );
+
+    /* save bottom 3 bits so that we round one side +4 and the other +3 */
+    __asm__ __volatile__ (
+        /* Filter2 = vp8_signed_char_clamp(vp8_filter + 3) >>= 3; */
+        "addq_s.ph    %[Filter1_l],    %[vp8_filter_l], %[t2]           \n\t"
+        "addq_s.ph    %[Filter1_r],    %[vp8_filter_r], %[t2]           \n\t"
+
+        /* Filter1 = vp8_signed_char_clamp(vp8_filter + 4) >>= 3; */
+        "addq_s.ph    %[Filter2_l],    %[vp8_filter_l], %[t1]           \n\t"
+        "addq_s.ph    %[Filter2_r],    %[vp8_filter_r], %[t1]           \n\t"
+        "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
+        "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
+
+        "shra.ph      %[Filter2_l],    %[Filter2_l],    3               \n\t"
+        "shra.ph      %[Filter2_r],    %[Filter2_r],    3               \n\t"
+
+        "and          %[Filter1_l],    %[Filter1_l],    %[HWM]          \n\t"
+        "and          %[Filter1_r],    %[Filter1_r],    %[HWM]          \n\t"
+
+        /* vps0 = vp8_signed_char_clamp(ps0 + Filter2); */
+        "addq_s.ph    %[vps0_l],       %[vps0_l],       %[Filter2_l]    \n\t"
+        "addq_s.ph    %[vps0_r],       %[vps0_r],       %[Filter2_r]    \n\t"
+
+        /* vqs0 = vp8_signed_char_clamp(qs0 - Filter1); */
+        "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[Filter1_l]    \n\t"
+        "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[Filter1_r]    \n\t"
+
+        : [Filter1_l] "=&r" (Filter1_l), [Filter1_r] "=&r" (Filter1_r),
+          [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r),
+          [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
+          [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
+
+        : [t1] "r" (t1), [t2] "r" (t2),
+          [vp8_filter_l] "r" (vp8_filter_l), [vp8_filter_r] "r" (vp8_filter_r),
+          [HWM] "r" (HWM)
+    );
+
+    __asm__ __volatile__ (
+        /* (vp8_filter += 1) >>= 1 */
+        "addqh.ph    %[Filter1_l],    %[Filter1_l],     %[t3]           \n\t"
+        "addqh.ph    %[Filter1_r],    %[Filter1_r],     %[t3]           \n\t"
+
+        /* vp8_filter &= ~hev; */
+        "and          %[Filter1_l],    %[Filter1_l],    %[invhev_l]     \n\t"
+        "and          %[Filter1_r],    %[Filter1_r],    %[invhev_r]     \n\t"
+
+        /* vps1 = vp8_signed_char_clamp(ps1 + vp8_filter); */
+        "addq_s.ph    %[vps1_l],       %[vps1_l],       %[Filter1_l]    \n\t"
+        "addq_s.ph    %[vps1_r],       %[vps1_r],       %[Filter1_r]    \n\t"
+
+        /* vqs1 = vp8_signed_char_clamp(qs1 - vp8_filter); */
+        "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[Filter1_l]    \n\t"
+        "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
+
+        : [Filter1_l] "+r" (Filter1_l), [Filter1_r] "+r" (Filter1_r),
+          [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r),
+          [vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r)
+
+        : [t3] "r" (t3), [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r)
+    );
+
+    /* Create quad-bytes from halfword pairs */
+    vqs0_l = vqs0_l & HWM;
+    vqs1_l = vqs1_l & HWM;
+    vps0_l = vps0_l & HWM;
+    vps1_l = vps1_l & HWM;
+
+    __asm__ __volatile__ (
+        "shrl.ph      %[vqs0_r],       %[vqs0_r],       8               \n\t"
+        "shrl.ph      %[vps0_r],       %[vps0_r],       8               \n\t"
+        "shrl.ph      %[vqs1_r],       %[vqs1_r],       8               \n\t"
+        "shrl.ph      %[vps1_r],       %[vps1_r],       8               \n\t"
+
+        : [vps1_r] "+r" (vps1_r), [vqs1_r] "+r" (vqs1_r),
+          [vps0_r] "+r" (vps0_r), [vqs0_r] "+r" (vqs0_r)
+        :
+    );
+
+    vqs0 = vqs0_l | vqs0_r;
+    vqs1 = vqs1_l | vqs1_r;
+    vps0 = vps0_l | vps0_r;
+    vps1 = vps1_l | vps1_r;
+
+    *ps0 = vps0 ^ N128;
+    *ps1 = vps1 ^ N128;
+    *qs0 = vqs0 ^ N128;
+    *qs1 = vqs1 ^ N128;
+}
+
+void vp8_loop_filter_horizontal_edge_mips
+(
+    unsigned char *s,
+    int p,
+    unsigned int flimit,
+    unsigned int limit,
+    unsigned int thresh,
+    int count
+)
+{
+    uint32_t mask;
+    uint32_t hev;
+    uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+    unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
+
+    mask = 0;
+    hev = 0;
+    p1 = 0;
+    p2 = 0;
+    p3 = 0;
+    p4 = 0;
+
+    /* prefetch data for store */
+    prefetch_store_lf(s);
+
+    /* loop filter designed to work using chars so that we can make maximum use
+     * of 8 bit simd instructions.
+     */
+
+    sm1 = s - (p << 2);
+    s0 = s - p - p - p;
+    s1 = s - p - p ;
+    s2 = s - p;
+    s3 = s;
+    s4 = s + p;
+    s5 = s + p + p;
+    s6 = s + p + p + p;
+
+    /* load quad-byte vectors
+     * memory is 4 byte aligned
+     */
+    p1 = *((uint32_t *)(s1));
+    p2 = *((uint32_t *)(s2));
+    p3 = *((uint32_t *)(s3));
+    p4 = *((uint32_t *)(s4));
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+     * mask will be zero and filtering is not needed
+     */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+    {
+
+        pm1 = *((uint32_t *)(sm1));
+        p0  = *((uint32_t *)(s0));
+        p5  = *((uint32_t *)(s5));
+        p6  = *((uint32_t *)(s6));
+
+        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                 thresh, &hev, &mask);
+
+        /* if mask == 0 do filtering is not needed */
+        if (mask)
+        {
+            /* filtering */
+            vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+            /* unpack processed 4x4 neighborhood */
+            *((uint32_t *)s1) = p1;
+            *((uint32_t *)s2) = p2;
+            *((uint32_t *)s3) = p3;
+            *((uint32_t *)s4) = p4;
+        }
+    }
+
+    sm1 += 4;
+    s0  += 4;
+    s1  += 4;
+    s2  += 4;
+    s3  += 4;
+    s4  += 4;
+    s5  += 4;
+    s6  += 4;
+
+    /* load quad-byte vectors
+     * memory is 4 byte aligned
+     */
+    p1 = *((uint32_t *)(s1));
+    p2 = *((uint32_t *)(s2));
+    p3 = *((uint32_t *)(s3));
+    p4 = *((uint32_t *)(s4));
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+     * mask will be zero and filtering is not needed
+     */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+    {
+
+        pm1 = *((uint32_t *)(sm1));
+        p0  = *((uint32_t *)(s0));
+        p5  = *((uint32_t *)(s5));
+        p6  = *((uint32_t *)(s6));
+
+        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                 thresh, &hev, &mask);
+
+        /* if mask == 0 do filtering is not needed */
+        if (mask)
+        {
+            /* filtering */
+            vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+            /* unpack processed 4x4 neighborhood */
+            *((uint32_t *)s1) = p1;
+            *((uint32_t *)s2) = p2;
+            *((uint32_t *)s3) = p3;
+            *((uint32_t *)s4) = p4;
+        }
+    }
+
+    sm1 += 4;
+    s0  += 4;
+    s1  += 4;
+    s2  += 4;
+    s3  += 4;
+    s4  += 4;
+    s5  += 4;
+    s6  += 4;
+
+    /* load quad-byte vectors
+     * memory is 4 byte aligned
+     */
+    p1 = *((uint32_t *)(s1));
+    p2 = *((uint32_t *)(s2));
+    p3 = *((uint32_t *)(s3));
+    p4 = *((uint32_t *)(s4));
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+     * mask will be zero and filtering is not needed
+     */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+    {
+
+        pm1 = *((uint32_t *)(sm1));
+        p0  = *((uint32_t *)(s0));
+        p5  = *((uint32_t *)(s5));
+        p6  = *((uint32_t *)(s6));
+
+        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                 thresh, &hev, &mask);
+
+        /* if mask == 0 do filtering is not needed */
+        if (mask)
+        {
+            /* filtering */
+            vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+            /* unpack processed 4x4 neighborhood */
+            *((uint32_t *)s1) = p1;
+            *((uint32_t *)s2) = p2;
+            *((uint32_t *)s3) = p3;
+            *((uint32_t *)s4) = p4;
+        }
+    }
+
+    sm1 += 4;
+    s0  += 4;
+    s1  += 4;
+    s2  += 4;
+    s3  += 4;
+    s4  += 4;
+    s5  += 4;
+    s6  += 4;
+
+    /* load quad-byte vectors
+     * memory is 4 byte aligned
+     */
+    p1 = *((uint32_t *)(s1));
+    p2 = *((uint32_t *)(s2));
+    p3 = *((uint32_t *)(s3));
+    p4 = *((uint32_t *)(s4));
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+     * mask will be zero and filtering is not needed
+     */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+    {
+
+        pm1 = *((uint32_t *)(sm1));
+        p0  = *((uint32_t *)(s0));
+        p5  = *((uint32_t *)(s5));
+        p6  = *((uint32_t *)(s6));
+
+        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                 thresh, &hev, &mask);
+
+        /* if mask == 0 do filtering is not needed */
+        if (mask)
+        {
+            /* filtering */
+            vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+            /* unpack processed 4x4 neighborhood */
+            *((uint32_t *)s1) = p1;
+            *((uint32_t *)s2) = p2;
+            *((uint32_t *)s3) = p3;
+            *((uint32_t *)s4) = p4;
+        }
+    }
+}
+
+void vp8_loop_filter_uvhorizontal_edge_mips
+(
+    unsigned char *s,
+    int p,
+    unsigned int flimit,
+    unsigned int limit,
+    unsigned int thresh,
+    int count
+)
+{
+    uint32_t mask;
+    uint32_t hev;
+    uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+    unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
+
+    mask = 0;
+    hev = 0;
+    p1 = 0;
+    p2 = 0;
+    p3 = 0;
+    p4 = 0;
+
+    /* loop filter designed to work using chars so that we can make maximum use
+     * of 8 bit simd instructions.
+     */
+
+    sm1 = s - (p << 2);
+    s0  = s - p - p - p;
+    s1  = s - p - p ;
+    s2  = s - p;
+    s3  = s;
+    s4  = s + p;
+    s5  = s + p + p;
+    s6  = s + p + p + p;
+
+    /* load quad-byte vectors
+     * memory is 4 byte aligned
+     */
+    p1 = *((uint32_t *)(s1));
+    p2 = *((uint32_t *)(s2));
+    p3 = *((uint32_t *)(s3));
+    p4 = *((uint32_t *)(s4));
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+     * mask will be zero and filtering is not needed
+     */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+    {
+
+        pm1 = *((uint32_t *)(sm1));
+        p0  = *((uint32_t *)(s0));
+        p5  = *((uint32_t *)(s5));
+        p6  = *((uint32_t *)(s6));
+
+        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                 thresh, &hev, &mask);
+
+        /* if mask == 0 do filtering is not needed */
+        if (mask)
+        {
+            /* filtering */
+            vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+            /* unpack processed 4x4 neighborhood */
+            *((uint32_t *)s1) = p1;
+            *((uint32_t *)s2) = p2;
+            *((uint32_t *)s3) = p3;
+            *((uint32_t *)s4) = p4;
+        }
+    }
+
+    sm1 += 4;
+    s0  += 4;
+    s1  += 4;
+    s2  += 4;
+    s3  += 4;
+    s4  += 4;
+    s5  += 4;
+    s6  += 4;
+
+    /* load quad-byte vectors
+     * memory is 4 byte aligned
+     */
+    p1 = *((uint32_t *)(s1));
+    p2 = *((uint32_t *)(s2));
+    p3 = *((uint32_t *)(s3));
+    p4 = *((uint32_t *)(s4));
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+     * mask will be zero and filtering is not needed
+     */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+    {
+
+        pm1 = *((uint32_t *)(sm1));
+        p0  = *((uint32_t *)(s0));
+        p5  = *((uint32_t *)(s5));
+        p6  = *((uint32_t *)(s6));
+
+        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                 thresh, &hev, &mask);
+
+        /* if mask == 0 do filtering is not needed */
+        if (mask)
+        {
+            /* filtering */
+            vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+            /* unpack processed 4x4 neighborhood */
+            *((uint32_t *)s1) = p1;
+            *((uint32_t *)s2) = p2;
+            *((uint32_t *)s3) = p3;
+            *((uint32_t *)s4) = p4;
+        }
+    }
+}
+
+void vp8_loop_filter_vertical_edge_mips
+(
+    unsigned char *s,
+    int p,
+    const unsigned int flimit,
+    const unsigned int limit,
+    const unsigned int thresh,
+    int count
+)
+{
+    int i;
+    uint32_t mask, hev;
+    uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+    unsigned char *s1, *s2, *s3, *s4;
+    uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+
+    hev = 0;
+    mask = 0;
+    i = 0;
+    pm1 = 0;
+    p0 = 0;
+    p1 = 0;
+    p2 = 0;
+    p3 = 0;
+    p4 = 0;
+    p5 = 0;
+    p6 = 0;
+
+    /* loop filter designed to work using chars so that we can make maximum use
+     * of 8 bit simd instructions.
+     */
+
+    /* apply filter on 4 pixesl at the same time */
+    do
+    {
+
+        /* prefetch data for store */
+        prefetch_store_lf(s + p);
+
+        s1 = s;
+        s2 = s + p;
+        s3 = s2 + p;
+        s4 = s3 + p;
+        s  = s4 + p;
+
+        /* load quad-byte vectors
+         * memory is 4 byte aligned
+         */
+        p2  = *((uint32_t *)(s1 - 4));
+        p6  = *((uint32_t *)(s1));
+        p1  = *((uint32_t *)(s2 - 4));
+        p5  = *((uint32_t *)(s2));
+        p0  = *((uint32_t *)(s3 - 4));
+        p4  = *((uint32_t *)(s3));
+        pm1 = *((uint32_t *)(s4 - 4));
+        p3  = *((uint32_t *)(s4));
+
+        /* transpose pm1, p0, p1, p2 */
+        __asm__ __volatile__ (
+            "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
+            "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
+            "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
+            "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
+
+            "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
+            "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
+            "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+            "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+            "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
+            "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
+            "append         %[p1],      %[sec3],    16          \n\t"
+            "append         %[pm1],     %[sec4],    16          \n\t"
+
+            : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+              [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+              [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
+              [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+            :
+        );
+
+        /* transpose p3, p4, p5, p6 */
+        __asm__ __volatile__ (
+            "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
+            "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
+            "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
+            "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
+
+            "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
+            "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
+            "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+            "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+            "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
+            "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
+            "append         %[p5],      %[sec3],    16          \n\t"
+            "append         %[p3],      %[sec4],    16          \n\t"
+
+            : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+              [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+              [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+              [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+            :
+        );
+
+        /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+         * mask will be zero and filtering is not needed
+         */
+        if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+        {
+
+            vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                     thresh, &hev, &mask);
+
+            /* if mask == 0 do filtering is not needed */
+            if (mask)
+            {
+                /* filtering */
+                vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+                /* unpack processed 4x4 neighborhood
+                 * don't use transpose on output data
+                 * because memory isn't aligned
+                 */
+                __asm__ __volatile__ (
+                    "sb         %[p4],  1(%[s4])    \n\t"
+                    "sb         %[p3],  0(%[s4])    \n\t"
+                    "sb         %[p2], -1(%[s4])    \n\t"
+                    "sb         %[p1], -2(%[s4])    \n\t"
+                    :
+                    : [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
+                      [p2] "r" (p2), [p1] "r" (p1)
+                );
+
+                __asm__ __volatile__ (
+                    "srl        %[p4], %[p4], 8     \n\t"
+                    "srl        %[p3], %[p3], 8     \n\t"
+                    "srl        %[p2], %[p2], 8     \n\t"
+                    "srl        %[p1], %[p1], 8     \n\t"
+                    : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+                    :
+                );
+
+                __asm__ __volatile__ (
+                    "sb         %[p4],  1(%[s3])    \n\t"
+                    "sb         %[p3],  0(%[s3])    \n\t"
+                    "sb         %[p2], -1(%[s3])    \n\t"
+                    "sb         %[p1], -2(%[s3])    \n\t"
+                    : [p1] "+r" (p1)
+                    : [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3), [p2] "r" (p2)
+                );
+
+                __asm__ __volatile__ (
+                    "srl        %[p4], %[p4], 8     \n\t"
+                    "srl        %[p3], %[p3], 8     \n\t"
+                    "srl        %[p2], %[p2], 8     \n\t"
+                    "srl        %[p1], %[p1], 8     \n\t"
+                    : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+                    :
+                );
+
+                __asm__ __volatile__ (
+                    "sb         %[p4],  1(%[s2])    \n\t"
+                    "sb         %[p3],  0(%[s2])    \n\t"
+                    "sb         %[p2], -1(%[s2])    \n\t"
+                    "sb         %[p1], -2(%[s2])    \n\t"
+                    :
+                    : [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
+                      [p2] "r" (p2), [p1] "r" (p1)
+                );
+
+                __asm__ __volatile__ (
+                    "srl        %[p4], %[p4], 8     \n\t"
+                    "srl        %[p3], %[p3], 8     \n\t"
+                    "srl        %[p2], %[p2], 8     \n\t"
+                    "srl        %[p1], %[p1], 8     \n\t"
+                    : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+                    :
+                );
+
+                __asm__ __volatile__ (
+                    "sb         %[p4],  1(%[s1])    \n\t"
+                    "sb         %[p3],  0(%[s1])    \n\t"
+                    "sb         %[p2], -1(%[s1])    \n\t"
+                    "sb         %[p1], -2(%[s1])    \n\t"
+                    :
+                    : [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1),
+                      [p2] "r" (p2), [p1] "r" (p1)
+                );
+            }
+        }
+
+        s1 = s;
+        s2 = s + p;
+        s3 = s2 + p;
+        s4 = s3 + p;
+        s  = s4 + p;
+
+        /* load quad-byte vectors
+         * memory is 4 byte aligned
+         */
+        p2  = *((uint32_t *)(s1 - 4));
+        p6  = *((uint32_t *)(s1));
+        p1  = *((uint32_t *)(s2 - 4));
+        p5  = *((uint32_t *)(s2));
+        p0  = *((uint32_t *)(s3 - 4));
+        p4  = *((uint32_t *)(s3));
+        pm1 = *((uint32_t *)(s4 - 4));
+        p3  = *((uint32_t *)(s4));
+
+        /* transpose pm1, p0, p1, p2 */
+        __asm__ __volatile__ (
+            "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
+            "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
+            "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
+            "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
+
+            "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
+            "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
+            "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+            "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+            "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
+            "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
+            "append         %[p1],      %[sec3],    16          \n\t"
+            "append         %[pm1],     %[sec4],    16          \n\t"
+
+            : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+              [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+              [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
+              [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+            :
+        );
+
+        /* transpose p3, p4, p5, p6 */
+        __asm__ __volatile__ (
+            "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
+            "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
+            "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
+            "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
+
+            "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
+            "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
+            "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+            "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+            "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
+            "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
+            "append         %[p5],      %[sec3],    16          \n\t"
+            "append         %[p3],      %[sec4],    16          \n\t"
+
+            : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+              [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+              [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+              [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+            :
+        );
+
+        /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+         * mask will be zero and filtering is not needed
+         */
+        if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+        {
+
+            vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                     thresh, &hev, &mask);
+
+            /* if mask == 0 do filtering is not needed */
+            if (mask)
+            {
+                /* filtering */
+                vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+                /* unpack processed 4x4 neighborhood
+                 * don't use transpose on output data
+                 * because memory isn't aligned
+                 */
+                __asm__ __volatile__ (
+                    "sb         %[p4],  1(%[s4])    \n\t"
+                    "sb         %[p3],  0(%[s4])    \n\t"
+                    "sb         %[p2], -1(%[s4])    \n\t"
+                    "sb         %[p1], -2(%[s4])    \n\t"
+                    :
+                    : [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
+                      [p2] "r" (p2), [p1] "r" (p1)
+                );
+
+                __asm__ __volatile__ (
+                    "srl        %[p4], %[p4], 8     \n\t"
+                    "srl        %[p3], %[p3], 8     \n\t"
+                    "srl        %[p2], %[p2], 8     \n\t"
+                    "srl        %[p1], %[p1], 8     \n\t"
+                    : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+                    :
+                );
+
+                __asm__ __volatile__ (
+                    "sb         %[p4],  1(%[s3])    \n\t"
+                    "sb         %[p3],  0(%[s3])    \n\t"
+                    "sb         %[p2], -1(%[s3])    \n\t"
+                    "sb         %[p1], -2(%[s3])    \n\t"
+                    : [p1] "+r" (p1)
+                    : [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3), [p2] "r" (p2)
+                );
+
+                __asm__ __volatile__ (
+                    "srl        %[p4], %[p4], 8     \n\t"
+                    "srl        %[p3], %[p3], 8     \n\t"
+                    "srl        %[p2], %[p2], 8     \n\t"
+                    "srl        %[p1], %[p1], 8     \n\t"
+                    : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+                    :
+                );
+
+                __asm__ __volatile__ (
+                    "sb         %[p4],  1(%[s2])    \n\t"
+                    "sb         %[p3],  0(%[s2])    \n\t"
+                    "sb         %[p2], -1(%[s2])    \n\t"
+                    "sb         %[p1], -2(%[s2])    \n\t"
+                    :
+                    : [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
+                      [p2] "r" (p2), [p1] "r" (p1)
+                );
+
+                __asm__ __volatile__ (
+                    "srl        %[p4], %[p4], 8     \n\t"
+                    "srl        %[p3], %[p3], 8     \n\t"
+                    "srl        %[p2], %[p2], 8     \n\t"
+                    "srl        %[p1], %[p1], 8     \n\t"
+                    : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+                    :
+                );
+
+                __asm__ __volatile__ (
+                    "sb         %[p4],  1(%[s1])    \n\t"
+                    "sb         %[p3],  0(%[s1])    \n\t"
+                    "sb         %[p2], -1(%[s1])    \n\t"
+                    "sb         %[p1], -2(%[s1])    \n\t"
+                    :
+                    : [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1),
+                      [p2] "r" (p2), [p1] "r" (p1)
+                );
+            }
+        }
+
+        i += 8;
+    }
+
+    while (i < count);
+}
+
+void vp8_loop_filter_uvvertical_edge_mips
+(
+    unsigned char *s,
+    int p,
+    unsigned int flimit,
+    unsigned int limit,
+    unsigned int thresh,
+    int count
+)
+{
+    uint32_t mask, hev;
+    uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+    unsigned char *s1, *s2, *s3, *s4;
+    uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+
+    /* loop filter designed to work using chars so that we can make maximum use
+     * of 8 bit simd instructions.
+     */
+
+    /* apply filter on 4 pixesl at the same time */
+
+    s1 = s;
+    s2 = s + p;
+    s3 = s2 + p;
+    s4 = s3 + p;
+
+    /* load quad-byte vectors
+    * memory is 4 byte aligned
+    */
+    p2  = *((uint32_t *)(s1 - 4));
+    p6  = *((uint32_t *)(s1));
+    p1  = *((uint32_t *)(s2 - 4));
+    p5  = *((uint32_t *)(s2));
+    p0  = *((uint32_t *)(s3 - 4));
+    p4  = *((uint32_t *)(s3));
+    pm1 = *((uint32_t *)(s4 - 4));
+    p3  = *((uint32_t *)(s4));
+
+    /* transpose pm1, p0, p1, p2 */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
+        "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
+
+        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
+        "append         %[p1],      %[sec3],    16          \n\t"
+        "append         %[pm1],     %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* transpose p3, p4, p5, p6 */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
+        "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
+
+        "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
+        "append         %[p5],      %[sec3],    16          \n\t"
+        "append         %[p3],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+    * mask will be zero and filtering is not needed
+    */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+    {
+
+        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                 thresh, &hev, &mask);
+
+        /* if mask == 0 do filtering is not needed */
+        if (mask)
+        {
+            /* filtering */
+            vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+            /* unpack processed 4x4 neighborhood
+             * don't use transpose on output data
+             * because memory isn't aligned
+             */
+            __asm__ __volatile__ (
+                "sb         %[p4],  1(%[s4])    \n\t"
+                "sb         %[p3],  0(%[s4])    \n\t"
+                "sb         %[p2], -1(%[s4])    \n\t"
+                "sb         %[p1], -2(%[s4])    \n\t"
+                :
+                : [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
+                  [p2] "r" (p2), [p1] "r" (p1)
+            );
+
+            __asm__ __volatile__ (
+                "srl        %[p4], %[p4], 8     \n\t"
+                "srl        %[p3], %[p3], 8     \n\t"
+                "srl        %[p2], %[p2], 8     \n\t"
+                "srl        %[p1], %[p1], 8     \n\t"
+                : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+                :
+            );
+
+            __asm__ __volatile__ (
+                "sb         %[p4],  1(%[s3])    \n\t"
+                "sb         %[p3],  0(%[s3])    \n\t"
+                "sb         %[p2], -1(%[s3])    \n\t"
+                "sb         %[p1], -2(%[s3])    \n\t"
+                : [p1] "+r" (p1)
+                : [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3), [p2] "r" (p2)
+            );
+
+            __asm__ __volatile__ (
+                "srl        %[p4], %[p4], 8     \n\t"
+                "srl        %[p3], %[p3], 8     \n\t"
+                "srl        %[p2], %[p2], 8     \n\t"
+                "srl        %[p1], %[p1], 8     \n\t"
+                : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+                :
+            );
+
+            __asm__ __volatile__ (
+                "sb         %[p4],  1(%[s2])    \n\t"
+                "sb         %[p3],  0(%[s2])    \n\t"
+                "sb         %[p2], -1(%[s2])    \n\t"
+                "sb         %[p1], -2(%[s2])    \n\t"
+                :
+                : [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
+                  [p2] "r" (p2), [p1] "r" (p1)
+            );
+
+            __asm__ __volatile__ (
+                "srl        %[p4], %[p4], 8     \n\t"
+                "srl        %[p3], %[p3], 8     \n\t"
+                "srl        %[p2], %[p2], 8     \n\t"
+                "srl        %[p1], %[p1], 8     \n\t"
+                : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+                :
+            );
+
+            __asm__ __volatile__ (
+                "sb         %[p4],  1(%[s1])    \n\t"
+                "sb         %[p3],  0(%[s1])    \n\t"
+                "sb         %[p2], -1(%[s1])    \n\t"
+                "sb         %[p1], -2(%[s1])    \n\t"
+                :
+                : [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1), [p2] "r" (p2), [p1] "r" (p1)
+            );
+        }
+    }
+
+    s1 = s4 + p;
+    s2 = s1 + p;
+    s3 = s2 + p;
+    s4 = s3 + p;
+
+    /* load quad-byte vectors
+     * memory is 4 byte aligned
+     */
+    p2  = *((uint32_t *)(s1 - 4));
+    p6  = *((uint32_t *)(s1));
+    p1  = *((uint32_t *)(s2 - 4));
+    p5  = *((uint32_t *)(s2));
+    p0  = *((uint32_t *)(s3 - 4));
+    p4  = *((uint32_t *)(s3));
+    pm1 = *((uint32_t *)(s4 - 4));
+    p3  = *((uint32_t *)(s4));
+
+    /* transpose pm1, p0, p1, p2 */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
+        "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
+
+        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
+        "append         %[p1],      %[sec3],    16          \n\t"
+        "append         %[pm1],     %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* transpose p3, p4, p5, p6 */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
+        "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
+
+        "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
+        "append         %[p5],      %[sec3],    16          \n\t"
+        "append         %[p3],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+     * mask will be zero and filtering is not needed
+     */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+    {
+
+        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                 thresh, &hev, &mask);
+
+        /* if mask == 0 do filtering is not needed */
+        if (mask)
+        {
+            /* filtering */
+            vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
+
+            /* unpack processed 4x4 neighborhood
+             * don't use transpose on output data
+             * because memory isn't aligned
+             */
+            __asm__ __volatile__ (
+                "sb         %[p4],  1(%[s4])    \n\t"
+                "sb         %[p3],  0(%[s4])    \n\t"
+                "sb         %[p2], -1(%[s4])    \n\t"
+                "sb         %[p1], -2(%[s4])    \n\t"
+                :
+                : [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
+                  [p2] "r" (p2), [p1] "r" (p1)
+            );
+
+            __asm__ __volatile__ (
+                "srl        %[p4], %[p4], 8     \n\t"
+                "srl        %[p3], %[p3], 8     \n\t"
+                "srl        %[p2], %[p2], 8     \n\t"
+                "srl        %[p1], %[p1], 8     \n\t"
+                : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+                :
+            );
+
+            __asm__ __volatile__ (
+                "sb         %[p4],  1(%[s3])    \n\t"
+                "sb         %[p3],  0(%[s3])    \n\t"
+                "sb         %[p2], -1(%[s3])    \n\t"
+                "sb         %[p1], -2(%[s3])    \n\t"
+                : [p1] "+r" (p1)
+                : [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3), [p2] "r" (p2)
+            );
+
+            __asm__ __volatile__ (
+                "srl        %[p4], %[p4], 8     \n\t"
+                "srl        %[p3], %[p3], 8     \n\t"
+                "srl        %[p2], %[p2], 8     \n\t"
+                "srl        %[p1], %[p1], 8     \n\t"
+                : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+                :
+            );
+
+            __asm__ __volatile__ (
+                "sb         %[p4],  1(%[s2])    \n\t"
+                "sb         %[p3],  0(%[s2])    \n\t"
+                "sb         %[p2], -1(%[s2])    \n\t"
+                "sb         %[p1], -2(%[s2])    \n\t"
+                :
+                : [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
+                  [p2] "r" (p2), [p1] "r" (p1)
+            );
+
+            __asm__ __volatile__ (
+                "srl        %[p4], %[p4], 8     \n\t"
+                "srl        %[p3], %[p3], 8     \n\t"
+                "srl        %[p2], %[p2], 8     \n\t"
+                "srl        %[p1], %[p1], 8     \n\t"
+                : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+                :
+            );
+
+            __asm__ __volatile__ (
+                "sb         %[p4],  1(%[s1])    \n\t"
+                "sb         %[p3],  0(%[s1])    \n\t"
+                "sb         %[p2], -1(%[s1])    \n\t"
+                "sb         %[p1], -2(%[s1])    \n\t"
+                :
+                : [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1),
+                  [p2] "r" (p2), [p1] "r" (p1)
+            );
+        }
+    }
+}
+
+/* inputs & outputs are quad-byte vectors */
+static __inline void vp8_mbfilter_mips
+(
+    uint32_t mask,
+    uint32_t hev,
+    uint32_t *ps2,
+    uint32_t *ps1,
+    uint32_t *ps0,
+    uint32_t *qs0,
+    uint32_t *qs1,
+    uint32_t *qs2
+)
+{
+    int32_t vps2, vps1, vps0, vqs0, vqs1, vqs2;
+    int32_t vps2_l, vps1_l, vps0_l, vqs0_l, vqs1_l, vqs2_l;
+    int32_t vps2_r, vps1_r, vps0_r, vqs0_r, vqs1_r, vqs2_r;
+    uint32_t HWM, vp8_filter_l, vp8_filter_r, mask_l, mask_r, hev_l, hev_r, subr_r, subr_l;
+    uint32_t Filter2_l, Filter2_r, t1, t2, Filter1_l, Filter1_r, invhev_l, invhev_r;
+    uint32_t N128, R63;
+    uint32_t u1_l, u1_r, u2_l, u2_r, u3_l, u3_r;
+
+    R63  = 0x003F003F;
+    HWM  = 0xFF00FF00;
+    N128 = 0x80808080;
+    t1   = 0x03000300;
+    t2   = 0x04000400;
+
+    vps0 = (*ps0) ^ N128;
+    vps1 = (*ps1) ^ N128;
+    vps2 = (*ps2) ^ N128;
+    vqs0 = (*qs0) ^ N128;
+    vqs1 = (*qs1) ^ N128;
+    vqs2 = (*qs2) ^ N128;
+
+    /* use halfword pairs instead quad-bytes because of accuracy */
+    vps0_l = vps0 & HWM;
+    vps0_r = vps0 << 8;
+    vps0_r = vps0_r & HWM;
+
+    vqs0_l = vqs0 & HWM;
+    vqs0_r = vqs0 << 8;
+    vqs0_r = vqs0_r & HWM;
+
+    vps1_l = vps1 & HWM;
+    vps1_r = vps1 << 8;
+    vps1_r = vps1_r & HWM;
+
+    vqs1_l = vqs1 & HWM;
+    vqs1_r = vqs1 << 8;
+    vqs1_r = vqs1_r & HWM;
+
+    vqs2_l = vqs2 & HWM;
+    vqs2_r = vqs2 << 8;
+    vqs2_r = vqs2_r & HWM;
+
+    __asm__ __volatile__ (
+        /* qs0 - ps0 */
+        "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
+        "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
+
+        /* vp8_filter = vp8_signed_char_clamp(ps1 - qs1); */
+        "subq_s.ph    %[vp8_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
+        "subq_s.ph    %[vp8_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
+
+        : [vp8_filter_l] "=&r" (vp8_filter_l), [vp8_filter_r] "=r" (vp8_filter_r),
+          [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r)
+        : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l),
+          [vps1_r] "r" (vps1_r), [vqs0_l] "r" (vqs0_l), [vqs0_r] "r" (vqs0_r),
+          [vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r)
+    );
+
+    vps2_l = vps2 & HWM;
+    vps2_r = vps2 << 8;
+    vps2_r = vps2_r & HWM;
+
+    /* add outer taps if we have high edge variance */
+    __asm__ __volatile__ (
+        /* vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0)); */
+        "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
+        "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
+        "and          %[mask_l],       %[HWM],          %[mask]         \n\t"
+        "sll          %[mask_r],       %[mask],         8               \n\t"
+        "and          %[mask_r],       %[HWM],          %[mask_r]       \n\t"
+        "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
+        "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
+        "and          %[hev_l],        %[HWM],          %[hev]          \n\t"
+        "sll          %[hev_r],        %[hev],          8               \n\t"
+        "and          %[hev_r],        %[HWM],          %[hev_r]        \n\t"
+        "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
+        "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
+
+        /* vp8_filter &= mask; */
+        "and          %[vp8_filter_l], %[vp8_filter_l], %[mask_l]       \n\t"
+        "and          %[vp8_filter_r], %[vp8_filter_r], %[mask_r]       \n\t"
+
+        /* Filter2 = vp8_filter & hev; */
+        "and          %[Filter2_l],    %[vp8_filter_l], %[hev_l]        \n\t"
+        "and          %[Filter2_r],    %[vp8_filter_r], %[hev_r]        \n\t"
+
+        : [vp8_filter_l] "+r" (vp8_filter_l), [vp8_filter_r] "+r" (vp8_filter_r),
+          [hev_l] "=&r" (hev_l), [hev_r] "=&r" (hev_r),
+          [mask_l] "=&r" (mask_l), [mask_r] "=&r" (mask_r),
+          [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r)
+        : [subr_l] "r" (subr_l), [subr_r] "r" (subr_r),
+          [HWM] "r" (HWM), [hev]  "r" (hev), [mask] "r" (mask)
+    );
+
+    /* save bottom 3 bits so that we round one side +4 and the other +3 */
+    __asm__ __volatile__ (
+        /* Filter1 = vp8_signed_char_clamp(Filter2 + 4) >>= 3; */
+        "addq_s.ph    %[Filter1_l],    %[Filter2_l],    %[t2]           \n\t"
+        "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
+        "addq_s.ph    %[Filter1_r],    %[Filter2_r],    %[t2]           \n\t"
+
+        /* Filter2 = vp8_signed_char_clamp(Filter2 + 3) >>= 3; */
+        "addq_s.ph    %[Filter2_l],    %[Filter2_l],    %[t1]           \n\t"
+        "addq_s.ph    %[Filter2_r],    %[Filter2_r],    %[t1]           \n\t"
+
+        "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
+        "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
+
+        "shra.ph      %[Filter2_l],    %[Filter2_l],    3               \n\t"
+        "shra.ph      %[Filter2_r],    %[Filter2_r],    3               \n\t"
+        "and          %[Filter1_l],    %[Filter1_l],    %[HWM]          \n\t"
+        "and          %[Filter1_r],    %[Filter1_r],    %[HWM]          \n\t"
+        "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
+
+        /* qs0 = vp8_signed_char_clamp(qs0 - Filter1); */
+        "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[Filter1_l]    \n\t"
+        "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[Filter1_r]    \n\t"
+
+        /* ps0 = vp8_signed_char_clamp(ps0 + Filter2); */
+        "addq_s.ph    %[vps0_l],       %[vps0_l],       %[Filter2_l]    \n\t"
+        "addq_s.ph    %[vps0_r],       %[vps0_r],       %[Filter2_r]    \n\t"
+
+        : [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r),
+          [Filter1_l] "=&r" (Filter1_l), [Filter1_r] "=&r" (Filter1_r),
+          [Filter2_l] "+r" (Filter2_l), [Filter2_r] "+r" (Filter2_r),
+          [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
+          [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
+        : [t1] "r" (t1), [t2] "r" (t2), [HWM] "r" (HWM),
+          [hev_l] "r" (hev_l), [hev_r] "r" (hev_r)
+    );
+
+    /* only apply wider filter if not high edge variance */
+    __asm__ __volatile__ (
+        /* vp8_filter &= ~hev; */
+        "and          %[Filter2_l],    %[vp8_filter_l], %[invhev_l]     \n\t"
+        "and          %[Filter2_r],    %[vp8_filter_r], %[invhev_r]     \n\t"
+
+        "shra.ph      %[Filter2_l],    %[Filter2_l],    8               \n\t"
+        "shra.ph      %[Filter2_r],    %[Filter2_r],    8               \n\t"
+
+        : [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r)
+        : [vp8_filter_l] "r" (vp8_filter_l), [vp8_filter_r] "r" (vp8_filter_r),
+          [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r)
+    );
+
+    /* roughly 3/7th difference across boundary */
+    __asm__ __volatile__ (
+        "shll.ph      %[u3_l],         %[Filter2_l],    3               \n\t"
+        "shll.ph      %[u3_r],         %[Filter2_r],    3               \n\t"
+
+        "addq.ph      %[u3_l],         %[u3_l],         %[Filter2_l]    \n\t"
+        "addq.ph      %[u3_r],         %[u3_r],         %[Filter2_r]    \n\t"
+
+        "shll.ph      %[u2_l],         %[u3_l],         1               \n\t"
+        "shll.ph      %[u2_r],         %[u3_r],         1               \n\t"
+
+        "addq.ph      %[u1_l],         %[u3_l],         %[u2_l]         \n\t"
+        "addq.ph      %[u1_r],         %[u3_r],         %[u2_r]         \n\t"
+
+        "addq.ph      %[u2_l],         %[u2_l],         %[R63]          \n\t"
+        "addq.ph      %[u2_r],         %[u2_r],         %[R63]          \n\t"
+
+        "addq.ph      %[u3_l],         %[u3_l],         %[R63]          \n\t"
+        "addq.ph      %[u3_r],         %[u3_r],         %[R63]          \n\t"
+
+        /* vp8_signed_char_clamp((63 + Filter2 * 27) >> 7)
+         * vp8_signed_char_clamp((63 + Filter2 * 18) >> 7)
+         */
+        "addq.ph      %[u1_l],         %[u1_l],         %[R63]          \n\t"
+        "addq.ph      %[u1_r],         %[u1_r],         %[R63]          \n\t"
+        "shra.ph      %[u1_l],         %[u1_l],         7               \n\t"
+        "shra.ph      %[u1_r],         %[u1_r],         7               \n\t"
+        "shra.ph      %[u2_l],         %[u2_l],         7               \n\t"
+        "shra.ph      %[u2_r],         %[u2_r],         7               \n\t"
+        "shll.ph      %[u1_l],         %[u1_l],         8               \n\t"
+        "shll.ph      %[u1_r],         %[u1_r],         8               \n\t"
+        "shll.ph      %[u2_l],         %[u2_l],         8               \n\t"
+        "shll.ph      %[u2_r],         %[u2_r],         8               \n\t"
+
+        /* vqs0 = vp8_signed_char_clamp(qs0 - u); */
+        "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[u1_l]         \n\t"
+        "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[u1_r]         \n\t"
+
+        /* vps0 = vp8_signed_char_clamp(ps0 + u); */
+        "addq_s.ph    %[vps0_l],       %[vps0_l],       %[u1_l]         \n\t"
+        "addq_s.ph    %[vps0_r],       %[vps0_r],       %[u1_r]         \n\t"
+
+        : [u1_l] "=&r" (u1_l), [u1_r] "=&r" (u1_r), [u2_l] "=&r" (u2_l),
+          [u2_r] "=&r" (u2_r), [u3_l] "=&r" (u3_l), [u3_r] "=&r" (u3_r),
+          [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
+          [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
+        : [R63]  "r" (R63),
+          [Filter2_l] "r" (Filter2_l), [Filter2_r] "r" (Filter2_r)
+    );
+
+    __asm__ __volatile__ (
+        /* vqs1 = vp8_signed_char_clamp(qs1 - u); */
+        "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[u2_l]         \n\t"
+        "addq_s.ph    %[vps1_l],       %[vps1_l],       %[u2_l]         \n\t"
+
+        /* vps1 = vp8_signed_char_clamp(ps1 + u); */
+        "addq_s.ph    %[vps1_r],       %[vps1_r],       %[u2_r]         \n\t"
+        "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[u2_r]         \n\t"
+
+        : [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r),
+          [vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r)
+        : [u2_l] "r" (u2_l), [u2_r] "r" (u2_r)
+    );
+
+    /* roughly 1/7th difference across boundary */
+    __asm__ __volatile__ (
+        /* u = vp8_signed_char_clamp((63 + Filter2 * 9) >> 7); */
+        "shra.ph      %[u3_l],         %[u3_l],         7               \n\t"
+        "shra.ph      %[u3_r],         %[u3_r],         7               \n\t"
+        "shll.ph      %[u3_l],         %[u3_l],         8               \n\t"
+        "shll.ph      %[u3_r],         %[u3_r],         8               \n\t"
+
+        /* vqs2 = vp8_signed_char_clamp(qs2 - u); */
+        "subq_s.ph    %[vqs2_l],       %[vqs2_l],       %[u3_l]         \n\t"
+        "subq_s.ph    %[vqs2_r],       %[vqs2_r],       %[u3_r]         \n\t"
+
+        /* vps2 = vp8_signed_char_clamp(ps2 + u); */
+        "addq_s.ph    %[vps2_l],       %[vps2_l],       %[u3_l]         \n\t"
+        "addq_s.ph    %[vps2_r],       %[vps2_r],       %[u3_r]         \n\t"
+
+        : [u3_l] "+r" (u3_l), [u3_r] "+r" (u3_r), [vps2_l] "+r" (vps2_l),
+          [vps2_r] "+r" (vps2_r), [vqs2_l] "+r" (vqs2_l), [vqs2_r] "+r" (vqs2_r)
+        :
+    );
+
+    /* Create quad-bytes from halfword pairs */
+    __asm__ __volatile__ (
+        "and          %[vqs0_l],       %[vqs0_l],       %[HWM]          \n\t"
+        "shrl.ph      %[vqs0_r],       %[vqs0_r],       8               \n\t"
+
+        "and          %[vps0_l],       %[vps0_l],       %[HWM]          \n\t"
+        "shrl.ph      %[vps0_r],       %[vps0_r],       8               \n\t"
+
+        "and          %[vqs1_l],       %[vqs1_l],       %[HWM]          \n\t"
+        "shrl.ph      %[vqs1_r],       %[vqs1_r],       8               \n\t"
+
+        "and          %[vps1_l],       %[vps1_l],       %[HWM]          \n\t"
+        "shrl.ph      %[vps1_r],       %[vps1_r],       8               \n\t"
+
+        "and          %[vqs2_l],       %[vqs2_l],       %[HWM]          \n\t"
+        "shrl.ph      %[vqs2_r],       %[vqs2_r],       8               \n\t"
+
+        "and          %[vps2_l],       %[vps2_l],       %[HWM]          \n\t"
+        "shrl.ph      %[vps2_r],       %[vps2_r],       8               \n\t"
+
+        "or           %[vqs0_r],       %[vqs0_l],       %[vqs0_r]       \n\t"
+        "or           %[vps0_r],       %[vps0_l],       %[vps0_r]       \n\t"
+        "or           %[vqs1_r],       %[vqs1_l],       %[vqs1_r]       \n\t"
+        "or           %[vps1_r],       %[vps1_l],       %[vps1_r]       \n\t"
+        "or           %[vqs2_r],       %[vqs2_l],       %[vqs2_r]       \n\t"
+        "or           %[vps2_r],       %[vps2_l],       %[vps2_r]       \n\t"
+
+        : [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r), [vqs1_l] "+r" (vqs1_l),
+          [vqs1_r] "+r" (vqs1_r), [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
+          [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r), [vqs2_l] "+r" (vqs2_l),
+          [vqs2_r] "+r" (vqs2_r), [vps2_r] "+r" (vps2_r), [vps2_l] "+r" (vps2_l)
+        : [HWM] "r" (HWM)
+    );
+
+    *ps0 = vps0_r ^ N128;
+    *ps1 = vps1_r ^ N128;
+    *ps2 = vps2_r ^ N128;
+    *qs0 = vqs0_r ^ N128;
+    *qs1 = vqs1_r ^ N128;
+    *qs2 = vqs2_r ^ N128;
+}
+
+void vp8_mbloop_filter_horizontal_edge_mips
+(
+    unsigned char *s,
+    int p,
+    unsigned int flimit,
+    unsigned int limit,
+    unsigned int thresh,
+    int count
+)
+{
+    int i;
+    uint32_t mask, hev;
+    uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+    unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
+
+    mask = 0;
+    hev = 0;
+    i = 0;
+    p1 = 0;
+    p2 = 0;
+    p3 = 0;
+    p4 = 0;
+
+    /* loop filter designed to work using chars so that we can make maximum use
+     * of 8 bit simd instructions.
+     */
+
+    sm1 = s - (p << 2);
+    s0  = s - p - p - p;
+    s1  = s - p - p;
+    s2  = s - p;
+    s3  = s;
+    s4  = s + p;
+    s5  = s + p + p;
+    s6  = s + p + p + p;
+
+    /* prefetch data for load */
+    prefetch_load_lf(s + p);
+
+    /* apply filter on 4 pixesl at the same time */
+    do
+    {
+        /* load quad-byte vectors
+         * memory is 4 byte aligned
+         */
+        p1 = *((uint32_t *)(s1));
+        p2 = *((uint32_t *)(s2));
+        p3 = *((uint32_t *)(s3));
+        p4 = *((uint32_t *)(s4));
+
+        /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+         * mask will be zero and filtering is not needed
+         */
+        if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+        {
+
+            pm1 = *((uint32_t *)(sm1));
+            p0  = *((uint32_t *)(s0));
+            p5  = *((uint32_t *)(s5));
+            p6  = *((uint32_t *)(s6));
+
+            vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                     thresh, &hev, &mask);
+
+            /* if mask == 0 do filtering is not needed */
+            if (mask)
+            {
+                /* filtering */
+                vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
+
+                /* unpack processed 4x4 neighborhood
+                 * memory is 4 byte aligned
+                 */
+                *((uint32_t *)s0) = p0;
+                *((uint32_t *)s1) = p1;
+                *((uint32_t *)s2) = p2;
+                *((uint32_t *)s3) = p3;
+                *((uint32_t *)s4) = p4;
+                *((uint32_t *)s5) = p5;
+            }
+        }
+
+        sm1 += 4;
+        s0  += 4;
+        s1  += 4;
+        s2  += 4;
+        s3  += 4;
+        s4  += 4;
+        s5  += 4;
+        s6  += 4;
+
+        /* load quad-byte vectors
+         * memory is 4 byte aligned
+         */
+        p1 = *((uint32_t *)(s1));
+        p2 = *((uint32_t *)(s2));
+        p3 = *((uint32_t *)(s3));
+        p4 = *((uint32_t *)(s4));
+
+        /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+         * mask will be zero and filtering is not needed
+         */
+        if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+        {
+
+            pm1 = *((uint32_t *)(sm1));
+            p0  = *((uint32_t *)(s0));
+            p5  = *((uint32_t *)(s5));
+            p6  = *((uint32_t *)(s6));
+
+            vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                     thresh, &hev, &mask);
+
+            /* if mask == 0 do filtering is not needed */
+            if (mask)
+            {
+                /* filtering */
+                vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
+
+                /* unpack processed 4x4 neighborhood
+                 * memory is 4 byte aligned
+                 */
+                *((uint32_t *)s0) = p0;
+                *((uint32_t *)s1) = p1;
+                *((uint32_t *)s2) = p2;
+                *((uint32_t *)s3) = p3;
+                *((uint32_t *)s4) = p4;
+                *((uint32_t *)s5) = p5;
+            }
+        }
+
+        sm1 += 4;
+        s0  += 4;
+        s1  += 4;
+        s2  += 4;
+        s3  += 4;
+        s4  += 4;
+        s5  += 4;
+        s6  += 4;
+
+        i += 8;
+    }
+
+    while (i < count);
+}
+
+void vp8_mbloop_filter_uvhorizontal_edge_mips
+(
+    unsigned char *s,
+    int p,
+    unsigned int flimit,
+    unsigned int limit,
+    unsigned int thresh,
+    int count
+)
+{
+    uint32_t mask, hev;
+    uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+    unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
+
+    mask = 0;
+    hev = 0;
+    p1 = 0;
+    p2 = 0;
+    p3 = 0;
+    p4 = 0;
+
+    /* loop filter designed to work using chars so that we can make maximum use
+     * of 8 bit simd instructions.
+     */
+
+    sm1 = s - (p << 2);
+    s0  = s - p - p - p;
+    s1  = s - p - p;
+    s2  = s - p;
+    s3  = s;
+    s4  = s + p;
+    s5  = s + p + p;
+    s6  = s + p + p + p;
+
+    /* load quad-byte vectors
+     * memory is 4 byte aligned
+     */
+    p1 = *((uint32_t *)(s1));
+    p2 = *((uint32_t *)(s2));
+    p3 = *((uint32_t *)(s3));
+    p4 = *((uint32_t *)(s4));
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+     * mask will be zero and filtering is not needed
+     */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+    {
+
+        pm1 = *((uint32_t *)(sm1));
+        p0  = *((uint32_t *)(s0));
+        p5  = *((uint32_t *)(s5));
+        p6  = *((uint32_t *)(s6));
+
+        /* if mask == 0 do filtering is not needed */
+        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                 thresh, &hev, &mask);
+
+        if (mask)
+        {
+            /* filtering */
+            vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
+
+            /* unpack processed 4x4 neighborhood
+             * memory is 4 byte aligned
+             */
+            *((uint32_t *)s0) = p0;
+            *((uint32_t *)s1) = p1;
+            *((uint32_t *)s2) = p2;
+            *((uint32_t *)s3) = p3;
+            *((uint32_t *)s4) = p4;
+            *((uint32_t *)s5) = p5;
+        }
+    }
+
+    sm1 += 4;
+    s0  += 4;
+    s1  += 4;
+    s2  += 4;
+    s3  += 4;
+    s4  += 4;
+    s5  += 4;
+    s6  += 4;
+
+    /* load quad-byte vectors
+     * memory is 4 byte aligned
+     */
+    p1 = *((uint32_t *)(s1));
+    p2 = *((uint32_t *)(s2));
+    p3 = *((uint32_t *)(s3));
+    p4 = *((uint32_t *)(s4));
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+     * mask will be zero and filtering is not needed
+     */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+    {
+
+        pm1 = *((uint32_t *)(sm1));
+        p0  = *((uint32_t *)(s0));
+        p5  = *((uint32_t *)(s5));
+        p6  = *((uint32_t *)(s6));
+
+        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                 thresh, &hev, &mask);
+
+        /* if mask == 0 do filtering is not needed */
+        if (mask)
+        {
+            /* filtering */
+            vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
+
+            /* unpack processed 4x4 neighborhood
+             * memory is 4 byte aligned
+             */
+            *((uint32_t *)s0) = p0;
+            *((uint32_t *)s1) = p1;
+            *((uint32_t *)s2) = p2;
+            *((uint32_t *)s3) = p3;
+            *((uint32_t *)s4) = p4;
+            *((uint32_t *)s5) = p5;
+        }
+    }
+}
+
+
+void vp8_mbloop_filter_vertical_edge_mips
+(
+    unsigned char *s,
+    int p,
+    unsigned int flimit,
+    unsigned int limit,
+    unsigned int thresh,
+    int count
+)
+{
+
+    int i;
+    uint32_t mask, hev;
+    uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+    unsigned char *s1, *s2, *s3, *s4;
+    uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+
+    mask = 0;
+    hev = 0;
+    i = 0;
+    pm1 = 0;
+    p0 = 0;
+    p1 = 0;
+    p2 = 0;
+    p3 = 0;
+    p4 = 0;
+    p5 = 0;
+    p6 = 0;
+
+    /* loop filter designed to work using chars so that we can make maximum use
+     * of 8 bit simd instructions.
+     */
+
+    /* apply filter on 4 pixesl at the same time */
+    do
+    {
+        s1 = s;
+        s2 = s + p;
+        s3 = s2 + p;
+        s4 = s3 + p;
+        s  = s4 + p;
+
+        /* load quad-byte vectors
+         * memory is 4 byte aligned
+         */
+        p2  = *((uint32_t *)(s1 - 4));
+        p6  = *((uint32_t *)(s1));
+        p1  = *((uint32_t *)(s2 - 4));
+        p5  = *((uint32_t *)(s2));
+        p0  = *((uint32_t *)(s3 - 4));
+        p4  = *((uint32_t *)(s3));
+        pm1 = *((uint32_t *)(s4 - 4));
+        p3  = *((uint32_t *)(s4));
+
+        /* transpose pm1, p0, p1, p2 */
+        __asm__ __volatile__ (
+            "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
+            "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
+            "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
+            "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
+
+            "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
+            "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
+            "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+            "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+            "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
+            "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
+            "append         %[p1],      %[sec3],    16          \n\t"
+            "append         %[pm1],     %[sec4],    16          \n\t"
+
+            : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+              [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+              [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
+              [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+            :
+        );
+
+        /* transpose p3, p4, p5, p6 */
+        __asm__ __volatile__ (
+            "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
+            "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
+            "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
+            "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
+
+            "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
+            "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
+            "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+            "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+            "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
+            "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
+            "append         %[p5],      %[sec3],    16          \n\t"
+            "append         %[p3],      %[sec4],    16          \n\t"
+
+            : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+              [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+              [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+              [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+            :
+        );
+
+        /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+         * mask will be zero and filtering is not needed
+         */
+        if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+        {
+
+            vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                     thresh, &hev, &mask);
+
+            /* if mask == 0 do filtering is not needed */
+            if (mask)
+            {
+                /* filtering */
+                vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
+
+                /* don't use transpose on output data
+                 * because memory isn't aligned
+                 */
+                __asm__ __volatile__ (
+                    "sb         %[p5],  2(%[s4])        \n\t"
+                    "sb         %[p4],  1(%[s4])        \n\t"
+                    "sb         %[p3],  0(%[s4])        \n\t"
+                    "sb         %[p2], -1(%[s4])        \n\t"
+                    "sb         %[p1], -2(%[s4])        \n\t"
+                    "sb         %[p0], -3(%[s4])        \n\t"
+                    :
+                    : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
+                      [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+                );
+
+                __asm__ __volatile__ (
+                    "srl        %[p5], %[p5], 8         \n\t"
+                    "srl        %[p4], %[p4], 8         \n\t"
+                    "srl        %[p3], %[p3], 8         \n\t"
+                    "srl        %[p2], %[p2], 8         \n\t"
+                    "srl        %[p1], %[p1], 8         \n\t"
+                    "srl        %[p0], %[p0], 8         \n\t"
+                    : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+                      [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
+                    :
+                );
+
+                __asm__ __volatile__ (
+                    "sb         %[p5],  2(%[s3])        \n\t"
+                    "sb         %[p4],  1(%[s3])        \n\t"
+                    "sb         %[p3],  0(%[s3])        \n\t"
+                    "sb         %[p2], -1(%[s3])        \n\t"
+                    "sb         %[p1], -2(%[s3])        \n\t"
+                    "sb         %[p0], -3(%[s3])        \n\t"
+                    :
+                    : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3),
+                      [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+                );
+
+                __asm__ __volatile__ (
+                    "srl        %[p5], %[p5], 8         \n\t"
+                    "srl        %[p4], %[p4], 8         \n\t"
+                    "srl        %[p3], %[p3], 8         \n\t"
+                    "srl        %[p2], %[p2], 8         \n\t"
+                    "srl        %[p1], %[p1], 8         \n\t"
+                    "srl        %[p0], %[p0], 8         \n\t"
+                    : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+                      [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
+                    :
+                );
+
+                __asm__ __volatile__ (
+                    "sb         %[p5],  2(%[s2])        \n\t"
+                    "sb         %[p4],  1(%[s2])        \n\t"
+                    "sb         %[p3],  0(%[s2])        \n\t"
+                    "sb         %[p2], -1(%[s2])        \n\t"
+                    "sb         %[p1], -2(%[s2])        \n\t"
+                    "sb         %[p0], -3(%[s2])        \n\t"
+                    :
+                    : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
+                      [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+                );
+
+                __asm__ __volatile__ (
+                    "srl        %[p5], %[p5], 8         \n\t"
+                    "srl        %[p4], %[p4], 8         \n\t"
+                    "srl        %[p3], %[p3], 8         \n\t"
+                    "srl        %[p2], %[p2], 8         \n\t"
+                    "srl        %[p1], %[p1], 8         \n\t"
+                    "srl        %[p0], %[p0], 8         \n\t"
+                    : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+                      [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
+                    :
+                );
+
+                __asm__ __volatile__ (
+                    "sb         %[p5],  2(%[s1])        \n\t"
+                    "sb         %[p4],  1(%[s1])        \n\t"
+                    "sb         %[p3],  0(%[s1])        \n\t"
+                    "sb         %[p2], -1(%[s1])        \n\t"
+                    "sb         %[p1], -2(%[s1])        \n\t"
+                    "sb         %[p0], -3(%[s1])        \n\t"
+                    :
+                    : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1),
+                      [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+                );
+            }
+        }
+
+        i += 4;
+    }
+
+    while (i < count);
+}
+
+void vp8_mbloop_filter_uvvertical_edge_mips
+(
+    unsigned char *s,
+    int p,
+    unsigned int flimit,
+    unsigned int limit,
+    unsigned int thresh,
+    int count
+)
+{
+    uint32_t mask, hev;
+    uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+    unsigned char *s1, *s2, *s3, *s4;
+    uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+
+    mask = 0;
+    hev = 0;
+    pm1 = 0;
+    p0 = 0;
+    p1 = 0;
+    p2 = 0;
+    p3 = 0;
+    p4 = 0;
+    p5 = 0;
+    p6 = 0;
+
+    /* loop filter designed to work using chars so that we can make maximum use
+     * of 8 bit simd instructions.
+     */
+
+    /* apply filter on 4 pixesl at the same time */
+
+    s1 = s;
+    s2 = s + p;
+    s3 = s2 + p;
+    s4 = s3 + p;
+
+    /* prefetch data for load */
+    prefetch_load_lf(s + 2 * p);
+
+    /* load quad-byte vectors
+     * memory is 4 byte aligned
+     */
+    p2  = *((uint32_t *)(s1 - 4));
+    p6  = *((uint32_t *)(s1));
+    p1  = *((uint32_t *)(s2 - 4));
+    p5  = *((uint32_t *)(s2));
+    p0  = *((uint32_t *)(s3 - 4));
+    p4  = *((uint32_t *)(s3));
+    pm1 = *((uint32_t *)(s4 - 4));
+    p3  = *((uint32_t *)(s4));
+
+    /* transpose pm1, p0, p1, p2 */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
+        "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
+
+        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
+        "append         %[p1],      %[sec3],    16          \n\t"
+        "append         %[pm1],     %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* transpose p3, p4, p5, p6 */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
+        "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
+
+        "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
+        "append         %[p5],      %[sec3],    16          \n\t"
+        "append         %[p3],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+     * mask will be zero and filtering is not needed
+     */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+    {
+
+        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
+                                 thresh, &hev, &mask);
+
+        /* if mask == 0 do filtering is not needed */
+        if (mask)
+        {
+            /* filtering */
+            vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
+
+            /* don't use transpose on output data
+             * because memory isn't aligned
+             */
+            __asm__ __volatile__ (
+                "sb         %[p5],  2(%[s4])        \n\t"
+                "sb         %[p4],  1(%[s4])        \n\t"
+                "sb         %[p3],  0(%[s4])        \n\t"
+                "sb         %[p2], -1(%[s4])        \n\t"
+                "sb         %[p1], -2(%[s4])        \n\t"
+                "sb         %[p0], -3(%[s4])        \n\t"
+                :
+                : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
+                  [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+            );
+
+            __asm__ __volatile__ (
+                "srl        %[p5], %[p5], 8         \n\t"
+                "srl        %[p4], %[p4], 8         \n\t"
+                "srl        %[p3], %[p3], 8         \n\t"
+                "srl        %[p2], %[p2], 8         \n\t"
+                "srl        %[p1], %[p1], 8         \n\t"
+                "srl        %[p0], %[p0], 8         \n\t"
+                : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+                  [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
+                :
+            );
+
+            __asm__ __volatile__ (
+                "sb         %[p5],  2(%[s3])        \n\t"
+                "sb         %[p4],  1(%[s3])        \n\t"
+                "sb         %[p3],  0(%[s3])        \n\t"
+                "sb         %[p2], -1(%[s3])        \n\t"
+                "sb         %[p1], -2(%[s3])        \n\t"
+                "sb         %[p0], -3(%[s3])        \n\t"
+                :
+                : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3),
+                  [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+            );
+
+            __asm__ __volatile__ (
+                "srl        %[p5], %[p5], 8         \n\t"
+                "srl        %[p4], %[p4], 8         \n\t"
+                "srl        %[p3], %[p3], 8         \n\t"
+                "srl        %[p2], %[p2], 8         \n\t"
+                "srl        %[p1], %[p1], 8         \n\t"
+                "srl        %[p0], %[p0], 8         \n\t"
+                : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+                  [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
+                :
+            );
+
+            __asm__ __volatile__ (
+                "sb         %[p5],  2(%[s2])        \n\t"
+                "sb         %[p4],  1(%[s2])        \n\t"
+                "sb         %[p3],  0(%[s2])        \n\t"
+                "sb         %[p2], -1(%[s2])        \n\t"
+                "sb         %[p1], -2(%[s2])        \n\t"
+                "sb         %[p0], -3(%[s2])        \n\t"
+                :
+                : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
+                  [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+            );
+
+            __asm__ __volatile__ (
+                "srl        %[p5], %[p5], 8         \n\t"
+                "srl        %[p4], %[p4], 8         \n\t"
+                "srl        %[p3], %[p3], 8         \n\t"
+                "srl        %[p2], %[p2], 8         \n\t"
+                "srl        %[p1], %[p1], 8         \n\t"
+                "srl        %[p0], %[p0], 8         \n\t"
+                : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+                  [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
+                :
+            );
+
+            __asm__ __volatile__ (
+                "sb         %[p5],  2(%[s1])        \n\t"
+                "sb         %[p4],  1(%[s1])        \n\t"
+                "sb         %[p3],  0(%[s1])        \n\t"
+                "sb         %[p2], -1(%[s1])        \n\t"
+                "sb         %[p1], -2(%[s1])        \n\t"
+                "sb         %[p0], -3(%[s1])        \n\t"
+                :
+                : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1),
+                  [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+            );
+        }
+    }
+
+    s1 = s4 + p;
+    s2 = s1 + p;
+    s3 = s2 + p;
+    s4 = s3 + p;
+
+    /* load quad-byte vectors
+    * memory is 4 byte aligned
+    */
+    p2  = *((uint32_t *)(s1 - 4));
+    p6  = *((uint32_t *)(s1));
+    p1  = *((uint32_t *)(s2 - 4));
+    p5  = *((uint32_t *)(s2));
+    p0  = *((uint32_t *)(s3 - 4));
+    p4  = *((uint32_t *)(s3));
+    pm1 = *((uint32_t *)(s4 - 4));
+    p3  = *((uint32_t *)(s4));
+
+    /* transpose pm1, p0, p1, p2 */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
+        "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
+
+        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
+        "append         %[p1],      %[sec3],    16          \n\t"
+        "append         %[pm1],     %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* transpose p3, p4, p5, p6 */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
+        "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
+
+        "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
+        "append         %[p5],      %[sec3],    16          \n\t"
+        "append         %[p3],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+     * mask will be zero and filtering is not needed
+     */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
+    {
+
+        vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, thresh, &hev, &mask);
+
+        /* if mask == 0 do filtering is not needed */
+        if (mask)
+        {
+            /* filtering */
+            vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
+
+            /* don't use transpose on output data
+             * because memory isn't aligned
+             */
+            __asm__ __volatile__ (
+                "sb         %[p5],  2(%[s4])        \n\t"
+                "sb         %[p4],  1(%[s4])        \n\t"
+                "sb         %[p3],  0(%[s4])        \n\t"
+                "sb         %[p2], -1(%[s4])        \n\t"
+                "sb         %[p1], -2(%[s4])        \n\t"
+                "sb         %[p0], -3(%[s4])        \n\t"
+                :
+                : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
+                  [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+            );
+
+            __asm__ __volatile__ (
+                "srl        %[p5], %[p5], 8         \n\t"
+                "srl        %[p4], %[p4], 8         \n\t"
+                "srl        %[p3], %[p3], 8         \n\t"
+                "srl        %[p2], %[p2], 8         \n\t"
+                "srl        %[p1], %[p1], 8         \n\t"
+                "srl        %[p0], %[p0], 8         \n\t"
+                : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+                  [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
+                :
+            );
+
+            __asm__ __volatile__ (
+                "sb         %[p5],  2(%[s3])        \n\t"
+                "sb         %[p4],  1(%[s3])        \n\t"
+                "sb         %[p3],  0(%[s3])        \n\t"
+                "sb         %[p2], -1(%[s3])        \n\t"
+                "sb         %[p1], -2(%[s3])        \n\t"
+                "sb         %[p0], -3(%[s3])        \n\t"
+                :
+                : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3),
+                  [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+            );
+
+            __asm__ __volatile__ (
+                "srl        %[p5], %[p5], 8         \n\t"
+                "srl        %[p4], %[p4], 8         \n\t"
+                "srl        %[p3], %[p3], 8         \n\t"
+                "srl        %[p2], %[p2], 8         \n\t"
+                "srl        %[p1], %[p1], 8         \n\t"
+                "srl        %[p0], %[p0], 8         \n\t"
+                : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+                  [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
+                :
+            );
+
+            __asm__ __volatile__ (
+                "sb         %[p5],  2(%[s2])        \n\t"
+                "sb         %[p4],  1(%[s2])        \n\t"
+                "sb         %[p3],  0(%[s2])        \n\t"
+                "sb         %[p2], -1(%[s2])        \n\t"
+                "sb         %[p1], -2(%[s2])        \n\t"
+                "sb         %[p0], -3(%[s2])        \n\t"
+                :
+                : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
+                  [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+            );
+
+            __asm__ __volatile__ (
+                "srl        %[p5], %[p5], 8         \n\t"
+                "srl        %[p4], %[p4], 8         \n\t"
+                "srl        %[p3], %[p3], 8         \n\t"
+                "srl        %[p2], %[p2], 8         \n\t"
+                "srl        %[p1], %[p1], 8         \n\t"
+                "srl        %[p0], %[p0], 8         \n\t"
+                : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+                  [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
+                :
+            );
+
+            __asm__ __volatile__ (
+                "sb         %[p5],  2(%[s1])        \n\t"
+                "sb         %[p4],  1(%[s1])        \n\t"
+                "sb         %[p3],  0(%[s1])        \n\t"
+                "sb         %[p2], -1(%[s1])        \n\t"
+                "sb         %[p1], -2(%[s1])        \n\t"
+                "sb         %[p0], -3(%[s1])        \n\t"
+                :
+                : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1),
+                  [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
+            );
+        }
+    }
+}
+
+/* Horizontal MB filtering */
+void vp8_loop_filter_mbh_dspr2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                               int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    unsigned int thresh_vec, flimit_vec, limit_vec;
+    unsigned char thresh, flimit, limit, flimit_temp;
+
+    /* use direct value instead pointers */
+    limit = *(lfi->lim);
+    flimit_temp = *(lfi->mblim);
+    thresh = *(lfi->hev_thr);
+    flimit = flimit_temp;
+
+    /* create quad-byte */
+    __asm__ __volatile__ (
+        "replv.qb       %[thresh_vec], %[thresh]    \n\t"
+        "replv.qb       %[flimit_vec], %[flimit]    \n\t"
+        "replv.qb       %[limit_vec],  %[limit]     \n\t"
+        : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), [limit_vec] "=r" (limit_vec)
+        : [thresh] "r" (thresh), [flimit] "r" (flimit), [limit] "r" (limit)
+    );
+
+    vp8_mbloop_filter_horizontal_edge_mips(y_ptr, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
+
+    if (u_ptr)
+    {
+        vp8_mbloop_filter_uvhorizontal_edge_mips(u_ptr, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
+    }
+
+    if (v_ptr)
+    {
+        vp8_mbloop_filter_uvhorizontal_edge_mips(v_ptr, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
+    }
+}
+
+
+/* Vertical MB Filtering */
+void vp8_loop_filter_mbv_dspr2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                               int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    unsigned int thresh_vec, flimit_vec, limit_vec;
+    unsigned char thresh, flimit, limit, flimit_temp;
+
+    /* use direct value instead pointers */
+    limit = *(lfi->lim);
+    flimit_temp = *(lfi->mblim);
+    thresh = *(lfi->hev_thr);
+    flimit = flimit_temp;
+
+    /* create quad-byte */
+    __asm__ __volatile__ (
+        "replv.qb       %[thresh_vec], %[thresh]    \n\t"
+        "replv.qb       %[flimit_vec], %[flimit]    \n\t"
+        "replv.qb       %[limit_vec],  %[limit]     \n\t"
+        : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), [limit_vec] "=r" (limit_vec)
+        : [thresh] "r" (thresh), [flimit] "r" (flimit), [limit] "r" (limit)
+    );
+
+    vp8_mbloop_filter_vertical_edge_mips(y_ptr, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
+
+    if (u_ptr)
+        vp8_mbloop_filter_uvvertical_edge_mips(u_ptr, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
+
+    if (v_ptr)
+        vp8_mbloop_filter_uvvertical_edge_mips(v_ptr, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
+}
+
+
+/* Horizontal B Filtering */
+void vp8_loop_filter_bh_dspr2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    unsigned int thresh_vec, flimit_vec, limit_vec;
+    unsigned char thresh, flimit, limit, flimit_temp;
+
+    /* use direct value instead pointers */
+    limit = *(lfi->lim);
+    flimit_temp = *(lfi->blim);
+    thresh = *(lfi->hev_thr);
+    flimit = flimit_temp;
+
+    /* create quad-byte */
+    __asm__ __volatile__ (
+        "replv.qb       %[thresh_vec], %[thresh]    \n\t"
+        "replv.qb       %[flimit_vec], %[flimit]    \n\t"
+        "replv.qb       %[limit_vec],  %[limit]     \n\t"
+        : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), [limit_vec] "=r" (limit_vec)
+        : [thresh] "r" (thresh), [flimit] "r" (flimit), [limit] "r" (limit)
+    );
+
+    vp8_loop_filter_horizontal_edge_mips(y_ptr + 4 * y_stride, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
+    vp8_loop_filter_horizontal_edge_mips(y_ptr + 8 * y_stride, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
+    vp8_loop_filter_horizontal_edge_mips(y_ptr + 12 * y_stride, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
+
+    if (u_ptr)
+        vp8_loop_filter_uvhorizontal_edge_mips(u_ptr + 4 * uv_stride, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
+
+    if (v_ptr)
+        vp8_loop_filter_uvhorizontal_edge_mips(v_ptr + 4 * uv_stride, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
+}
+
+
+/* Vertical B Filtering */
+void vp8_loop_filter_bv_dspr2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
+                              int y_stride, int uv_stride, loop_filter_info *lfi)
+{
+    unsigned int thresh_vec, flimit_vec, limit_vec;
+    unsigned char thresh, flimit, limit, flimit_temp;
+
+    /* use direct value instead pointers */
+    limit = *(lfi->lim);
+    flimit_temp = *(lfi->blim);
+    thresh = *(lfi->hev_thr);
+    flimit = flimit_temp;
+
+    /* create quad-byte */
+    __asm__ __volatile__ (
+        "replv.qb       %[thresh_vec], %[thresh]    \n\t"
+        "replv.qb       %[flimit_vec], %[flimit]    \n\t"
+        "replv.qb       %[limit_vec],  %[limit]     \n\t"
+        : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), [limit_vec] "=r" (limit_vec)
+        : [thresh] "r" (thresh), [flimit] "r" (flimit), [limit] "r" (limit)
+    );
+
+    vp8_loop_filter_vertical_edge_mips(y_ptr + 4, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
+    vp8_loop_filter_vertical_edge_mips(y_ptr + 8, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
+    vp8_loop_filter_vertical_edge_mips(y_ptr + 12, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
+
+    if (u_ptr)
+        vp8_loop_filter_uvvertical_edge_mips(u_ptr + 4, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
+
+    if (v_ptr)
+        vp8_loop_filter_uvvertical_edge_mips(v_ptr + 4, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
+}
+
+#endif

diff --git a/vp8/common/mips/dspr2/reconinter_dspr2.c b/vp8/common/mips/dspr2/reconinter_dspr2.c
new file mode 100644
index 0000000..a5239a3
--- /dev/null
+++ b/vp8/common/mips/dspr2/reconinter_dspr2.c

@@ -0,0 +1,121 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_config.h"
+#include "vpx_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+#if HAVE_DSPR2
+inline void prefetch_load_int(unsigned char *src)
+{
+    __asm__ __volatile__ (
+        "pref   0,  0(%[src])   \n\t"
+        :
+        : [src] "r" (src)
+    );
+}
+
+
+__inline void vp8_copy_mem16x16_dspr2(
+    unsigned char *RESTRICT src,
+    int src_stride,
+    unsigned char *RESTRICT dst,
+    int dst_stride)
+{
+    int r;
+    unsigned int a0, a1, a2, a3;
+
+    for (r = 16; r--;)
+    {
+        /* load src data in cache memory */
+        prefetch_load_int(src + src_stride);
+
+        /* use unaligned memory load and store */
+        __asm__ __volatile__ (
+            "ulw    %[a0], 0(%[src])            \n\t"
+            "ulw    %[a1], 4(%[src])            \n\t"
+            "ulw    %[a2], 8(%[src])            \n\t"
+            "ulw    %[a3], 12(%[src])           \n\t"
+            "sw     %[a0], 0(%[dst])            \n\t"
+            "sw     %[a1], 4(%[dst])            \n\t"
+            "sw     %[a2], 8(%[dst])            \n\t"
+            "sw     %[a3], 12(%[dst])           \n\t"
+            : [a0] "=&r" (a0), [a1] "=&r" (a1),
+              [a2] "=&r" (a2), [a3] "=&r" (a3)
+            : [src] "r" (src), [dst] "r" (dst)
+        );
+
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
+
+
+__inline void vp8_copy_mem8x8_dspr2(
+    unsigned char *RESTRICT src,
+    int src_stride,
+    unsigned char *RESTRICT dst,
+    int dst_stride)
+{
+    int r;
+    unsigned int a0, a1;
+
+    /* load src data in cache memory */
+    prefetch_load_int(src + src_stride);
+
+    for (r = 8; r--;)
+    {
+        /* use unaligned memory load and store */
+        __asm__ __volatile__ (
+            "ulw    %[a0], 0(%[src])            \n\t"
+            "ulw    %[a1], 4(%[src])            \n\t"
+            "sw     %[a0], 0(%[dst])            \n\t"
+            "sw     %[a1], 4(%[dst])            \n\t"
+            : [a0] "=&r" (a0), [a1] "=&r" (a1)
+            : [src] "r" (src), [dst] "r" (dst)
+        );
+
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
+
+
+__inline void vp8_copy_mem8x4_dspr2(
+    unsigned char *RESTRICT src,
+    int src_stride,
+    unsigned char *RESTRICT dst,
+    int dst_stride)
+{
+    int r;
+    unsigned int a0, a1;
+
+    /* load src data in cache memory */
+    prefetch_load_int(src + src_stride);
+
+    for (r = 4; r--;)
+    {
+        /* use unaligned memory load and store */
+        __asm__ __volatile__ (
+            "ulw    %[a0], 0(%[src])            \n\t"
+            "ulw    %[a1], 4(%[src])            \n\t"
+            "sw     %[a0], 0(%[dst])            \n\t"
+            "sw     %[a1], 4(%[dst])            \n\t"
+           : [a0] "=&r" (a0), [a1] "=&r" (a1)
+           : [src] "r" (src), [dst] "r" (dst)
+        );
+
+        src += src_stride;
+        dst += dst_stride;
+    }
+}
+
+#endif

diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h
index 2c8188a..766b4ea 100644
--- a/vp8/common/onyx.h
+++ b/vp8/common/onyx.h

@@ -39,14 +39,6 @@
 
     typedef enum
     {
-        VP8_LAST_FLAG = 1,
-        VP8_GOLD_FLAG = 2,
-        VP8_ALT_FLAG = 4
-    } VP8_REFFRAME;
-
-
-    typedef enum
-    {
         USAGE_STREAM_FROM_SERVER    = 0x0,
         USAGE_LOCAL_FILE_PLAYBACK   = 0x1,
         USAGE_CONSTRAINED_QUALITY   = 0x2
@@ -102,83 +94,101 @@
 
     typedef struct
     {
-        int Version;            // 4 versions of bitstream defined 0 best quality/slowest decode, 3 lowest quality/fastest decode
-        int Width;              // width of data passed to the compressor
-        int Height;             // height of data passed to the compressor
+        /* 4 versions of bitstream defined:
+         *   0 best quality/slowest decode, 3 lowest quality/fastest decode
+         */
+        int Version;
+        int Width;
+        int Height;
         struct vpx_rational  timebase;
-        unsigned int target_bandwidth;    // bandwidth to be used in kilobits per second
+        unsigned int target_bandwidth;    /* kilobits per second */
 
-        int noise_sensitivity;   // parameter used for applying pre processing blur: recommendation 0
-        int Sharpness;          // parameter used for sharpening output: recommendation 0:
+        /* parameter used for applying pre processing blur: recommendation 0 */
+        int noise_sensitivity;
+
+        /* parameter used for sharpening output: recommendation 0: */
+        int Sharpness;
         int cpu_used;
         unsigned int rc_max_intra_bitrate_pct;
 
-        // mode ->
-        //(0)=Realtime/Live Encoding. This mode is optimized for realtim encoding (for example, capturing
-        //    a television signal or feed from a live camera). ( speed setting controls how fast )
-        //(1)=Good Quality Fast Encoding. The encoder balances quality with the amount of time it takes to
-        //    encode the output. ( speed setting controls how fast )
-        //(2)=One Pass - Best Quality. The encoder places priority on the quality of the output over encoding
-        //    speed. The output is compressed at the highest possible quality. This option takes the longest
-        //    amount of time to encode. ( speed setting ignored )
-        //(3)=Two Pass - First Pass. The encoder generates a file of statistics for use in the second encoding
-        //    pass. ( speed setting controls how fast )
-        //(4)=Two Pass - Second Pass. The encoder uses the statistics that were generated in the first encoding
-        //    pass to create the compressed output. ( speed setting controls how fast )
-        //(5)=Two Pass - Second Pass Best.  The encoder uses the statistics that were generated in the first
-        //    encoding pass to create the compressed output using the highest possible quality, and taking a
-        //    longer amount of time to encode.. ( speed setting ignored )
-        int Mode;               //
+        /* mode ->
+         *(0)=Realtime/Live Encoding. This mode is optimized for realtim
+         *    encoding (for example, capturing a television signal or feed
+         *    from a live camera). ( speed setting controls how fast )
+         *(1)=Good Quality Fast Encoding. The encoder balances quality with
+         *    the amount of time it takes to encode the output. ( speed
+         *    setting controls how fast )
+         *(2)=One Pass - Best Quality. The encoder places priority on the
+         *    quality of the output over encoding speed. The output is
+         *    compressed at the highest possible quality. This option takes
+         *    the longest amount of time to encode. ( speed setting ignored
+         *    )
+         *(3)=Two Pass - First Pass. The encoder generates a file of
+         *    statistics for use in the second encoding pass. ( speed
+         *    setting controls how fast )
+         *(4)=Two Pass - Second Pass. The encoder uses the statistics that
+         *    were generated in the first encoding pass to create the
+         *    compressed output. ( speed setting controls how fast )
+         *(5)=Two Pass - Second Pass Best.  The encoder uses the statistics
+         *    that were generated in the first encoding pass to create the
+         *    compressed output using the highest possible quality, and
+         *    taking a longer amount of time to encode.. ( speed setting
+         *    ignored )
+         */
+        int Mode;
 
-        // Key Framing Operations
-        int auto_key;            // automatically detect cut scenes and set the keyframes
-        int key_freq;            // maximum distance to key frame.
+        /* Key Framing Operations */
+        int auto_key;       /* automatically detect cut scenes */
+        int key_freq;       /* maximum distance to key frame. */
 
-        int allow_lag;           // allow lagged compression (if 0 lagin frames is ignored)
-        int lag_in_frames;        // how many frames lag before we start encoding
+        /* lagged compression (if allow_lag == 0 lag_in_frames is ignored) */
+        int allow_lag;
+        int lag_in_frames; /* how many frames lag before we start encoding */
 
-        //----------------------------------------------------------------
-        // DATARATE CONTROL OPTIONS
+        /*
+         * DATARATE CONTROL OPTIONS
+         */
 
-        int end_usage; // vbr or cbr
+        int end_usage; /* vbr or cbr */
 
-        // buffer targeting aggressiveness
+        /* buffer targeting aggressiveness */
         int under_shoot_pct;
         int over_shoot_pct;
 
-        // buffering parameters
-        int64_t starting_buffer_level;  // in bytes
+        /* buffering parameters */
+        int64_t starting_buffer_level;
         int64_t optimal_buffer_level;
         int64_t maximum_buffer_size;
 
-        int64_t starting_buffer_level_in_ms;  // in milli-seconds
+        int64_t starting_buffer_level_in_ms;
         int64_t optimal_buffer_level_in_ms;
         int64_t maximum_buffer_size_in_ms;
 
-        // controlling quality
+        /* controlling quality */
         int fixed_q;
         int worst_allowed_q;
         int best_allowed_q;
         int cq_level;
 
-        // allow internal resizing ( currently disabled in the build !!!!!)
+        /* allow internal resizing */
         int allow_spatial_resampling;
         int resample_down_water_mark;
         int resample_up_water_mark;
 
-        // allow internal frame rate alterations
+        /* allow internal frame rate alterations */
         int allow_df;
         int drop_frames_water_mark;
 
-        // two pass datarate control
-        int two_pass_vbrbias;        // two pass datarate control tweaks
+        /* two pass datarate control */
+        int two_pass_vbrbias;
         int two_pass_vbrmin_section;
         int two_pass_vbrmax_section;
-        // END DATARATE CONTROL OPTIONS
-        //----------------------------------------------------------------
 
+        /*
+         * END DATARATE CONTROL OPTIONS
+         */
 
-        // these parameters aren't to be used in final build don't use!!!
+        /* these parameters aren't to be used in final build don't use!!! */
         int play_alternate;
         int alt_freq;
         int alt_q;
@@ -186,26 +196,28 @@
         int gold_q;
 
 
-        int multi_threaded;   // how many threads to run the encoder on
-        int token_partitions; // how many token partitions to create for multi core decoding
-        int encode_breakout;  // early breakout encode threshold : for video conf recommend 800
+        int multi_threaded;   /* how many threads to run the encoder on */
+        int token_partitions; /* how many token partitions to create */
 
-        unsigned int error_resilient_mode; // Bitfield defining the error
-                                   // resiliency features to enable. Can provide
-                                   // decodable frames after losses in previous
-                                   // frames and decodable partitions after
-                                   // losses in the same frame.
+        /* early breakout threshold: for video conf recommend 800 */
+        int encode_breakout;
+
+        /* Bitfield defining the error resiliency features to enable.
+         * Can provide decodable frames after losses in previous
+         * frames and decodable partitions after losses in the same frame.
+         */
+        unsigned int error_resilient_mode;
 
         int arnr_max_frames;
-        int arnr_strength ;
-        int arnr_type     ;
+        int arnr_strength;
+        int arnr_type;
 
-        struct vpx_fixed_buf         two_pass_stats_in;
+        struct vpx_fixed_buf        two_pass_stats_in;
         struct vpx_codec_pkt_list  *output_pkt_list;
 
         vp8e_tuning tuning;
 
-        // Temporal scaling parameters
+        /* Temporal scaling parameters */
         unsigned int number_of_layers;
         unsigned int target_bitrate[VPX_TS_MAX_PERIODICITY];
         unsigned int rate_decimator[VPX_TS_MAX_PERIODICITY];
@@ -236,16 +248,14 @@
     void vp8_init_config(struct VP8_COMP* onyx, VP8_CONFIG *oxcf);
     void vp8_change_config(struct VP8_COMP* onyx, VP8_CONFIG *oxcf);
 
-// receive a frames worth of data caller can assume that a copy of this frame is made
-// and not just a copy of the pointer..
     int vp8_receive_raw_frame(struct VP8_COMP* comp, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time_stamp);
     int vp8_get_compressed_data(struct VP8_COMP* comp, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, unsigned char *dest_end, int64_t *time_stamp, int64_t *time_end, int flush);
     int vp8_get_preview_raw_frame(struct VP8_COMP* comp, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *flags);
 
     int vp8_use_as_reference(struct VP8_COMP* comp, int ref_frame_flags);
     int vp8_update_reference(struct VP8_COMP* comp, int ref_frame_flags);
-    int vp8_get_reference(struct VP8_COMP* comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
-    int vp8_set_reference(struct VP8_COMP* comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
+    int vp8_get_reference(struct VP8_COMP* comp, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd);
+    int vp8_set_reference(struct VP8_COMP* comp, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd);
     int vp8_update_entropy(struct VP8_COMP* comp, int update);
     int vp8_set_roimap(struct VP8_COMP* comp, unsigned char *map, unsigned int rows, unsigned int cols, int delta_q[4], int delta_lf[4], unsigned int threshold[4]);
     int vp8_set_active_map(struct VP8_COMP* comp, unsigned char *map, unsigned int rows, unsigned int cols);

diff --git a/vp8/common/onyxc_int.h b/vp8/common/onyxc_int.h
index d7d02c2..c44ba8d 100644
--- a/vp8/common/onyxc_int.h
+++ b/vp8/common/onyxc_int.h

@@ -108,7 +108,6 @@
     int full_pixel;
 
     int base_qindex;
-    int last_kf_gf_q;  /* Q used on the last GF or KF */
 
     int y1dc_delta_q;
     int y2dc_delta_q;

diff --git a/vp8/common/onyxd.h b/vp8/common/onyxd.h
index 35a8b6e..60af1cc 100644
--- a/vp8/common/onyxd.h
+++ b/vp8/common/onyxd.h

@@ -22,6 +22,7 @@
 #include "ppflags.h"
 #include "vpx_ports/mem.h"
 #include "vpx/vpx_codec.h"
+#include "vpx/vp8.h"
 
     struct VP8D_COMP;
 
@@ -35,12 +36,6 @@
         int     error_concealment;
         int     input_fragments;
     } VP8D_CONFIG;
-    typedef enum
-    {
-        VP8_LAST_FLAG = 1,
-        VP8_GOLD_FLAG = 2,
-        VP8_ALT_FLAG = 4
-    } VP8_REFFRAME;
 
     typedef enum
     {
@@ -56,8 +51,8 @@
     int vp8dx_receive_compressed_data(struct VP8D_COMP* comp, unsigned long size, const unsigned char *dest, int64_t time_stamp);
     int vp8dx_get_raw_frame(struct VP8D_COMP* comp, YV12_BUFFER_CONFIG *sd, int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags);
 
-    vpx_codec_err_t vp8dx_get_reference(struct VP8D_COMP* comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
-    vpx_codec_err_t vp8dx_set_reference(struct VP8D_COMP* comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
+    vpx_codec_err_t vp8dx_get_reference(struct VP8D_COMP* comp, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd);
+    vpx_codec_err_t vp8dx_set_reference(struct VP8D_COMP* comp, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd);
 
     struct VP8D_COMP* vp8dx_create_decompressor(VP8D_CONFIG *oxcf);
 

diff --git a/vp8/common/postproc.c b/vp8/common/postproc.c
index ccf6ad7..a7509ff 100644
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c

@@ -143,9 +143,7 @@
     int col;
     int i;
     int v;
-    int pitch = src_pixels_per_line;
     unsigned char d[8];
-    (void)dst_pixels_per_line;
 
     for (row = 0; row < rows; row++)
     {
@@ -161,10 +159,10 @@
 
             for (i = -2; i <= 2; i++)
             {
-                if (abs(v - p_src[col+i*pitch]) > flimit)
+                if (abs(v - p_src[col+i*src_pixels_per_line]) > flimit)
                     goto down_skip_convolve;
 
-                kernel += kernel5[2+i] * p_src[col+i*pitch];
+                kernel += kernel5[2+i] * p_src[col+i*src_pixels_per_line];
             }
 
             v = (kernel >> 3);
@@ -211,10 +209,9 @@
         p_dst[col-2] = d[(col-2)&7];
         p_dst[col-1] = d[(col-1)&7];
 
-
         /* next row */
-        src_ptr += pitch;
-        dst_ptr += pitch;
+        src_ptr += src_pixels_per_line;
+        dst_ptr += dst_pixels_per_line;
     }
 }
 
@@ -240,8 +237,9 @@
         for (i = -8; i<0; i++)
           s[i]=s[0];
 
-        // 17 avoids valgrind warning - we buffer values in c in d
-        // and only write them when we've read 8 ahead...
+        /* 17 avoids valgrind warning - we buffer values in c in d
+         * and only write them when we've read 8 ahead...
+         */
         for (i = cols; i<cols+17; i++)
           s[i]=s[cols-1];
 
@@ -275,9 +273,6 @@
 }
 
 
-
-
-
 void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, int flimit)
 {
     int r, c, i;
@@ -294,8 +289,9 @@
         for (i = -8; i < 0; i++)
           s[i*pitch]=s[0];
 
-        // 17 avoids valgrind warning - we buffer values in c in d
-        // and only write them when we've read 8 ahead...
+        /* 17 avoids valgrind warning - we buffer values in c in d
+         * and only write them when we've read 8 ahead...
+         */
         for (i = rows; i < rows+17; i++)
           s[i*pitch]=s[(rows-1)*pitch];
 
@@ -441,7 +437,7 @@
 
         }
 
-        for (next = next; next < 256; next++)
+        for (; next < 256; next++)
             char_dist[next] = 0;
 
     }
@@ -731,8 +727,9 @@
 
             oci->post_proc_buffer_int_used = 1;
 
-            // insure that postproc is set to all 0's so that post proc
-            // doesn't pull random data in from edge
+            /* insure that postproc is set to all 0's so that post proc
+             * doesn't pull random data in from edge
+             */
             vpx_memset((&oci->post_proc_buffer_int)->buffer_alloc,128,(&oci->post_proc_buffer)->frame_size);
 
         }

diff --git a/vp8/common/quant_common.c b/vp8/common/quant_common.c
index e9833fe..05f9210 100644
--- a/vp8/common/quant_common.c
+++ b/vp8/common/quant_common.c

@@ -109,7 +109,10 @@
     else if (QIndex < 0)
         QIndex = 0;
 
-    retval = (ac_qlookup[ QIndex ] * 155) / 100;
+    /* For all x in [0..284], x*155/100 is bitwise equal to (x*101581) >> 16.
+     * The smallest precision for that is '(x*6349) >> 12' but 16 is a good
+     * word size. */
+    retval = (ac_qlookup[ QIndex ] * 101581) >> 16;
 
     if (retval < 8)
         retval = 8;

diff --git a/vp8/common/reconintra4x4.c b/vp8/common/reconintra4x4.c
index dcc35ec..7bb8d0a 100644
--- a/vp8/common/reconintra4x4.c
+++ b/vp8/common/reconintra4x4.c

@@ -13,11 +13,11 @@
 #include "vpx_rtcd.h"
 #include "blockd.h"
 
-void vp8_intra4x4_predict_d_c(unsigned char *Above,
-                              unsigned char *yleft, int left_stride,
-                              int b_mode,
-                              unsigned char *dst, int dst_stride,
-                              unsigned char top_left)
+void vp8_intra4x4_predict_c(unsigned char *Above,
+                            unsigned char *yleft, int left_stride,
+                            B_PREDICTION_MODE b_mode,
+                            unsigned char *dst, int dst_stride,
+                            unsigned char top_left)
 {
     int i, r, c;
 
@@ -290,19 +290,8 @@
     }
     break;
 
+    default:
+    break;
 
     }
 }
-
-void vp8_intra4x4_predict_c(unsigned char *src, int src_stride,
-                            int b_mode,
-                            unsigned char *dst, int dst_stride)
-{
-    unsigned char *Above = src - src_stride;
-
-    vp8_intra4x4_predict_d_c(Above,
-                             src - 1, src_stride,
-                             b_mode,
-                             dst, dst_stride,
-                             Above[-1]);
-}

diff --git a/vp8/common/rtcd.c b/vp8/common/rtcd.c
index 232640d..4980f48 100644
--- a/vp8/common/rtcd.c
+++ b/vp8/common/rtcd.c

@@ -10,3 +10,60 @@
 #include "vpx_config.h"
 #define RTCD_C
 #include "vpx_rtcd.h"
+
+#if CONFIG_MULTITHREAD && HAVE_PTHREAD_H
+#include <pthread.h>
+static void once(void (*func)(void))
+{
+    static pthread_once_t lock = PTHREAD_ONCE_INIT;
+    pthread_once(&lock, func);
+}
+
+
+#elif CONFIG_MULTITHREAD && defined(_WIN32)
+#include <windows.h>
+static void once(void (*func)(void))
+{
+    /* Using a static initializer here rather than InitializeCriticalSection()
+     * since there's no race-free context in which to execute it. Protecting
+     * it with an atomic op like InterlockedCompareExchangePointer introduces
+     * an x86 dependency, and InitOnceExecuteOnce requires Vista.
+     */
+    static CRITICAL_SECTION lock = {(void *)-1, -1, 0, 0, 0, 0};
+    static int done;
+
+    EnterCriticalSection(&lock);
+
+    if (!done)
+    {
+        func();
+        done = 1;
+    }
+
+    LeaveCriticalSection(&lock);
+}
+
+
+#else
+/* No-op version that performs no synchronization. vpx_rtcd() is idempotent,
+ * so as long as your platform provides atomic loads/stores of pointers
+ * no synchronization is strictly necessary.
+ */
+
+static void once(void (*func)(void))
+{
+    static int done;
+
+    if(!done)
+    {
+        func();
+        done = 1;
+    }
+}
+#endif
+
+
+void vpx_rtcd()
+{
+    once(setup_rtcd_internal);
+}

diff --git a/vp8/common/rtcd_defs.sh b/vp8/common/rtcd_defs.sh
index d6cbd4a..f0bdf29 100644
--- a/vp8/common/rtcd_defs.sh
+++ b/vp8/common/rtcd_defs.sh

@@ -1,5 +1,7 @@
 common_forward_decls() {
 cat <<EOF
+#include "vp8/common/blockd.h"
+
 struct blockd;
 struct macroblockd;
 struct loop_filter_info;
@@ -22,35 +24,42 @@
 vp8_dequantize_b_media=vp8_dequantize_b_v6
 
 prototype void vp8_dequant_idct_add "short *input, short *dq, unsigned char *output, int stride"
-specialize vp8_dequant_idct_add mmx media neon
+specialize vp8_dequant_idct_add mmx media neon dspr2
 vp8_dequant_idct_add_media=vp8_dequant_idct_add_v6
+vp8_dequant_idct_add_dspr2=vp8_dequant_idct_add_dspr2
 
 prototype void vp8_dequant_idct_add_y_block "short *q, short *dq, unsigned char *dst, int stride, char *eobs"
-specialize vp8_dequant_idct_add_y_block mmx sse2 media neon
+specialize vp8_dequant_idct_add_y_block mmx sse2 media neon dspr2
 vp8_dequant_idct_add_y_block_media=vp8_dequant_idct_add_y_block_v6
+vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2
 
 prototype void vp8_dequant_idct_add_uv_block "short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs"
-specialize vp8_dequant_idct_add_uv_block mmx sse2 media neon
+specialize vp8_dequant_idct_add_uv_block mmx sse2 media neon dspr2
 vp8_dequant_idct_add_uv_block_media=vp8_dequant_idct_add_uv_block_v6
+vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2
 
 #
 # Loopfilter
 #
 prototype void vp8_loop_filter_mbv "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
-specialize vp8_loop_filter_mbv mmx sse2 media neon
+specialize vp8_loop_filter_mbv mmx sse2 media neon dspr2
 vp8_loop_filter_mbv_media=vp8_loop_filter_mbv_armv6
+vp8_loop_filter_mbv_dspr2=vp8_loop_filter_mbv_dspr2
 
 prototype void vp8_loop_filter_bv "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
-specialize vp8_loop_filter_bv mmx sse2 media neon
+specialize vp8_loop_filter_bv mmx sse2 media neon dspr2
 vp8_loop_filter_bv_media=vp8_loop_filter_bv_armv6
+vp8_loop_filter_bv_dspr2=vp8_loop_filter_bv_dspr2
 
 prototype void vp8_loop_filter_mbh "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
-specialize vp8_loop_filter_mbh mmx sse2 media neon
+specialize vp8_loop_filter_mbh mmx sse2 media neon dspr2
 vp8_loop_filter_mbh_media=vp8_loop_filter_mbh_armv6
+vp8_loop_filter_mbh_dspr2=vp8_loop_filter_mbh_dspr2
 
 prototype void vp8_loop_filter_bh "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
-specialize vp8_loop_filter_bh mmx sse2 media neon
+specialize vp8_loop_filter_bh mmx sse2 media neon dspr2
 vp8_loop_filter_bh_media=vp8_loop_filter_bh_armv6
+vp8_loop_filter_bh_dspr2=vp8_loop_filter_bh_dspr2
 
 
 prototype void vp8_loop_filter_simple_mbv "unsigned char *y, int ystride, const unsigned char *blimit"
@@ -90,37 +99,45 @@
 #
 #idct16
 prototype void vp8_short_idct4x4llm "short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride"
-specialize vp8_short_idct4x4llm mmx media neon
+specialize vp8_short_idct4x4llm mmx media neon dspr2
 vp8_short_idct4x4llm_media=vp8_short_idct4x4llm_v6_dual
+vp8_short_idct4x4llm_dspr2=vp8_short_idct4x4llm_dspr2
 
 #iwalsh1
 prototype void vp8_short_inv_walsh4x4_1 "short *input, short *output"
+specialize vp8_short_inv_walsh4x4_1 dspr2
+vp8_short_inv_walsh4x4_1_dspr2=vp8_short_inv_walsh4x4_1_dspr2
 # no asm yet
 
 #iwalsh16
 prototype void vp8_short_inv_walsh4x4 "short *input, short *output"
-specialize vp8_short_inv_walsh4x4 mmx sse2 media neon
+specialize vp8_short_inv_walsh4x4 mmx sse2 media neon dspr2
 vp8_short_inv_walsh4x4_media=vp8_short_inv_walsh4x4_v6
+vp8_short_inv_walsh4x4_dspr2=vp8_short_inv_walsh4x4_dspr2
 
 #idct1_scalar_add
 prototype void vp8_dc_only_idct_add "short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride"
-specialize vp8_dc_only_idct_add	mmx media neon
+specialize vp8_dc_only_idct_add	mmx media neon dspr2
 vp8_dc_only_idct_add_media=vp8_dc_only_idct_add_v6
+vp8_dc_only_idct_add_dspr2=vp8_dc_only_idct_add_dspr2
 
 #
 # RECON
 #
 prototype void vp8_copy_mem16x16 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
-specialize vp8_copy_mem16x16 mmx sse2 media neon
+specialize vp8_copy_mem16x16 mmx sse2 media neon dspr2
 vp8_copy_mem16x16_media=vp8_copy_mem16x16_v6
+vp8_copy_mem16x16_dspr2=vp8_copy_mem16x16_dspr2
 
 prototype void vp8_copy_mem8x8 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
-specialize vp8_copy_mem8x8 mmx media neon
+specialize vp8_copy_mem8x8 mmx media neon dspr2
 vp8_copy_mem8x8_media=vp8_copy_mem8x8_v6
+vp8_copy_mem8x8_dspr2=vp8_copy_mem8x8_dspr2
 
 prototype void vp8_copy_mem8x4 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
-specialize vp8_copy_mem8x4 mmx media neon
+specialize vp8_copy_mem8x4 mmx media neon dspr2
 vp8_copy_mem8x4_media=vp8_copy_mem8x4_v6
+vp8_copy_mem8x4_dspr2=vp8_copy_mem8x4_dspr2
 
 prototype void vp8_build_intra_predictors_mby_s "struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride"
 specialize vp8_build_intra_predictors_mby_s sse2 ssse3
@@ -129,8 +146,7 @@
 prototype void vp8_build_intra_predictors_mbuv_s "struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride"
 specialize vp8_build_intra_predictors_mbuv_s sse2 ssse3
 
-prototype void vp8_intra4x4_predict_d "unsigned char *above, unsigned char *left, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left"
-prototype void vp8_intra4x4_predict "unsigned char *src, int src_stride, int b_mode, unsigned char *dst, int dst_stride"
+prototype void vp8_intra4x4_predict "unsigned char *Above, unsigned char *yleft, int left_stride, B_PREDICTION_MODE b_mode, unsigned char *dst, int dst_stride, unsigned char top_left"
 specialize vp8_intra4x4_predict media
 vp8_intra4x4_predict_media=vp8_intra4x4_predict_armv6
 
@@ -177,20 +193,24 @@
 # Subpixel
 #
 prototype void vp8_sixtap_predict16x16 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
-specialize vp8_sixtap_predict16x16 mmx sse2 ssse3 media neon
+specialize vp8_sixtap_predict16x16 mmx sse2 ssse3 media neon dspr2
 vp8_sixtap_predict16x16_media=vp8_sixtap_predict16x16_armv6
+vp8_sixtap_predict16x16_dspr2=vp8_sixtap_predict16x16_dspr2
 
 prototype void vp8_sixtap_predict8x8 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
-specialize vp8_sixtap_predict8x8 mmx sse2 ssse3 media neon
+specialize vp8_sixtap_predict8x8 mmx sse2 ssse3 media neon dspr2
 vp8_sixtap_predict8x8_media=vp8_sixtap_predict8x8_armv6
+vp8_sixtap_predict8x8_dspr2=vp8_sixtap_predict8x8_dspr2
 
 prototype void vp8_sixtap_predict8x4 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
-specialize vp8_sixtap_predict8x4 mmx sse2 ssse3 media neon
+specialize vp8_sixtap_predict8x4 mmx sse2 ssse3 media neon dspr2
 vp8_sixtap_predict8x4_media=vp8_sixtap_predict8x4_armv6
+vp8_sixtap_predict8x4_dspr2=vp8_sixtap_predict8x4_dspr2
 
 prototype void vp8_sixtap_predict4x4 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
-specialize vp8_sixtap_predict4x4 mmx ssse3 media neon
+specialize vp8_sixtap_predict4x4 mmx ssse3 media neon dspr2
 vp8_sixtap_predict4x4_media=vp8_sixtap_predict4x4_armv6
+vp8_sixtap_predict4x4_dspr2=vp8_sixtap_predict4x4_dspr2
 
 prototype void vp8_bilinear_predict16x16 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
 specialize vp8_bilinear_predict16x16 mmx sse2 ssse3 media neon
@@ -276,23 +296,23 @@
 #
 # Single block SAD
 #
-prototype unsigned int vp8_sad4x4 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int max_sad"
+prototype unsigned int vp8_sad4x4 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"
 specialize vp8_sad4x4 mmx sse2 neon
 vp8_sad4x4_sse2=vp8_sad4x4_wmt
 
-prototype unsigned int vp8_sad8x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int max_sad"
+prototype unsigned int vp8_sad8x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"
 specialize vp8_sad8x8 mmx sse2 neon
 vp8_sad8x8_sse2=vp8_sad8x8_wmt
 
-prototype unsigned int vp8_sad8x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int max_sad"
+prototype unsigned int vp8_sad8x16 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"
 specialize vp8_sad8x16 mmx sse2 neon
 vp8_sad8x16_sse2=vp8_sad8x16_wmt
 
-prototype unsigned int vp8_sad16x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int max_sad"
+prototype unsigned int vp8_sad16x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"
 specialize vp8_sad16x8 mmx sse2 neon
 vp8_sad16x8_sse2=vp8_sad16x8_wmt
 
-prototype unsigned int vp8_sad16x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int max_sad"
+prototype unsigned int vp8_sad16x16 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"
 specialize vp8_sad16x16 mmx sse2 sse3 media neon
 vp8_sad16x16_sse2=vp8_sad16x16_wmt
 vp8_sad16x16_media=vp8_sad16x16_armv6
@@ -300,59 +320,59 @@
 #
 # Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
 #
-prototype void vp8_sad4x4x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+prototype void vp8_sad4x4x3 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
 specialize vp8_sad4x4x3 sse3
 
-prototype void vp8_sad8x8x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+prototype void vp8_sad8x8x3 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
 specialize vp8_sad8x8x3 sse3
 
-prototype void vp8_sad8x16x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+prototype void vp8_sad8x16x3 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
 specialize vp8_sad8x16x3 sse3
 
-prototype void vp8_sad16x8x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+prototype void vp8_sad16x8x3 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
 specialize vp8_sad16x8x3 sse3 ssse3
 
-prototype void vp8_sad16x16x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+prototype void vp8_sad16x16x3 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
 specialize vp8_sad16x16x3 sse3 ssse3
 
 # Note the only difference in the following prototypes is that they return into
 # an array of short
-prototype void vp8_sad4x4x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+prototype void vp8_sad4x4x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
 specialize vp8_sad4x4x8 sse4_1
 vp8_sad4x4x8_sse4_1=vp8_sad4x4x8_sse4
 
-prototype void vp8_sad8x8x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+prototype void vp8_sad8x8x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
 specialize vp8_sad8x8x8 sse4_1
 vp8_sad8x8x8_sse4_1=vp8_sad8x8x8_sse4
 
-prototype void vp8_sad8x16x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+prototype void vp8_sad8x16x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
 specialize vp8_sad8x16x8 sse4_1
 vp8_sad8x16x8_sse4_1=vp8_sad8x16x8_sse4
 
-prototype void vp8_sad16x8x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+prototype void vp8_sad16x8x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
 specialize vp8_sad16x8x8 sse4_1
 vp8_sad16x8x8_sse4_1=vp8_sad16x8x8_sse4
 
-prototype void vp8_sad16x16x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+prototype void vp8_sad16x16x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
 specialize vp8_sad16x16x8 sse4_1
 vp8_sad16x16x8_sse4_1=vp8_sad16x16x8_sse4
 
 #
 # Multi-block SAD, comparing a reference to N independent blocks
 #
-prototype void vp8_sad4x4x4d "const unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr[4], int  ref_stride, unsigned int *sad_array"
+prototype void vp8_sad4x4x4d "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array"
 specialize vp8_sad4x4x4d sse3
 
-prototype void vp8_sad8x8x4d "const unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr[4], int  ref_stride, unsigned int *sad_array"
+prototype void vp8_sad8x8x4d "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array"
 specialize vp8_sad8x8x4d sse3
 
-prototype void vp8_sad8x16x4d "const unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr[4], int  ref_stride, unsigned int *sad_array"
+prototype void vp8_sad8x16x4d "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array"
 specialize vp8_sad8x16x4d sse3
 
-prototype void vp8_sad16x8x4d "const unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr[4], int  ref_stride, unsigned int *sad_array"
+prototype void vp8_sad16x8x4d "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array"
 specialize vp8_sad16x8x4d sse3
 
-prototype void vp8_sad16x16x4d "const unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr[4], int  ref_stride, unsigned int *sad_array"
+prototype void vp8_sad16x16x4d "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array"
 specialize vp8_sad16x16x4d sse3
 
 #

diff --git a/vp8/common/sad_c.c b/vp8/common/sad_c.c
index 6a3e889..5f36fc9 100644
--- a/vp8/common/sad_c.c
+++ b/vp8/common/sad_c.c

@@ -9,21 +9,15 @@
  */
 
 
+#include <limits.h>
 #include <stdlib.h>
 #include "vpx_config.h"
 #include "vpx/vpx_integer.h"
 
-static
-unsigned int sad_mx_n_c(
-    const unsigned char *src_ptr,
-    int  src_stride,
-    const unsigned char *ref_ptr,
-    int  ref_stride,
-    int  max_sad,
-    int  m,
-    int  n)
+static unsigned int sad_mx_n_c(const unsigned char *src_ptr, int src_stride,
+                               const unsigned char *ref_ptr, int ref_stride,
+                               unsigned int max_sad, int m, int n)
 {
-
     int r, c;
     unsigned int sad = 0;
 
@@ -48,298 +42,211 @@
  * implementations of these functions are not required to check it.
  */
 
-unsigned int vp8_sad16x16_c(
-    const unsigned char *src_ptr,
-    int  src_stride,
-    const unsigned char *ref_ptr,
-    int  ref_stride,
-    int  max_sad)
+unsigned int vp8_sad16x16_c(const unsigned char *src_ptr, int src_stride,
+                            const unsigned char *ref_ptr, int ref_stride,
+                            unsigned int max_sad)
 {
-
     return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 16, 16);
 }
 
-
-unsigned int vp8_sad8x8_c(
-    const unsigned char *src_ptr,
-    int  src_stride,
-    const unsigned char *ref_ptr,
-    int  ref_stride,
-    int  max_sad)
+unsigned int vp8_sad8x8_c(const unsigned char *src_ptr, int src_stride,
+                          const unsigned char *ref_ptr, int ref_stride,
+                          unsigned int max_sad)
 {
-
     return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 8, 8);
 }
 
-
-unsigned int vp8_sad16x8_c(
-    const unsigned char *src_ptr,
-    int  src_stride,
-    const unsigned char *ref_ptr,
-    int  ref_stride,
-    int  max_sad)
+unsigned int vp8_sad16x8_c(const unsigned char *src_ptr, int src_stride,
+                           const unsigned char *ref_ptr, int ref_stride,
+                           unsigned int max_sad)
 {
-
     return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 16, 8);
 
 }
 
-
-unsigned int vp8_sad8x16_c(
-    const unsigned char *src_ptr,
-    int  src_stride,
-    const unsigned char *ref_ptr,
-    int  ref_stride,
-    int  max_sad)
+unsigned int vp8_sad8x16_c(const unsigned char *src_ptr, int src_stride,
+                           const unsigned char *ref_ptr, int ref_stride,
+                           unsigned int max_sad)
 {
-
     return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 8, 16);
 }
 
-
-unsigned int vp8_sad4x4_c(
-    const unsigned char *src_ptr,
-    int  src_stride,
-    const unsigned char *ref_ptr,
-    int  ref_stride,
-    int  max_sad)
+unsigned int vp8_sad4x4_c(const unsigned char *src_ptr, int src_stride,
+                          const unsigned char *ref_ptr, int ref_stride,
+                          unsigned int max_sad)
 {
-
     return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 4, 4);
 }
 
-void vp8_sad16x16x3_c(
-    const unsigned char *src_ptr,
-    int  src_stride,
-    const unsigned char *ref_ptr,
-    int  ref_stride,
-    unsigned int *sad_array
-)
+void vp8_sad16x16x3_c(const unsigned char *src_ptr, int src_stride,
+                      const unsigned char *ref_ptr, int ref_stride,
+                      unsigned int *sad_array)
 {
-    sad_array[0] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr  , ref_stride, 0x7fffffff);
-    sad_array[1] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
-    sad_array[2] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
+    sad_array[0] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
+    sad_array[1] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
+    sad_array[2] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
 }
 
-void vp8_sad16x16x8_c(
-    const unsigned char *src_ptr,
-    int  src_stride,
-    const unsigned char *ref_ptr,
-    int  ref_stride,
-    unsigned short *sad_array
-)
+void vp8_sad16x16x8_c(const unsigned char *src_ptr, int src_stride,
+                      const unsigned char *ref_ptr, int ref_stride,
+                      unsigned short *sad_array)
 {
-    sad_array[0] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr  , ref_stride, 0x7fffffff);
-    sad_array[1] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
-    sad_array[2] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
-    sad_array[3] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff);
-    sad_array[4] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff);
-    sad_array[5] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
-    sad_array[6] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff);
-    sad_array[7] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
+    sad_array[0] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
+    sad_array[1] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
+    sad_array[2] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
+    sad_array[3] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 3, ref_stride, UINT_MAX);
+    sad_array[4] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, UINT_MAX);
+    sad_array[5] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, UINT_MAX);
+    sad_array[6] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, UINT_MAX);
+    sad_array[7] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, UINT_MAX);
 }
 
-void vp8_sad16x8x3_c(
-    const unsigned char *src_ptr,
-    int  src_stride,
-    const unsigned char *ref_ptr,
-    int  ref_stride,
-    unsigned int *sad_array
-)
+void vp8_sad16x8x3_c(const unsigned char *src_ptr, int src_stride,
+                     const unsigned char *ref_ptr, int ref_stride,
+                     unsigned int *sad_array)
 {
-    sad_array[0] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr  , ref_stride, 0x7fffffff);
-    sad_array[1] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
-    sad_array[2] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
+    sad_array[0] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
+    sad_array[1] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
+    sad_array[2] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
 }
 
-void vp8_sad16x8x8_c(
-    const unsigned char *src_ptr,
-    int  src_stride,
-    const unsigned char *ref_ptr,
-    int  ref_stride,
-    unsigned short *sad_array
-)
+void vp8_sad16x8x8_c(const unsigned char *src_ptr, int src_stride,
+                     const unsigned char *ref_ptr, int ref_stride,
+                     unsigned short *sad_array)
 {
-    sad_array[0] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr  , ref_stride, 0x7fffffff);
-    sad_array[1] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
-    sad_array[2] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
-    sad_array[3] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff);
-    sad_array[4] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff);
-    sad_array[5] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
-    sad_array[6] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff);
-    sad_array[7] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
+    sad_array[0] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
+    sad_array[1] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
+    sad_array[2] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
+    sad_array[3] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 3, ref_stride, UINT_MAX);
+    sad_array[4] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, UINT_MAX);
+    sad_array[5] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, UINT_MAX);
+    sad_array[6] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, UINT_MAX);
+    sad_array[7] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, UINT_MAX);
 }
 
-void vp8_sad8x8x3_c(
-    const unsigned char *src_ptr,
-    int  src_stride,
-    const unsigned char *ref_ptr,
-    int  ref_stride,
-    unsigned int *sad_array
-)
+void vp8_sad8x8x3_c(const unsigned char *src_ptr, int src_stride,
+                    const unsigned char *ref_ptr, int ref_stride,
+                    unsigned int *sad_array)
 {
-    sad_array[0] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr  , ref_stride, 0x7fffffff);
-    sad_array[1] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
-    sad_array[2] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
+    sad_array[0] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
+    sad_array[1] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
+    sad_array[2] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
 }
 
-void vp8_sad8x8x8_c(
-    const unsigned char *src_ptr,
-    int  src_stride,
-    const unsigned char *ref_ptr,
-    int  ref_stride,
-    unsigned short *sad_array
-)
+void vp8_sad8x8x8_c(const unsigned char *src_ptr, int src_stride,
+                    const unsigned char *ref_ptr, int ref_stride,
+                    unsigned short *sad_array)
 {
-    sad_array[0] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr  , ref_stride, 0x7fffffff);
-    sad_array[1] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
-    sad_array[2] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
-    sad_array[3] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff);
-    sad_array[4] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff);
-    sad_array[5] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
-    sad_array[6] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff);
-    sad_array[7] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
+    sad_array[0] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
+    sad_array[1] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
+    sad_array[2] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
+    sad_array[3] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 3, ref_stride, UINT_MAX);
+    sad_array[4] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, UINT_MAX);
+    sad_array[5] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, UINT_MAX);
+    sad_array[6] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, UINT_MAX);
+    sad_array[7] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, UINT_MAX);
 }
 
-void vp8_sad8x16x3_c(
-    const unsigned char *src_ptr,
-    int  src_stride,
-    const unsigned char *ref_ptr,
-    int  ref_stride,
-    unsigned int *sad_array
-)
+void vp8_sad8x16x3_c(const unsigned char *src_ptr, int src_stride,
+                     const unsigned char *ref_ptr, int ref_stride,
+                     unsigned int *sad_array)
 {
-    sad_array[0] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr  , ref_stride, 0x7fffffff);
-    sad_array[1] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
-    sad_array[2] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
+    sad_array[0] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
+    sad_array[1] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
+    sad_array[2] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
 }
 
-void vp8_sad8x16x8_c(
-    const unsigned char *src_ptr,
-    int  src_stride,
-    const unsigned char *ref_ptr,
-    int  ref_stride,
-    unsigned short *sad_array
-)
+void vp8_sad8x16x8_c(const unsigned char *src_ptr, int src_stride,
+                     const unsigned char *ref_ptr, int ref_stride,
+                     unsigned short *sad_array)
 {
-    sad_array[0] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr  , ref_stride, 0x7fffffff);
-    sad_array[1] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
-    sad_array[2] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
-    sad_array[3] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff);
-    sad_array[4] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff);
-    sad_array[5] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
-    sad_array[6] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff);
-    sad_array[7] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
+    sad_array[0] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
+    sad_array[1] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
+    sad_array[2] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
+    sad_array[3] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 3, ref_stride, UINT_MAX);
+    sad_array[4] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, UINT_MAX);
+    sad_array[5] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, UINT_MAX);
+    sad_array[6] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, UINT_MAX);
+    sad_array[7] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, UINT_MAX);
 }
 
-void vp8_sad4x4x3_c(
-    const unsigned char *src_ptr,
-    int  src_stride,
-    const unsigned char *ref_ptr,
-    int  ref_stride,
-    unsigned int *sad_array
-)
+void vp8_sad4x4x3_c(const unsigned char *src_ptr, int src_stride,
+                    const unsigned char *ref_ptr, int ref_stride,
+                    unsigned int *sad_array)
 {
-    sad_array[0] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr  , ref_stride, 0x7fffffff);
-    sad_array[1] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
-    sad_array[2] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
+    sad_array[0] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
+    sad_array[1] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
+    sad_array[2] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
 }
 
-void vp8_sad4x4x8_c(
-    const unsigned char *src_ptr,
-    int  src_stride,
-    const unsigned char *ref_ptr,
-    int  ref_stride,
-    unsigned short *sad_array
-)
+void vp8_sad4x4x8_c(const unsigned char *src_ptr, int src_stride,
+                    const unsigned char *ref_ptr, int ref_stride,
+                    unsigned short *sad_array)
 {
-    sad_array[0] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr  , ref_stride, 0x7fffffff);
-    sad_array[1] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
-    sad_array[2] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
-    sad_array[3] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff);
-    sad_array[4] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff);
-    sad_array[5] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
-    sad_array[6] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff);
-    sad_array[7] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
+    sad_array[0] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
+    sad_array[1] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
+    sad_array[2] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
+    sad_array[3] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 3, ref_stride, UINT_MAX);
+    sad_array[4] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, UINT_MAX);
+    sad_array[5] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, UINT_MAX);
+    sad_array[6] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, UINT_MAX);
+    sad_array[7] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, UINT_MAX);
 }
 
-void vp8_sad16x16x4d_c(
-    const unsigned char *src_ptr,
-    int  src_stride,
-    unsigned char *ref_ptr[],
-    int  ref_stride,
-    unsigned int *sad_array
-)
+void vp8_sad16x16x4d_c(const unsigned char *src_ptr, int src_stride,
+                       const unsigned char * const ref_ptr[], int ref_stride,
+                       unsigned int *sad_array)
 {
-    sad_array[0] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[0], ref_stride, 0x7fffffff);
-    sad_array[1] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[1], ref_stride, 0x7fffffff);
-    sad_array[2] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[2], ref_stride, 0x7fffffff);
-    sad_array[3] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[3], ref_stride, 0x7fffffff);
+    sad_array[0] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[0], ref_stride, UINT_MAX);
+    sad_array[1] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[1], ref_stride, UINT_MAX);
+    sad_array[2] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[2], ref_stride, UINT_MAX);
+    sad_array[3] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[3], ref_stride, UINT_MAX);
 }
 
-void vp8_sad16x8x4d_c(
-    const unsigned char *src_ptr,
-    int  src_stride,
-    unsigned char *ref_ptr[],
-    int  ref_stride,
-    unsigned int *sad_array
-)
+void vp8_sad16x8x4d_c(const unsigned char *src_ptr, int src_stride,
+                      const unsigned char * const ref_ptr[], int ref_stride,
+                      unsigned int *sad_array)
 {
-    sad_array[0] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[0], ref_stride, 0x7fffffff);
-    sad_array[1] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[1], ref_stride, 0x7fffffff);
-    sad_array[2] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[2], ref_stride, 0x7fffffff);
-    sad_array[3] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[3], ref_stride, 0x7fffffff);
+    sad_array[0] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[0], ref_stride, UINT_MAX);
+    sad_array[1] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[1], ref_stride, UINT_MAX);
+    sad_array[2] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[2], ref_stride, UINT_MAX);
+    sad_array[3] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[3], ref_stride, UINT_MAX);
 }
 
-void vp8_sad8x8x4d_c(
-    const unsigned char *src_ptr,
-    int  src_stride,
-    unsigned char *ref_ptr[],
-    int  ref_stride,
-    unsigned int *sad_array
-)
+void vp8_sad8x8x4d_c(const unsigned char *src_ptr, int src_stride,
+                     const unsigned char * const ref_ptr[], int ref_stride,
+                     unsigned int *sad_array)
 {
-    sad_array[0] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[0], ref_stride, 0x7fffffff);
-    sad_array[1] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[1], ref_stride, 0x7fffffff);
-    sad_array[2] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[2], ref_stride, 0x7fffffff);
-    sad_array[3] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[3], ref_stride, 0x7fffffff);
+    sad_array[0] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[0], ref_stride, UINT_MAX);
+    sad_array[1] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[1], ref_stride, UINT_MAX);
+    sad_array[2] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[2], ref_stride, UINT_MAX);
+    sad_array[3] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[3], ref_stride, UINT_MAX);
 }
 
-void vp8_sad8x16x4d_c(
-    const unsigned char *src_ptr,
-    int  src_stride,
-    unsigned char *ref_ptr[],
-    int  ref_stride,
-    unsigned int *sad_array
-)
+void vp8_sad8x16x4d_c(const unsigned char *src_ptr, int src_stride,
+                      const unsigned char * const ref_ptr[], int ref_stride,
+                      unsigned int *sad_array)
 {
-    sad_array[0] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[0], ref_stride, 0x7fffffff);
-    sad_array[1] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[1], ref_stride, 0x7fffffff);
-    sad_array[2] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[2], ref_stride, 0x7fffffff);
-    sad_array[3] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[3], ref_stride, 0x7fffffff);
+    sad_array[0] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[0], ref_stride, UINT_MAX);
+    sad_array[1] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[1], ref_stride, UINT_MAX);
+    sad_array[2] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[2], ref_stride, UINT_MAX);
+    sad_array[3] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[3], ref_stride, UINT_MAX);
 }
 
-void vp8_sad4x4x4d_c(
-    const unsigned char *src_ptr,
-    int  src_stride,
-    unsigned char *ref_ptr[],
-    int  ref_stride,
-    unsigned int *sad_array
-)
+void vp8_sad4x4x4d_c(const unsigned char *src_ptr, int src_stride,
+                     const unsigned char * const ref_ptr[], int  ref_stride,
+                     unsigned int *sad_array)
 {
-    sad_array[0] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[0], ref_stride, 0x7fffffff);
-    sad_array[1] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[1], ref_stride, 0x7fffffff);
-    sad_array[2] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[2], ref_stride, 0x7fffffff);
-    sad_array[3] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[3], ref_stride, 0x7fffffff);
+    sad_array[0] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[0], ref_stride, UINT_MAX);
+    sad_array[1] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[1], ref_stride, UINT_MAX);
+    sad_array[2] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[2], ref_stride, UINT_MAX);
+    sad_array[3] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[3], ref_stride, UINT_MAX);
 }
 
 /* Copy 2 macroblocks to a buffer */
-void vp8_copy32xn_c(
-    unsigned char *src_ptr,
-    int  src_stride,
-    unsigned char *dst_ptr,
-    int  dst_stride,
-    int height)
+void vp8_copy32xn_c(unsigned char *src_ptr, int src_stride,
+                    unsigned char *dst_ptr, int dst_stride,
+                    int height)
 {
     int r;
 

diff --git a/vp8/common/variance.h b/vp8/common/variance.h
index b77aa28..4bb34cc 100644
--- a/vp8/common/variance.h
+++ b/vp8/common/variance.h

@@ -12,14 +12,12 @@
 #ifndef VARIANCE_H
 #define VARIANCE_H
 
-typedef unsigned int(*vp8_sad_fn_t)
-    (
+typedef unsigned int(*vp8_sad_fn_t)(
     const unsigned char *src_ptr,
     int source_stride,
     const unsigned char *ref_ptr,
     int ref_stride,
-    int max_sad
-    );
+    unsigned int max_sad);
 
 typedef void (*vp8_copy32xn_fn_t)(
     const unsigned char *src_ptr,
@@ -48,7 +46,7 @@
     (
      const unsigned char *src_ptr,
      int source_stride,
-     unsigned char *ref_ptr[4],
+     const unsigned char * const ref_ptr[],
      int  ref_stride,
      unsigned int *sad_array
     );

diff --git a/vp8/common/variance_c.c b/vp8/common/variance_c.c
index 996404d..da08aff 100644
--- a/vp8/common/variance_c.c
+++ b/vp8/common/variance_c.c

@@ -205,14 +205,14 @@
     {
         for (j = 0; j < output_width; j++)
         {
-            // Apply bilinear filter
+            /* Apply bilinear filter */
             output_ptr[j] = (((int)src_ptr[0]          * vp8_filter[0]) +
                              ((int)src_ptr[pixel_step] * vp8_filter[1]) +
                              (VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT;
             src_ptr++;
         }
 
-        // Next row...
+        /* Next row... */
         src_ptr    += src_pixels_per_line - output_width;
         output_ptr += output_width;
     }
@@ -264,15 +264,15 @@
     {
         for (j = 0; j < output_width; j++)
         {
-            // Apply filter
-            Temp = ((int)src_ptr[0]         * vp8_filter[0]) +
+            /* Apply filter */
+            Temp = ((int)src_ptr[0]          * vp8_filter[0]) +
                    ((int)src_ptr[pixel_step] * vp8_filter[1]) +
                    (VP8_FILTER_WEIGHT / 2);
             output_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
             src_ptr++;
         }
 
-        // Next row...
+        /* Next row... */
         src_ptr    += src_pixels_per_line - output_width;
         output_ptr += output_width;
     }
@@ -292,15 +292,15 @@
 {
     unsigned char  temp2[20*16];
     const short *HFilter, *VFilter;
-    unsigned short FData3[5*4]; // Temp data bufffer used in filtering
+    unsigned short FData3[5*4]; /* Temp data bufffer used in filtering */
 
     HFilter = vp8_bilinear_filters[xoffset];
     VFilter = vp8_bilinear_filters[yoffset];
 
-    // First filter 1d Horizontal
+    /* First filter 1d Horizontal */
     var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 5, 4, HFilter);
 
-    // Now filter Verticaly
+    /* Now filter Verticaly */
     var_filter_block2d_bil_second_pass(FData3, temp2, 4,  4,  4,  4, VFilter);
 
     return vp8_variance4x4_c(temp2, 4, dst_ptr, dst_pixels_per_line, sse);
@@ -318,7 +318,7 @@
     unsigned int *sse
 )
 {
-    unsigned short FData3[9*8]; // Temp data bufffer used in filtering
+    unsigned short FData3[9*8]; /* Temp data bufffer used in filtering */
     unsigned char  temp2[20*16];
     const short *HFilter, *VFilter;
 
@@ -342,7 +342,7 @@
     unsigned int *sse
 )
 {
-    unsigned short FData3[17*16];   // Temp data bufffer used in filtering
+    unsigned short FData3[17*16];   /* Temp data bufffer used in filtering */
     unsigned char  temp2[20*16];
     const short *HFilter, *VFilter;
 
@@ -418,7 +418,7 @@
     unsigned int *sse
 )
 {
-    unsigned short FData3[16*9];    // Temp data bufffer used in filtering
+    unsigned short FData3[16*9];    /* Temp data bufffer used in filtering */
     unsigned char  temp2[20*16];
     const short *HFilter, *VFilter;
 
@@ -442,7 +442,7 @@
     unsigned int *sse
 )
 {
-    unsigned short FData3[9*16];    // Temp data bufffer used in filtering
+    unsigned short FData3[9*16];    /* Temp data bufffer used in filtering */
     unsigned char  temp2[20*16];
     const short *HFilter, *VFilter;
 

diff --git a/vp8/common/vp8_entropymodedata.h b/vp8/common/vp8_entropymodedata.h
old mode 100755
new mode 100644


diff --git a/vp8/common/x86/postproc_x86.c b/vp8/common/x86/postproc_x86.c
index a25921b..3ec0106 100644
--- a/vp8/common/x86/postproc_x86.c
+++ b/vp8/common/x86/postproc_x86.c

@@ -18,4 +18,7 @@
 {
   return __rand();
 }
+#else
+/* ISO C forbids an empty translation unit. */
+int vp8_unused;
 #endif

diff --git a/vp8/common/x86/sad_sse2.asm b/vp8/common/x86/sad_sse2.asm
index 290e676..8d86abc 100644
--- a/vp8/common/x86/sad_sse2.asm
+++ b/vp8/common/x86/sad_sse2.asm

@@ -115,7 +115,7 @@
 
         movq            rax,        mm7
         cmp             eax,        arg(4)
-        jg              .x8x16sad_wmt_early_exit
+        ja              .x8x16sad_wmt_early_exit
 
         movq            mm0,        QWORD PTR [rsi]
         movq            mm1,        QWORD PTR [rdi]
@@ -176,7 +176,7 @@
 
         movq            rax,        mm7
         cmp             eax,        arg(4)
-        jg              .x8x8sad_wmt_early_exit
+        ja              .x8x8sad_wmt_early_exit
 
         movq            mm0,        QWORD PTR [rsi]
         movq            mm1,        QWORD PTR [rdi]
@@ -285,7 +285,7 @@
 
         movq            rax,        mm7
         cmp             eax,        arg(4)
-        jg              .x16x8sad_wmt_early_exit
+        ja              .x16x8sad_wmt_early_exit
 
         movq            mm0,        QWORD PTR [rsi]
         movq            mm2,        QWORD PTR [rsi+8]

diff --git a/vp8/common/x86/variance_sse2.c b/vp8/common/x86/variance_sse2.c
index 2769a30..afd6429 100644
--- a/vp8/common/x86/variance_sse2.c
+++ b/vp8/common/x86/variance_sse2.c

@@ -332,8 +332,9 @@
     unsigned int xxsum0, xxsum1;
 
 
-    // note we could avoid these if statements if the calling function
-    // just called the appropriate functions inside.
+    /* note we could avoid these if statements if the calling function
+     * just called the appropriate functions inside.
+     */
     if (xoffset == 4 && yoffset == 0)
     {
         vp8_half_horiz_variance16x_h_sse2(

diff --git a/vp8/common/x86/variance_ssse3.c b/vp8/common/x86/variance_ssse3.c
index 1be0d92..ba2055c 100644
--- a/vp8/common/x86/variance_ssse3.c
+++ b/vp8/common/x86/variance_ssse3.c

@@ -79,8 +79,9 @@
     int xsum0;
     unsigned int xxsum0;
 
-    // note we could avoid these if statements if the calling function
-    // just called the appropriate functions inside.
+    /* note we could avoid these if statements if the calling function
+     * just called the appropriate functions inside.
+     */
     if (xoffset == 4 && yoffset == 0)
     {
         vp8_half_horiz_variance16x_h_sse2(

diff --git a/vp8/common/x86/vp8_asm_stubs.c b/vp8/common/x86/vp8_asm_stubs.c
index 23a7fdc..3437a23 100644
--- a/vp8/common/x86/vp8_asm_stubs.c
+++ b/vp8/common/x86/vp8_asm_stubs.c

@@ -438,19 +438,35 @@
     {
         if (yoffset)
         {
-            vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, FData2, 16, 21, xoffset);
-            vp8_filter_block1d16_v6_ssse3(FData2 , 16, dst_ptr, dst_pitch, 16, yoffset);
+            vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
+                                          src_pixels_per_line, FData2,
+                                          16, 21, xoffset);
+            vp8_filter_block1d16_v6_ssse3(FData2 , 16, dst_ptr, dst_pitch,
+                                          16, yoffset);
         }
         else
         {
             /* First-pass only */
-            vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 16, xoffset);
+            vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line,
+                                          dst_ptr, dst_pitch, 16, xoffset);
         }
     }
     else
     {
-        /* Second-pass only */
-        vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line) , src_pixels_per_line, dst_ptr, dst_pitch, 16, yoffset);
+        if (yoffset)
+        {
+            /* Second-pass only */
+            vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
+                                          src_pixels_per_line,
+                                          dst_ptr, dst_pitch, 16, yoffset);
+        }
+        else
+        {
+            /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
+             * yoffset==0) case correctly. Add copy function here to guarantee
+             * six-tap function handles all possible offsets. */
+            vp8_copy_mem16x16(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
+        }
     }
 }
 
@@ -470,18 +486,34 @@
     {
         if (yoffset)
         {
-            vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, FData2, 8, 13, xoffset);
-            vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 8, yoffset);
+            vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
+                                         src_pixels_per_line, FData2,
+                                         8, 13, xoffset);
+            vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch,
+                                         8, yoffset);
         }
         else
         {
-            vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 8, xoffset);
+            vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
+                                         dst_ptr, dst_pitch, 8, xoffset);
         }
     }
     else
     {
-        /* Second-pass only */
-        vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 8, yoffset);
+        if (yoffset)
+        {
+            /* Second-pass only */
+            vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
+                                         src_pixels_per_line,
+                                         dst_ptr, dst_pitch, 8, yoffset);
+        }
+        else
+        {
+            /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
+             * yoffset==0) case correctly. Add copy function here to guarantee
+             * six-tap function handles all possible offsets. */
+            vp8_copy_mem8x8(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
+        }
     }
 }
 
@@ -502,19 +534,35 @@
     {
         if (yoffset)
         {
-            vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, FData2, 8, 9, xoffset);
-            vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 4, yoffset);
+            vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
+                                         src_pixels_per_line, FData2,
+                                         8, 9, xoffset);
+            vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch,
+                                         4, yoffset);
         }
         else
         {
             /* First-pass only */
-            vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 4, xoffset);
+            vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
+                                         dst_ptr, dst_pitch, 4, xoffset);
         }
     }
     else
     {
-        /* Second-pass only */
-        vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 4, yoffset);
+        if (yoffset)
+        {
+            /* Second-pass only */
+            vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
+                                         src_pixels_per_line,
+                                         dst_ptr, dst_pitch, 4, yoffset);
+        }
+        else
+        {
+            /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
+             * yoffset==0) case correctly. Add copy function here to guarantee
+             * six-tap function handles all possible offsets. */
+            vp8_copy_mem8x4(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
+        }
     }
 }
 
@@ -534,19 +582,48 @@
   {
       if (yoffset)
       {
-          vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, FData2, 4, 9, xoffset);
-          vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch, 4, yoffset);
+          vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
+                                       src_pixels_per_line,
+                                       FData2, 4, 9, xoffset);
+          vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch,
+                                       4, yoffset);
       }
       else
       {
-          vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 4, xoffset);
+          vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line,
+                                       dst_ptr, dst_pitch, 4, xoffset);
       }
   }
   else
   {
-      vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 4, yoffset);
-  }
+      if (yoffset)
+      {
+          vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
+                                       src_pixels_per_line,
+                                       dst_ptr, dst_pitch, 4, yoffset);
+      }
+      else
+      {
+        /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
+          * yoffset==0) case correctly. Add copy function here to guarantee
+          * six-tap function handles all possible offsets. */
+          int r;
 
+          for (r = 0; r < 4; r++)
+          {
+  #if !(CONFIG_FAST_UNALIGNED)
+            dst_ptr[0]  = src_ptr[0];
+            dst_ptr[1]  = src_ptr[1];
+            dst_ptr[2]  = src_ptr[2];
+            dst_ptr[3]  = src_ptr[3];
+  #else
+              *(uint32_t *)dst_ptr = *(uint32_t *)src_ptr ;
+  #endif
+              dst_ptr     += dst_pitch;
+              src_ptr     += src_pixels_per_line;
+          }
+      }
+  }
 }
 
 #endif

diff --git a/vp8/decoder/dboolhuff.h b/vp8/decoder/dboolhuff.h
index 880c185..1a08c05 100644
--- a/vp8/decoder/dboolhuff.h
+++ b/vp8/decoder/dboolhuff.h

@@ -55,7 +55,7 @@
         int loop_end, x; \
         size_t bits_left = ((_bufend)-(_bufptr))*CHAR_BIT; \
         \
-        x = shift + CHAR_BIT - bits_left; \
+        x = (int)(shift + CHAR_BIT - bits_left); \
         loop_end = 0; \
         if(x >= 0) \
         { \

diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c
index 48824ce..20bffdb 100644
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c

@@ -177,7 +177,6 @@
         {
             short *DQC = xd->dequant_y1;
             int dst_stride = xd->dst.y_stride;
-            unsigned char *base_dst = xd->dst.y_buffer;
 
             /* clear out residual eob info */
             if(xd->mode_info_context->mbmi.mb_skip_coeff)
@@ -188,38 +187,29 @@
             for (i = 0; i < 16; i++)
             {
                 BLOCKD *b = &xd->block[i];
-                int b_mode = xd->mode_info_context->bmi[i].as_mode;
-                unsigned char *yabove;
-                unsigned char *yleft;
-                int left_stride;
-                unsigned char top_left;
+                unsigned char *dst = xd->dst.y_buffer + b->offset;
+                B_PREDICTION_MODE b_mode =
+                    xd->mode_info_context->bmi[i].as_mode;
+                unsigned char *Above = dst - dst_stride;
+                unsigned char *yleft = dst - 1;
+                int left_stride = dst_stride;
+                unsigned char top_left = Above[-1];
 
-                yabove = base_dst + b->offset - dst_stride;
-                yleft = base_dst + b->offset - 1;
-                left_stride = dst_stride;
-                top_left = yabove[-1];
-
-                //                vp8_intra4x4_predict (base_dst + b->offset, dst_stride, b_mode,
-                  //                                    base_dst + b->offset, dst_stride );
-                vp8_intra4x4_predict_d_c(yabove, yleft, left_stride,
-                                       b_mode,
-                                       base_dst + b->offset, dst_stride,
-                                       top_left);
+                vp8_intra4x4_predict(Above, yleft, left_stride, b_mode,
+                                     dst, dst_stride, top_left);
 
                 if (xd->eobs[i])
                 {
                     if (xd->eobs[i] > 1)
                     {
-                    vp8_dequant_idct_add
-                            (b->qcoeff, DQC,
-                                base_dst + b->offset, dst_stride);
+                    vp8_dequant_idct_add(b->qcoeff, DQC, dst, dst_stride);
                     }
                     else
                     {
                         vp8_dc_only_idct_add
                             (b->qcoeff[0] * DQC[0],
-                                base_dst + b->offset, dst_stride,
-                                base_dst + b->offset, dst_stride);
+                                dst, dst_stride,
+                                dst, dst_stride);
                         ((int *)b->qcoeff)[0] = 0;
                     }
                 }
@@ -322,6 +312,8 @@
     VP8_COMMON *const pc = & pbi->common;
     MACROBLOCKD *const xd  = & pbi->mb;
 
+    MODE_INFO *lf_mic = xd->mode_info_context;
+
     int ibc = 0;
     int num_part = 1 << pc->multi_token_partition;
 
@@ -334,6 +326,7 @@
 
     unsigned char *ref_buffer[MAX_REF_FRAMES][3];
     unsigned char *dst_buffer[3];
+    unsigned char *lf_dst[3];
     int i;
     int ref_fb_index[MAX_REF_FRAMES];
     int ref_fb_corrupted[MAX_REF_FRAMES];
@@ -353,12 +346,17 @@
         ref_fb_corrupted[i] = pc->yv12_fb[ref_fb_index[i]].corrupted;
     }
 
-    dst_buffer[0] = pc->yv12_fb[dst_fb_idx].y_buffer;
-    dst_buffer[1] = pc->yv12_fb[dst_fb_idx].u_buffer;
-    dst_buffer[2] = pc->yv12_fb[dst_fb_idx].v_buffer;
+    /* Set up the buffer pointers */
+    lf_dst[0] = dst_buffer[0] = pc->yv12_fb[dst_fb_idx].y_buffer;
+    lf_dst[1] = dst_buffer[1] = pc->yv12_fb[dst_fb_idx].u_buffer;
+    lf_dst[2] = dst_buffer[2] = pc->yv12_fb[dst_fb_idx].v_buffer;
 
     xd->up_available = 0;
 
+    /* Initialize the loop filter for this frame. */
+    if(pc->filter_level)
+        vp8_loop_filter_frame_init(pc, xd, pc->filter_level);
+
     /* Decode the individual macro block */
     for (mb_row = 0; mb_row < pc->mb_rows; mb_row++)
     {
@@ -395,7 +393,7 @@
         xd->recon_above[1] -= xd->dst.uv_stride;
         xd->recon_above[2] -= xd->dst.uv_stride;
 
-        //TODO: move to outside row loop
+        /* TODO: move to outside row loop */
         xd->recon_left_stride[0] = xd->dst.y_stride;
         xd->recon_left_stride[1] = xd->dst.uv_stride;
 
@@ -460,26 +458,55 @@
             xd->recon_left[1] += 8;
             xd->recon_left[2] += 8;
 
-
             recon_yoffset += 16;
             recon_uvoffset += 8;
 
             ++xd->mode_info_context;  /* next mb */
 
             xd->above_context++;
-
         }
 
         /* adjust to the next row of mbs */
-        vp8_extend_mb_row(
-            &pc->yv12_fb[dst_fb_idx],
-            xd->dst.y_buffer + 16, xd->dst.u_buffer + 8, xd->dst.v_buffer + 8
-        );
+        vp8_extend_mb_row(&pc->yv12_fb[dst_fb_idx], xd->dst.y_buffer + 16,
+                          xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
 
         ++xd->mode_info_context;      /* skip prediction column */
         xd->up_available = 1;
 
+        if(pc->filter_level)
+        {
+            if(mb_row > 0)
+            {
+                if (pc->filter_type == NORMAL_LOOPFILTER)
+                    vp8_loop_filter_row_normal(pc, lf_mic, mb_row-1,
+                                               recon_y_stride, recon_uv_stride,
+                                               lf_dst[0], lf_dst[1], lf_dst[2]);
+                else
+                    vp8_loop_filter_row_simple(pc, lf_mic, mb_row-1,
+                                               recon_y_stride, recon_uv_stride,
+                                               lf_dst[0], lf_dst[1], lf_dst[2]);
+                lf_dst[0] += recon_y_stride  * 16;
+                lf_dst[1] += recon_uv_stride *  8;
+                lf_dst[2] += recon_uv_stride *  8;
+                lf_mic += pc->mb_cols;
+                lf_mic++;         /* Skip border mb */
+            }
+        }
     }
+
+    if(pc->filter_level)
+    {
+        if (pc->filter_type == NORMAL_LOOPFILTER)
+            vp8_loop_filter_row_normal(pc, lf_mic, mb_row-1, recon_y_stride,
+                                       recon_uv_stride, lf_dst[0], lf_dst[1],
+                                       lf_dst[2]);
+        else
+            vp8_loop_filter_row_simple(pc, lf_mic, mb_row-1, recon_y_stride,
+                                       recon_uv_stride, lf_dst[0], lf_dst[1],
+                                       lf_dst[2]);
+    }
+
+    vp8_yv12_extend_frame_borders(&pc->yv12_fb[dst_fb_idx]);
 }
 
 static unsigned int read_partition_size(const unsigned char *cx_size)
@@ -519,13 +546,13 @@
         if (read_is_valid(partition_size_ptr, 3, first_fragment_end))
             partition_size = read_partition_size(partition_size_ptr);
         else if (pbi->ec_active)
-            partition_size = bytes_left;
+            partition_size = (unsigned int)bytes_left;
         else
             vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
                                "Truncated partition size data");
     }
     else
-        partition_size = bytes_left;
+        partition_size = (unsigned int)bytes_left;
 
     /* Validate the calculated partition length. If the buffer
      * described by the partition can't be fully read, then restrict
@@ -534,7 +561,7 @@
     if (!read_is_valid(fragment_start, partition_size, fragment_end))
     {
         if (pbi->ec_active)
-            partition_size = bytes_left;
+            partition_size = (unsigned int)bytes_left;
         else
             vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
                                "Truncated packet or corrupt partition "
@@ -580,10 +607,10 @@
             /* Size of first partition + token partition sizes element */
             ptrdiff_t ext_first_part_size = token_part_sizes -
                 pbi->fragments[0] + 3 * (num_token_partitions - 1);
-            fragment_size -= ext_first_part_size;
+            fragment_size -= (unsigned int)ext_first_part_size;
             if (fragment_size > 0)
             {
-                pbi->fragment_sizes[0] = ext_first_part_size;
+                pbi->fragment_sizes[0] = (unsigned int)ext_first_part_size;
                 /* The fragment contains an additional partition. Move to
                  * next. */
                 fragment_idx++;
@@ -602,8 +629,8 @@
                                                  fragment_end,
                                                  fragment_idx - 1,
                                                  num_token_partitions);
-            pbi->fragment_sizes[fragment_idx] = partition_size;
-            fragment_size -= partition_size;
+            pbi->fragment_sizes[fragment_idx] = (unsigned int)partition_size;
+            fragment_size -= (unsigned int)partition_size;
             assert(fragment_idx <= num_token_partitions);
             if (fragment_size > 0)
             {
@@ -851,10 +878,10 @@
 #endif
 
 #if CONFIG_MULTITHREAD
-
                 if (pbi->b_multithreaded_rd)
                     vp8mt_alloc_temp_buffers(pbi, pc->Width, prev_mb_rows);
-
+#else
+                (void)prev_mb_rows;
 #endif
                 frame_size_change = 1;
             }
@@ -869,7 +896,7 @@
 
     init_frame(pbi);
 
-    if (vp8dx_start_decode(bc, data, data_end - data))
+    if (vp8dx_start_decode(bc, data, (unsigned int)(data_end - data)))
         vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
                            "Failed to allocate bool decoder 0");
     if (pc->frame_type == KEY_FRAME) {
@@ -1190,13 +1217,6 @@
 
     /* vpx_log("Decoder: Frame Decoded, Size Roughly:%d bytes  \n",bc->pos+pbi->bc2.pos); */
 
-    /* If this was a kf or Gf note the Q used */
-    if ((pc->frame_type == KEY_FRAME) ||
-         pc->refresh_golden_frame || pc->refresh_alt_ref_frame)
-    {
-        pc->last_kf_gf_q = pc->base_qindex;
-    }
-
     if (pc->refresh_entropy_probs == 0)
     {
         vpx_memcpy(&pc->fc, &pc->lfc, sizeof(pc->fc));

diff --git a/vp8/decoder/detokenize.c b/vp8/decoder/detokenize.c
index 0c39848..452ff6c 100644
--- a/vp8/decoder/detokenize.c
+++ b/vp8/decoder/detokenize.c

@@ -53,7 +53,8 @@
 #define NUM_PROBAS  11
 #define NUM_CTX  3
 
-typedef const uint8_t (*ProbaArray)[NUM_CTX][NUM_PROBAS];  // for const-casting
+/* for const-casting */
+typedef const uint8_t (*ProbaArray)[NUM_CTX][NUM_PROBAS];
 
 static int GetSigned(BOOL_DECODER *br, int value_to_sign)
 {

diff --git a/vp8/decoder/onyxd_if.c b/vp8/decoder/onyxd_if.c
index 38839f3..205c150 100644
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c

@@ -125,16 +125,16 @@
 }
 
 
-vpx_codec_err_t vp8dx_get_reference(VP8D_COMP *pbi, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
+vpx_codec_err_t vp8dx_get_reference(VP8D_COMP *pbi, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd)
 {
     VP8_COMMON *cm = &pbi->common;
     int ref_fb_idx;
 
-    if (ref_frame_flag == VP8_LAST_FLAG)
+    if (ref_frame_flag == VP8_LAST_FRAME)
         ref_fb_idx = cm->lst_fb_idx;
-    else if (ref_frame_flag == VP8_GOLD_FLAG)
+    else if (ref_frame_flag == VP8_GOLD_FRAME)
         ref_fb_idx = cm->gld_fb_idx;
-    else if (ref_frame_flag == VP8_ALT_FLAG)
+    else if (ref_frame_flag == VP8_ALTR_FRAME)
         ref_fb_idx = cm->alt_fb_idx;
     else{
         vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,
@@ -156,17 +156,17 @@
 }
 
 
-vpx_codec_err_t vp8dx_set_reference(VP8D_COMP *pbi, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
+vpx_codec_err_t vp8dx_set_reference(VP8D_COMP *pbi, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd)
 {
     VP8_COMMON *cm = &pbi->common;
     int *ref_fb_ptr = NULL;
     int free_fb;
 
-    if (ref_frame_flag == VP8_LAST_FLAG)
+    if (ref_frame_flag == VP8_LAST_FRAME)
         ref_fb_ptr = &cm->lst_fb_idx;
-    else if (ref_frame_flag == VP8_GOLD_FLAG)
+    else if (ref_frame_flag == VP8_GOLD_FRAME)
         ref_fb_ptr = &cm->gld_fb_idx;
-    else if (ref_frame_flag == VP8_ALT_FLAG)
+    else if (ref_frame_flag == VP8_ALTR_FRAME)
         ref_fb_ptr = &cm->alt_fb_idx;
     else{
         vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,
@@ -467,16 +467,8 @@
             pbi->num_fragments = 0;
             return -1;
         }
-
-        if(cm->filter_level)
-        {
-            /* Apply the loop filter if appropriate. */
-            vp8_loop_filter_frame(cm, &pbi->mb, cm->frame_type);
-        }
-        vp8_yv12_extend_frame_borders(cm->frame_to_show);
     }
 
-
     vp8_clear_system_state();
 
 #if CONFIG_ERROR_CONCEALMENT

diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c
index a51ca64..dce1e4c 100644
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c

@@ -166,7 +166,6 @@
         {
             short *DQC = xd->dequant_y1;
             int dst_stride = xd->dst.y_stride;
-            unsigned char *base_dst = xd->dst.y_buffer;
 
             /* clear out residual eob info */
             if(xd->mode_info_context->mbmi.mb_skip_coeff)
@@ -177,17 +176,19 @@
             for (i = 0; i < 16; i++)
             {
                 BLOCKD *b = &xd->block[i];
-                int b_mode = xd->mode_info_context->bmi[i].as_mode;
-                unsigned char *yabove;
+                unsigned char *dst = xd->dst.y_buffer + b->offset;
+                B_PREDICTION_MODE b_mode =
+                    xd->mode_info_context->bmi[i].as_mode;
+                unsigned char *Above;
                 unsigned char *yleft;
                 int left_stride;
                 unsigned char top_left;
 
                 /*Caution: For some b_mode, it needs 8 pixels (4 above + 4 above-right).*/
                 if (i < 4 && pbi->common.filter_level)
-                    yabove = xd->recon_above[0] + b->offset; //i*4;
+                    Above = xd->recon_above[0] + b->offset;
                 else
-                    yabove = (base_dst - dst_stride) + b->offset;
+                    Above = dst - dst_stride;
 
                 if (i%4==0 && pbi->common.filter_level)
                 {
@@ -196,34 +197,28 @@
                 }
                 else
                 {
-                    yleft = (base_dst  - 1) + b->offset;
+                    yleft = dst - 1;
                     left_stride = dst_stride;
                 }
 
                 if ((i==4 || i==8 || i==12) && pbi->common.filter_level)
                     top_left = *(xd->recon_left[0] + i - 1);
                 else
-                    top_left = yabove[-1];
+                    top_left = Above[-1];
 
-                vp8_intra4x4_predict_d_c(yabove, yleft, left_stride,
-                                       b_mode,
-                                       base_dst + b->offset, dst_stride,
-                                       top_left);
+                vp8_intra4x4_predict(Above, yleft, left_stride,
+                                     b_mode, dst, dst_stride, top_left);
 
                 if (xd->eobs[i] )
                 {
                     if (xd->eobs[i] > 1)
                     {
-                        vp8_dequant_idct_add
-                            (b->qcoeff, DQC,
-                            base_dst + b->offset, dst_stride);
+                        vp8_dequant_idct_add(b->qcoeff, DQC, dst, dst_stride);
                     }
                     else
                     {
-                        vp8_dc_only_idct_add
-                            (b->qcoeff[0] * DQC[0],
-                            base_dst + b->offset, dst_stride,
-                            base_dst + b->offset, dst_stride);
+                        vp8_dc_only_idct_add(b->qcoeff[0] * DQC[0],
+                                             dst, dst_stride, dst, dst_stride);
                         ((int *)b->qcoeff)[0] = 0;
                     }
                 }
@@ -383,7 +378,7 @@
           xd->recon_left[1] = pbi->mt_uleft_col[mb_row];
           xd->recon_left[2] = pbi->mt_vleft_col[mb_row];
 
-          //TODO: move to outside row loop
+          /* TODO: move to outside row loop */
           xd->recon_left_stride[0] = 1;
           xd->recon_left_stride[1] = 1;
        }
@@ -401,7 +396,7 @@
           xd->recon_above[1] -= xd->dst.uv_stride;
           xd->recon_above[2] -= xd->dst.uv_stride;
 
-          //TODO: move to outside row loop
+          /* TODO: move to outside row loop */
           xd->recon_left_stride[0] = xd->dst.y_stride;
           xd->recon_left_stride[1] = xd->dst.uv_stride;
        }

diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c
index d7cd5a9..2799143 100644
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c

@@ -172,7 +172,7 @@
     while (p < stop)
     {
         const int t = p->Token;
-        const vp8_token *a = vp8_coef_encodings + t;
+        vp8_token *a = vp8_coef_encodings + t;
         const vp8_extra_bit_struct *b = vp8_extra_bits + t;
         int i = 0;
         const unsigned char *pp = p->context_tree;
@@ -397,7 +397,7 @@
         {
             const TOKENEXTRA *p    = cpi->tplist[mb_row].start;
             const TOKENEXTRA *stop = cpi->tplist[mb_row].stop;
-            int tokens = stop - p;
+            int tokens = (int)(stop - p);
 
             vp8_pack_tokens_c(w, p, tokens);
         }
@@ -416,7 +416,7 @@
     {
         const TOKENEXTRA *p    = cpi->tplist[mb_row].start;
         const TOKENEXTRA *stop = cpi->tplist[mb_row].stop;
-        int tokens = stop - p;
+        int tokens = (int)(stop - p);
 
         vp8_pack_tokens_c(w, p, tokens);
     }
@@ -461,7 +461,7 @@
 
 static void write_mb_features(vp8_writer *w, const MB_MODE_INFO *mi, const MACROBLOCKD *x)
 {
-    // Encode the MB segment id.
+    /* Encode the MB segment id. */
     if (x->segmentation_enabled && x->update_mb_segmentation_map)
     {
         switch (mi->segment_id)
@@ -483,7 +483,7 @@
             vp8_write(w, 1, x->mb_segment_tree_probs[2]);
             break;
 
-            // TRAP.. This should not happen
+            /* TRAP.. This should not happen */
         default:
             vp8_write(w, 0, x->mb_segment_tree_probs[0]);
             vp8_write(w, 0, x->mb_segment_tree_probs[1]);
@@ -497,7 +497,7 @@
     const int rf_intra = rfct[INTRA_FRAME];
     const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
 
-    // Calculate the probabilities used to code the ref frame based on useage
+    /* Calculate the probabilities used to code the ref frame based on usage */
     if (!(cpi->prob_intra_coded = rf_intra * 255 / (rf_intra + rf_inter)))
         cpi->prob_intra_coded = 1;
 
@@ -571,8 +571,10 @@
 
             MACROBLOCKD *xd = &cpi->mb.e_mbd;
 
-            // Distance of Mb to the various image edges.
-            // These specified to 8th pel as they are always compared to MV values that are in 1/8th pel units
+            /* Distance of Mb to the various image edges.
+             * These specified to 8th pel as they are always compared to MV
+             * values that are in 1/8th pel units
+             */
             xd->mb_to_left_edge = -((mb_col * 16) << 3);
             xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
             xd->mb_to_top_edge = -((mb_row * 16)) << 3;
@@ -779,7 +781,7 @@
             write_uv_mode(bc, (m++)->mbmi.uv_mode, vp8_kf_uv_mode_prob);
         }
 
-        m++;    // skip L prediction border
+        m++;    /* skip L prediction border */
     }
 }
 
@@ -878,9 +880,6 @@
                 /* at every context */
 
                 /* calc probs and branch cts for this frame only */
-                //vp8_prob new_p           [ENTROPY_NODES];
-                //unsigned int branch_ct   [ENTROPY_NODES] [2];
-
                 int t = 0;      /* token/prob index */
 
                 vp8_tree_probs_from_distribution(
@@ -940,9 +939,6 @@
                 /* at every context */
 
                 /* calc probs and branch cts for this frame only */
-                //vp8_prob new_p           [ENTROPY_NODES];
-                //unsigned int branch_ct   [ENTROPY_NODES] [2];
-
                 int t = 0;      /* token/prob index */
 
                 vp8_tree_probs_from_distribution(
@@ -1004,7 +1000,7 @@
     int new_intra, new_last, new_garf, oldtotal, newtotal;
     int ref_frame_cost[MAX_REF_FRAMES];
 
-    vp8_clear_system_state(); //__asm emms;
+    vp8_clear_system_state();
 
     if (cpi->common.frame_type != KEY_FRAME)
     {
@@ -1026,7 +1022,7 @@
             rfct[ALTREF_FRAME] * ref_frame_cost[ALTREF_FRAME];
 
 
-        // old costs
+        /* old costs */
         vp8_calc_ref_frame_costs(ref_frame_cost,cpi->prob_intra_coded,
                                  cpi->prob_last_coded,cpi->prob_gf_coded);
 
@@ -1078,7 +1074,7 @@
 #endif
     int savings = 0;
 
-    vp8_clear_system_state(); //__asm emms;
+    vp8_clear_system_state();
 
     do
     {
@@ -1110,21 +1106,15 @@
             }
             do
             {
-                //note: use result from vp8_estimate_entropy_savings, so no need to call vp8_tree_probs_from_distribution here.
+                /* note: use result from vp8_estimate_entropy_savings, so no
+                 * need to call vp8_tree_probs_from_distribution here.
+                 */
+
                 /* at every context */
 
                 /* calc probs and branch cts for this frame only */
-                //vp8_prob new_p           [ENTROPY_NODES];
-                //unsigned int branch_ct   [ENTROPY_NODES] [2];
-
                 int t = 0;      /* token/prob index */
 
-                //vp8_tree_probs_from_distribution(
-                //    MAX_ENTROPY_TOKENS, vp8_coef_encodings, vp8_coef_tree,
-                //    new_p, branch_ct, (unsigned int *)cpi->coef_counts [i][j][k],
-                //    256, 1
-                //    );
-
                 do
                 {
                     const vp8_prob newp = cpi->frame_coef_probs [i][j][k][t];
@@ -1295,14 +1285,16 @@
     Sectionbits[active_section = 1] += sizeof(VP8_HEADER) * 8 * 256;
 #endif
 
-    // every keyframe send startcode, width, height, scale factor, clamp and color type
+    /* every keyframe send startcode, width, height, scale factor, clamp
+     * and color type
+     */
     if (oh.type == KEY_FRAME)
     {
         int v;
 
         validate_buffer(cx_data, 7, cx_data_end, &cpi->common.error);
 
-        // Start / synch code
+        /* Start / synch code */
         cx_data[0] = 0x9D;
         cx_data[1] = 0x01;
         cx_data[2] = 0x2a;
@@ -1321,7 +1313,7 @@
 
         vp8_start_encode(bc, cx_data, cx_data_end);
 
-        // signal clr type
+        /* signal clr type */
         vp8_write_bit(bc, pc->clr_type);
         vp8_write_bit(bc, pc->clamp_type);
 
@@ -1330,13 +1322,13 @@
         vp8_start_encode(bc, cx_data, cx_data_end);
 
 
-    // Signal whether or not Segmentation is enabled
+    /* Signal whether or not Segmentation is enabled */
     vp8_write_bit(bc, xd->segmentation_enabled);
 
-    // Indicate which features are enabled
+    /*  Indicate which features are enabled */
     if (xd->segmentation_enabled)
     {
-        // Signal whether or not the segmentation map is being updated.
+        /* Signal whether or not the segmentation map is being updated. */
         vp8_write_bit(bc, xd->update_mb_segmentation_map);
         vp8_write_bit(bc, xd->update_mb_segmentation_data);
 
@@ -1346,15 +1338,15 @@
 
             vp8_write_bit(bc, xd->mb_segement_abs_delta);
 
-            // For each segmentation feature (Quant and loop filter level)
+            /* For each segmentation feature (Quant and loop filter level) */
             for (i = 0; i < MB_LVL_MAX; i++)
             {
-                // For each of the segments
+                /* For each of the segments */
                 for (j = 0; j < MAX_MB_SEGMENTS; j++)
                 {
                     Data = xd->segment_feature_data[i][j];
 
-                    // Frame level data
+                    /* Frame level data */
                     if (Data)
                     {
                         vp8_write_bit(bc, 1);
@@ -1379,7 +1371,7 @@
 
         if (xd->update_mb_segmentation_map)
         {
-            // Write the probs used to decode the segment id for each macro block.
+            /* Write the probs used to decode the segment id for each mb */
             for (i = 0; i < MB_FEATURE_TREE_PROBS; i++)
             {
                 int Data = xd->mb_segment_tree_probs[i];
@@ -1395,17 +1387,18 @@
         }
     }
 
-    // Code to determine whether or not to update the scan order.
     vp8_write_bit(bc, pc->filter_type);
     vp8_write_literal(bc, pc->filter_level, 6);
     vp8_write_literal(bc, pc->sharpness_level, 3);
 
-    // Write out loop filter deltas applied at the MB level based on mode or ref frame (if they are enabled).
+    /* Write out loop filter deltas applied at the MB level based on mode
+     * or ref frame (if they are enabled).
+     */
     vp8_write_bit(bc, xd->mode_ref_lf_delta_enabled);
 
     if (xd->mode_ref_lf_delta_enabled)
     {
-        // Do the deltas need to be updated
+        /* Do the deltas need to be updated */
         int send_update = xd->mode_ref_lf_delta_update
                           || cpi->oxcf.error_resilient_mode;
 
@@ -1414,12 +1407,12 @@
         {
             int Data;
 
-            // Send update
+            /* Send update */
             for (i = 0; i < MAX_REF_LF_DELTAS; i++)
             {
                 Data = xd->ref_lf_deltas[i];
 
-                // Frame level data
+                /* Frame level data */
                 if (xd->ref_lf_deltas[i] != xd->last_ref_lf_deltas[i]
                     || cpi->oxcf.error_resilient_mode)
                 {
@@ -1429,20 +1422,20 @@
                     if (Data > 0)
                     {
                         vp8_write_literal(bc, (Data & 0x3F), 6);
-                        vp8_write_bit(bc, 0);    // sign
+                        vp8_write_bit(bc, 0);    /* sign */
                     }
                     else
                     {
                         Data = -Data;
                         vp8_write_literal(bc, (Data & 0x3F), 6);
-                        vp8_write_bit(bc, 1);    // sign
+                        vp8_write_bit(bc, 1);    /* sign */
                     }
                 }
                 else
                     vp8_write_bit(bc, 0);
             }
 
-            // Send update
+            /* Send update */
             for (i = 0; i < MAX_MODE_LF_DELTAS; i++)
             {
                 Data = xd->mode_lf_deltas[i];
@@ -1456,13 +1449,13 @@
                     if (Data > 0)
                     {
                         vp8_write_literal(bc, (Data & 0x3F), 6);
-                        vp8_write_bit(bc, 0);    // sign
+                        vp8_write_bit(bc, 0);    /* sign */
                     }
                     else
                     {
                         Data = -Data;
                         vp8_write_literal(bc, (Data & 0x3F), 6);
-                        vp8_write_bit(bc, 1);    // sign
+                        vp8_write_bit(bc, 1);    /* sign */
                     }
                 }
                 else
@@ -1471,34 +1464,42 @@
         }
     }
 
-    //signal here is multi token partition is enabled
+    /* signal here is multi token partition is enabled */
     vp8_write_literal(bc, pc->multi_token_partition, 2);
 
-    // Frame Qbaseline quantizer index
+    /* Frame Qbaseline quantizer index */
     vp8_write_literal(bc, pc->base_qindex, 7);
 
-    // Transmit Dc, Second order and Uv quantizer delta information
+    /* Transmit Dc, Second order and Uv quantizer delta information */
     put_delta_q(bc, pc->y1dc_delta_q);
     put_delta_q(bc, pc->y2dc_delta_q);
     put_delta_q(bc, pc->y2ac_delta_q);
     put_delta_q(bc, pc->uvdc_delta_q);
     put_delta_q(bc, pc->uvac_delta_q);
 
-    // When there is a key frame all reference buffers are updated using the new key frame
+    /* When there is a key frame all reference buffers are updated using
+     * the new key frame
+     */
     if (pc->frame_type != KEY_FRAME)
     {
-        // Should the GF or ARF be updated using the transmitted frame or buffer
+        /* Should the GF or ARF be updated using the transmitted frame
+         * or buffer
+         */
         vp8_write_bit(bc, pc->refresh_golden_frame);
         vp8_write_bit(bc, pc->refresh_alt_ref_frame);
 
-        // If not being updated from current frame should either GF or ARF be updated from another buffer
+        /* If not being updated from current frame should either GF or ARF
+         * be updated from another buffer
+         */
         if (!pc->refresh_golden_frame)
             vp8_write_literal(bc, pc->copy_buffer_to_gf, 2);
 
         if (!pc->refresh_alt_ref_frame)
             vp8_write_literal(bc, pc->copy_buffer_to_arf, 2);
 
-        // Indicate reference frame sign bias for Golden and ARF frames (always 0 for last frame buffer)
+        /* Indicate reference frame sign bias for Golden and ARF frames
+         * (always 0 for last frame buffer)
+         */
         vp8_write_bit(bc, pc->ref_frame_sign_bias[GOLDEN_FRAME]);
         vp8_write_bit(bc, pc->ref_frame_sign_bias[ALTREF_FRAME]);
     }
@@ -1527,14 +1528,14 @@
 
 #endif
 
-    vp8_clear_system_state();  //__asm emms;
+    vp8_clear_system_state();
 
 #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
     pack_coef_probs(cpi);
 #else
     if (pc->refresh_entropy_probs == 0)
     {
-        // save a copy for later refresh
+        /* save a copy for later refresh */
         vpx_memcpy(&cpi->common.lfc, &cpi->common.fc, sizeof(cpi->common.fc));
     }
 
@@ -1545,7 +1546,7 @@
     active_section = 2;
 #endif
 
-    // Write out the mb_no_coeff_skip flag
+    /* Write out the mb_no_coeff_skip flag */
     vp8_write_bit(bc, pc->mb_no_coeff_skip);
 
     if (pc->frame_type == KEY_FRAME)

diff --git a/vp8/encoder/block.h b/vp8/encoder/block.h
index 9756acc..0b0a234 100644
--- a/vp8/encoder/block.h
+++ b/vp8/encoder/block.h

@@ -18,7 +18,7 @@
 #include "vp8/common/entropy.h"
 #include "vpx_ports/mem.h"
 
-// motion search site
+/* motion search site */
 typedef struct
 {
     MV mv;
@@ -27,11 +27,11 @@
 
 typedef struct block
 {
-    // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries
+    /* 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries */
     short *src_diff;
     short *coeff;
 
-    // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries
+    /* 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries */
     short *quant;
     short *quant_fast;
     unsigned char *quant_shift;
@@ -39,7 +39,7 @@
     short *zrun_zbin_boost;
     short *round;
 
-    // Zbin Over Quant value
+    /* Zbin Over Quant value */
     short zbin_extra;
 
     unsigned char **base_src;
@@ -59,12 +59,12 @@
 
 typedef struct macroblock
 {
-    DECLARE_ALIGNED(16, short, src_diff[400]);       // 16x16 Y 8x8 U 8x8 V 4x4 2nd Y
-    DECLARE_ALIGNED(16, short, coeff[400]);     // 16x16 Y 8x8 U 8x8 V 4x4 2nd Y
+    DECLARE_ALIGNED(16, short, src_diff[400]); /* 25 blocks Y,U,V,Y2 */
+    DECLARE_ALIGNED(16, short, coeff[400]); /* 25 blocks Y,U,V,Y2 */
     DECLARE_ALIGNED(16, unsigned char, thismb[256]);
 
     unsigned char *thismb_ptr;
-    // 16 Y blocks, 4 U blocks, 4 V blocks, 1 DC 2nd order block each with 16 entries
+    /* 16 Y, 4 U, 4 V, 1 DC 2nd order block */
     BLOCK block[25];
 
     YV12_BUFFER_CONFIG src;
@@ -99,8 +99,9 @@
     int (*token_costs)[COEF_BANDS][PREV_COEF_CONTEXTS]
     [MAX_ENTROPY_TOKENS];
 
-    // These define limits to motion vector components to prevent
-    // them from extending outside the UMV borders
+    /* These define limits to motion vector components to prevent
+     * them from extending outside the UMV borders.
+     */
     int mv_col_min;
     int mv_col_max;
     int mv_row_min;
@@ -110,7 +111,6 @@
 
     unsigned int encode_breakout;
 
-    //char * gf_active_ptr;
     signed char *gf_active_ptr;
 
     unsigned char *active_ptr;

diff --git a/vp8/encoder/boolhuff.h b/vp8/encoder/boolhuff.h
index fb6cbaf..8309063 100644
--- a/vp8/encoder/boolhuff.h
+++ b/vp8/encoder/boolhuff.h

@@ -32,7 +32,7 @@
     unsigned char *buffer_end;
     struct vpx_internal_error_info *error;
 
-    // Variables used to track bit costs without outputing to the bitstream
+    /* Variables used to track bit costs without outputing to the bitstream */
     unsigned int  measure_cost;
     unsigned long bit_counter;
 } BOOL_CODER;

diff --git a/vp8/encoder/denoising.c b/vp8/encoder/denoising.c
index f392396..d6b03e6 100644
--- a/vp8/encoder/denoising.c
+++ b/vp8/encoder/denoising.c

@@ -16,22 +16,26 @@
 #include "vpx_rtcd.h"
 
 static const unsigned int NOISE_MOTION_THRESHOLD = 25 * 25;
-// SSE_DIFF_THRESHOLD is selected as ~95% confidence assuming var(noise) ~= 100.
+/* SSE_DIFF_THRESHOLD is selected as ~95% confidence assuming
+ * var(noise) ~= 100.
+ */
 static const unsigned int SSE_DIFF_THRESHOLD = 16 * 16 * 20;
 static const unsigned int SSE_THRESHOLD = 16 * 16 * 40;
 
-// The filtering coefficients used for denoizing are adjusted for static
-// blocks, or blocks with very small motion vectors. This is done through
-// the motion magnitude parameter.
-//
-// There are currently 2048 possible mapping from absolute difference to
-// filter coefficient depending on the motion magnitude. Each mapping is
-// in a LUT table. All these tables are staticly allocated but they are only
-// filled on their first use.
-//
-// Each entry is a pair of 16b values, the coefficient and its complement
-// to 256. Each of these value should only be 8b but they are 16b wide to
-// avoid slow partial register manipulations.
+/*
+ * The filtering coefficients used for denoizing are adjusted for static
+ * blocks, or blocks with very small motion vectors. This is done through
+ * the motion magnitude parameter.
+ *
+ * There are currently 2048 possible mapping from absolute difference to
+ * filter coefficient depending on the motion magnitude. Each mapping is
+ * in a LUT table. All these tables are staticly allocated but they are only
+ * filled on their first use.
+ *
+ * Each entry is a pair of 16b values, the coefficient and its complement
+ * to 256. Each of these value should only be 8b but they are 16b wide to
+ * avoid slow partial register manipulations.
+ */
 enum {num_motion_magnitude_adjustments = 2048};
 
 static union coeff_pair filter_coeff_LUT[num_motion_magnitude_adjustments][256];
@@ -100,7 +104,7 @@
 
     for (r = 0; r < 16; ++r)
     {
-        // Calculate absolute differences
+        /* Calculate absolute differences */
         unsigned char abs_diff[16];
 
         union coeff_pair filter_coefficient[16];
@@ -112,13 +116,13 @@
             abs_diff[c] = absdiff;
         }
 
-        // Use LUT to get filter coefficients (two 16b value; f and 256-f)
+        /* Use LUT to get filter coefficients (two 16b value; f and 256-f) */
         for (c = 0; c < 16; ++c)
         {
             filter_coefficient[c] = LUT[abs_diff[c]];
         }
 
-        // Filtering...
+        /* Filtering... */
         for (c = 0; c < 16; ++c)
         {
             const uint16_t state = (uint16_t)(mc_running_avg_y[c]);
@@ -128,10 +132,11 @@
                     filter_coefficient[c].as_short[1] * sample + 128) >> 8;
         }
 
-        // Depending on the magnitude of the difference between the signal and
-        // filtered version, either replace the signal by the filtered one or
-        // update the filter state with the signal when the change in a pixel
-        // isn't classified as noise.
+        /* Depending on the magnitude of the difference between the signal and
+         * filtered version, either replace the signal by the filtered one or
+         * update the filter state with the signal when the change in a pixel
+         * isn't classified as noise.
+         */
         for (c = 0; c < 16; ++c)
         {
             const int diff = sig[c] - running_avg_y[c];
@@ -148,7 +153,7 @@
             }
         }
 
-        // Update pointers for next iteration.
+        /* Update pointers for next iteration. */
         sig += sig_stride;
         filtered += 16;
         mc_running_avg_y += mc_avg_y_stride;
@@ -228,7 +233,6 @@
 
     enum vp8_denoiser_decision decision = FILTER_BLOCK;
 
-    // Motion compensate the running average.
     if (zero_frame)
     {
         YV12_BUFFER_CONFIG *src = &denoiser->yv12_running_avg[frame];
@@ -243,7 +247,7 @@
 
         saved_mbmi = *mbmi;
 
-        // Use the best MV for the compensation.
+        /* Use the best MV for the compensation. */
         mbmi->ref_frame = x->best_reference_frame;
         mbmi->mode = x->best_sse_inter_mode;
         mbmi->mv = x->best_sse_mv;
@@ -252,14 +256,18 @@
         mv_row = x->best_sse_mv.as_mv.row;
 
         if (frame == INTRA_FRAME ||
-            (mv_row *mv_row + mv_col *mv_col <= NOISE_MOTION_THRESHOLD &&
-             sse_diff < SSE_DIFF_THRESHOLD))
+            ((unsigned int)(mv_row *mv_row + mv_col *mv_col)
+              <= NOISE_MOTION_THRESHOLD &&
+             sse_diff < (int)SSE_DIFF_THRESHOLD))
         {
-            // Handle intra blocks as referring to last frame with zero motion
-            // and let the absolute pixel difference affect the filter factor.
-            // Also consider small amount of motion as being random walk due to
-            // noise, if it doesn't mean that we get a much bigger error.
-            // Note that any changes to the mode info only affects the denoising.
+            /*
+             * Handle intra blocks as referring to last frame with zero motion
+             * and let the absolute pixel difference affect the filter factor.
+             * Also consider small amount of motion as being random walk due
+             * to noise, if it doesn't mean that we get a much bigger error.
+             * Note that any changes to the mode info only affects the
+             * denoising.
+             */
             mbmi->ref_frame =
                     x->best_zeromv_reference_frame;
 
@@ -275,11 +283,11 @@
         saved_pre = filter_xd->pre;
         saved_dst = filter_xd->dst;
 
-        // Compensate the running average.
+        /* Compensate the running average. */
         filter_xd->pre.y_buffer = src->y_buffer + recon_yoffset;
         filter_xd->pre.u_buffer = src->u_buffer + recon_uvoffset;
         filter_xd->pre.v_buffer = src->v_buffer + recon_uvoffset;
-        // Write the compensated running average to the destination buffer.
+        /* Write the compensated running average to the destination buffer. */
         filter_xd->dst.y_buffer = dst->y_buffer + recon_yoffset;
         filter_xd->dst.u_buffer = dst->u_buffer + recon_uvoffset;
         filter_xd->dst.v_buffer = dst->v_buffer + recon_uvoffset;
@@ -314,7 +322,7 @@
 
     if (decision == FILTER_BLOCK)
     {
-        // Filter.
+        /* Filter. */
         decision = vp8_denoiser_filter(&denoiser->yv12_mc_running_avg,
                                        &denoiser->yv12_running_avg[LAST_FRAME],
                                        x,
@@ -323,8 +331,9 @@
     }
     if (decision == COPY_BLOCK)
     {
-        // No filtering of this block; it differs too much from the predictor,
-        // or the motion vector magnitude is considered too big.
+        /* No filtering of this block; it differs too much from the predictor,
+         * or the motion vector magnitude is considered too big.
+         */
         vp8_copy_mem16x16(
                 x->thismb, 16,
                 denoiser->yv12_running_avg[LAST_FRAME].y_buffer + recon_yoffset,

diff --git a/vp8/encoder/denoising.h b/vp8/encoder/denoising.h
index dc78e65..2f5fbff 100644
--- a/vp8/encoder/denoising.h
+++ b/vp8/encoder/denoising.h

@@ -19,7 +19,7 @@
 enum vp8_denoiser_decision
 {
   COPY_BLOCK,
-  FILTER_BLOCK,
+  FILTER_BLOCK
 };
 
 typedef struct vp8_denoiser
@@ -47,4 +47,4 @@
 
 union coeff_pair *vp8_get_filter_coeff_LUT(unsigned int motion_magnitude);
 
-#endif  // VP8_ENCODER_DENOISING_H_
+#endif  /* VP8_ENCODER_DENOISING_H_ */

diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index 2cef01e..600e815 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c

@@ -77,7 +77,7 @@
 };
 
 
-// Original activity measure from Tim T's code.
+/* Original activity measure from Tim T's code. */
 static unsigned int tt_activity_measure( VP8_COMP *cpi, MACROBLOCK *x )
 {
     unsigned int act;
@@ -100,7 +100,7 @@
     return act;
 }
 
-// Stub for alternative experimental activity measures.
+/* Stub for alternative experimental activity measures. */
 static unsigned int alt_activity_measure( VP8_COMP *cpi,
                                           MACROBLOCK *x, int use_dc_pred )
 {
@@ -108,8 +108,9 @@
 }
 
 
-// Measure the activity of the current macroblock
-// What we measure here is TBD so abstracted to this function
+/* Measure the activity of the current macroblock
+ * What we measure here is TBD so abstracted to this function
+ */
 #define ALT_ACT_MEASURE 1
 static unsigned int mb_activity_measure( VP8_COMP *cpi, MACROBLOCK *x,
                                   int mb_row, int mb_col)
@@ -120,12 +121,12 @@
     {
         int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
 
-        // Or use and alternative.
+        /* Or use and alternative. */
         mb_activity = alt_activity_measure( cpi, x, use_dc_pred );
     }
     else
     {
-        // Original activity measure from Tim T's code.
+        /* Original activity measure from Tim T's code. */
         mb_activity = tt_activity_measure( cpi, x );
     }
 
@@ -135,36 +136,36 @@
     return mb_activity;
 }
 
-// Calculate an "average" mb activity value for the frame
+/* Calculate an "average" mb activity value for the frame */
 #define ACT_MEDIAN 0
 static void calc_av_activity( VP8_COMP *cpi, int64_t activity_sum )
 {
 #if ACT_MEDIAN
-    // Find median: Simple n^2 algorithm for experimentation
+    /* Find median: Simple n^2 algorithm for experimentation */
     {
         unsigned int median;
         unsigned int i,j;
         unsigned int * sortlist;
         unsigned int tmp;
 
-        // Create a list to sort to
+        /* Create a list to sort to */
         CHECK_MEM_ERROR(sortlist,
                         vpx_calloc(sizeof(unsigned int),
                         cpi->common.MBs));
 
-        // Copy map to sort list
+        /* Copy map to sort list */
         vpx_memcpy( sortlist, cpi->mb_activity_map,
                     sizeof(unsigned int) * cpi->common.MBs );
 
 
-        // Ripple each value down to its correct position
+        /* Ripple each value down to its correct position */
         for ( i = 1; i < cpi->common.MBs; i ++ )
         {
             for ( j = i; j > 0; j -- )
             {
                 if ( sortlist[j] < sortlist[j-1] )
                 {
-                    // Swap values
+                    /* Swap values */
                     tmp = sortlist[j-1];
                     sortlist[j-1] = sortlist[j];
                     sortlist[j] = tmp;
@@ -174,7 +175,7 @@
             }
         }
 
-        // Even number MBs so estimate median as mean of two either side.
+        /* Even number MBs so estimate median as mean of two either side. */
         median = ( 1 + sortlist[cpi->common.MBs >> 1] +
                    sortlist[(cpi->common.MBs >> 1) + 1] ) >> 1;
 
@@ -183,14 +184,14 @@
         vpx_free(sortlist);
     }
 #else
-    // Simple mean for now
+    /* Simple mean for now */
     cpi->activity_avg = (unsigned int)(activity_sum/cpi->common.MBs);
 #endif
 
     if (cpi->activity_avg < VP8_ACTIVITY_AVG_MIN)
         cpi->activity_avg = VP8_ACTIVITY_AVG_MIN;
 
-    // Experimental code: return fixed value normalized for several clips
+    /* Experimental code: return fixed value normalized for several clips */
     if  ( ALT_ACT_MEASURE )
         cpi->activity_avg = 100000;
 }
@@ -199,7 +200,7 @@
 #define OUTPUT_NORM_ACT_STATS   0
 
 #if USE_ACT_INDEX
-// Calculate and activity index for each mb
+/* Calculate and activity index for each mb */
 static void calc_activity_index( VP8_COMP *cpi, MACROBLOCK *x )
 {
     VP8_COMMON *const cm = & cpi->common;
@@ -214,19 +215,19 @@
     fprintf(f, "\n%12d\n", cpi->activity_avg );
 #endif
 
-    // Reset pointers to start of activity map
+    /* Reset pointers to start of activity map */
     x->mb_activity_ptr = cpi->mb_activity_map;
 
-    // Calculate normalized mb activity number.
+    /* Calculate normalized mb activity number. */
     for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
     {
-        // for each macroblock col in image
+        /* for each macroblock col in image */
         for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
         {
-            // Read activity from the map
+            /* Read activity from the map */
             act = *(x->mb_activity_ptr);
 
-            // Calculate a normalized activity number
+            /* Calculate a normalized activity number */
             a = act + 4*cpi->activity_avg;
             b = 4*act + cpi->activity_avg;
 
@@ -238,7 +239,7 @@
 #if OUTPUT_NORM_ACT_STATS
             fprintf(f, " %6d", *(x->mb_activity_ptr));
 #endif
-            // Increment activity map pointers
+            /* Increment activity map pointers */
             x->mb_activity_ptr++;
         }
 
@@ -255,8 +256,9 @@
 }
 #endif
 
-// Loop through all MBs. Note activity of each, average activity and
-// calculate a normalized activity for each
+/* Loop through all MBs. Note activity of each, average activity and
+ * calculate a normalized activity for each
+ */
 static void build_activity_map( VP8_COMP *cpi )
 {
     MACROBLOCK *const x = & cpi->mb;
@@ -273,15 +275,15 @@
     unsigned int mb_activity;
     int64_t activity_sum = 0;
 
-    // for each macroblock row in image
+    /* for each macroblock row in image */
     for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
     {
 #if ALT_ACT_MEASURE
-        // reset above block coeffs
+        /* reset above block coeffs */
         xd->up_available = (mb_row != 0);
         recon_yoffset = (mb_row * recon_y_stride * 16);
 #endif
-        // for each macroblock col in image
+        /* for each macroblock col in image */
         for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
         {
 #if ALT_ACT_MEASURE
@@ -289,48 +291,48 @@
             xd->left_available = (mb_col != 0);
             recon_yoffset += 16;
 #endif
-            //Copy current mb to a buffer
+            /* Copy current mb to a buffer */
             vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
 
-            // measure activity
+            /* measure activity */
             mb_activity = mb_activity_measure( cpi, x, mb_row, mb_col );
 
-            // Keep frame sum
+            /* Keep frame sum */
             activity_sum += mb_activity;
 
-            // Store MB level activity details.
+            /* Store MB level activity details. */
             *x->mb_activity_ptr = mb_activity;
 
-            // Increment activity map pointer
+            /* Increment activity map pointer */
             x->mb_activity_ptr++;
 
-            // adjust to the next column of source macroblocks
+            /* adjust to the next column of source macroblocks */
             x->src.y_buffer += 16;
         }
 
 
-        // adjust to the next row of mbs
+        /* adjust to the next row of mbs */
         x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;
 
 #if ALT_ACT_MEASURE
-        //extend the recon for intra prediction
+        /* extend the recon for intra prediction */
         vp8_extend_mb_row(new_yv12, xd->dst.y_buffer + 16,
                           xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
 #endif
 
     }
 
-    // Calculate an "average" MB activity
+    /* Calculate an "average" MB activity */
     calc_av_activity(cpi, activity_sum);
 
 #if USE_ACT_INDEX
-    // Calculate an activity index number of each mb
+    /* Calculate an activity index number of each mb */
     calc_activity_index( cpi, x );
 #endif
 
 }
 
-// Macroblock activity masking
+/* Macroblock activity masking */
 void vp8_activity_masking(VP8_COMP *cpi, MACROBLOCK *x)
 {
 #if USE_ACT_INDEX
@@ -342,7 +344,7 @@
     int64_t b;
     int64_t act = *(x->mb_activity_ptr);
 
-    // Apply the masking to the RD multiplier.
+    /* Apply the masking to the RD multiplier. */
     a = act + (2*cpi->activity_avg);
     b = (2*act) + cpi->activity_avg;
 
@@ -351,7 +353,7 @@
     x->errorperbit += (x->errorperbit==0);
 #endif
 
-    // Activity based Zbin adjustment
+    /* Activity based Zbin adjustment */
     adjust_act_zbin(cpi, x);
 }
 
@@ -398,7 +400,7 @@
         w = &cpi->bc[1];
 #endif
 
-    // reset above block coeffs
+    /* reset above block coeffs */
     xd->above_context = cm->above_context;
 
     xd->up_available = (mb_row != 0);
@@ -406,37 +408,41 @@
     recon_uvoffset = (mb_row * recon_uv_stride * 8);
 
     cpi->tplist[mb_row].start = *tp;
-    //printf("Main mb_row = %d\n", mb_row);
+    /* printf("Main mb_row = %d\n", mb_row); */
 
-    // Distance of Mb to the top & bottom edges, specified in 1/8th pel
-    // units as they are always compared to values that are in 1/8th pel units
+    /* Distance of Mb to the top & bottom edges, specified in 1/8th pel
+     * units as they are always compared to values that are in 1/8th pel
+     */
     xd->mb_to_top_edge = -((mb_row * 16) << 3);
     xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
 
-    // Set up limit values for vertical motion vector components
-    // to prevent them extending beyond the UMV borders
+    /* Set up limit values for vertical motion vector components
+     * to prevent them extending beyond the UMV borders
+     */
     x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16));
     x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16)
                         + (VP8BORDERINPIXELS - 16);
 
-    // Set the mb activity pointer to the start of the row.
+    /* Set the mb activity pointer to the start of the row. */
     x->mb_activity_ptr = &cpi->mb_activity_map[map_index];
 
-    // for each macroblock col in image
+    /* for each macroblock col in image */
     for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
     {
 
 #if  (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
         *tp = cpi->tok;
 #endif
-        // Distance of Mb to the left & right edges, specified in
-        // 1/8th pel units as they are always compared to values
-        // that are in 1/8th pel units
+        /* Distance of Mb to the left & right edges, specified in
+         * 1/8th pel units as they are always compared to values
+         * that are in 1/8th pel units
+         */
         xd->mb_to_left_edge = -((mb_col * 16) << 3);
         xd->mb_to_right_edge = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
 
-        // Set up limit values for horizontal motion vector components
-        // to prevent them extending beyond the UMV borders
+        /* Set up limit values for horizontal motion vector components
+         * to prevent them extending beyond the UMV borders
+         */
         x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16));
         x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16)
                             + (VP8BORDERINPIXELS - 16);
@@ -449,13 +455,13 @@
         x->rddiv = cpi->RDDIV;
         x->rdmult = cpi->RDMULT;
 
-        //Copy current mb to a buffer
+        /* Copy current mb to a buffer */
         vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
 
 #if CONFIG_MULTITHREAD
         if (cpi->b_multi_threaded != 0)
         {
-            *current_mb_col = mb_col - 1; // set previous MB done
+            *current_mb_col = mb_col - 1; /* set previous MB done */
 
             if ((mb_col & (nsync - 1)) == 0)
             {
@@ -471,11 +477,13 @@
         if(cpi->oxcf.tuning == VP8_TUNE_SSIM)
             vp8_activity_masking(cpi, x);
 
-        // Is segmentation enabled
-        // MB level adjustment to quantizer
+        /* Is segmentation enabled */
+        /* MB level adjustment to quantizer */
         if (xd->segmentation_enabled)
         {
-            // Code to set segment id in xd->mbmi.segment_id for current MB (with range checking)
+            /* Code to set segment id in xd->mbmi.segment_id for current MB
+             * (with range checking)
+             */
             if (cpi->segmentation_map[map_index+mb_col] <= 3)
                 xd->mode_info_context->mbmi.segment_id = cpi->segmentation_map[map_index+mb_col];
             else
@@ -484,7 +492,8 @@
             vp8cx_mb_init_quantizer(cpi, x, 1);
         }
         else
-            xd->mode_info_context->mbmi.segment_id = 0;         // Set to Segment 0 by default
+            /* Set to Segment 0 by default */
+            xd->mode_info_context->mbmi.segment_id = 0;
 
         x->active_ptr = cpi->active_map + map_index + mb_col;
 
@@ -514,21 +523,24 @@
 
 #endif
 
-            // Count of last ref frame 0,0 usage
-            if ((xd->mode_info_context->mbmi.mode == ZEROMV) && (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME))
-                cpi->inter_zz_count ++;
-
-            // Special case code for cyclic refresh
-            // If cyclic update enabled then copy xd->mbmi.segment_id; (which may have been updated based on mode
-            // during vp8cx_encode_inter_macroblock()) back into the global segmentation map
+            /* Special case code for cyclic refresh
+             * If cyclic update enabled then copy xd->mbmi.segment_id; (which
+             * may have been updated based on mode during
+             * vp8cx_encode_inter_macroblock()) back into the global
+             * segmentation map
+             */
             if ((cpi->current_layer == 0) &&
                 (cpi->cyclic_refresh_mode_enabled && xd->segmentation_enabled))
             {
                 cpi->segmentation_map[map_index+mb_col] = xd->mode_info_context->mbmi.segment_id;
 
-                // If the block has been refreshed mark it as clean (the magnitude of the -ve influences how long it will be before we consider another refresh):
-                // Else if it was coded (last frame 0,0) and has not already been refreshed then mark it as a candidate for cleanup next time (marked 0)
-                // else mark it as dirty (1).
+                /* If the block has been refreshed mark it as clean (the
+                 * magnitude of the -ve influences how long it will be before
+                 * we consider another refresh):
+                 * Else if it was coded (last frame 0,0) and has not already
+                 * been refreshed then mark it as a candidate for cleanup
+                 * next time (marked 0) else mark it as dirty (1).
+                 */
                 if (xd->mode_info_context->mbmi.segment_id)
                     cpi->cyclic_refresh_map[map_index+mb_col] = -1;
                 else if ((xd->mode_info_context->mbmi.mode == ZEROMV) && (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME))
@@ -551,13 +563,13 @@
             pack_tokens(w, tp_start, tok_count);
         }
 #endif
-        // Increment pointer into gf usage flags structure.
+        /* Increment pointer into gf usage flags structure. */
         x->gf_active_ptr++;
 
-        // Increment the activity mask pointers.
+        /* Increment the activity mask pointers. */
         x->mb_activity_ptr++;
 
-        // adjust to the next column of macroblocks
+        /* adjust to the next column of macroblocks */
         x->src.y_buffer += 16;
         x->src.u_buffer += 8;
         x->src.v_buffer += 8;
@@ -565,16 +577,16 @@
         recon_yoffset += 16;
         recon_uvoffset += 8;
 
-        // Keep track of segment usage
+        /* Keep track of segment usage */
         segment_counts[xd->mode_info_context->mbmi.segment_id] ++;
 
-        // skip to next mb
+        /* skip to next mb */
         xd->mode_info_context++;
         x->partition_info++;
         xd->above_context++;
     }
 
-    //extend the recon for intra prediction
+    /* extend the recon for intra prediction */
     vp8_extend_mb_row( &cm->yv12_fb[dst_fb_idx],
                         xd->dst.y_buffer + 16,
                         xd->dst.u_buffer + 8,
@@ -585,7 +597,7 @@
         *current_mb_col = rightmost_col;
 #endif
 
-    // this is to account for the border
+    /* this is to account for the border */
     xd->mode_info_context++;
     x->partition_info++;
 }
@@ -596,10 +608,10 @@
     VP8_COMMON *const cm = & cpi->common;
     MACROBLOCKD *const xd = & x->e_mbd;
 
-    // GF active flags data structure
+    /* GF active flags data structure */
     x->gf_active_ptr = (signed char *)cpi->gf_active_flags;
 
-    // Activity map pointer
+    /* Activity map pointer */
     x->mb_activity_ptr = cpi->mb_activity_map;
 
     x->act_zbin_adj = 0;
@@ -611,24 +623,20 @@
 
     xd->frame_type = cm->frame_type;
 
-    // reset intra mode contexts
+    /* reset intra mode contexts */
     if (cm->frame_type == KEY_FRAME)
         vp8_init_mbmode_probs(cm);
 
-    // Copy data over into macro block data structures.
+    /* Copy data over into macro block data structures. */
     x->src = * cpi->Source;
     xd->pre = cm->yv12_fb[cm->lst_fb_idx];
     xd->dst = cm->yv12_fb[cm->new_fb_idx];
 
-    // set up frame for intra coded blocks
+    /* set up frame for intra coded blocks */
     vp8_setup_intra_recon(&cm->yv12_fb[cm->new_fb_idx]);
 
     vp8_build_block_offsets(x);
 
-    vp8_setup_block_dptrs(&x->e_mbd);
-
-    vp8_setup_block_ptrs(x);
-
     xd->mode_info_context->mbmi.mode = DC_PRED;
     xd->mode_info_context->mbmi.uv_mode = DC_PRED;
 
@@ -643,16 +651,18 @@
     vpx_memset(cm->above_context, 0,
                sizeof(ENTROPY_CONTEXT_PLANES) * cm->mb_cols);
 
-    // Special case treatment when GF and ARF are not sensible options for reference
-    if (cpi->ref_frame_flags == VP8_LAST_FLAG)
+    /* Special case treatment when GF and ARF are not sensible options
+     * for reference
+     */
+    if (cpi->ref_frame_flags == VP8_LAST_FRAME)
         vp8_calc_ref_frame_costs(x->ref_frame_cost,
                                  cpi->prob_intra_coded,255,128);
     else if ((cpi->oxcf.number_of_layers > 1) &&
-               (cpi->ref_frame_flags == VP8_GOLD_FLAG))
+               (cpi->ref_frame_flags == VP8_GOLD_FRAME))
         vp8_calc_ref_frame_costs(x->ref_frame_cost,
                                  cpi->prob_intra_coded,1,255);
     else if ((cpi->oxcf.number_of_layers > 1) &&
-                (cpi->ref_frame_flags == VP8_ALT_FLAG))
+                (cpi->ref_frame_flags == VP8_ALTR_FRAME))
         vp8_calc_ref_frame_costs(x->ref_frame_cost,
                                  cpi->prob_intra_coded,1,1);
     else
@@ -676,7 +686,7 @@
     int segment_counts[MAX_MB_SEGMENTS];
     int totalrate;
 #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
-    BOOL_CODER * bc = &cpi->bc[1]; // bc[0] is for control partition
+    BOOL_CODER * bc = &cpi->bc[1]; /* bc[0] is for control partition */
     const int num_part = (1 << cm->multi_token_partition);
 #endif
 
@@ -691,7 +701,7 @@
             vp8_auto_select_speed(cpi);
     }
 
-    // Functions setup for all frame types so we can use MC in AltRef
+    /* Functions setup for all frame types so we can use MC in AltRef */
     if(!cm->use_bilinear_mc_filter)
     {
         xd->subpixel_predict        = vp8_sixtap_predict4x4;
@@ -707,16 +717,13 @@
         xd->subpixel_predict16x16   = vp8_bilinear_predict16x16;
     }
 
-    // Reset frame count of inter 0,0 motion vector usage.
-    cpi->inter_zz_count = 0;
-
     cpi->prediction_error = 0;
     cpi->intra_error = 0;
     cpi->skip_true_count = 0;
     cpi->tok_count = 0;
 
 #if 0
-    // Experimental code
+    /* Experimental code */
     cpi->frame_distortion = 0;
     cpi->last_mb_distortion = 0;
 #endif
@@ -736,14 +743,14 @@
 
     if(cpi->oxcf.tuning == VP8_TUNE_SSIM)
     {
-        // Initialize encode frame context.
+        /* Initialize encode frame context. */
         init_encode_frame_mb_context(cpi);
 
-        // Build a frame level activity map
+        /* Build a frame level activity map */
         build_activity_map(cpi);
     }
 
-    // re-init encode frame context.
+    /* re-init encode frame context. */
     init_encode_frame_mb_context(cpi);
 
 #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
@@ -790,7 +797,7 @@
 
                 encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate);
 
-                // adjust to the next row of mbs
+                /* adjust to the next row of mbs */
                 x->src.y_buffer += 16 * x->src.y_stride * (cpi->encoding_thread_count + 1) - 16 * cm->mb_cols;
                 x->src.u_buffer +=  8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
                 x->src.v_buffer +=  8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
@@ -809,7 +816,8 @@
 
             for (mb_row = 0; mb_row < cm->mb_rows; mb_row ++)
             {
-                cpi->tok_count += cpi->tplist[mb_row].stop - cpi->tplist[mb_row].start;
+                cpi->tok_count += (unsigned int)
+                  (cpi->tplist[mb_row].stop - cpi->tplist[mb_row].start);
             }
 
             if (xd->segmentation_enabled)
@@ -836,7 +844,7 @@
         else
 #endif
         {
-            // for each macroblock row in image
+            /* for each macroblock row in image */
             for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
             {
                 vp8_zero(cm->left_context)
@@ -847,13 +855,13 @@
 
                 encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate);
 
-                // adjust to the next row of mbs
+                /* adjust to the next row of mbs */
                 x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;
                 x->src.u_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;
                 x->src.v_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;
             }
 
-            cpi->tok_count = tp - cpi->tok;
+            cpi->tok_count = (unsigned int)(tp - cpi->tok);
         }
 
 #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
@@ -873,12 +881,13 @@
 
 
     // Work out the segment probabilities if segmentation is enabled
-    if (xd->segmentation_enabled)
+    // and needs to be updated
+    if (xd->segmentation_enabled && xd->update_mb_segmentation_map)
     {
         int tot_count;
         int i;
 
-        // Set to defaults
+        /* Set to defaults */
         vpx_memset(xd->mb_segment_tree_probs, 255 , sizeof(xd->mb_segment_tree_probs));
 
         tot_count = segment_counts[0] + segment_counts[1] + segment_counts[2] + segment_counts[3];
@@ -899,7 +908,7 @@
             if (tot_count > 0)
                 xd->mb_segment_tree_probs[2] = (segment_counts[2] * 255) / tot_count;
 
-            // Zero probabilities not allowed
+            /* Zero probabilities not allowed */
             for (i = 0; i < MB_FEATURE_TREE_PROBS; i ++)
             {
                 if (xd->mb_segment_tree_probs[i] == 0)
@@ -908,10 +917,10 @@
         }
     }
 
-    // 256 rate units to the bit
-    cpi->projected_frame_size = totalrate >> 8;   // projected_frame_size in units of BYTES
+    /* projected_frame_size in units of BYTES */
+    cpi->projected_frame_size = totalrate >> 8;
 
-    // Make a note of the percentage MBs coded Intra.
+    /* Make a note of the percentage MBs coded Intra. */
     if (cm->frame_type == KEY_FRAME)
     {
         cpi->this_frame_percent_intra = 100;
@@ -961,9 +970,11 @@
 #endif
 
 #if ! CONFIG_REALTIME_ONLY
-    // Adjust the projected reference frame usage probability numbers to reflect
-    // what we have just seen. This may be useful when we make multiple iterations
-    // of the recode loop rather than continuing to use values from the previous frame.
+    /* Adjust the projected reference frame usage probability numbers to
+     * reflect what we have just seen. This may be useful when we make
+     * multiple iterations of the recode loop rather than continuing to use
+     * values from the previous frame.
+     */
     if ((cm->frame_type != KEY_FRAME) && ((cpi->oxcf.number_of_layers > 1) ||
         (!cm->refresh_alt_ref_frame && !cm->refresh_golden_frame)))
     {
@@ -1017,16 +1028,13 @@
 
     vp8_build_block_doffsets(&x->e_mbd);
 
-    // y blocks
+    /* y blocks */
     x->thismb_ptr = &x->thismb[0];
     for (br = 0; br < 4; br++)
     {
         for (bc = 0; bc < 4; bc++)
         {
             BLOCK *this_block = &x->block[block];
-            //this_block->base_src = &x->src.y_buffer;
-            //this_block->src_stride = x->src.y_stride;
-            //this_block->src = 4 * br * this_block->src_stride + 4 * bc;
             this_block->base_src = &x->thismb_ptr;
             this_block->src_stride = 16;
             this_block->src = 4 * br * 16 + 4 * bc;
@@ -1034,7 +1042,7 @@
         }
     }
 
-    // u blocks
+    /* u blocks */
     for (br = 0; br < 2; br++)
     {
         for (bc = 0; bc < 2; bc++)
@@ -1047,7 +1055,7 @@
         }
     }
 
-    // v blocks
+    /* v blocks */
     for (br = 0; br < 2; br++)
     {
         for (bc = 0; bc < 2; bc++)
@@ -1092,8 +1100,9 @@
 
 }
 
-// Experimental stub function to create a per MB zbin adjustment based on
-// some previously calculated measure of MB activity.
+/* Experimental stub function to create a per MB zbin adjustment based on
+ * some previously calculated measure of MB activity.
+ */
 static void adjust_act_zbin( VP8_COMP *cpi, MACROBLOCK *x )
 {
 #if USE_ACT_INDEX
@@ -1103,7 +1112,7 @@
     int64_t b;
     int64_t act = *(x->mb_activity_ptr);
 
-    // Apply the masking to the RD multiplier.
+    /* Apply the masking to the RD multiplier. */
     a = act + 4*cpi->activity_avg;
     b = 4*act + cpi->activity_avg;
 
@@ -1176,7 +1185,7 @@
         x->encode_breakout = cpi->oxcf.encode_breakout;
 
 #if CONFIG_TEMPORAL_DENOISING
-    // Reset the best sse mode/mv for each macroblock.
+    /* Reset the best sse mode/mv for each macroblock. */
     x->best_reference_frame = INTRA_FRAME;
     x->best_zeromv_reference_frame = INTRA_FRAME;
     x->best_sse_inter_mode = 0;
@@ -1223,23 +1232,23 @@
 
     if(cpi->oxcf.tuning == VP8_TUNE_SSIM)
     {
-        // Adjust the zbin based on this MB rate.
+        /* Adjust the zbin based on this MB rate. */
         adjust_act_zbin( cpi, x );
     }
 
 #if 0
-    // Experimental RD code
+    /* Experimental RD code */
     cpi->frame_distortion += distortion;
     cpi->last_mb_distortion = distortion;
 #endif
 
-    // MB level adjutment to quantizer setup
+    /* MB level adjutment to quantizer setup */
     if (xd->segmentation_enabled)
     {
-        // If cyclic update enabled
+        /* If cyclic update enabled */
         if (cpi->current_layer == 0 && cpi->cyclic_refresh_mode_enabled)
         {
-            // Clear segment_id back to 0 if not coded (last frame 0,0)
+            /* Clear segment_id back to 0 if not coded (last frame 0,0) */
             if ((xd->mode_info_context->mbmi.segment_id == 1) &&
                 ((xd->mode_info_context->mbmi.ref_frame != LAST_FRAME) || (xd->mode_info_context->mbmi.mode != ZEROMV)))
             {
@@ -1252,8 +1261,9 @@
     }
 
     {
-        // Experimental code. Special case for gf and arf zeromv modes.
-        // Increase zbin size to supress noise
+        /* Experimental code. Special case for gf and arf zeromv modes.
+         * Increase zbin size to supress noise
+         */
         cpi->zbin_mode_boost = 0;
         if (cpi->zbin_mode_boost_enabled)
         {

diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c
index 1f445b7..340dd63 100644
--- a/vp8/encoder/encodeintra.c
+++ b/vp8/encoder/encodeintra.c

@@ -54,10 +54,13 @@
     BLOCKD *b = &x->e_mbd.block[ib];
     BLOCK *be = &x->block[ib];
     int dst_stride = x->e_mbd.dst.y_stride;
-    unsigned char *base_dst = x->e_mbd.dst.y_buffer;
+    unsigned char *dst = x->e_mbd.dst.y_buffer + b->offset;
+    unsigned char *Above = dst - dst_stride;
+    unsigned char *yleft = dst - 1;
+    unsigned char top_left = Above[-1];
 
-    vp8_intra4x4_predict(base_dst + b->offset, dst_stride,
-                 b->bmi.as_mode, b->predictor, 16);
+    vp8_intra4x4_predict(Above, yleft, dst_stride, b->bmi.as_mode,
+                         b->predictor, 16, top_left);
 
     vp8_subtract_b(be, b, 16);
 
@@ -67,14 +70,11 @@
 
     if (*b->eob > 1)
     {
-      vp8_short_idct4x4llm(b->dqcoeff,
-            b->predictor, 16, base_dst + b->offset, dst_stride);
+      vp8_short_idct4x4llm(b->dqcoeff, b->predictor, 16, dst, dst_stride);
     }
     else
     {
-      vp8_dc_only_idct_add
-            (b->dqcoeff[0], b->predictor, 16, base_dst + b->offset,
-                dst_stride);
+      vp8_dc_only_idct_add(b->dqcoeff[0], b->predictor, 16, dst, dst_stride);
     }
 }
 

diff --git a/vp8/encoder/encodemb.c b/vp8/encoder/encodemb.c
index f89e4f7..7d494f2 100644
--- a/vp8/encoder/encodemb.c
+++ b/vp8/encoder/encodemb.c

@@ -137,10 +137,10 @@
             &x->block[i].coeff[0], 32);
     }
 
-    // build dc block from 16 y dc values
+    /* build dc block from 16 y dc values */
     build_dcblock(x);
 
-    // do 2nd order transform on the dc block
+    /* do 2nd order transform on the dc block */
     x->short_walsh4x4(&x->block[24].src_diff[0],
         &x->block[24].coeff[0], 8);
 
@@ -157,7 +157,7 @@
             &x->block[i].coeff[0], 32);
     }
 
-    // build dc block from 16 y dc values
+    /* build dc block from 16 y dc values */
     if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
         build_dcblock(x);
 
@@ -167,7 +167,7 @@
             &x->block[i].coeff[0], 16);
     }
 
-    // do 2nd order transform on the dc block
+    /* do 2nd order transform on the dc block */
     if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
         x->short_walsh4x4(&x->block[24].src_diff[0],
         &x->block[24].coeff[0], 8);
@@ -185,7 +185,7 @@
             &x->block[i].coeff[0], 32);
     }
 
-    // build dc block from 16 y dc values
+    /* build dc block from 16 y dc values */
     if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
     {
         build_dcblock(x);
@@ -208,7 +208,7 @@
   short         qc;
 };
 
-// TODO: experiments to find optimal multiple numbers
+/* TODO: experiments to find optimal multiple numbers */
 #define Y1_RD_MULT 4
 #define UV_RD_MULT 2
 #define Y2_RD_MULT 16

diff --git a/vp8/encoder/encodemv.c b/vp8/encoder/encodemv.c
index 0145f6d..7d8c84d 100644
--- a/vp8/encoder/encodemv.c
+++ b/vp8/encoder/encodemv.c

@@ -29,15 +29,15 @@
     const vp8_prob *p = mvc->prob;
     const int x = v < 0 ? -v : v;
 
-    if (x < mvnum_short)     // Small
+    if (x < mvnum_short)     /* Small */
     {
         vp8_write(w, 0, p [mvpis_short]);
         vp8_treed_write(w, vp8_small_mvtree, p + MVPshort, x, 3);
 
         if (!x)
-            return;         // no sign bit
+            return;         /* no sign bit */
     }
-    else                    // Large
+    else                    /* Large */
     {
         int i = 0;
 
@@ -100,7 +100,7 @@
 static unsigned int cost_mvcomponent(const int v, const struct mv_context *mvc)
 {
     const vp8_prob *p = mvc->prob;
-    const int x = v;   //v<0? -v:v;
+    const int x = v;
     unsigned int cost;
 
     if (x < mvnum_short)
@@ -132,12 +132,12 @@
             cost += vp8_cost_bit(p [MVPbits + 3], (x >> 3) & 1);
     }
 
-    return cost;   // + vp8_cost_bit( p [MVPsign], v < 0);
+    return cost;   /* + vp8_cost_bit( p [MVPsign], v < 0); */
 }
 
 void vp8_build_component_cost_table(int *mvcost[2], const MV_CONTEXT *mvc, int mvc_flag[2])
 {
-    int i = 1;   //-mv_max;
+    int i = 1;
     unsigned int cost0 = 0;
     unsigned int cost1 = 0;
 
@@ -151,7 +151,6 @@
 
         do
         {
-            //mvcost [0] [i] = cost_mvcomponent( i, &mvc[0]);
             cost0 = cost_mvcomponent(i, &mvc[0]);
 
             mvcost [0] [i] = cost0 + vp8_cost_zero(mvc[0].prob[MVPsign]);
@@ -168,7 +167,6 @@
 
         do
         {
-            //mvcost [1] [i] = cost_mvcomponent( i, mvc[1]);
             cost1 = cost_mvcomponent(i, &mvc[1]);
 
             mvcost [1] [i] = cost1 + vp8_cost_zero(mvc[1].prob[MVPsign]);
@@ -179,10 +177,10 @@
 }
 
 
-// Motion vector probability table update depends on benefit.
-// Small correction allows for the fact that an update to an MV probability
-// may have benefit in subsequent frames as well as the current one.
-
+/* Motion vector probability table update depends on benefit.
+ * Small correction allows for the fact that an update to an MV probability
+ * may have benefit in subsequent frames as well as the current one.
+ */
 #define MV_PROB_UPDATE_CORRECTION   -1
 
 
@@ -254,22 +252,22 @@
     vp8_zero(short_bct)
 
 
-    //j=0
+    /* j=0 */
     {
         const int c = events [mv_max];
 
-        is_short_ct [0] += c;     // Short vector
-        short_ct [0] += c;       // Magnitude distribution
+        is_short_ct [0] += c;     /* Short vector */
+        short_ct [0] += c;       /* Magnitude distribution */
     }
 
-    //j: 1 ~ mv_max (1023)
+    /* j: 1 ~ mv_max (1023) */
     {
         int j = 1;
 
         do
         {
-            const int c1 = events [mv_max + j];  //positive
-            const int c2 = events [mv_max - j];  //negative
+            const int c1 = events [mv_max + j];  /* positive */
+            const int c2 = events [mv_max - j];  /* negative */
             const int c  = c1 + c2;
             int a = j;
 
@@ -278,13 +276,13 @@
 
             if (a < mvnum_short)
             {
-                is_short_ct [0] += c;     // Short vector
-                short_ct [a] += c;       // Magnitude distribution
+                is_short_ct [0] += c;     /* Short vector */
+                short_ct [a] += c;       /* Magnitude distribution */
             }
             else
             {
                 int k = mvlong_width - 1;
-                is_short_ct [1] += c;     // Long vector
+                is_short_ct [1] += c;     /* Long vector */
 
                 /*  bit 3 not always encoded. */
                 do
@@ -296,43 +294,6 @@
         while (++j <= mv_max);
     }
 
-    /*
-    {
-        int j = -mv_max;
-        do
-        {
-
-            const int c = events [mv_max + j];
-            int a = j;
-
-            if( j < 0)
-            {
-                sign_ct [1] += c;
-                a = -j;
-            }
-            else if( j)
-                sign_ct [0] += c;
-
-            if( a < mvnum_short)
-            {
-                is_short_ct [0] += c;     // Short vector
-                short_ct [a] += c;       // Magnitude distribution
-            }
-            else
-            {
-                int k = mvlong_width - 1;
-                is_short_ct [1] += c;     // Long vector
-
-                //  bit 3 not always encoded.
-
-                do
-                    bit_ct [k] [(a >> k) & 1] += c;
-                while( --k >= 0);
-            }
-        } while( ++j <= mv_max);
-    }
-    */
-
     calc_prob(Pnew + mvpis_short, is_short_ct);
 
     calc_prob(Pnew + MVPsign, sign_ct);

diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c
index 57d5783..d92462b 100644
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c

@@ -39,7 +39,7 @@
 
         if (sem_wait(&cpi->h_event_start_lpf) == 0)
         {
-            if (cpi->b_multi_threaded == 0) // we're shutting down
+            if (cpi->b_multi_threaded == 0) /* we're shutting down */
                 break;
 
             vp8_loopfilter_frame(cpi, cm);
@@ -59,17 +59,14 @@
     MB_ROW_COMP *mbri = (MB_ROW_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr2);
     ENTROPY_CONTEXT_PLANES mb_row_left_context;
 
-    const int nsync = cpi->mt_sync_range;
-    //printf("Started thread %d\n", ithread);
-
     while (1)
     {
         if (cpi->b_multi_threaded == 0)
             break;
 
-        //if(WaitForSingleObject(cpi->h_event_mbrencoding[ithread], INFINITE) == WAIT_OBJECT_0)
         if (sem_wait(&cpi->h_event_start_encoding[ithread]) == 0)
         {
+            const int nsync = cpi->mt_sync_range;
             VP8_COMMON *cm = &cpi->common;
             int mb_row;
             MACROBLOCK *x = &mbri->mb;
@@ -83,7 +80,7 @@
             int *segment_counts = mbri->segment_counts;
             int *totalrate = &mbri->totalrate;
 
-            if (cpi->b_multi_threaded == 0) // we're shutting down
+            if (cpi->b_multi_threaded == 0) /* we're shutting down */
                 break;
 
             for (mb_row = ithread + 1; mb_row < cm->mb_rows; mb_row += (cpi->encoding_thread_count + 1))
@@ -108,7 +105,7 @@
 
                 last_row_current_mb_col = &cpi->mt_current_mb_col[mb_row - 1];
 
-                // reset above block coeffs
+                /* reset above block coeffs */
                 xd->above_context = cm->above_context;
                 xd->left_context = &mb_row_left_context;
 
@@ -118,10 +115,10 @@
                 recon_yoffset = (mb_row * recon_y_stride * 16);
                 recon_uvoffset = (mb_row * recon_uv_stride * 8);
 
-                // Set the mb activity pointer to the start of the row.
+                /* Set the mb activity pointer to the start of the row. */
                 x->mb_activity_ptr = &cpi->mb_activity_map[map_index];
 
-                // for each macroblock col in image
+                /* for each macroblock col in image */
                 for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
                 {
                     *current_mb_col = mb_col - 1;
@@ -139,14 +136,18 @@
                     tp = tp_start;
 #endif
 
-                    // Distance of Mb to the various image edges.
-                    // These specified to 8th pel as they are always compared to values that are in 1/8th pel units
+                    /* Distance of Mb to the various image edges.
+                     * These specified to 8th pel as they are always compared
+                     * to values that are in 1/8th pel units
+                     */
                     xd->mb_to_left_edge = -((mb_col * 16) << 3);
                     xd->mb_to_right_edge = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
                     xd->mb_to_top_edge = -((mb_row * 16) << 3);
                     xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
 
-                    // Set up limit values for motion vectors used to prevent them extending outside the UMV borders
+                    /* Set up limit values for motion vectors used to prevent
+                     * them extending outside the UMV borders
+                     */
                     x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16));
                     x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + (VP8BORDERINPIXELS - 16);
                     x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16));
@@ -160,17 +161,19 @@
                     x->rddiv = cpi->RDDIV;
                     x->rdmult = cpi->RDMULT;
 
-                    //Copy current mb to a buffer
+                    /* Copy current mb to a buffer */
                     vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
 
                     if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
                         vp8_activity_masking(cpi, x);
 
-                    // Is segmentation enabled
-                    // MB level adjustment to quantizer
+                    /* Is segmentation enabled */
+                    /* MB level adjustment to quantizer */
                     if (xd->segmentation_enabled)
                     {
-                        // Code to set segment id in xd->mbmi.segment_id for current MB (with range checking)
+                        /* Code to set segment id in xd->mbmi.segment_id for
+                         * current MB (with range checking)
+                         */
                         if (cpi->segmentation_map[map_index + mb_col] <= 3)
                             xd->mode_info_context->mbmi.segment_id = cpi->segmentation_map[map_index + mb_col];
                         else
@@ -179,7 +182,8 @@
                         vp8cx_mb_init_quantizer(cpi, x, 1);
                     }
                     else
-                        xd->mode_info_context->mbmi.segment_id = 0; // Set to Segment 0 by default
+                        /* Set to Segment 0 by default */
+                        xd->mode_info_context->mbmi.segment_id = 0;
 
                     x->active_ptr = cpi->active_map + map_index + mb_col;
 
@@ -209,21 +213,26 @@
 
 #endif
 
-                        // Count of last ref frame 0,0 usage
-                        if ((xd->mode_info_context->mbmi.mode == ZEROMV) && (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME))
-                            cpi->inter_zz_count++;
-
-                        // Special case code for cyclic refresh
-                        // If cyclic update enabled then copy xd->mbmi.segment_id; (which may have been updated based on mode
-                        // during vp8cx_encode_inter_macroblock()) back into the global segmentation map
+                        /* Special case code for cyclic refresh
+                         * If cyclic update enabled then copy
+                         * xd->mbmi.segment_id; (which may have been updated
+                         * based on mode during
+                         * vp8cx_encode_inter_macroblock()) back into the
+                         * global segmentation map
+                         */
                         if (cpi->cyclic_refresh_mode_enabled && xd->segmentation_enabled)
                         {
                             const MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
                             cpi->segmentation_map[map_index + mb_col] = mbmi->segment_id;
 
-                            // If the block has been refreshed mark it as clean (the magnitude of the -ve influences how long it will be before we consider another refresh):
-                            // Else if it was coded (last frame 0,0) and has not already been refreshed then mark it as a candidate for cleanup next time (marked 0)
-                            // else mark it as dirty (1).
+                            /* If the block has been refreshed mark it as clean
+                             * (the magnitude of the -ve influences how long it
+                             * will be before we consider another refresh):
+                             * Else if it was coded (last frame 0,0) and has
+                             * not already been refreshed then mark it as a
+                             * candidate for cleanup next time (marked 0) else
+                             * mark it as dirty (1).
+                             */
                             if (mbmi->segment_id)
                                 cpi->cyclic_refresh_map[map_index + mb_col] = -1;
                             else if ((mbmi->mode == ZEROMV) && (mbmi->ref_frame == LAST_FRAME))
@@ -246,13 +255,13 @@
 #else
                     cpi->tplist[mb_row].stop = tp;
 #endif
-                    // Increment pointer into gf usage flags structure.
+                    /* Increment pointer into gf usage flags structure. */
                     x->gf_active_ptr++;
 
-                    // Increment the activity mask pointers.
+                    /* Increment the activity mask pointers. */
                     x->mb_activity_ptr++;
 
-                    // adjust to the next column of macroblocks
+                    /* adjust to the next column of macroblocks */
                     x->src.y_buffer += 16;
                     x->src.u_buffer += 8;
                     x->src.v_buffer += 8;
@@ -260,10 +269,10 @@
                     recon_yoffset += 16;
                     recon_uvoffset += 8;
 
-                    // Keep track of segment usage
+                    /* Keep track of segment usage */
                     segment_counts[xd->mode_info_context->mbmi.segment_id]++;
 
-                    // skip to next mb
+                    /* skip to next mb */
                     xd->mode_info_context++;
                     x->partition_info++;
                     xd->above_context++;
@@ -276,7 +285,7 @@
 
                 *current_mb_col = mb_col + nsync;
 
-                // this is to account for the border
+                /* this is to account for the border */
                 xd->mode_info_context++;
                 x->partition_info++;
 
@@ -296,7 +305,7 @@
         }
     }
 
-    //printf("exit thread %d\n", ithread);
+    /* printf("exit thread %d\n", ithread); */
     return 0;
 }
 
@@ -460,10 +469,6 @@
 
         vp8_build_block_offsets(mb);
 
-        vp8_setup_block_dptrs(mbd);
-
-        vp8_setup_block_ptrs(mb);
-
         mbd->left_context = &cm->left_context;
         mb->mvc = cm->fc.mvc;
 
@@ -508,8 +513,6 @@
         vpx_memset(cpi->mb_row_ei, 0, sizeof(MB_ROW_COMP) * th_count);
         CHECK_MEM_ERROR(cpi->en_thread_data,
                         vpx_malloc(sizeof(ENCODETHREAD_DATA) * th_count));
-        CHECK_MEM_ERROR(cpi->mt_current_mb_col,
-                        vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cm->mb_rows));
 
         sem_init(&cpi->h_event_end_encoding, 0, 0);
 
@@ -523,9 +526,14 @@
 
         for (ithread = 0; ithread < th_count; ithread++)
         {
-            ENCODETHREAD_DATA * ethd = &cpi->en_thread_data[ithread];
+            ENCODETHREAD_DATA *ethd = &cpi->en_thread_data[ithread];
+
+            /* Setup block ptrs and offsets */
+            vp8_setup_block_ptrs(&cpi->mb_row_ei[ithread].mb);
+            vp8_setup_block_dptrs(&cpi->mb_row_ei[ithread].mb.e_mbd);
 
             sem_init(&cpi->h_event_start_encoding[ithread], 0, 0);
+
             ethd->ithread = ithread;
             ethd->ptr1 = (void *)cpi;
             ethd->ptr2 = (void *)&cpi->mb_row_ei[ithread];
@@ -550,14 +558,13 @@
 {
     if (cpi->b_multi_threaded)
     {
-        //shutdown other threads
+        /* shutdown other threads */
         cpi->b_multi_threaded = 0;
         {
             int i;
 
             for (i = 0; i < cpi->encoding_thread_count; i++)
             {
-                //SetEvent(cpi->h_event_mbrencoding[i]);
                 sem_post(&cpi->h_event_start_encoding[i]);
                 pthread_join(cpi->h_encoding_thread[i], 0);
 
@@ -572,12 +579,11 @@
         sem_destroy(&cpi->h_event_end_lpf);
         sem_destroy(&cpi->h_event_start_lpf);
 
-        //free thread related resources
+        /* free thread related resources */
         vpx_free(cpi->h_event_start_encoding);
         vpx_free(cpi->h_encoding_thread);
         vpx_free(cpi->mb_row_ei);
         vpx_free(cpi->en_thread_data);
-        vpx_free(cpi->mt_current_mb_col);
     }
 }
 #endif

diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index 8de1a6a..b668c8f 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c

@@ -30,14 +30,12 @@
 #include "encodemv.h"
 #include "encodeframe.h"
 
-//#define OUTPUT_FPF 1
+/* #define OUTPUT_FPF 1 */
 
 extern void vp8cx_frame_init_quantizer(VP8_COMP *cpi);
 extern void vp8_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, int_mv *mv);
 extern void vp8_alloc_compressor_data(VP8_COMP *cpi);
 
-//#define GFQ_ADJUSTMENT (40 + ((15*Q)/10))
-//#define GFQ_ADJUSTMENT (80 + ((15*Q)/10))
 #define GFQ_ADJUSTMENT vp8_gf_boost_qadjustment[Q]
 extern int vp8_kf_boost_qadjustment[QINDEX_RANGE];
 
@@ -77,7 +75,9 @@
 
 static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame);
 
-// Resets the first pass file to the given position using a relative seek from the current position
+/* Resets the first pass file to the given position using a relative seek
+ * from the current position
+ */
 static void reset_fpf_position(VP8_COMP *cpi, FIRSTPASS_STATS *Position)
 {
     cpi->twopass.stats_in = Position;
@@ -92,14 +92,14 @@
     return 1;
 }
 
-// Read frame stats at an offset from the current position
+/* Read frame stats at an offset from the current position */
 static int read_frame_stats( VP8_COMP *cpi,
                              FIRSTPASS_STATS *frame_stats,
                              int offset )
 {
     FIRSTPASS_STATS * fps_ptr = cpi->twopass.stats_in;
 
-    // Check legality of offset
+    /* Check legality of offset */
     if ( offset >= 0 )
     {
         if ( &fps_ptr[offset] >= cpi->twopass.stats_in_end )
@@ -136,7 +136,7 @@
     pkt.data.twopass_stats.sz = sizeof(FIRSTPASS_STATS);
     vpx_codec_pkt_list_add(pktlist, &pkt);
 
-// TEMP debug code
+/* TEMP debug code */
 #if OUTPUT_FPF
 
     {
@@ -257,7 +257,9 @@
     section->duration   /= section->count;
 }
 
-// Calculate a modified Error used in distributing bits between easier and harder frames
+/* Calculate a modified Error used in distributing bits between easier
+ * and harder frames
+ */
 static double calculate_modified_err(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
 {
     double av_err = ( cpi->twopass.total_stats.ssim_weighted_pred_err /
@@ -315,7 +317,9 @@
     unsigned char *src = source->y_buffer;
     double sum_weights = 0.0;
 
-    // Loop throught the Y plane raw examining levels and creating a weight for the image
+    /* Loop throught the Y plane raw examining levels and creating a weight
+     * for the image
+     */
     i = source->y_height;
     do
     {
@@ -335,41 +339,52 @@
 }
 
 
-// This function returns the current per frame maximum bitrate target
+/* This function returns the current per frame maximum bitrate target */
 static int frame_max_bits(VP8_COMP *cpi)
 {
-    // Max allocation for a single frame based on the max section guidelines passed in and how many bits are left
+    /* Max allocation for a single frame based on the max section guidelines
+     * passed in and how many bits are left
+     */
     int max_bits;
 
-    // For CBR we need to also consider buffer fullness.
-    // If we are running below the optimal level then we need to gradually tighten up on max_bits.
+    /* For CBR we need to also consider buffer fullness.
+     * If we are running below the optimal level then we need to gradually
+     * tighten up on max_bits.
+     */
     if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
     {
         double buffer_fullness_ratio = (double)cpi->buffer_level / DOUBLE_DIVIDE_CHECK((double)cpi->oxcf.optimal_buffer_level);
 
-        // For CBR base this on the target average bits per frame plus the maximum sedction rate passed in by the user
+        /* For CBR base this on the target average bits per frame plus the
+         * maximum sedction rate passed in by the user
+         */
         max_bits = (int)(cpi->av_per_frame_bandwidth * ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0));
 
-        // If our buffer is below the optimum level
+        /* If our buffer is below the optimum level */
         if (buffer_fullness_ratio < 1.0)
         {
-            // The lower of max_bits / 4 or cpi->av_per_frame_bandwidth / 4.
+            /* The lower of max_bits / 4 or cpi->av_per_frame_bandwidth / 4. */
             int min_max_bits = ((cpi->av_per_frame_bandwidth >> 2) < (max_bits >> 2)) ? cpi->av_per_frame_bandwidth >> 2 : max_bits >> 2;
 
             max_bits = (int)(max_bits * buffer_fullness_ratio);
 
+            /* Lowest value we will set ... which should allow the buffer to
+             * refill.
+             */
             if (max_bits < min_max_bits)
-                max_bits = min_max_bits;       // Lowest value we will set ... which should allow the buffer to refil.
+                max_bits = min_max_bits;
         }
     }
-    // VBR
+    /* VBR */
     else
     {
-        // For VBR base this on the bits and frames left plus the two_pass_vbrmax_section rate passed in by the user
+        /* For VBR base this on the bits and frames left plus the
+         * two_pass_vbrmax_section rate passed in by the user
+         */
         max_bits = (int)(((double)cpi->twopass.bits_left / (cpi->twopass.total_stats.count - (double)cpi->common.current_video_frame)) * ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0));
     }
 
-    // Trap case where we are out of bits
+    /* Trap case where we are out of bits */
     if (max_bits < 0)
         max_bits = 0;
 
@@ -403,13 +418,13 @@
     unsigned char *ref_ptr;
     int ref_stride = x->e_mbd.pre.y_stride;
 
-    // Set up pointers for this macro block raw buffer
+    /* Set up pointers for this macro block raw buffer */
     raw_ptr = (unsigned char *)(raw_buffer->y_buffer + recon_yoffset
                                 + d->offset);
     vp8_mse16x16 ( src_ptr, src_stride, raw_ptr, raw_stride,
                    (unsigned int *)(raw_motion_err));
 
-    // Set up pointers for this macro block recon buffer
+    /* Set up pointers for this macro block recon buffer */
     xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
     ref_ptr = (unsigned char *)(xd->pre.y_buffer + d->offset );
     vp8_mse16x16 ( src_ptr, src_stride, ref_ptr, ref_stride,
@@ -430,19 +445,19 @@
     int_mv ref_mv_full;
 
     int tmp_err;
-    int step_param = 3;                                       //3;          // Dont search over full range for first pass
-    int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param; //3;
+    int step_param = 3; /* Dont search over full range for first pass */
+    int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
     int n;
     vp8_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];
     int new_mv_mode_penalty = 256;
 
-    // override the default variance function to use MSE
+    /* override the default variance function to use MSE */
     v_fn_ptr.vf    = vp8_mse16x16;
 
-    // Set up pointers for this macro block recon buffer
+    /* Set up pointers for this macro block recon buffer */
     xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
 
-    // Initial step/diamond search centred on best mv
+    /* Initial step/diamond search centred on best mv */
     tmp_mv.as_int = 0;
     ref_mv_full.as_mv.col = ref_mv->as_mv.col>>3;
     ref_mv_full.as_mv.row = ref_mv->as_mv.row>>3;
@@ -459,7 +474,7 @@
         best_mv->col = tmp_mv.as_mv.col;
     }
 
-    // Further step/diamond searches as necessary
+    /* Further step/diamond searches as necessary */
     n = num00;
     num00 = 0;
 
@@ -520,7 +535,7 @@
 
     zero_ref_mv.as_int = 0;
 
-    vp8_clear_system_state();  //__asm emms;
+    vp8_clear_system_state();
 
     x->src = * cpi->Source;
     xd->pre = *lst_yv12;
@@ -530,19 +545,28 @@
 
     xd->mode_info_context = cm->mi;
 
+    if(!cm->use_bilinear_mc_filter)
+    {
+         xd->subpixel_predict        = vp8_sixtap_predict4x4;
+         xd->subpixel_predict8x4     = vp8_sixtap_predict8x4;
+         xd->subpixel_predict8x8     = vp8_sixtap_predict8x8;
+         xd->subpixel_predict16x16   = vp8_sixtap_predict16x16;
+     }
+     else
+     {
+         xd->subpixel_predict        = vp8_bilinear_predict4x4;
+         xd->subpixel_predict8x4     = vp8_bilinear_predict8x4;
+         xd->subpixel_predict8x8     = vp8_bilinear_predict8x8;
+         xd->subpixel_predict16x16   = vp8_bilinear_predict16x16;
+     }
+
     vp8_build_block_offsets(x);
 
-    vp8_setup_block_dptrs(&x->e_mbd);
-
-    vp8_setup_block_ptrs(x);
-
-    // set up frame new frame for intra coded blocks
+    /* set up frame new frame for intra coded blocks */
     vp8_setup_intra_recon(new_yv12);
     vp8cx_frame_init_quantizer(cpi);
 
-    // Initialise the MV cost table to the defaults
-    //if( cm->current_video_frame == 0)
-    //if ( 0 )
+    /* Initialise the MV cost table to the defaults */
     {
         int flag[2] = {1, 1};
         vp8_initialize_rd_consts(cpi, vp8_dc_quant(cm->base_qindex, cm->y1dc_delta_q));
@@ -550,24 +574,26 @@
         vp8_build_component_cost_table(cpi->mb.mvcost, (const MV_CONTEXT *) cm->fc.mvc, flag);
     }
 
-    // for each macroblock row in image
+    /* for each macroblock row in image */
     for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
     {
         int_mv best_ref_mv;
 
         best_ref_mv.as_int = 0;
 
-        // reset above block coeffs
+        /* reset above block coeffs */
         xd->up_available = (mb_row != 0);
         recon_yoffset = (mb_row * recon_y_stride * 16);
         recon_uvoffset = (mb_row * recon_uv_stride * 8);
 
-        // Set up limit values for motion vectors to prevent them extending outside the UMV borders
+        /* Set up limit values for motion vectors to prevent them extending
+         * outside the UMV borders
+         */
         x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16));
         x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) + (VP8BORDERINPIXELS - 16);
 
 
-        // for each macroblock col in image
+        /* for each macroblock col in image */
         for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
         {
             int this_error;
@@ -579,26 +605,33 @@
             xd->dst.v_buffer = new_yv12->v_buffer + recon_uvoffset;
             xd->left_available = (mb_col != 0);
 
-            //Copy current mb to a buffer
+            /* Copy current mb to a buffer */
             vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
 
-            // do intra 16x16 prediction
+            /* do intra 16x16 prediction */
             this_error = vp8_encode_intra(cpi, x, use_dc_pred);
 
-            // "intrapenalty" below deals with situations where the intra and inter error scores are very low (eg a plain black frame)
-            // We do not have special cases in first pass for 0,0 and nearest etc so all inter modes carry an overhead cost estimate fot the mv.
-            // When the error score is very low this causes us to pick all or lots of INTRA modes and throw lots of key frames.
-            // This penalty adds a cost matching that of a 0,0 mv to the intra case.
+            /* "intrapenalty" below deals with situations where the intra
+             * and inter error scores are very low (eg a plain black frame)
+             * We do not have special cases in first pass for 0,0 and
+             * nearest etc so all inter modes carry an overhead cost
+             * estimate fot the mv. When the error score is very low this
+             * causes us to pick all or lots of INTRA modes and throw lots
+             * of key frames. This penalty adds a cost matching that of a
+             * 0,0 mv to the intra case.
+             */
             this_error += intrapenalty;
 
-            // Cumulative intra error total
+            /* Cumulative intra error total */
             intra_error += (int64_t)this_error;
 
-            // Set up limit values for motion vectors to prevent them extending outside the UMV borders
+            /* Set up limit values for motion vectors to prevent them
+             * extending outside the UMV borders
+             */
             x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16));
             x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + (VP8BORDERINPIXELS - 16);
 
-            // Other than for the first frame do a motion search
+            /* Other than for the first frame do a motion search */
             if (cm->current_video_frame > 0)
             {
                 BLOCKD *d = &x->e_mbd.block[0];
@@ -607,7 +640,7 @@
                 int motion_error = INT_MAX;
                 int raw_motion_error = INT_MAX;
 
-                // Simple 0,0 motion with no mv overhead
+                /* Simple 0,0 motion with no mv overhead */
                 zz_motion_search( cpi, x, cpi->last_frame_unscaled_source,
                                   &raw_motion_error, lst_yv12, &motion_error,
                                   recon_yoffset );
@@ -617,13 +650,16 @@
                 if (raw_motion_error < cpi->oxcf.encode_breakout)
                     goto skip_motion_search;
 
-                // Test last reference frame using the previous best mv as the
-                // starting point (best reference) for the search
+                /* Test last reference frame using the previous best mv as the
+                 * starting point (best reference) for the search
+                 */
                 first_pass_motion_search(cpi, x, &best_ref_mv,
                                         &d->bmi.mv.as_mv, lst_yv12,
                                         &motion_error, recon_yoffset);
 
-                // If the current best reference mv is not centred on 0,0 then do a 0,0 based search as well
+                /* If the current best reference mv is not centred on 0,0
+                 * then do a 0,0 based search as well
+                 */
                 if (best_ref_mv.as_int)
                 {
                    tmp_err = INT_MAX;
@@ -638,7 +674,9 @@
                    }
                 }
 
-                // Experimental search in a second reference frame ((0,0) based only)
+                /* Experimental search in a second reference frame ((0,0)
+                 * based only)
+                 */
                 if (cm->current_video_frame > 1)
                 {
                     first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv, gld_yv12, &gf_motion_error, recon_yoffset);
@@ -646,19 +684,9 @@
                     if ((gf_motion_error < motion_error) && (gf_motion_error < this_error))
                     {
                         second_ref_count++;
-                        //motion_error = gf_motion_error;
-                        //d->bmi.mv.as_mv.row = tmp_mv.row;
-                        //d->bmi.mv.as_mv.col = tmp_mv.col;
                     }
-                    /*else
-                    {
-                        xd->pre.y_buffer = cm->last_frame.y_buffer + recon_yoffset;
-                        xd->pre.u_buffer = cm->last_frame.u_buffer + recon_uvoffset;
-                        xd->pre.v_buffer = cm->last_frame.v_buffer + recon_uvoffset;
-                    }*/
 
-
-                    // Reset to last frame as reference buffer
+                    /* Reset to last frame as reference buffer */
                     xd->pre.y_buffer = lst_yv12->y_buffer + recon_yoffset;
                     xd->pre.u_buffer = lst_yv12->u_buffer + recon_uvoffset;
                     xd->pre.v_buffer = lst_yv12->v_buffer + recon_uvoffset;
@@ -670,10 +698,11 @@
 
                 if (motion_error <= this_error)
                 {
-                    // Keep a count of cases where the inter and intra were
-                    // very close and very low. This helps with scene cut
-                    // detection for example in cropped clips with black bars
-                    // at the sides or top and bottom.
+                    /* Keep a count of cases where the inter and intra were
+                     * very close and very low. This helps with scene cut
+                     * detection for example in cropped clips with black bars
+                     * at the sides or top and bottom.
+                     */
                     if( (((this_error-intrapenalty) * 9) <=
                          (motion_error*10)) &&
                         (this_error < (2*intrapenalty)) )
@@ -696,17 +725,17 @@
 
                     best_ref_mv.as_int = d->bmi.mv.as_int;
 
-                    // Was the vector non-zero
+                    /* Was the vector non-zero */
                     if (d->bmi.mv.as_int)
                     {
                         mvcount++;
 
-                        // Was it different from the last non zero vector
+                        /* Was it different from the last non zero vector */
                         if ( d->bmi.mv.as_int != lastmv_as_int )
                             new_mv_count++;
                         lastmv_as_int = d->bmi.mv.as_int;
 
-                        // Does the Row vector point inwards or outwards
+                        /* Does the Row vector point inwards or outwards */
                         if (mb_row < cm->mb_rows / 2)
                         {
                             if (d->bmi.mv.as_mv.row > 0)
@@ -722,7 +751,7 @@
                                 sum_in_vectors--;
                         }
 
-                        // Does the Row vector point inwards or outwards
+                        /* Does the Row vector point inwards or outwards */
                         if (mb_col < cm->mb_cols / 2)
                         {
                             if (d->bmi.mv.as_mv.col > 0)
@@ -743,7 +772,7 @@
 
             coded_error += (int64_t)this_error;
 
-            // adjust to the next column of macroblocks
+            /* adjust to the next column of macroblocks */
             x->src.y_buffer += 16;
             x->src.u_buffer += 8;
             x->src.v_buffer += 8;
@@ -752,25 +781,25 @@
             recon_uvoffset += 8;
         }
 
-        // adjust to the next row of mbs
+        /* adjust to the next row of mbs */
         x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;
         x->src.u_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;
         x->src.v_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;
 
-        //extend the recon for intra prediction
+        /* extend the recon for intra prediction */
         vp8_extend_mb_row(new_yv12, xd->dst.y_buffer + 16, xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
-        vp8_clear_system_state();  //__asm emms;
+        vp8_clear_system_state();
     }
 
-    vp8_clear_system_state();  //__asm emms;
+    vp8_clear_system_state();
     {
         double weight = 0.0;
 
         FIRSTPASS_STATS fps;
 
         fps.frame      = cm->current_video_frame ;
-        fps.intra_error = intra_error >> 8;
-        fps.coded_error = coded_error >> 8;
+        fps.intra_error = (double)(intra_error >> 8);
+        fps.coded_error = (double)(coded_error >> 8);
         weight = simple_weight(cpi->Source);
 
 
@@ -809,12 +838,13 @@
             fps.pcnt_motion = 1.0 * (double)mvcount / cpi->common.MBs;
         }
 
-        // TODO:  handle the case when duration is set to 0, or something less
-        // than the full time between subsequent cpi->source_time_stamp s  .
-        fps.duration = cpi->source->ts_end
-                       - cpi->source->ts_start;
+        /* TODO:  handle the case when duration is set to 0, or something less
+         * than the full time between subsequent cpi->source_time_stamps
+         */
+        fps.duration = (double)(cpi->source->ts_end
+                       - cpi->source->ts_start);
 
-        // don't want to do output stats with a stack variable!
+        /* don't want to do output stats with a stack variable! */
         memcpy(&cpi->twopass.this_frame_stats,
                &fps,
                sizeof(FIRSTPASS_STATS));
@@ -822,7 +852,9 @@
         accumulate_stats(&cpi->twopass.total_stats, &fps);
     }
 
-    // Copy the previous Last Frame into the GF buffer if specific conditions for doing so are met
+    /* Copy the previous Last Frame into the GF buffer if specific
+     * conditions for doing so are met
+     */
     if ((cm->current_video_frame > 0) &&
         (cpi->twopass.this_frame_stats.pcnt_inter > 0.20) &&
         ((cpi->twopass.this_frame_stats.intra_error / cpi->twopass.this_frame_stats.coded_error) > 2.0))
@@ -830,18 +862,22 @@
         vp8_yv12_copy_frame(lst_yv12, gld_yv12);
     }
 
-    // swap frame pointers so last frame refers to the frame we just compressed
+    /* swap frame pointers so last frame refers to the frame we just
+     * compressed
+     */
     vp8_swap_yv12_buffer(lst_yv12, new_yv12);
     vp8_yv12_extend_frame_borders(lst_yv12);
 
-    // Special case for the first frame. Copy into the GF buffer as a second reference.
+    /* Special case for the first frame. Copy into the GF buffer as a
+     * second reference.
+     */
     if (cm->current_video_frame == 0)
     {
         vp8_yv12_copy_frame(lst_yv12, gld_yv12);
     }
 
 
-    // use this to see what the first pass reconstruction looks like
+    /* use this to see what the first pass reconstruction looks like */
     if (0)
     {
         char filename[512];
@@ -853,7 +889,8 @@
         else
             recon_file = fopen(filename, "ab");
 
-        if(fwrite(lst_yv12->buffer_alloc, lst_yv12->frame_size, 1, recon_file));
+        (void) fwrite(lst_yv12->buffer_alloc, lst_yv12->frame_size, 1,
+                      recon_file);
         fclose(recon_file);
     }
 
@@ -862,11 +899,10 @@
 }
 extern const int vp8_bits_per_mb[2][QINDEX_RANGE];
 
-// Estimate a cost per mb attributable to overheads such as the coding of
-// modes and motion vectors.
-// Currently simplistic in its assumptions for testing.
-//
-
+/* Estimate a cost per mb attributable to overheads such as the coding of
+ * modes and motion vectors.
+ * Currently simplistic in its assumptions for testing.
+ */
 
 static double bitcost( double prob )
 {
@@ -890,12 +926,14 @@
     motion_cost = bitcost(av_pct_motion);
     intra_cost = bitcost(av_intra);
 
-    // Estimate of extra bits per mv overhead for mbs
-    // << 9 is the normalization to the (bits * 512) used in vp8_bits_per_mb
+    /* Estimate of extra bits per mv overhead for mbs
+     * << 9 is the normalization to the (bits * 512) used in vp8_bits_per_mb
+     */
     mv_cost = ((int)(fpstats->new_mv_count / fpstats->count) * 8) << 9;
 
-    // Crude estimate of overhead cost from modes
-    // << 9 is the normalization to (bits * 512) used in vp8_bits_per_mb
+    /* Crude estimate of overhead cost from modes
+     * << 9 is the normalization to (bits * 512) used in vp8_bits_per_mb
+     */
     mode_cost =
         (int)( ( ((av_pct_inter - av_pct_motion) * zz_cost) +
                  (av_pct_motion * motion_cost) +
@@ -914,17 +952,17 @@
     double error_term = err_per_mb / err_devisor;
     double correction_factor;
 
-    // Adjustment based on Q to power term.
+    /* Adjustment based on Q to power term. */
     power_term = pt_low + (Q * 0.01);
     power_term = (power_term > pt_high) ? pt_high : power_term;
 
-    // Adjustments to error term
-    // TBD
+    /* Adjustments to error term */
+    /* TBD */
 
-    // Calculate correction factor
+    /* Calculate correction factor */
     correction_factor = pow(error_term, power_term);
 
-    // Clip range
+    /* Clip range */
     correction_factor =
         (correction_factor < 0.05)
             ? 0.05 : (correction_factor > 5.0) ? 5.0 : correction_factor;
@@ -948,15 +986,16 @@
     int overhead_bits_per_mb;
 
     if (section_target_bandwitdh <= 0)
-        return cpi->twopass.maxq_max_limit;          // Highest value allowed
+        return cpi->twopass.maxq_max_limit;       /* Highest value allowed */
 
     target_norm_bits_per_mb =
         (section_target_bandwitdh < (1 << 20))
             ? (512 * section_target_bandwitdh) / num_mbs
             : 512 * (section_target_bandwitdh / num_mbs);
 
-    // Calculate a corrective factor based on a rolling ratio of bits spent
-    // vs target bits
+    /* Calculate a corrective factor based on a rolling ratio of bits spent
+     * vs target bits
+     */
     if ((cpi->rolling_target_bits > 0) &&
         (cpi->active_worst_quality < cpi->worst_quality))
     {
@@ -977,8 +1016,9 @@
                     ? 10.0 : cpi->twopass.est_max_qcorrection_factor;
     }
 
-    // Corrections for higher compression speed settings
-    // (reduced compression expected)
+    /* Corrections for higher compression speed settings
+     * (reduced compression expected)
+     */
     if ((cpi->compressor_speed == 3) || (cpi->compressor_speed == 1))
     {
         if (cpi->oxcf.cpu_used <= 5)
@@ -987,18 +1027,20 @@
             speed_correction = 1.25;
     }
 
-    // Estimate of overhead bits per mb
-    // Correction to overhead bits for min allowed Q.
+    /* Estimate of overhead bits per mb */
+    /* Correction to overhead bits for min allowed Q. */
     overhead_bits_per_mb = overhead_bits / num_mbs;
-    overhead_bits_per_mb *= pow( 0.98, (double)cpi->twopass.maxq_min_limit );
+    overhead_bits_per_mb = (int)(overhead_bits_per_mb *
+                            pow( 0.98, (double)cpi->twopass.maxq_min_limit ));
 
-    // Try and pick a max Q that will be high enough to encode the
-    // content at the given rate.
+    /* Try and pick a max Q that will be high enough to encode the
+     * content at the given rate.
+     */
     for (Q = cpi->twopass.maxq_min_limit; Q < cpi->twopass.maxq_max_limit; Q++)
     {
         int bits_per_mb_at_this_q;
 
-        // Error per MB based correction factor
+        /* Error per MB based correction factor */
         err_correction_factor =
             calc_correction_factor(err_per_mb, 150.0, 0.40, 0.90, Q);
 
@@ -1010,27 +1052,29 @@
             * cpi->twopass.section_max_qfactor
             * (double)bits_per_mb_at_this_q);
 
-        // Mode and motion overhead
-        // As Q rises in real encode loop rd code will force overhead down
-        // We make a crude adjustment for this here as *.98 per Q step.
+        /* Mode and motion overhead */
+        /* As Q rises in real encode loop rd code will force overhead down
+         * We make a crude adjustment for this here as *.98 per Q step.
+         */
         overhead_bits_per_mb = (int)((double)overhead_bits_per_mb * 0.98);
 
         if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
             break;
     }
 
-    // Restriction on active max q for constrained quality mode.
+    /* Restriction on active max q for constrained quality mode. */
     if ( (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
          (Q < cpi->cq_target_quality) )
     {
         Q = cpi->cq_target_quality;
     }
 
-    // Adjust maxq_min_limit and maxq_max_limit limits based on
-    // averaga q observed in clip for non kf/gf.arf frames
-    // Give average a chance to settle though.
+    /* Adjust maxq_min_limit and maxq_max_limit limits based on
+     * average q observed in clip for non kf/gf.arf frames
+     * Give average a chance to settle though.
+     */
     if ( (cpi->ni_frames >
-                  ((unsigned int)cpi->twopass.total_stats.count >> 8)) &&
+                  ((int)cpi->twopass.total_stats.count >> 8)) &&
          (cpi->ni_frames > 150) )
     {
         cpi->twopass.maxq_max_limit = ((cpi->ni_av_qi + 32) < cpi->worst_quality)
@@ -1042,8 +1086,9 @@
     return Q;
 }
 
-// For cq mode estimate a cq level that matches the observed
-// complexity and data rate.
+/* For cq mode estimate a cq level that matches the observed
+ * complexity and data rate.
+ */
 static int estimate_cq( VP8_COMP *cpi,
                         FIRSTPASS_STATS * fpstats,
                         int section_target_bandwitdh,
@@ -1072,11 +1117,12 @@
                               ? (512 * section_target_bandwitdh) / num_mbs
                               : 512 * (section_target_bandwitdh / num_mbs);
 
-    // Estimate of overhead bits per mb
+    /* Estimate of overhead bits per mb */
     overhead_bits_per_mb = overhead_bits / num_mbs;
 
-    // Corrections for higher compression speed settings
-    // (reduced compression expected)
+    /* Corrections for higher compression speed settings
+     * (reduced compression expected)
+     */
     if ((cpi->compressor_speed == 3) || (cpi->compressor_speed == 1))
     {
         if (cpi->oxcf.cpu_used <= 5)
@@ -1085,19 +1131,19 @@
             speed_correction = 1.25;
     }
 
-    // II ratio correction factor for clip as a whole
+    /* II ratio correction factor for clip as a whole */
     clip_iiratio = cpi->twopass.total_stats.intra_error /
                    DOUBLE_DIVIDE_CHECK(cpi->twopass.total_stats.coded_error);
     clip_iifactor = 1.0 - ((clip_iiratio - 10.0) * 0.025);
     if (clip_iifactor < 0.80)
         clip_iifactor = 0.80;
 
-    // Try and pick a Q that can encode the content at the given rate.
+    /* Try and pick a Q that can encode the content at the given rate. */
     for (Q = 0; Q < MAXQ; Q++)
     {
         int bits_per_mb_at_this_q;
 
-        // Error per MB based correction factor
+        /* Error per MB based correction factor */
         err_correction_factor =
             calc_correction_factor(err_per_mb, 100.0, 0.40, 0.90, Q);
 
@@ -1110,16 +1156,17 @@
                         clip_iifactor *
                         (double)bits_per_mb_at_this_q);
 
-        // Mode and motion overhead
-        // As Q rises in real encode loop rd code will force overhead down
-        // We make a crude adjustment for this here as *.98 per Q step.
+        /* Mode and motion overhead */
+        /* As Q rises in real encode loop rd code will force overhead down
+         * We make a crude adjustment for this here as *.98 per Q step.
+         */
         overhead_bits_per_mb = (int)((double)overhead_bits_per_mb * 0.98);
 
         if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
             break;
     }
 
-    // Clip value to range "best allowed to (worst allowed - 1)"
+    /* Clip value to range "best allowed to (worst allowed - 1)" */
     Q = cq_level[Q];
     if ( Q >= cpi->worst_quality )
         Q = cpi->worst_quality - 1;
@@ -1141,7 +1188,9 @@
 
     target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20)) ? (512 * section_target_bandwitdh) / num_mbs : 512 * (section_target_bandwitdh / num_mbs);
 
-    // Corrections for higher compression speed settings (reduced compression expected)
+    /* Corrections for higher compression speed settings
+     * (reduced compression expected)
+     */
     if ((cpi->compressor_speed == 3) || (cpi->compressor_speed == 1))
     {
         if (cpi->oxcf.cpu_used <= 5)
@@ -1150,12 +1199,12 @@
             speed_correction = 1.25;
     }
 
-    // Try and pick a Q that can encode the content at the given rate.
+    /* Try and pick a Q that can encode the content at the given rate. */
     for (Q = 0; Q < MAXQ; Q++)
     {
         int bits_per_mb_at_this_q;
 
-        // Error per MB based correction factor
+        /* Error per MB based correction factor */
         err_correction_factor =
             calc_correction_factor(err_per_mb, 150.0, 0.40, 0.90, Q);
 
@@ -1172,7 +1221,7 @@
     return Q;
 }
 
-// Estimate a worst case Q for a KF group
+/* Estimate a worst case Q for a KF group */
 static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, double group_iiratio)
 {
     int Q;
@@ -1192,12 +1241,14 @@
 
     double combined_correction_factor;
 
-    // Trap special case where the target is <= 0
+    /* Trap special case where the target is <= 0 */
     if (target_norm_bits_per_mb <= 0)
         return MAXQ * 2;
 
-    // Calculate a corrective factor based on a rolling ratio of bits spent vs target bits
-    // This is clamped to the range 0.1 to 10.0
+    /* Calculate a corrective factor based on a rolling ratio of bits spent
+     *  vs target bits
+     * This is clamped to the range 0.1 to 10.0
+     */
     if (cpi->long_rolling_target_bits <= 0)
         current_spend_ratio = 10.0;
     else
@@ -1206,14 +1257,19 @@
         current_spend_ratio = (current_spend_ratio > 10.0) ? 10.0 : (current_spend_ratio < 0.1) ? 0.1 : current_spend_ratio;
     }
 
-    // Calculate a correction factor based on the quality of prediction in the sequence as indicated by intra_inter error score ratio (IIRatio)
-    // The idea here is to favour subsampling in the hardest sections vs the easyest.
+    /* Calculate a correction factor based on the quality of prediction in
+     * the sequence as indicated by intra_inter error score ratio (IIRatio)
+     * The idea here is to favour subsampling in the hardest sections vs
+     * the easyest.
+     */
     iiratio_correction_factor = 1.0 - ((group_iiratio - 6.0) * 0.1);
 
     if (iiratio_correction_factor < 0.5)
         iiratio_correction_factor = 0.5;
 
-    // Corrections for higher compression speed settings (reduced compression expected)
+    /* Corrections for higher compression speed settings
+     * (reduced compression expected)
+     */
     if ((cpi->compressor_speed == 3) || (cpi->compressor_speed == 1))
     {
         if (cpi->oxcf.cpu_used <= 5)
@@ -1222,13 +1278,15 @@
             speed_correction = 1.25;
     }
 
-    // Combine the various factors calculated above
+    /* Combine the various factors calculated above */
     combined_correction_factor = speed_correction * iiratio_correction_factor * current_spend_ratio;
 
-    // Try and pick a Q that should be high enough to encode the content at the given rate.
+    /* Try and pick a Q that should be high enough to encode the content at
+     * the given rate.
+     */
     for (Q = 0; Q < MAXQ; Q++)
     {
-        // Error per MB based correction factor
+        /* Error per MB based correction factor */
         err_correction_factor =
             calc_correction_factor(err_per_mb, 150.0, pow_lowq, pow_highq, Q);
 
@@ -1241,7 +1299,9 @@
             break;
     }
 
-    // If we could not hit the target even at Max Q then estimate what Q would have bee required
+    /* If we could not hit the target even at Max Q then estimate what Q
+     * would have been required
+     */
     while ((bits_per_mb_at_this_q > target_norm_bits_per_mb)  && (Q < (MAXQ * 2)))
     {
 
@@ -1280,30 +1340,34 @@
     cpi->twopass.total_stats = *cpi->twopass.stats_in_end;
     cpi->twopass.total_left_stats = cpi->twopass.total_stats;
 
-    // each frame can have a different duration, as the frame rate in the source
-    // isn't guaranteed to be constant.   The frame rate prior to the first frame
-    // encoded in the second pass is a guess.  However the sum duration is not.
-    // Its calculated based on the actual durations of all frames from the first
-    // pass.
+    /* each frame can have a different duration, as the frame rate in the
+     * source isn't guaranteed to be constant.   The frame rate prior to
+     * the first frame encoded in the second pass is a guess.  However the
+     * sum duration is not. Its calculated based on the actual durations of
+     * all frames from the first pass.
+     */
     vp8_new_frame_rate(cpi, 10000000.0 * cpi->twopass.total_stats.count / cpi->twopass.total_stats.duration);
 
     cpi->output_frame_rate = cpi->frame_rate;
     cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration * cpi->oxcf.target_bandwidth / 10000000.0) ;
     cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats.duration * two_pass_min_rate / 10000000.0);
 
-    // Calculate a minimum intra value to be used in determining the IIratio
-    // scores used in the second pass. We have this minimum to make sure
-    // that clips that are static but "low complexity" in the intra domain
-    // are still boosted appropriately for KF/GF/ARF
+    /* Calculate a minimum intra value to be used in determining the IIratio
+     * scores used in the second pass. We have this minimum to make sure
+     * that clips that are static but "low complexity" in the intra domain
+     * are still boosted appropriately for KF/GF/ARF
+     */
     cpi->twopass.kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs;
     cpi->twopass.gf_intra_err_min = GF_MB_INTRA_MIN * cpi->common.MBs;
 
-    // Scan the first pass file and calculate an average Intra / Inter error score ratio for the sequence
+    /* Scan the first pass file and calculate an average Intra / Inter error
+     * score ratio for the sequence
+     */
     {
         double sum_iiratio = 0.0;
         double IIRatio;
 
-        start_pos = cpi->twopass.stats_in;               // Note starting "file" position
+        start_pos = cpi->twopass.stats_in; /* Note starting "file" position */
 
         while (input_stats(cpi, &this_frame) != EOF)
         {
@@ -1314,14 +1378,15 @@
 
         cpi->twopass.avg_iiratio = sum_iiratio / DOUBLE_DIVIDE_CHECK((double)cpi->twopass.total_stats.count);
 
-        // Reset file position
+        /* Reset file position */
         reset_fpf_position(cpi, start_pos);
     }
 
-    // Scan the first pass file and calculate a modified total error based upon the bias/power function
-    // used to allocate bits
+    /* Scan the first pass file and calculate a modified total error based
+     * upon the bias/power function used to allocate bits
+     */
     {
-        start_pos = cpi->twopass.stats_in;               // Note starting "file" position
+        start_pos = cpi->twopass.stats_in;  /* Note starting "file" position */
 
         cpi->twopass.modified_error_total = 0.0;
         cpi->twopass.modified_error_used = 0.0;
@@ -1332,7 +1397,7 @@
         }
         cpi->twopass.modified_error_left = cpi->twopass.modified_error_total;
 
-        reset_fpf_position(cpi, start_pos);            // Reset file position
+        reset_fpf_position(cpi, start_pos);  /* Reset file position */
 
     }
 }
@@ -1341,23 +1406,24 @@
 {
 }
 
-// This function gives and estimate of how badly we believe
-// the prediction quality is decaying from frame to frame.
+/* This function gives and estimate of how badly we believe the prediction
+ * quality is decaying from frame to frame.
+ */
 static double get_prediction_decay_rate(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame)
 {
     double prediction_decay_rate;
     double motion_decay;
     double motion_pct = next_frame->pcnt_motion;
 
-    // Initial basis is the % mbs inter coded
+    /* Initial basis is the % mbs inter coded */
     prediction_decay_rate = next_frame->pcnt_inter;
 
-    // High % motion -> somewhat higher decay rate
+    /* High % motion -> somewhat higher decay rate */
     motion_decay = (1.0 - (motion_pct / 20.0));
     if (motion_decay < prediction_decay_rate)
         prediction_decay_rate = motion_decay;
 
-    // Adjustment to decay rate based on speed of motion
+    /* Adjustment to decay rate based on speed of motion */
     {
         double this_mv_rabs;
         double this_mv_cabs;
@@ -1377,9 +1443,10 @@
     return prediction_decay_rate;
 }
 
-// Function to test for a condition where a complex transition is followed
-// by a static section. For example in slide shows where there is a fade
-// between slides. This is to help with more optimal kf and gf positioning.
+/* Function to test for a condition where a complex transition is followed
+ * by a static section. For example in slide shows where there is a fade
+ * between slides. This is to help with more optimal kf and gf positioning.
+ */
 static int detect_transition_to_still(
     VP8_COMP *cpi,
     int frame_interval,
@@ -1389,9 +1456,10 @@
 {
     int trans_to_still = 0;
 
-    // Break clause to detect very still sections after motion
-    // For example a static image after a fade or other transition
-    // instead of a clean scene cut.
+    /* Break clause to detect very still sections after motion
+     * For example a static image after a fade or other transition
+     * instead of a clean scene cut.
+     */
     if ( (frame_interval > MIN_GF_INTERVAL) &&
          (loop_decay_rate >= 0.999) &&
          (decay_accumulator < 0.9) )
@@ -1401,8 +1469,7 @@
         FIRSTPASS_STATS tmp_next_frame;
         double decay_rate;
 
-        // Look ahead a few frames to see if static condition
-        // persists...
+        /* Look ahead a few frames to see if static condition persists... */
         for ( j = 0; j < still_interval; j++ )
         {
             if (EOF == input_stats(cpi, &tmp_next_frame))
@@ -1412,10 +1479,10 @@
             if ( decay_rate < 0.999 )
                 break;
         }
-        // Reset file position
+        /* Reset file position */
         reset_fpf_position(cpi, position);
 
-        // Only if it does do we signal a transition to still
+        /* Only if it does do we signal a transition to still */
         if ( j == still_interval )
             trans_to_still = 1;
     }
@@ -1423,24 +1490,26 @@
     return trans_to_still;
 }
 
-// This function detects a flash through the high relative pcnt_second_ref
-// score in the frame following a flash frame. The offset passed in should
-// reflect this
+/* This function detects a flash through the high relative pcnt_second_ref
+ * score in the frame following a flash frame. The offset passed in should
+ * reflect this
+ */
 static int detect_flash( VP8_COMP *cpi, int offset )
 {
     FIRSTPASS_STATS next_frame;
 
     int flash_detected = 0;
 
-    // Read the frame data.
-    // The return is 0 (no flash detected) if not a valid frame
+    /* Read the frame data. */
+    /* The return is 0 (no flash detected) if not a valid frame */
     if ( read_frame_stats(cpi, &next_frame, offset) != EOF )
     {
-        // What we are looking for here is a situation where there is a
-        // brief break in prediction (such as a flash) but subsequent frames
-        // are reasonably well predicted by an earlier (pre flash) frame.
-        // The recovery after a flash is indicated by a high pcnt_second_ref
-        // comapred to pcnt_inter.
+        /* What we are looking for here is a situation where there is a
+         * brief break in prediction (such as a flash) but subsequent frames
+         * are reasonably well predicted by an earlier (pre flash) frame.
+         * The recovery after a flash is indicated by a high pcnt_second_ref
+         * comapred to pcnt_inter.
+         */
         if ( (next_frame.pcnt_second_ref > next_frame.pcnt_inter) &&
              (next_frame.pcnt_second_ref >= 0.5 ) )
         {
@@ -1461,7 +1530,7 @@
     return flash_detected;
 }
 
-// Update the motion related elements to the GF arf boost calculation
+/* Update the motion related elements to the GF arf boost calculation */
 static void accumulate_frame_motion_stats(
     VP8_COMP *cpi,
     FIRSTPASS_STATS * this_frame,
@@ -1470,22 +1539,22 @@
     double * abs_mv_in_out_accumulator,
     double * mv_ratio_accumulator )
 {
-    //double this_frame_mv_in_out;
     double this_frame_mvr_ratio;
     double this_frame_mvc_ratio;
     double motion_pct;
 
-    // Accumulate motion stats.
+    /* Accumulate motion stats. */
     motion_pct = this_frame->pcnt_motion;
 
-    // Accumulate Motion In/Out of frame stats
+    /* Accumulate Motion In/Out of frame stats */
     *this_frame_mv_in_out = this_frame->mv_in_out_count * motion_pct;
     *mv_in_out_accumulator += this_frame->mv_in_out_count * motion_pct;
     *abs_mv_in_out_accumulator +=
         fabs(this_frame->mv_in_out_count * motion_pct);
 
-    // Accumulate a measure of how uniform (or conversely how random)
-    // the motion field is. (A ratio of absmv / mv)
+    /* Accumulate a measure of how uniform (or conversely how random)
+     * the motion field is. (A ratio of absmv / mv)
+     */
     if (motion_pct > 0.05)
     {
         this_frame_mvr_ratio = fabs(this_frame->mvr_abs) /
@@ -1507,7 +1576,7 @@
     }
 }
 
-// Calculate a baseline boost number for the current frame.
+/* Calculate a baseline boost number for the current frame. */
 static double calc_frame_boost(
     VP8_COMP *cpi,
     FIRSTPASS_STATS * this_frame,
@@ -1515,7 +1584,7 @@
 {
     double frame_boost;
 
-    // Underlying boost factor is based on inter intra error ratio
+    /* Underlying boost factor is based on inter intra error ratio */
     if (this_frame->intra_error > cpi->twopass.gf_intra_err_min)
         frame_boost = (IIFACTOR * this_frame->intra_error /
                       DOUBLE_DIVIDE_CHECK(this_frame->coded_error));
@@ -1523,17 +1592,18 @@
         frame_boost = (IIFACTOR * cpi->twopass.gf_intra_err_min /
                       DOUBLE_DIVIDE_CHECK(this_frame->coded_error));
 
-    // Increase boost for frames where new data coming into frame
-    // (eg zoom out). Slightly reduce boost if there is a net balance
-    // of motion out of the frame (zoom in).
-    // The range for this_frame_mv_in_out is -1.0 to +1.0
+    /* Increase boost for frames where new data coming into frame
+     * (eg zoom out). Slightly reduce boost if there is a net balance
+     * of motion out of the frame (zoom in).
+     * The range for this_frame_mv_in_out is -1.0 to +1.0
+     */
     if (this_frame_mv_in_out > 0.0)
         frame_boost += frame_boost * (this_frame_mv_in_out * 2.0);
-    // In extreme case boost is halved
+    /* In extreme case boost is halved */
     else
         frame_boost += frame_boost * (this_frame_mv_in_out / 2.0);
 
-    // Clip to maximum
+    /* Clip to maximum */
     if (frame_boost > GF_RMAX)
         frame_boost = GF_RMAX;
 
@@ -1561,26 +1631,27 @@
     double r;
     int flash_detected = 0;
 
-    // Search forward from the proposed arf/next gf position
+    /* Search forward from the proposed arf/next gf position */
     for ( i = 0; i < f_frames; i++ )
     {
         if ( read_frame_stats(cpi, &this_frame, (i+offset)) == EOF )
             break;
 
-        // Update the motion related elements to the boost calculation
+        /* Update the motion related elements to the boost calculation */
         accumulate_frame_motion_stats( cpi, &this_frame,
             &this_frame_mv_in_out, &mv_in_out_accumulator,
             &abs_mv_in_out_accumulator, &mv_ratio_accumulator );
 
-        // Calculate the baseline boost number for this frame
+        /* Calculate the baseline boost number for this frame */
         r = calc_frame_boost( cpi, &this_frame, this_frame_mv_in_out );
 
-        // We want to discount the the flash frame itself and the recovery
-        // frame that follows as both will have poor scores.
+        /* We want to discount the the flash frame itself and the recovery
+         * frame that follows as both will have poor scores.
+         */
         flash_detected = detect_flash(cpi, (i+offset)) ||
                          detect_flash(cpi, (i+offset+1));
 
-        // Cumulative effect of prediction quality decay
+        /* Cumulative effect of prediction quality decay */
         if ( !flash_detected )
         {
             decay_accumulator =
@@ -1591,7 +1662,7 @@
         }
         boost_score += (decay_accumulator * r);
 
-        // Break out conditions.
+        /* Break out conditions. */
         if  ( (!flash_detected) &&
               ((mv_ratio_accumulator > 100.0) ||
                (abs_mv_in_out_accumulator > 3.0) ||
@@ -1603,7 +1674,7 @@
 
     *f_boost = (int)(boost_score * 100.0) >> 4;
 
-    // Reset for backward looking loop
+    /* Reset for backward looking loop */
     boost_score = 0.0;
     mv_ratio_accumulator = 0.0;
     decay_accumulator = 1.0;
@@ -1611,26 +1682,27 @@
     mv_in_out_accumulator = 0.0;
     abs_mv_in_out_accumulator = 0.0;
 
-    // Search forward from the proposed arf/next gf position
+    /* Search forward from the proposed arf/next gf position */
     for ( i = -1; i >= -b_frames; i-- )
     {
         if ( read_frame_stats(cpi, &this_frame, (i+offset)) == EOF )
             break;
 
-        // Update the motion related elements to the boost calculation
+        /* Update the motion related elements to the boost calculation */
         accumulate_frame_motion_stats( cpi, &this_frame,
             &this_frame_mv_in_out, &mv_in_out_accumulator,
             &abs_mv_in_out_accumulator, &mv_ratio_accumulator );
 
-        // Calculate the baseline boost number for this frame
+        /* Calculate the baseline boost number for this frame */
         r = calc_frame_boost( cpi, &this_frame, this_frame_mv_in_out );
 
-        // We want to discount the the flash frame itself and the recovery
-        // frame that follows as both will have poor scores.
+        /* We want to discount the the flash frame itself and the recovery
+         * frame that follows as both will have poor scores.
+         */
         flash_detected = detect_flash(cpi, (i+offset)) ||
                          detect_flash(cpi, (i+offset+1));
 
-        // Cumulative effect of prediction quality decay
+        /* Cumulative effect of prediction quality decay */
         if ( !flash_detected )
         {
             decay_accumulator =
@@ -1642,7 +1714,7 @@
 
         boost_score += (decay_accumulator * r);
 
-        // Break out conditions.
+        /* Break out conditions. */
         if  ( (!flash_detected) &&
               ((mv_ratio_accumulator > 100.0) ||
                (abs_mv_in_out_accumulator > 3.0) ||
@@ -1657,7 +1729,7 @@
 }
 #endif
 
-// Analyse and define a gf/arf group .
+/* Analyse and define a gf/arf group . */
 static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
 {
     FIRSTPASS_STATS next_frame;
@@ -1673,14 +1745,14 @@
     double mv_ratio_accumulator = 0.0;
     double decay_accumulator = 1.0;
 
-    double loop_decay_rate = 1.00;          // Starting decay rate
+    double loop_decay_rate = 1.00;          /* Starting decay rate */
 
     double this_frame_mv_in_out = 0.0;
     double mv_in_out_accumulator = 0.0;
     double abs_mv_in_out_accumulator = 0.0;
     double mod_err_per_mb_accumulator = 0.0;
 
-    int max_bits = frame_max_bits(cpi);     // Max for a single frame
+    int max_bits = frame_max_bits(cpi);     /* Max for a single frame */
 
     unsigned int allow_alt_ref =
                     cpi->oxcf.play_alternate && cpi->oxcf.lag_in_frames;
@@ -1693,37 +1765,40 @@
     cpi->twopass.gf_group_bits = 0;
     cpi->twopass.gf_decay_rate = 0;
 
-    vp8_clear_system_state();  //__asm emms;
+    vp8_clear_system_state();
 
     start_pos = cpi->twopass.stats_in;
 
-    vpx_memset(&next_frame, 0, sizeof(next_frame)); // assure clean
+    vpx_memset(&next_frame, 0, sizeof(next_frame)); /* assure clean */
 
-    // Load stats for the current frame.
+    /* Load stats for the current frame. */
     mod_frame_err = calculate_modified_err(cpi, this_frame);
 
-    // Note the error of the frame at the start of the group (this will be
-    // the GF frame error if we code a normal gf
+    /* Note the error of the frame at the start of the group (this will be
+     * the GF frame error if we code a normal gf
+     */
     gf_first_frame_err = mod_frame_err;
 
-    // Special treatment if the current frame is a key frame (which is also
-    // a gf). If it is then its error score (and hence bit allocation) need
-    // to be subtracted out from the calculation for the GF group
+    /* Special treatment if the current frame is a key frame (which is also
+     * a gf). If it is then its error score (and hence bit allocation) need
+     * to be subtracted out from the calculation for the GF group
+     */
     if (cpi->common.frame_type == KEY_FRAME)
         gf_group_err -= gf_first_frame_err;
 
-    // Scan forward to try and work out how many frames the next gf group
-    // should contain and what level of boost is appropriate for the GF
-    // or ARF that will be coded with the group
+    /* Scan forward to try and work out how many frames the next gf group
+     * should contain and what level of boost is appropriate for the GF
+     * or ARF that will be coded with the group
+     */
     i = 0;
 
     while (((i < cpi->twopass.static_scene_max_gf_interval) ||
             ((cpi->twopass.frames_to_key - i) < MIN_GF_INTERVAL)) &&
            (i < cpi->twopass.frames_to_key))
     {
-        i++;    // Increment the loop counter
+        i++;
 
-        // Accumulate error score of frames in this gf group
+        /* Accumulate error score of frames in this gf group */
         mod_frame_err = calculate_modified_err(cpi, this_frame);
 
         gf_group_err += mod_frame_err;
@@ -1734,19 +1809,20 @@
         if (EOF == input_stats(cpi, &next_frame))
             break;
 
-        // Test for the case where there is a brief flash but the prediction
-        // quality back to an earlier frame is then restored.
+        /* Test for the case where there is a brief flash but the prediction
+         * quality back to an earlier frame is then restored.
+         */
         flash_detected = detect_flash(cpi, 0);
 
-        // Update the motion related elements to the boost calculation
+        /* Update the motion related elements to the boost calculation */
         accumulate_frame_motion_stats( cpi, &next_frame,
             &this_frame_mv_in_out, &mv_in_out_accumulator,
             &abs_mv_in_out_accumulator, &mv_ratio_accumulator );
 
-        // Calculate a baseline boost number for this frame
+        /* Calculate a baseline boost number for this frame */
         r = calc_frame_boost( cpi, &next_frame, this_frame_mv_in_out );
 
-        // Cumulative effect of prediction quality decay
+        /* Cumulative effect of prediction quality decay */
         if ( !flash_detected )
         {
             loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
@@ -1756,8 +1832,9 @@
         }
         boost_score += (decay_accumulator * r);
 
-        // Break clause to detect very still sections after motion
-        // For example a staic image after a fade or other transition.
+        /* Break clause to detect very still sections after motion
+         * For example a staic image after a fade or other transition.
+         */
         if ( detect_transition_to_still( cpi, i, 5,
                                          loop_decay_rate,
                                          decay_accumulator ) )
@@ -1767,14 +1844,14 @@
             break;
         }
 
-        // Break out conditions.
+        /* Break out conditions. */
         if  (
-            // Break at cpi->max_gf_interval unless almost totally static
+            /* Break at cpi->max_gf_interval unless almost totally static */
             (i >= cpi->max_gf_interval && (decay_accumulator < 0.995)) ||
             (
-                // Dont break out with a very short interval
+                /* Dont break out with a very short interval */
                 (i > MIN_GF_INTERVAL) &&
-                // Dont break out very close to a key frame
+                /* Dont break out very close to a key frame */
                 ((cpi->twopass.frames_to_key - i) >= MIN_GF_INTERVAL) &&
                 ((boost_score > 20.0) || (next_frame.pcnt_inter < 0.75)) &&
                 (!flash_detected) &&
@@ -1796,15 +1873,15 @@
     cpi->twopass.gf_decay_rate =
         (i > 0) ? (int)(100.0 * (1.0 - decay_accumulator)) / i : 0;
 
-    // When using CBR apply additional buffer related upper limits
+    /* When using CBR apply additional buffer related upper limits */
     if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
     {
         double max_boost;
 
-        // For cbr apply buffer related limits
+        /* For cbr apply buffer related limits */
         if (cpi->drop_frames_allowed)
         {
-            int df_buffer_level = cpi->oxcf.drop_frames_water_mark *
+            int64_t df_buffer_level = cpi->oxcf.drop_frames_water_mark *
                                   (cpi->oxcf.optimal_buffer_level / 100);
 
             if (cpi->buffer_level > df_buffer_level)
@@ -1825,7 +1902,7 @@
             boost_score = max_boost;
     }
 
-    // Dont allow conventional gf too near the next kf
+    /* Dont allow conventional gf too near the next kf */
     if ((cpi->twopass.frames_to_key - i) < MIN_GF_INTERVAL)
     {
         while (i < cpi->twopass.frames_to_key)
@@ -1846,14 +1923,14 @@
     cpi->gfu_boost = (int)(boost_score * 100.0) >> 4;
 
 #if NEW_BOOST
-    // Alterrnative boost calculation for alt ref
+    /* Alterrnative boost calculation for alt ref */
     alt_boost = calc_arf_boost( cpi, 0, (i-1), (i-1), &f_boost, &b_boost );
 #endif
 
-    // Should we use the alternate refernce frame
+    /* Should we use the alternate refernce frame */
     if (allow_alt_ref &&
         (i >= MIN_GF_INTERVAL) &&
-        // dont use ARF very near next kf
+        /* dont use ARF very near next kf */
         (i <= (cpi->twopass.frames_to_key - MIN_GF_INTERVAL)) &&
 #if NEW_BOOST
         ((next_frame.pcnt_inter > 0.75) ||
@@ -1883,7 +1960,7 @@
         cpi->gfu_boost = alt_boost;
 #endif
 
-        // Estimate the bits to be allocated to the group as a whole
+        /* Estimate the bits to be allocated to the group as a whole */
         if ((cpi->twopass.kf_group_bits > 0) &&
             (cpi->twopass.kf_group_error_left > 0))
         {
@@ -1893,7 +1970,7 @@
         else
             group_bits = 0;
 
-        // Boost for arf frame
+        /* Boost for arf frame */
 #if NEW_BOOST
         Boost = (alt_boost * GFQ_ADJUSTMENT) / 100;
 #else
@@ -1901,7 +1978,7 @@
 #endif
         Boost += (i * 50);
 
-        // Set max and minimum boost and hence minimum allocation
+        /* Set max and minimum boost and hence minimum allocation */
         if (Boost > ((cpi->baseline_gf_interval + 1) * 200))
             Boost = ((cpi->baseline_gf_interval + 1) * 200);
         else if (Boost < 125)
@@ -1909,24 +1986,27 @@
 
         allocation_chunks = (i * 100) + Boost;
 
-        // Normalize Altboost and allocations chunck down to prevent overflow
+        /* Normalize Altboost and allocations chunck down to prevent overflow */
         while (Boost > 1000)
         {
             Boost /= 2;
             allocation_chunks /= 2;
         }
 
-        // Calculate the number of bits to be spent on the arf based on the
-        // boost number
+        /* Calculate the number of bits to be spent on the arf based on the
+         * boost number
+         */
         arf_frame_bits = (int)((double)Boost * (group_bits /
                                (double)allocation_chunks));
 
-        // Estimate if there are enough bits available to make worthwhile use
-        // of an arf.
+        /* Estimate if there are enough bits available to make worthwhile use
+         * of an arf.
+         */
         tmp_q = estimate_q(cpi, mod_frame_err, (int)arf_frame_bits);
 
-        // Only use an arf if it is likely we will be able to code
-        // it at a lower Q than the surrounding frames.
+        /* Only use an arf if it is likely we will be able to code
+         * it at a lower Q than the surrounding frames.
+         */
         if (tmp_q < cpi->worst_quality)
         {
             int half_gf_int;
@@ -1936,42 +2016,46 @@
 
             cpi->source_alt_ref_pending = 1;
 
-            // For alt ref frames the error score for the end frame of the
-            // group (the alt ref frame) should not contribute to the group
-            // total and hence the number of bit allocated to the group.
-            // Rather it forms part of the next group (it is the GF at the
-            // start of the next group)
-            // gf_group_err -= mod_frame_err;
-
-            // For alt ref frames alt ref frame is technically part of the
-            // GF frame for the next group but we always base the error
-            // calculation and bit allocation on the current group of frames.
-
-            // Set the interval till the next gf or arf.
-            // For ARFs this is the number of frames to be coded before the
-            // future frame that is coded as an ARF.
-            // The future frame itself is part of the next group
+            /*
+             * For alt ref frames the error score for the end frame of the
+             * group (the alt ref frame) should not contribute to the group
+             * total and hence the number of bit allocated to the group.
+             * Rather it forms part of the next group (it is the GF at the
+             * start of the next group)
+             * gf_group_err -= mod_frame_err;
+             *
+             * For alt ref frames alt ref frame is technically part of the
+             * GF frame for the next group but we always base the error
+             * calculation and bit allocation on the current group of frames.
+             *
+             * Set the interval till the next gf or arf.
+             * For ARFs this is the number of frames to be coded before the
+             * future frame that is coded as an ARF.
+             * The future frame itself is part of the next group
+             */
             cpi->baseline_gf_interval = i;
 
-            // Define the arnr filter width for this group of frames:
-            // We only filter frames that lie within a distance of half
-            // the GF interval from the ARF frame. We also have to trap
-            // cases where the filter extends beyond the end of clip.
-            // Note: this_frame->frame has been updated in the loop
-            // so it now points at the ARF frame.
+            /*
+             * Define the arnr filter width for this group of frames:
+             * We only filter frames that lie within a distance of half
+             * the GF interval from the ARF frame. We also have to trap
+             * cases where the filter extends beyond the end of clip.
+             * Note: this_frame->frame has been updated in the loop
+             * so it now points at the ARF frame.
+             */
             half_gf_int = cpi->baseline_gf_interval >> 1;
-            frames_after_arf = cpi->twopass.total_stats.count -
-                               this_frame->frame - 1;
+            frames_after_arf = (int)(cpi->twopass.total_stats.count -
+                               this_frame->frame - 1);
 
             switch (cpi->oxcf.arnr_type)
             {
-            case 1: // Backward filter
+            case 1: /* Backward filter */
                 frames_fwd = 0;
                 if (frames_bwd > half_gf_int)
                     frames_bwd = half_gf_int;
                 break;
 
-            case 2: // Forward filter
+            case 2: /* Forward filter */
                 if (frames_fwd > half_gf_int)
                     frames_fwd = half_gf_int;
                 if (frames_fwd > frames_after_arf)
@@ -1979,7 +2063,7 @@
                 frames_bwd = 0;
                 break;
 
-            case 3: // Centered filter
+            case 3: /* Centered filter */
             default:
                 frames_fwd >>= 1;
                 if (frames_fwd > frames_after_arf)
@@ -1989,8 +2073,9 @@
 
                 frames_bwd = frames_fwd;
 
-                // For even length filter there is one more frame backward
-                // than forward: e.g. len=6 ==> bbbAff, len=7 ==> bbbAfff.
+                /* For even length filter there is one more frame backward
+                 * than forward: e.g. len=6 ==> bbbAff, len=7 ==> bbbAfff.
+                 */
                 if (frames_bwd < half_gf_int)
                     frames_bwd += (cpi->oxcf.arnr_max_frames+1) & 0x1;
                 break;
@@ -2010,12 +2095,14 @@
         cpi->baseline_gf_interval = i;
     }
 
-    // Now decide how many bits should be allocated to the GF group as  a
-    // proportion of those remaining in the kf group.
-    // The final key frame group in the clip is treated as a special case
-    // where cpi->twopass.kf_group_bits is tied to cpi->twopass.bits_left.
-    // This is also important for short clips where there may only be one
-    // key frame.
+    /*
+     * Now decide how many bits should be allocated to the GF group as  a
+     * proportion of those remaining in the kf group.
+     * The final key frame group in the clip is treated as a special case
+     * where cpi->twopass.kf_group_bits is tied to cpi->twopass.bits_left.
+     * This is also important for short clips where there may only be one
+     * key frame.
+     */
     if (cpi->twopass.frames_to_key >= (int)(cpi->twopass.total_stats.count -
                                             cpi->common.current_video_frame))
     {
@@ -2023,7 +2110,7 @@
             (cpi->twopass.bits_left > 0) ? cpi->twopass.bits_left : 0;
     }
 
-    // Calculate the bits to be allocated to the group as a whole
+    /* Calculate the bits to be allocated to the group as a whole */
     if ((cpi->twopass.kf_group_bits > 0) &&
         (cpi->twopass.kf_group_error_left > 0))
     {
@@ -2034,31 +2121,32 @@
     else
         cpi->twopass.gf_group_bits = 0;
 
-    cpi->twopass.gf_group_bits =
+    cpi->twopass.gf_group_bits = (int)(
         (cpi->twopass.gf_group_bits < 0)
             ? 0
             : (cpi->twopass.gf_group_bits > cpi->twopass.kf_group_bits)
-                ? cpi->twopass.kf_group_bits : cpi->twopass.gf_group_bits;
+                ? cpi->twopass.kf_group_bits : cpi->twopass.gf_group_bits);
 
-    // Clip cpi->twopass.gf_group_bits based on user supplied data rate
-    // variability limit (cpi->oxcf.two_pass_vbrmax_section)
+    /* Clip cpi->twopass.gf_group_bits based on user supplied data rate
+     * variability limit (cpi->oxcf.two_pass_vbrmax_section)
+     */
     if (cpi->twopass.gf_group_bits > max_bits * cpi->baseline_gf_interval)
         cpi->twopass.gf_group_bits = max_bits * cpi->baseline_gf_interval;
 
-    // Reset the file position
+    /* Reset the file position */
     reset_fpf_position(cpi, start_pos);
 
-    // Update the record of error used so far (only done once per gf group)
+    /* Update the record of error used so far (only done once per gf group) */
     cpi->twopass.modified_error_used += gf_group_err;
 
-    // Assign  bits to the arf or gf.
+    /* Assign  bits to the arf or gf. */
     for (i = 0; i <= (cpi->source_alt_ref_pending && cpi->common.frame_type != KEY_FRAME); i++) {
         int Boost;
         int allocation_chunks;
         int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
         int gf_bits;
 
-        // For ARF frames
+        /* For ARF frames */
         if (cpi->source_alt_ref_pending && i == 0)
         {
 #if NEW_BOOST
@@ -2068,7 +2156,7 @@
 #endif
             Boost += (cpi->baseline_gf_interval * 50);
 
-            // Set max and minimum boost and hence minimum allocation
+            /* Set max and minimum boost and hence minimum allocation */
             if (Boost > ((cpi->baseline_gf_interval + 1) * 200))
                 Boost = ((cpi->baseline_gf_interval + 1) * 200);
             else if (Boost < 125)
@@ -2077,13 +2165,13 @@
             allocation_chunks =
                 ((cpi->baseline_gf_interval + 1) * 100) + Boost;
         }
-        // Else for standard golden frames
+        /* Else for standard golden frames */
         else
         {
-            // boost based on inter / intra ratio of subsequent frames
+            /* boost based on inter / intra ratio of subsequent frames */
             Boost = (cpi->gfu_boost * GFQ_ADJUSTMENT) / 100;
 
-            // Set max and minimum boost and hence minimum allocation
+            /* Set max and minimum boost and hence minimum allocation */
             if (Boost > (cpi->baseline_gf_interval * 150))
                 Boost = (cpi->baseline_gf_interval * 150);
             else if (Boost < 125)
@@ -2093,22 +2181,24 @@
                 (cpi->baseline_gf_interval * 100) + (Boost - 100);
         }
 
-        // Normalize Altboost and allocations chunck down to prevent overflow
+        /* Normalize Altboost and allocations chunck down to prevent overflow */
         while (Boost > 1000)
         {
             Boost /= 2;
             allocation_chunks /= 2;
         }
 
-        // Calculate the number of bits to be spent on the gf or arf based on
-        // the boost number
+        /* Calculate the number of bits to be spent on the gf or arf based on
+         * the boost number
+         */
         gf_bits = (int)((double)Boost *
                         (cpi->twopass.gf_group_bits /
                          (double)allocation_chunks));
 
-        // If the frame that is to be boosted is simpler than the average for
-        // the gf/arf group then use an alternative calculation
-        // based on the error score of the frame itself
+        /* If the frame that is to be boosted is simpler than the average for
+         * the gf/arf group then use an alternative calculation
+         * based on the error score of the frame itself
+         */
         if (mod_frame_err < gf_group_err / (double)cpi->baseline_gf_interval)
         {
             double  alt_gf_grp_bits;
@@ -2127,9 +2217,10 @@
                 gf_bits = alt_gf_bits;
             }
         }
-        // Else if it is harder than other frames in the group make sure it at
-        // least receives an allocation in keeping with its relative error
-        // score, otherwise it may be worse off than an "un-boosted" frame
+        /* Else if it is harder than other frames in the group make sure it at
+         * least receives an allocation in keeping with its relative error
+         * score, otherwise it may be worse off than an "un-boosted" frame
+         */
         else
         {
             int alt_gf_bits =
@@ -2143,18 +2234,19 @@
             }
         }
 
-        // Apply an additional limit for CBR
+        /* Apply an additional limit for CBR */
         if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
         {
-            if (cpi->twopass.gf_bits > (cpi->buffer_level >> 1))
-                cpi->twopass.gf_bits = cpi->buffer_level >> 1;
+            if (cpi->twopass.gf_bits > (int)(cpi->buffer_level >> 1))
+                cpi->twopass.gf_bits = (int)(cpi->buffer_level >> 1);
         }
 
-        // Dont allow a negative value for gf_bits
+        /* Dont allow a negative value for gf_bits */
         if (gf_bits < 0)
             gf_bits = 0;
 
-        gf_bits += cpi->min_frame_bandwidth;                     // Add in minimum for a frame
+        /* Add in minimum for a frame */
+        gf_bits += cpi->min_frame_bandwidth;
 
         if (i == 0)
         {
@@ -2162,33 +2254,39 @@
         }
         if (i == 1 || (!cpi->source_alt_ref_pending && (cpi->common.frame_type != KEY_FRAME)))
         {
-            cpi->per_frame_bandwidth = gf_bits;                 // Per frame bit target for this frame
+            /* Per frame bit target for this frame */
+            cpi->per_frame_bandwidth = gf_bits;
         }
     }
 
     {
-        // Adjust KF group bits and error remainin
-        cpi->twopass.kf_group_error_left -= gf_group_err;
+        /* Adjust KF group bits and error remainin */
+        cpi->twopass.kf_group_error_left -= (int64_t)gf_group_err;
         cpi->twopass.kf_group_bits -= cpi->twopass.gf_group_bits;
 
         if (cpi->twopass.kf_group_bits < 0)
             cpi->twopass.kf_group_bits = 0;
 
-        // Note the error score left in the remaining frames of the group.
-        // For normal GFs we want to remove the error score for the first frame of the group (except in Key frame case where this has already happened)
+        /* Note the error score left in the remaining frames of the group.
+         * For normal GFs we want to remove the error score for the first
+         * frame of the group (except in Key frame case where this has
+         * already happened)
+         */
         if (!cpi->source_alt_ref_pending && cpi->common.frame_type != KEY_FRAME)
-            cpi->twopass.gf_group_error_left = gf_group_err - gf_first_frame_err;
+            cpi->twopass.gf_group_error_left = (int)(gf_group_err -
+                                                     gf_first_frame_err);
         else
-            cpi->twopass.gf_group_error_left = gf_group_err;
+            cpi->twopass.gf_group_error_left = (int) gf_group_err;
 
         cpi->twopass.gf_group_bits -= cpi->twopass.gf_bits - cpi->min_frame_bandwidth;
 
         if (cpi->twopass.gf_group_bits < 0)
             cpi->twopass.gf_group_bits = 0;
 
-        // This condition could fail if there are two kfs very close together
-        // despite (MIN_GF_INTERVAL) and would cause a devide by 0 in the
-        // calculation of cpi->twopass.alt_extra_bits.
+        /* This condition could fail if there are two kfs very close together
+         * despite (MIN_GF_INTERVAL) and would cause a devide by 0 in the
+         * calculation of cpi->twopass.alt_extra_bits.
+         */
         if ( cpi->baseline_gf_interval >= 3 )
         {
 #if NEW_BOOST
@@ -2217,7 +2315,7 @@
             cpi->twopass.alt_extra_bits = 0;
     }
 
-    // Adjustments based on a measure of complexity of the section
+    /* Adjustments based on a measure of complexity of the section */
     if (cpi->common.frame_type != KEY_FRAME)
     {
         FIRSTPASS_STATS sectionstats;
@@ -2234,47 +2332,45 @@
 
         avg_stats(&sectionstats);
 
-        cpi->twopass.section_intra_rating =
-            sectionstats.intra_error /
-            DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
+        cpi->twopass.section_intra_rating = (unsigned int)
+            (sectionstats.intra_error /
+            DOUBLE_DIVIDE_CHECK(sectionstats.coded_error));
 
         Ratio = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
-        //if( (Ratio > 11) ) //&& (sectionstats.pcnt_second_ref < .20) )
-        //{
         cpi->twopass.section_max_qfactor = 1.0 - ((Ratio - 10.0) * 0.025);
 
         if (cpi->twopass.section_max_qfactor < 0.80)
             cpi->twopass.section_max_qfactor = 0.80;
 
-        //}
-        //else
-        //    cpi->twopass.section_max_qfactor = 1.0;
-
         reset_fpf_position(cpi, start_pos);
     }
 }
 
-// Allocate bits to a normal frame that is neither a gf an arf or a key frame.
+/* Allocate bits to a normal frame that is neither a gf an arf or a key frame. */
 static void assign_std_frame_bits(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
 {
-    int    target_frame_size;                                                             // gf_group_error_left
+    int    target_frame_size;
 
     double modified_err;
-    double err_fraction;                                                                 // What portion of the remaining GF group error is used by this frame
+    double err_fraction;
 
-    int max_bits = frame_max_bits(cpi);    // Max for a single frame
+    int max_bits = frame_max_bits(cpi);  /* Max for a single frame */
 
-    // Calculate modified prediction error used in bit allocation
+    /* Calculate modified prediction error used in bit allocation */
     modified_err = calculate_modified_err(cpi, this_frame);
 
+    /* What portion of the remaining GF group error is used by this frame */
     if (cpi->twopass.gf_group_error_left > 0)
-        err_fraction = modified_err / cpi->twopass.gf_group_error_left;                              // What portion of the remaining GF group error is used by this frame
+        err_fraction = modified_err / cpi->twopass.gf_group_error_left;
     else
         err_fraction = 0.0;
 
-    target_frame_size = (int)((double)cpi->twopass.gf_group_bits * err_fraction);                    // How many of those bits available for allocation should we give it?
+    /* How many of those bits available for allocation should we give it? */
+    target_frame_size = (int)((double)cpi->twopass.gf_group_bits * err_fraction);
 
-    // Clip to target size to 0 - max_bits (or cpi->twopass.gf_group_bits) at the top end.
+    /* Clip to target size to 0 - max_bits (or cpi->twopass.gf_group_bits)
+     * at the top end.
+     */
     if (target_frame_size < 0)
         target_frame_size = 0;
     else
@@ -2286,22 +2382,25 @@
             target_frame_size = cpi->twopass.gf_group_bits;
     }
 
-    cpi->twopass.gf_group_error_left -= modified_err;                                               // Adjust error remaining
-    cpi->twopass.gf_group_bits -= target_frame_size;                                                // Adjust bits remaining
+    /* Adjust error and bits remaining */
+    cpi->twopass.gf_group_error_left -= (int)modified_err;
+    cpi->twopass.gf_group_bits -= target_frame_size;
 
     if (cpi->twopass.gf_group_bits < 0)
         cpi->twopass.gf_group_bits = 0;
 
-    target_frame_size += cpi->min_frame_bandwidth;                                          // Add in the minimum number of bits that is set aside for every frame.
+    /* Add in the minimum number of bits that is set aside for every frame. */
+    target_frame_size += cpi->min_frame_bandwidth;
 
-    // Every other frame gets a few extra bits
+    /* Every other frame gets a few extra bits */
     if ( (cpi->common.frames_since_golden & 0x01) &&
          (cpi->frames_till_gf_update_due > 0) )
     {
         target_frame_size += cpi->twopass.alt_extra_bits;
     }
 
-    cpi->per_frame_bandwidth = target_frame_size;                                           // Per frame bit target for this frame
+    /* Per frame bit target for this frame */
+    cpi->per_frame_bandwidth = target_frame_size;
 }
 
 void vp8_second_pass(VP8_COMP *cpi)
@@ -2330,20 +2429,25 @@
     this_frame_intra_error = this_frame.intra_error;
     this_frame_coded_error = this_frame.coded_error;
 
-    // keyframe and section processing !
+    /* keyframe and section processing ! */
     if (cpi->twopass.frames_to_key == 0)
     {
-        // Define next KF group and assign bits to it
+        /* Define next KF group and assign bits to it */
         vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
         find_next_key_frame(cpi, &this_frame_copy);
 
-        // Special case: Error error_resilient_mode mode does not make much sense for two pass but with its current meaning but this code is designed to stop
-        // outlandish behaviour if someone does set it when using two pass. It effectively disables GF groups.
-        // This is temporary code till we decide what should really happen in this case.
+        /* Special case: Error error_resilient_mode mode does not make much
+         * sense for two pass but with its current meaning but this code is
+         * designed to stop outlandish behaviour if someone does set it when
+         * using two pass. It effectively disables GF groups. This is
+         * temporary code till we decide what should really happen in this
+         * case.
+         */
         if (cpi->oxcf.error_resilient_mode)
         {
-            cpi->twopass.gf_group_bits = cpi->twopass.kf_group_bits;
-            cpi->twopass.gf_group_error_left = cpi->twopass.kf_group_error_left;
+            cpi->twopass.gf_group_bits = (int)cpi->twopass.kf_group_bits;
+            cpi->twopass.gf_group_error_left =
+                                  (int)cpi->twopass.kf_group_error_left;
             cpi->baseline_gf_interval = cpi->twopass.frames_to_key;
             cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
             cpi->source_alt_ref_pending = 0;
@@ -2351,19 +2455,25 @@
 
     }
 
-    // Is this a GF / ARF (Note that a KF is always also a GF)
+    /* Is this a GF / ARF (Note that a KF is always also a GF) */
     if (cpi->frames_till_gf_update_due == 0)
     {
-        // Define next gf group and assign bits to it
+        /* Define next gf group and assign bits to it */
         vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
         define_gf_group(cpi, &this_frame_copy);
 
-        // If we are going to code an altref frame at the end of the group and the current frame is not a key frame....
-        // If the previous group used an arf this frame has already benefited from that arf boost and it should not be given extra bits
-        // If the previous group was NOT coded using arf we may want to apply some boost to this GF as well
+        /* If we are going to code an altref frame at the end of the group
+         * and the current frame is not a key frame.... If the previous
+         * group used an arf this frame has already benefited from that arf
+         * boost and it should not be given extra bits If the previous
+         * group was NOT coded using arf we may want to apply some boost to
+         * this GF as well
+         */
         if (cpi->source_alt_ref_pending && (cpi->common.frame_type != KEY_FRAME))
         {
-            // Assign a standard frames worth of bits from those allocated to the GF group
+            /* Assign a standard frames worth of bits from those allocated
+             * to the GF group
+             */
             int bak = cpi->per_frame_bandwidth;
             vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
             assign_std_frame_bits(cpi, &this_frame_copy);
@@ -2371,59 +2481,64 @@
         }
     }
 
-    // Otherwise this is an ordinary frame
+    /* Otherwise this is an ordinary frame */
     else
     {
-        // Special case: Error error_resilient_mode mode does not make much sense for two pass but with its current meaning but this code is designed to stop
-        // outlandish behaviour if someone does set it when using two pass. It effectively disables GF groups.
-        // This is temporary code till we decide what should really happen in this case.
+        /* Special case: Error error_resilient_mode mode does not make much
+         * sense for two pass but with its current meaning but this code is
+         * designed to stop outlandish behaviour if someone does set it
+         * when using two pass. It effectively disables GF groups. This is
+         * temporary code till we decide what should really happen in this
+         * case.
+         */
         if (cpi->oxcf.error_resilient_mode)
         {
             cpi->frames_till_gf_update_due = cpi->twopass.frames_to_key;
 
             if (cpi->common.frame_type != KEY_FRAME)
             {
-                // Assign bits from those allocated to the GF group
+                /* Assign bits from those allocated to the GF group */
                 vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
                 assign_std_frame_bits(cpi, &this_frame_copy);
             }
         }
         else
         {
-            // Assign bits from those allocated to the GF group
+            /* Assign bits from those allocated to the GF group */
             vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
             assign_std_frame_bits(cpi, &this_frame_copy);
         }
     }
 
-    // Keep a globally available copy of this and the next frame's iiratio.
-    cpi->twopass.this_iiratio = this_frame_intra_error /
-                        DOUBLE_DIVIDE_CHECK(this_frame_coded_error);
+    /* Keep a globally available copy of this and the next frame's iiratio. */
+    cpi->twopass.this_iiratio = (unsigned int)(this_frame_intra_error /
+                        DOUBLE_DIVIDE_CHECK(this_frame_coded_error));
     {
         FIRSTPASS_STATS next_frame;
         if ( lookup_next_frame_stats(cpi, &next_frame) != EOF )
         {
-            cpi->twopass.next_iiratio = next_frame.intra_error /
-                                DOUBLE_DIVIDE_CHECK(next_frame.coded_error);
+            cpi->twopass.next_iiratio = (unsigned int)(next_frame.intra_error /
+                                DOUBLE_DIVIDE_CHECK(next_frame.coded_error));
         }
     }
 
-    // Set nominal per second bandwidth for this frame
-    cpi->target_bandwidth = cpi->per_frame_bandwidth * cpi->output_frame_rate;
+    /* Set nominal per second bandwidth for this frame */
+    cpi->target_bandwidth = (int)
+    (cpi->per_frame_bandwidth * cpi->output_frame_rate);
     if (cpi->target_bandwidth < 0)
         cpi->target_bandwidth = 0;
 
 
-    // Account for mv, mode and other overheads.
-    overhead_bits = estimate_modemvcost(
+    /* Account for mv, mode and other overheads. */
+    overhead_bits = (int)estimate_modemvcost(
                         cpi, &cpi->twopass.total_left_stats );
 
-    // Special case code for first frame.
+    /* Special case code for first frame. */
     if (cpi->common.current_video_frame == 0)
     {
         cpi->twopass.est_max_qcorrection_factor = 1.0;
 
-        // Set a cq_level in constrained quality mode.
+        /* Set a cq_level in constrained quality mode. */
         if ( cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY )
         {
             int est_cq;
@@ -2439,7 +2554,7 @@
                 cpi->cq_target_quality = est_cq;
         }
 
-        // guess at maxq needed in 2nd pass
+        /* guess at maxq needed in 2nd pass */
         cpi->twopass.maxq_max_limit = cpi->worst_quality;
         cpi->twopass.maxq_min_limit = cpi->best_quality;
 
@@ -2449,11 +2564,12 @@
                     (int)(cpi->twopass.bits_left / frames_left),
                     overhead_bits );
 
-        // Limit the maxq value returned subsequently.
-        // This increases the risk of overspend or underspend if the initial
-        // estimate for the clip is bad, but helps prevent excessive
-        // variation in Q, especially near the end of a clip
-        // where for example a small overspend may cause Q to crash
+        /* Limit the maxq value returned subsequently.
+         * This increases the risk of overspend or underspend if the initial
+         * estimate for the clip is bad, but helps prevent excessive
+         * variation in Q, especially near the end of a clip
+         * where for example a small overspend may cause Q to crash
+         */
         cpi->twopass.maxq_max_limit = ((tmp_q + 32) < cpi->worst_quality)
                                   ? (tmp_q + 32) : cpi->worst_quality;
         cpi->twopass.maxq_min_limit = ((tmp_q - 32) > cpi->best_quality)
@@ -2463,10 +2579,11 @@
         cpi->ni_av_qi                     = tmp_q;
     }
 
-    // The last few frames of a clip almost always have to few or too many
-    // bits and for the sake of over exact rate control we dont want to make
-    // radical adjustments to the allowed quantizer range just to use up a
-    // few surplus bits or get beneath the target rate.
+    /* The last few frames of a clip almost always have to few or too many
+     * bits and for the sake of over exact rate control we dont want to make
+     * radical adjustments to the allowed quantizer range just to use up a
+     * few surplus bits or get beneath the target rate.
+     */
     else if ( (cpi->common.current_video_frame <
                  (((unsigned int)cpi->twopass.total_stats.count * 255)>>8)) &&
               ((cpi->common.current_video_frame + cpi->baseline_gf_interval) <
@@ -2481,7 +2598,7 @@
                     (int)(cpi->twopass.bits_left / frames_left),
                     overhead_bits );
 
-        // Move active_worst_quality but in a damped way
+        /* Move active_worst_quality but in a damped way */
         if (tmp_q > cpi->active_worst_quality)
             cpi->active_worst_quality ++;
         else if (tmp_q < cpi->active_worst_quality)
@@ -2493,7 +2610,7 @@
 
     cpi->twopass.frames_to_key --;
 
-    // Update the total stats remaining sturcture
+    /* Update the total stats remaining sturcture */
     subtract_stats(&cpi->twopass.total_left_stats, &this_frame );
 }
 
@@ -2502,8 +2619,9 @@
 {
     int is_viable_kf = 0;
 
-    // Does the frame satisfy the primary criteria of a key frame
-    //      If so, then examine how well it predicts subsequent frames
+    /* Does the frame satisfy the primary criteria of a key frame
+     *      If so, then examine how well it predicts subsequent frames
+     */
     if ((this_frame->pcnt_second_ref < 0.10) &&
         (next_frame->pcnt_second_ref < 0.10) &&
         ((this_frame->pcnt_inter < 0.05) ||
@@ -2530,10 +2648,10 @@
 
         vpx_memcpy(&local_next_frame, next_frame, sizeof(*next_frame));
 
-        // Note the starting file position so we can reset to it
+        /* Note the starting file position so we can reset to it */
         start_pos = cpi->twopass.stats_in;
 
-        // Examine how well the key frame predicts subsequent frames
+        /* Examine how well the key frame predicts subsequent frames */
         for (i = 0 ; i < 16; i++)
         {
             next_iiratio = (IIKFACTOR1 * local_next_frame.intra_error / DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error)) ;
@@ -2541,18 +2659,16 @@
             if (next_iiratio > RMAX)
                 next_iiratio = RMAX;
 
-            // Cumulative effect of decay in prediction quality
+            /* Cumulative effect of decay in prediction quality */
             if (local_next_frame.pcnt_inter > 0.85)
                 decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter;
             else
                 decay_accumulator = decay_accumulator * ((0.85 + local_next_frame.pcnt_inter) / 2.0);
 
-            //decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter;
-
-            // Keep a running total
+            /* Keep a running total */
             boost_score += (decay_accumulator * next_iiratio);
 
-            // Test various breakout clauses
+            /* Test various breakout clauses */
             if ((local_next_frame.pcnt_inter < 0.05) ||
                 (next_iiratio < 1.5) ||
                 (((local_next_frame.pcnt_inter -
@@ -2567,17 +2683,19 @@
 
             old_boost_score = boost_score;
 
-            // Get the next frame details
+            /* Get the next frame details */
             if (EOF == input_stats(cpi, &local_next_frame))
                 break;
         }
 
-        // If there is tolerable prediction for at least the next 3 frames then break out else discard this pottential key frame and move on
+        /* If there is tolerable prediction for at least the next 3 frames
+         * then break out else discard this pottential key frame and move on
+         */
         if (boost_score > 5.0 && (i > 3))
             is_viable_kf = 1;
         else
         {
-            // Reset the file position
+            /* Reset the file position */
             reset_fpf_position(cpi, start_pos);
 
             is_viable_kf = 0;
@@ -2605,65 +2723,71 @@
     double kf_group_coded_err = 0.0;
     double recent_loop_decay[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
 
-    vpx_memset(&next_frame, 0, sizeof(next_frame)); // assure clean
+    vpx_memset(&next_frame, 0, sizeof(next_frame));
 
-    vp8_clear_system_state();  //__asm emms;
+    vp8_clear_system_state();
     start_position = cpi->twopass.stats_in;
 
     cpi->common.frame_type = KEY_FRAME;
 
-    // is this a forced key frame by interval
+    /* is this a forced key frame by interval */
     cpi->this_key_frame_forced = cpi->next_key_frame_forced;
 
-    // Clear the alt ref active flag as this can never be active on a key frame
+    /* Clear the alt ref active flag as this can never be active on a key
+     * frame
+     */
     cpi->source_alt_ref_active = 0;
 
-    // Kf is always a gf so clear frames till next gf counter
+    /* Kf is always a gf so clear frames till next gf counter */
     cpi->frames_till_gf_update_due = 0;
 
     cpi->twopass.frames_to_key = 1;
 
-    // Take a copy of the initial frame details
+    /* Take a copy of the initial frame details */
     vpx_memcpy(&first_frame, this_frame, sizeof(*this_frame));
 
-    cpi->twopass.kf_group_bits = 0;        // Total bits avaialable to kf group
-    cpi->twopass.kf_group_error_left = 0;  // Group modified error score.
+    cpi->twopass.kf_group_bits = 0;
+    cpi->twopass.kf_group_error_left = 0;
 
     kf_mod_err = calculate_modified_err(cpi, this_frame);
 
-    // find the next keyframe
+    /* find the next keyframe */
     i = 0;
     while (cpi->twopass.stats_in < cpi->twopass.stats_in_end)
     {
-        // Accumulate kf group error
+        /* Accumulate kf group error */
         kf_group_err += calculate_modified_err(cpi, this_frame);
 
-        // These figures keep intra and coded error counts for all frames including key frames in the group.
-        // The effect of the key frame itself can be subtracted out using the first_frame data collected above
+        /* These figures keep intra and coded error counts for all frames
+         * including key frames in the group. The effect of the key frame
+         * itself can be subtracted out using the first_frame data
+         * collected above
+         */
         kf_group_intra_err += this_frame->intra_error;
         kf_group_coded_err += this_frame->coded_error;
 
-        // load a the next frame's stats
+        /* load a the next frame's stats */
         vpx_memcpy(&last_frame, this_frame, sizeof(*this_frame));
         input_stats(cpi, this_frame);
 
-        // Provided that we are not at the end of the file...
+        /* Provided that we are not at the end of the file... */
         if (cpi->oxcf.auto_key
             && lookup_next_frame_stats(cpi, &next_frame) != EOF)
         {
-            // Normal scene cut check
+            /* Normal scene cut check */
             if ( ( i >= MIN_GF_INTERVAL ) &&
                  test_candidate_kf(cpi, &last_frame, this_frame, &next_frame) )
             {
                 break;
             }
 
-            // How fast is prediction quality decaying
+            /* How fast is prediction quality decaying */
             loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
 
-            // We want to know something about the recent past... rather than
-            // as used elsewhere where we are concened with decay in prediction
-            // quality since the last GF or KF.
+            /* We want to know something about the recent past... rather than
+             * as used elsewhere where we are concened with decay in prediction
+             * quality since the last GF or KF.
+             */
             recent_loop_decay[i%8] = loop_decay_rate;
             decay_accumulator = 1.0;
             for (j = 0; j < 8; j++)
@@ -2671,8 +2795,9 @@
                 decay_accumulator = decay_accumulator * recent_loop_decay[j];
             }
 
-            // Special check for transition or high motion followed by a
-            // to a static scene.
+            /* Special check for transition or high motion followed by a
+             * static scene.
+             */
             if ( detect_transition_to_still( cpi, i,
                                              (cpi->key_frame_frequency-i),
                                              loop_decay_rate,
@@ -2682,11 +2807,12 @@
             }
 
 
-            // Step on to the next frame
+            /* Step on to the next frame */
             cpi->twopass.frames_to_key ++;
 
-            // If we don't have a real key frame within the next two
-            // forcekeyframeevery intervals then break out of the loop.
+            /* If we don't have a real key frame within the next two
+             * forcekeyframeevery intervals then break out of the loop.
+             */
             if (cpi->twopass.frames_to_key >= 2 *(int)cpi->key_frame_frequency)
                 break;
         } else
@@ -2695,10 +2821,11 @@
         i++;
     }
 
-    // If there is a max kf interval set by the user we must obey it.
-    // We already breakout of the loop above at 2x max.
-    // This code centers the extra kf if the actual natural
-    // interval is between 1x and 2x
+    /* If there is a max kf interval set by the user we must obey it.
+     * We already breakout of the loop above at 2x max.
+     * This code centers the extra kf if the actual natural
+     * interval is between 1x and 2x
+     */
     if (cpi->oxcf.auto_key
         && cpi->twopass.frames_to_key > (int)cpi->key_frame_frequency )
     {
@@ -2707,29 +2834,29 @@
 
         cpi->twopass.frames_to_key /= 2;
 
-        // Copy first frame details
+        /* Copy first frame details */
         vpx_memcpy(&tmp_frame, &first_frame, sizeof(first_frame));
 
-        // Reset to the start of the group
+        /* Reset to the start of the group */
         reset_fpf_position(cpi, start_position);
 
         kf_group_err = 0;
         kf_group_intra_err = 0;
         kf_group_coded_err = 0;
 
-        // Rescan to get the correct error data for the forced kf group
+        /* Rescan to get the correct error data for the forced kf group */
         for( i = 0; i < cpi->twopass.frames_to_key; i++ )
         {
-            // Accumulate kf group errors
+            /* Accumulate kf group errors */
             kf_group_err += calculate_modified_err(cpi, &tmp_frame);
             kf_group_intra_err += tmp_frame.intra_error;
             kf_group_coded_err += tmp_frame.coded_error;
 
-            // Load a the next frame's stats
+            /* Load a the next frame's stats */
             input_stats(cpi, &tmp_frame);
         }
 
-        // Reset to the start of the group
+        /* Reset to the start of the group */
         reset_fpf_position(cpi, current_pos);
 
         cpi->next_key_frame_forced = 1;
@@ -2737,58 +2864,63 @@
     else
         cpi->next_key_frame_forced = 0;
 
-    // Special case for the last frame of the file
+    /* Special case for the last frame of the file */
     if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end)
     {
-        // Accumulate kf group error
+        /* Accumulate kf group error */
         kf_group_err += calculate_modified_err(cpi, this_frame);
 
-        // These figures keep intra and coded error counts for all frames including key frames in the group.
-        // The effect of the key frame itself can be subtracted out using the first_frame data collected above
+        /* These figures keep intra and coded error counts for all frames
+         * including key frames in the group. The effect of the key frame
+         * itself can be subtracted out using the first_frame data
+         * collected above
+         */
         kf_group_intra_err += this_frame->intra_error;
         kf_group_coded_err += this_frame->coded_error;
     }
 
-    // Calculate the number of bits that should be assigned to the kf group.
+    /* Calculate the number of bits that should be assigned to the kf group. */
     if ((cpi->twopass.bits_left > 0) && (cpi->twopass.modified_error_left > 0.0))
     {
-        // Max for a single normal frame (not key frame)
+        /* Max for a single normal frame (not key frame) */
         int max_bits = frame_max_bits(cpi);
 
-        // Maximum bits for the kf group
+        /* Maximum bits for the kf group */
         int64_t max_grp_bits;
 
-        // Default allocation based on bits left and relative
-        // complexity of the section
+        /* Default allocation based on bits left and relative
+         * complexity of the section
+         */
         cpi->twopass.kf_group_bits = (int64_t)( cpi->twopass.bits_left *
                                           ( kf_group_err /
                                             cpi->twopass.modified_error_left ));
 
-        // Clip based on maximum per frame rate defined by the user.
+        /* Clip based on maximum per frame rate defined by the user. */
         max_grp_bits = (int64_t)max_bits * (int64_t)cpi->twopass.frames_to_key;
         if (cpi->twopass.kf_group_bits > max_grp_bits)
             cpi->twopass.kf_group_bits = max_grp_bits;
 
-        // Additional special case for CBR if buffer is getting full.
+        /* Additional special case for CBR if buffer is getting full. */
         if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
         {
-            int opt_buffer_lvl = cpi->oxcf.optimal_buffer_level;
-            int buffer_lvl = cpi->buffer_level;
+            int64_t opt_buffer_lvl = cpi->oxcf.optimal_buffer_level;
+            int64_t buffer_lvl = cpi->buffer_level;
 
-            // If the buffer is near or above the optimal and this kf group is
-            // not being allocated much then increase the allocation a bit.
+            /* If the buffer is near or above the optimal and this kf group is
+             * not being allocated much then increase the allocation a bit.
+             */
             if (buffer_lvl >= opt_buffer_lvl)
             {
-                int high_water_mark = (opt_buffer_lvl +
+                int64_t high_water_mark = (opt_buffer_lvl +
                                        cpi->oxcf.maximum_buffer_size) >> 1;
 
                 int64_t av_group_bits;
 
-                // Av bits per frame * number of frames
+                /* Av bits per frame * number of frames */
                 av_group_bits = (int64_t)cpi->av_per_frame_bandwidth *
                                 (int64_t)cpi->twopass.frames_to_key;
 
-                // We are at or above the maximum.
+                /* We are at or above the maximum. */
                 if (cpi->buffer_level >= high_water_mark)
                 {
                     int64_t min_group_bits;
@@ -2800,7 +2932,7 @@
                     if (cpi->twopass.kf_group_bits < min_group_bits)
                         cpi->twopass.kf_group_bits = min_group_bits;
                 }
-                // We are above optimal but below the maximum
+                /* We are above optimal but below the maximum */
                 else if (cpi->twopass.kf_group_bits < av_group_bits)
                 {
                     int64_t bits_below_av = av_group_bits -
@@ -2817,13 +2949,15 @@
     else
         cpi->twopass.kf_group_bits = 0;
 
-    // Reset the first pass file position
+    /* Reset the first pass file position */
     reset_fpf_position(cpi, start_position);
 
-    // determine how big to make this keyframe based on how well the subsequent frames use inter blocks
+    /* determine how big to make this keyframe based on how well the
+     * subsequent frames use inter blocks
+     */
     decay_accumulator = 1.0;
     boost_score = 0.0;
-    loop_decay_rate = 1.00;       // Starting decay rate
+    loop_decay_rate = 1.00;       /* Starting decay rate */
 
     for (i = 0 ; i < cpi->twopass.frames_to_key ; i++)
     {
@@ -2842,7 +2976,7 @@
         if (r > RMAX)
             r = RMAX;
 
-        // How fast is prediction quality decaying
+        /* How fast is prediction quality decaying */
         loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
 
         decay_accumulator = decay_accumulator * loop_decay_rate;
@@ -2875,31 +3009,26 @@
 
         avg_stats(&sectionstats);
 
-        cpi->twopass.section_intra_rating =
-            sectionstats.intra_error
-            / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
+        cpi->twopass.section_intra_rating = (unsigned int)
+            (sectionstats.intra_error
+            / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error));
 
         Ratio = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
-        // if( (Ratio > 11) ) //&& (sectionstats.pcnt_second_ref < .20) )
-        //{
         cpi->twopass.section_max_qfactor = 1.0 - ((Ratio - 10.0) * 0.025);
 
         if (cpi->twopass.section_max_qfactor < 0.80)
             cpi->twopass.section_max_qfactor = 0.80;
-
-        //}
-        //else
-        //    cpi->twopass.section_max_qfactor = 1.0;
     }
 
-    // When using CBR apply additional buffer fullness related upper limits
+    /* When using CBR apply additional buffer fullness related upper limits */
     if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
     {
         double max_boost;
 
         if (cpi->drop_frames_allowed)
         {
-            int df_buffer_level = cpi->oxcf.drop_frames_water_mark * (cpi->oxcf.optimal_buffer_level / 100);
+            int df_buffer_level = (int)(cpi->oxcf.drop_frames_water_mark
+                                  * (cpi->oxcf.optimal_buffer_level / 100));
 
             if (cpi->buffer_level > df_buffer_level)
                 max_boost = ((double)((cpi->buffer_level - df_buffer_level) * 2 / 3) * 16.0) / DOUBLE_DIVIDE_CHECK((double)cpi->av_per_frame_bandwidth);
@@ -2919,18 +3048,18 @@
             boost_score = max_boost;
     }
 
-    // Reset the first pass file position
+    /* Reset the first pass file position */
     reset_fpf_position(cpi, start_position);
 
-    // Work out how many bits to allocate for the key frame itself
+    /* Work out how many bits to allocate for the key frame itself */
     if (1)
     {
-        int kf_boost = boost_score;
+        int kf_boost = (int)boost_score;
         int allocation_chunks;
         int Counter = cpi->twopass.frames_to_key;
         int alt_kf_bits;
         YV12_BUFFER_CONFIG *lst_yv12 = &cpi->common.yv12_fb[cpi->common.lst_fb_idx];
-        // Min boost based on kf interval
+        /* Min boost based on kf interval */
 #if 0
 
         while ((kf_boost < 48) && (Counter > 0))
@@ -2948,32 +3077,33 @@
             if (kf_boost > 48) kf_boost = 48;
         }
 
-        // bigger frame sizes need larger kf boosts, smaller frames smaller boosts...
+        /* bigger frame sizes need larger kf boosts, smaller frames smaller
+         * boosts...
+         */
         if ((lst_yv12->y_width * lst_yv12->y_height) > (320 * 240))
             kf_boost += 2 * (lst_yv12->y_width * lst_yv12->y_height) / (320 * 240);
         else if ((lst_yv12->y_width * lst_yv12->y_height) < (320 * 240))
             kf_boost -= 4 * (320 * 240) / (lst_yv12->y_width * lst_yv12->y_height);
 
-        kf_boost = (int)((double)kf_boost * 100.0) >> 4;                          // Scale 16 to 100
-
-        // Adjustment to boost based on recent average q
-        //kf_boost = kf_boost * vp8_kf_boost_qadjustment[cpi->ni_av_qi] / 100;
-
-        if (kf_boost < 250)                                                      // Min KF boost
+        /* Min KF boost */
+        kf_boost = (int)((double)kf_boost * 100.0) >> 4; /* Scale 16 to 100 */
+        if (kf_boost < 250)
             kf_boost = 250;
 
-        // We do three calculations for kf size.
-        // The first is based on the error score for the whole kf group.
-        // The second (optionaly) on the key frames own error if this is
-        // smaller than the average for the group.
-        // The final one insures that the frame receives at least the
-        // allocation it would have received based on its own error score vs
-        // the error score remaining
-        // Special case if the sequence appears almost totaly static
-        // as measured by the decay accumulator. In this case we want to
-        // spend almost all of the bits on the key frame.
-        // cpi->twopass.frames_to_key-1 because key frame itself is taken
-        // care of by kf_boost.
+        /*
+         * We do three calculations for kf size.
+         * The first is based on the error score for the whole kf group.
+         * The second (optionaly) on the key frames own error if this is
+         * smaller than the average for the group.
+         * The final one insures that the frame receives at least the
+         * allocation it would have received based on its own error score vs
+         * the error score remaining
+         * Special case if the sequence appears almost totaly static
+         * as measured by the decay accumulator. In this case we want to
+         * spend almost all of the bits on the key frame.
+         * cpi->twopass.frames_to_key-1 because key frame itself is taken
+         * care of by kf_boost.
+         */
         if ( decay_accumulator >= 0.99 )
         {
             allocation_chunks =
@@ -2985,7 +3115,7 @@
                 ((cpi->twopass.frames_to_key - 1) * 100) + kf_boost;
         }
 
-        // Normalize Altboost and allocations chunck down to prevent overflow
+        /* Normalize Altboost and allocations chunck down to prevent overflow */
         while (kf_boost > 1000)
         {
             kf_boost /= 2;
@@ -2994,20 +3124,21 @@
 
         cpi->twopass.kf_group_bits = (cpi->twopass.kf_group_bits < 0) ? 0 : cpi->twopass.kf_group_bits;
 
-        // Calculate the number of bits to be spent on the key frame
+        /* Calculate the number of bits to be spent on the key frame */
         cpi->twopass.kf_bits  = (int)((double)kf_boost * ((double)cpi->twopass.kf_group_bits / (double)allocation_chunks));
 
-        // Apply an additional limit for CBR
+        /* Apply an additional limit for CBR */
         if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
         {
-            if (cpi->twopass.kf_bits > ((3 * cpi->buffer_level) >> 2))
-                cpi->twopass.kf_bits = (3 * cpi->buffer_level) >> 2;
+            if (cpi->twopass.kf_bits > (int)((3 * cpi->buffer_level) >> 2))
+                cpi->twopass.kf_bits = (int)((3 * cpi->buffer_level) >> 2);
         }
 
-        // If the key frame is actually easier than the average for the
-        // kf group (which does sometimes happen... eg a blank intro frame)
-        // Then use an alternate calculation based on the kf error score
-        // which should give a smaller key frame.
+        /* If the key frame is actually easier than the average for the
+         * kf group (which does sometimes happen... eg a blank intro frame)
+         * Then use an alternate calculation based on the kf error score
+         * which should give a smaller key frame.
+         */
         if (kf_mod_err < kf_group_err / cpi->twopass.frames_to_key)
         {
             double  alt_kf_grp_bits =
@@ -3023,9 +3154,10 @@
                 cpi->twopass.kf_bits = alt_kf_bits;
             }
         }
-        // Else if it is much harder than other frames in the group make sure
-        // it at least receives an allocation in keeping with its relative
-        // error score
+        /* Else if it is much harder than other frames in the group make sure
+         * it at least receives an allocation in keeping with its relative
+         * error score
+         */
         else
         {
             alt_kf_bits =
@@ -3040,17 +3172,23 @@
         }
 
         cpi->twopass.kf_group_bits -= cpi->twopass.kf_bits;
-        cpi->twopass.kf_bits += cpi->min_frame_bandwidth;                                          // Add in the minimum frame allowance
+        /* Add in the minimum frame allowance */
+        cpi->twopass.kf_bits += cpi->min_frame_bandwidth;
 
-        cpi->per_frame_bandwidth = cpi->twopass.kf_bits;                                           // Peer frame bit target for this frame
-        cpi->target_bandwidth = cpi->twopass.kf_bits * cpi->output_frame_rate;                      // Convert to a per second bitrate
+        /* Peer frame bit target for this frame */
+        cpi->per_frame_bandwidth = cpi->twopass.kf_bits;
+
+        /* Convert to a per second bitrate */
+        cpi->target_bandwidth = (int)(cpi->twopass.kf_bits *
+                                      cpi->output_frame_rate);
     }
 
-    // Note the total error score of the kf group minus the key frame itself
+    /* Note the total error score of the kf group minus the key frame itself */
     cpi->twopass.kf_group_error_left = (int)(kf_group_err - kf_mod_err);
 
-    // Adjust the count of total modified error left.
-    // The count of bits left is adjusted elsewhere based on real coded frame sizes
+    /* Adjust the count of total modified error left. The count of bits left
+     * is adjusted elsewhere based on real coded frame sizes
+     */
     cpi->twopass.modified_error_left -= kf_group_err;
 
     if (cpi->oxcf.allow_spatial_resampling)
@@ -3063,7 +3201,7 @@
         int new_width = cpi->oxcf.Width;
         int new_height = cpi->oxcf.Height;
 
-        int projected_buffer_level = cpi->buffer_level;
+        int projected_buffer_level = (int)cpi->buffer_level;
         int tmp_q;
 
         double projected_bits_perframe;
@@ -3076,40 +3214,47 @@
         if ((cpi->common.Width != cpi->oxcf.Width) || (cpi->common.Height != cpi->oxcf.Height))
             last_kf_resampled = 1;
 
-        // Set back to unscaled by defaults
+        /* Set back to unscaled by defaults */
         cpi->common.horiz_scale = NORMAL;
         cpi->common.vert_scale = NORMAL;
 
-        // Calculate Average bits per frame.
-        //av_bits_per_frame = cpi->twopass.bits_left/(double)(cpi->twopass.total_stats.count - cpi->common.current_video_frame);
+        /* Calculate Average bits per frame. */
         av_bits_per_frame = cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->frame_rate);
-        //if ( av_bits_per_frame < 0.0 )
-        //  av_bits_per_frame = 0.0
 
-        // CBR... Use the clip average as the target for deciding resample
+        /* CBR... Use the clip average as the target for deciding resample */
         if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
         {
             bits_per_frame = av_bits_per_frame;
         }
 
-        // In VBR we want to avoid downsampling in easy section unless we are under extreme pressure
-        // So use the larger of target bitrate for this sectoion or average bitrate for sequence
+        /* In VBR we want to avoid downsampling in easy section unless we
+         * are under extreme pressure So use the larger of target bitrate
+         * for this section or average bitrate for sequence
+         */
         else
         {
-            bits_per_frame = cpi->twopass.kf_group_bits / cpi->twopass.frames_to_key;     // This accounts for how hard the section is...
+            /* This accounts for how hard the section is... */
+            bits_per_frame = (double)
+                (cpi->twopass.kf_group_bits / cpi->twopass.frames_to_key);
 
-            if (bits_per_frame < av_bits_per_frame)                      // Dont turn to resampling in easy sections just because they have been assigned a small number of bits
+            /* Dont turn to resampling in easy sections just because they
+             * have been assigned a small number of bits
+             */
+            if (bits_per_frame < av_bits_per_frame)
                 bits_per_frame = av_bits_per_frame;
         }
 
-        // bits_per_frame should comply with our minimum
+        /* bits_per_frame should comply with our minimum */
         if (bits_per_frame < (cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100))
             bits_per_frame = (cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
 
-        // Work out if spatial resampling is necessary
-        kf_q = estimate_kf_group_q(cpi, err_per_frame, bits_per_frame, group_iiratio);
+        /* Work out if spatial resampling is necessary */
+        kf_q = estimate_kf_group_q(cpi, err_per_frame,
+                                  (int)bits_per_frame, group_iiratio);
 
-        // If we project a required Q higher than the maximum allowed Q then make a guess at the actual size of frames in this section
+        /* If we project a required Q higher than the maximum allowed Q then
+         * make a guess at the actual size of frames in this section
+         */
         projected_bits_perframe = bits_per_frame;
         tmp_q = kf_q;
 
@@ -3119,8 +3264,11 @@
             tmp_q--;
         }
 
-        // Guess at buffer level at the end of the section
-        projected_buffer_level = cpi->buffer_level - (int)((projected_bits_perframe - av_bits_per_frame) * cpi->twopass.frames_to_key);
+        /* Guess at buffer level at the end of the section */
+        projected_buffer_level = (int)
+                    (cpi->buffer_level - (int)
+                    ((projected_bits_perframe - av_bits_per_frame) *
+                    cpi->twopass.frames_to_key));
 
         if (0)
         {
@@ -3129,15 +3277,17 @@
             fclose(f);
         }
 
-        // The trigger for spatial resampling depends on the various parameters such as whether we are streaming (CBR) or VBR.
+        /* The trigger for spatial resampling depends on the various
+         * parameters such as whether we are streaming (CBR) or VBR.
+         */
         if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
         {
-            // Trigger resample if we are projected to fall below down sample level or
-            // resampled last time and are projected to remain below the up sample level
+            /* Trigger resample if we are projected to fall below down
+             * sample level or resampled last time and are projected to
+             * remain below the up sample level
+             */
             if ((projected_buffer_level < (cpi->oxcf.resample_down_water_mark * cpi->oxcf.optimal_buffer_level / 100)) ||
                 (last_kf_resampled && (projected_buffer_level < (cpi->oxcf.resample_up_water_mark * cpi->oxcf.optimal_buffer_level / 100))))
-                //( ((cpi->buffer_level < (cpi->oxcf.resample_down_water_mark * cpi->oxcf.optimal_buffer_level / 100))) &&
-                //  ((projected_buffer_level < (cpi->oxcf.resample_up_water_mark * cpi->oxcf.optimal_buffer_level / 100))) ))
                 resample_trigger = 1;
             else
                 resample_trigger = 0;
@@ -3147,9 +3297,15 @@
             int64_t clip_bits = (int64_t)(cpi->twopass.total_stats.count * cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->frame_rate));
             int64_t over_spend = cpi->oxcf.starting_buffer_level - cpi->buffer_level;
 
-            if ((last_kf_resampled && (kf_q > cpi->worst_quality)) ||                                               // If triggered last time the threshold for triggering again is reduced
-                ((kf_q > cpi->worst_quality) &&                                                                  // Projected Q higher than allowed and ...
-                 (over_spend > clip_bits / 20)))                                                               // ... Overspend > 5% of total bits
+            /* If triggered last time the threshold for triggering again is
+             * reduced:
+             *
+             * Projected Q higher than allowed and Overspend > 5% of total
+             * bits
+             */
+            if ((last_kf_resampled && (kf_q > cpi->worst_quality)) ||
+                ((kf_q > cpi->worst_quality) &&
+                 (over_spend > clip_bits / 20)))
                 resample_trigger = 1;
             else
                 resample_trigger = 0;
@@ -3171,13 +3327,19 @@
                 new_width = ((hs - 1) + (cpi->oxcf.Width * hr)) / hs;
                 new_height = ((vs - 1) + (cpi->oxcf.Height * vr)) / vs;
 
-                // Reducing the area to 1/4 does not reduce the complexity (err_per_frame) to 1/4...
-                // effective_sizeratio attempts to provide a crude correction for this
+                /* Reducing the area to 1/4 does not reduce the complexity
+                 * (err_per_frame) to 1/4... effective_sizeratio attempts
+                 * to provide a crude correction for this
+                 */
                 effective_size_ratio = (double)(new_width * new_height) / (double)(cpi->oxcf.Width * cpi->oxcf.Height);
                 effective_size_ratio = (1.0 + (3.0 * effective_size_ratio)) / 4.0;
 
-                // Now try again and see what Q we get with the smaller image size
-                kf_q = estimate_kf_group_q(cpi, err_per_frame * effective_size_ratio, bits_per_frame, group_iiratio);
+                /* Now try again and see what Q we get with the smaller
+                 * image size
+                 */
+                kf_q = estimate_kf_group_q(cpi,
+                                          err_per_frame * effective_size_ratio,
+                                          (int)bits_per_frame, group_iiratio);
 
                 if (0)
                 {

diff --git a/vp8/encoder/lookahead.c b/vp8/encoder/lookahead.c
index 4c92281..ce2ce08 100644
--- a/vp8/encoder/lookahead.c
+++ b/vp8/encoder/lookahead.c

@@ -118,10 +118,11 @@
     ctx->sz++;
     buf = pop(ctx, &ctx->write_idx);
 
-    // Only do this partial copy if the following conditions are all met:
-    // 1. Lookahead queue has has size of 1.
-    // 2. Active map is provided.
-    // 3. This is not a key frame, golden nor altref frame.
+    /* Only do this partial copy if the following conditions are all met:
+     * 1. Lookahead queue has has size of 1.
+     * 2. Active map is provided.
+     * 3. This is not a key frame, golden nor altref frame.
+     */
     if (ctx->max_sz == 1 && active_map && !flags)
     {
         for (row = 0; row < mb_rows; ++row)
@@ -130,18 +131,18 @@
 
             while (1)
             {
-                // Find the first active macroblock in this row.
+                /* Find the first active macroblock in this row. */
                 for (; col < mb_cols; ++col)
                 {
                     if (active_map[col])
                         break;
                 }
 
-                // No more active macroblock in this row.
+                /* No more active macroblock in this row. */
                 if (col == mb_cols)
                     break;
 
-                // Find the end of active region in this row.
+                /* Find the end of active region in this row. */
                 active_end = col;
 
                 for (; active_end < mb_cols; ++active_end)
@@ -150,13 +151,13 @@
                         break;
                 }
 
-                // Only copy this active region.
+                /* Only copy this active region. */
                 vp8_copy_and_extend_frame_with_rect(src, &buf->img,
                                                     row << 4,
                                                     col << 4, 16,
                                                     (active_end - col) << 4);
 
-                // Start again from the end of this active region.
+                /* Start again from the end of this active region. */
                 col = active_end;
             }
 

diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c
index dc0edfb..b08c7a5 100644
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c

@@ -25,16 +25,19 @@
 
 int vp8_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvcost[2], int Weight)
 {
-    // MV costing is based on the distribution of vectors in the previous frame and as such will tend to
-    // over state the cost of vectors. In addition coding a new vector can have a knock on effect on the
-    // cost of subsequent vectors and the quality of prediction from NEAR and NEAREST for subsequent blocks.
-    // The "Weight" parameter allows, to a limited extent, for some account to be taken of these factors.
+    /* MV costing is based on the distribution of vectors in the previous
+     * frame and as such will tend to over state the cost of vectors. In
+     * addition coding a new vector can have a knock on effect on the cost
+     * of subsequent vectors and the quality of prediction from NEAR and
+     * NEAREST for subsequent blocks. The "Weight" parameter allows, to a
+     * limited extent, for some account to be taken of these factors.
+     */
     return ((mvcost[0][(mv->as_mv.row - ref->as_mv.row) >> 1] + mvcost[1][(mv->as_mv.col - ref->as_mv.col) >> 1]) * Weight) >> 7;
 }
 
 static int mv_err_cost(int_mv *mv, int_mv *ref, int *mvcost[2], int error_per_bit)
 {
-    // Ignore mv costing if mvcost is NULL
+    /* Ignore mv costing if mvcost is NULL */
     if (mvcost)
         return ((mvcost[0][(mv->as_mv.row - ref->as_mv.row) >> 1] +
                  mvcost[1][(mv->as_mv.col - ref->as_mv.col) >> 1])
@@ -44,8 +47,8 @@
 
 static int mvsad_err_cost(int_mv *mv, int_mv *ref, int *mvsadcost[2], int error_per_bit)
 {
-    // Calculate sad error cost on full pixel basis.
-    // Ignore mv costing if mvsadcost is NULL
+    /* Calculate sad error cost on full pixel basis. */
+    /* Ignore mv costing if mvsadcost is NULL */
     if (mvsadcost)
         return ((mvsadcost[0][(mv->as_mv.row - ref->as_mv.row)] +
                  mvsadcost[1][(mv->as_mv.col - ref->as_mv.col)])
@@ -59,7 +62,7 @@
     int search_site_count = 0;
 
 
-    // Generate offsets for 4 search sites per step.
+    /* Generate offsets for 4 search sites per step. */
     Len = MAX_FIRST_STEP;
     x->ss[search_site_count].mv.col = 0;
     x->ss[search_site_count].mv.row = 0;
@@ -69,31 +72,31 @@
     while (Len > 0)
     {
 
-        // Compute offsets for search sites.
+        /* Compute offsets for search sites. */
         x->ss[search_site_count].mv.col = 0;
         x->ss[search_site_count].mv.row = -Len;
         x->ss[search_site_count].offset = -Len * stride;
         search_site_count++;
 
-        // Compute offsets for search sites.
+        /* Compute offsets for search sites. */
         x->ss[search_site_count].mv.col = 0;
         x->ss[search_site_count].mv.row = Len;
         x->ss[search_site_count].offset = Len * stride;
         search_site_count++;
 
-        // Compute offsets for search sites.
+        /* Compute offsets for search sites. */
         x->ss[search_site_count].mv.col = -Len;
         x->ss[search_site_count].mv.row = 0;
         x->ss[search_site_count].offset = -Len;
         search_site_count++;
 
-        // Compute offsets for search sites.
+        /* Compute offsets for search sites. */
         x->ss[search_site_count].mv.col = Len;
         x->ss[search_site_count].mv.row = 0;
         x->ss[search_site_count].offset = Len;
         search_site_count++;
 
-        // Contract.
+        /* Contract. */
         Len /= 2;
     }
 
@@ -106,7 +109,7 @@
     int Len;
     int search_site_count = 0;
 
-    // Generate offsets for 8 search sites per step.
+    /* Generate offsets for 8 search sites per step. */
     Len = MAX_FIRST_STEP;
     x->ss[search_site_count].mv.col = 0;
     x->ss[search_site_count].mv.row = 0;
@@ -116,56 +119,56 @@
     while (Len > 0)
     {
 
-        // Compute offsets for search sites.
+        /* Compute offsets for search sites. */
         x->ss[search_site_count].mv.col = 0;
         x->ss[search_site_count].mv.row = -Len;
         x->ss[search_site_count].offset = -Len * stride;
         search_site_count++;
 
-        // Compute offsets for search sites.
+        /* Compute offsets for search sites. */
         x->ss[search_site_count].mv.col = 0;
         x->ss[search_site_count].mv.row = Len;
         x->ss[search_site_count].offset = Len * stride;
         search_site_count++;
 
-        // Compute offsets for search sites.
+        /* Compute offsets for search sites. */
         x->ss[search_site_count].mv.col = -Len;
         x->ss[search_site_count].mv.row = 0;
         x->ss[search_site_count].offset = -Len;
         search_site_count++;
 
-        // Compute offsets for search sites.
+        /* Compute offsets for search sites. */
         x->ss[search_site_count].mv.col = Len;
         x->ss[search_site_count].mv.row = 0;
         x->ss[search_site_count].offset = Len;
         search_site_count++;
 
-        // Compute offsets for search sites.
+        /* Compute offsets for search sites. */
         x->ss[search_site_count].mv.col = -Len;
         x->ss[search_site_count].mv.row = -Len;
         x->ss[search_site_count].offset = -Len * stride - Len;
         search_site_count++;
 
-        // Compute offsets for search sites.
+        /* Compute offsets for search sites. */
         x->ss[search_site_count].mv.col = Len;
         x->ss[search_site_count].mv.row = -Len;
         x->ss[search_site_count].offset = -Len * stride + Len;
         search_site_count++;
 
-        // Compute offsets for search sites.
+        /* Compute offsets for search sites. */
         x->ss[search_site_count].mv.col = -Len;
         x->ss[search_site_count].mv.row = Len;
         x->ss[search_site_count].offset = Len * stride - Len;
         search_site_count++;
 
-        // Compute offsets for search sites.
+        /* Compute offsets for search sites. */
         x->ss[search_site_count].mv.col = Len;
         x->ss[search_site_count].mv.row = Len;
         x->ss[search_site_count].offset = Len * stride + Len;
         search_site_count++;
 
 
-        // Contract.
+        /* Contract. */
         Len /= 2;
     }
 
@@ -182,13 +185,20 @@
  * 32 cols area that is enough for 16x16 macroblock. Later, for SPLITMV, we
  * could reduce the area.
  */
-#define MVC(r,c) (mvcost ? ((mvcost[0][(r)-rr] + mvcost[1][(c) - rc]) * error_per_bit + 128 )>>8 : 0) // estimated cost of a motion vector (r,c)
-#define PRE(r,c) (y + (((r)>>2) * y_stride + ((c)>>2) -(offset))) // pointer to predictor base of a motionvector
-#define SP(x) (((x)&3)<<1) // convert motion vector component to offset for svf calc
-#define DIST(r,c) vfp->svf( PRE(r,c), y_stride, SP(c),SP(r), z,b->src_stride,&sse) // returns subpixel variance error function.
+
+/* estimated cost of a motion vector (r,c) */
+#define MVC(r,c) (mvcost ? ((mvcost[0][(r)-rr] + mvcost[1][(c) - rc]) * error_per_bit + 128 )>>8 : 0)
+/* pointer to predictor base of a motionvector */
+#define PRE(r,c) (y + (((r)>>2) * y_stride + ((c)>>2) -(offset)))
+/* convert motion vector component to offset for svf calc */
+#define SP(x) (((x)&3)<<1)
+/* returns subpixel variance error function. */
+#define DIST(r,c) vfp->svf( PRE(r,c), y_stride, SP(c),SP(r), z,b->src_stride,&sse)
 #define IFMVCV(r,c,s,e) if ( c >= minc && c <= maxc && r >= minr && r <= maxr) s else e;
-#define ERR(r,c) (MVC(r,c)+DIST(r,c)) // returns distortion + motion vector cost
-#define CHECK_BETTER(v,r,c) IFMVCV(r,c,{thismse = DIST(r,c); if((v = (MVC(r,c)+thismse)) < besterr) { besterr = v; br=r; bc=c; *distortion = thismse; *sse1 = sse; }}, v=INT_MAX;)// checks if (r,c) has better score than previous best
+/* returns distortion + motion vector cost */
+#define ERR(r,c) (MVC(r,c)+DIST(r,c))
+/* checks if (r,c) has better score than previous best */
+#define CHECK_BETTER(v,r,c) IFMVCV(r,c,{thismse = DIST(r,c); if((v = (MVC(r,c)+thismse)) < besterr) { besterr = v; br=r; bc=c; *distortion = thismse; *sse1 = sse; }}, v=UINT_MAX;)
 
 int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
                                              int_mv *bestmv, int_mv *ref_mv,
@@ -202,7 +212,7 @@
     int rr = ref_mv->as_mv.row >> 1, rc = ref_mv->as_mv.col >> 1;
     int br = bestmv->as_mv.row << 2, bc = bestmv->as_mv.col << 2;
     int tr = br, tc = bc;
-    unsigned int besterr = INT_MAX;
+    unsigned int besterr;
     unsigned int left, right, up, down, diag;
     unsigned int sse;
     unsigned int whichdir;
@@ -227,7 +237,7 @@
     unsigned char *y;
     int buf_r1, buf_r2, buf_c1, buf_c2;
 
-    // Clamping to avoid out-of-range data access
+    /* Clamping to avoid out-of-range data access */
     buf_r1 = ((bestmv->as_mv.row - 3) < x->mv_row_min)?(bestmv->as_mv.row - x->mv_row_min):3;
     buf_r2 = ((bestmv->as_mv.row + 3) > x->mv_row_max)?(x->mv_row_max - bestmv->as_mv.row):3;
     buf_c1 = ((bestmv->as_mv.col - 3) < x->mv_col_min)?(bestmv->as_mv.col - x->mv_col_min):3;
@@ -244,19 +254,21 @@
 
     offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;
 
-    // central mv
+    /* central mv */
     bestmv->as_mv.row <<= 3;
     bestmv->as_mv.col <<= 3;
 
-    // calculate central point error
+    /* calculate central point error */
     besterr = vfp->vf(y, y_stride, z, b->src_stride, sse1);
     *distortion = besterr;
     besterr += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
 
-    // TODO: Each subsequent iteration checks at least one point in common with the last iteration could be 2 ( if diag selected)
+    /* TODO: Each subsequent iteration checks at least one point in common
+     * with the last iteration could be 2 ( if diag selected)
+     */
     while (--halfiters)
     {
-        // 1/2 pel
+        /* 1/2 pel */
         CHECK_BETTER(left, tr, tc - 2);
         CHECK_BETTER(right, tr, tc + 2);
         CHECK_BETTER(up, tr - 2, tc);
@@ -280,7 +292,7 @@
             break;
         }
 
-        // no reason to check the same one again.
+        /* no reason to check the same one again. */
         if (tr == br && tc == bc)
             break;
 
@@ -288,8 +300,11 @@
         tc = bc;
     }
 
-    // TODO: Each subsequent iteration checks at least one point in common with the last iteration could be 2 ( if diag selected)
-    // 1/4 pel
+    /* TODO: Each subsequent iteration checks at least one point in common
+     * with the last iteration could be 2 ( if diag selected)
+     */
+
+    /* 1/4 pel */
     while (--quarteriters)
     {
         CHECK_BETTER(left, tr, tc - 1);
@@ -315,7 +330,7 @@
             break;
         }
 
-        // no reason to check the same one again.
+        /* no reason to check the same one again. */
         if (tr == br && tc == bc)
             break;
 
@@ -373,17 +388,17 @@
      y_stride = pre_stride;
 #endif
 
-    // central mv
+    /* central mv */
     bestmv->as_mv.row <<= 3;
     bestmv->as_mv.col <<= 3;
     startmv = *bestmv;
 
-    // calculate central point error
+    /* calculate central point error */
     bestmse = vfp->vf(y, y_stride, z, b->src_stride, sse1);
     *distortion = bestmse;
     bestmse += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
 
-    // go left then right and check error
+    /* go left then right and check error */
     this_mv.as_mv.row = startmv.as_mv.row;
     this_mv.as_mv.col = ((startmv.as_mv.col - 8) | 4);
     thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, b->src_stride, &sse);
@@ -409,7 +424,7 @@
         *sse1 = sse;
     }
 
-    // go up then down and check error
+    /* go up then down and check error */
     this_mv.as_mv.col = startmv.as_mv.col;
     this_mv.as_mv.row = ((startmv.as_mv.row - 8) | 4);
     thismse =  vfp->svf_halfpix_v(y - y_stride, y_stride, z, b->src_stride, &sse);
@@ -436,10 +451,8 @@
     }
 
 
-    // now check 1 more diagonal
+    /* now check 1 more diagonal */
     whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
-    //for(whichdir =0;whichdir<4;whichdir++)
-    //{
     this_mv = startmv;
 
     switch (whichdir)
@@ -477,10 +490,8 @@
         *sse1 = sse;
     }
 
-//  }
 
-
-    // time to check quarter pels.
+    /* time to check quarter pels. */
     if (bestmv->as_mv.row < startmv.as_mv.row)
         y -= y_stride;
 
@@ -491,7 +502,7 @@
 
 
 
-    // go left then right and check error
+    /* go left then right and check error */
     this_mv.as_mv.row = startmv.as_mv.row;
 
     if (startmv.as_mv.col & 7)
@@ -527,7 +538,7 @@
         *sse1 = sse;
     }
 
-    // go up then down and check error
+    /* go up then down and check error */
     this_mv.as_mv.col = startmv.as_mv.col;
 
     if (startmv.as_mv.row & 7)
@@ -564,11 +575,9 @@
     }
 
 
-    // now check 1 more diagonal
+    /* now check 1 more diagonal */
     whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
 
-//  for(whichdir=0;whichdir<4;whichdir++)
-//  {
     this_mv = startmv;
 
     switch (whichdir)
@@ -690,17 +699,17 @@
     y_stride = pre_stride;
 #endif
 
-    // central mv
+    /* central mv */
     bestmv->as_mv.row <<= 3;
     bestmv->as_mv.col <<= 3;
     startmv = *bestmv;
 
-    // calculate central point error
+    /* calculate central point error */
     bestmse = vfp->vf(y, y_stride, z, b->src_stride, sse1);
     *distortion = bestmse;
     bestmse += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
 
-    // go left then right and check error
+    /* go left then right and check error */
     this_mv.as_mv.row = startmv.as_mv.row;
     this_mv.as_mv.col = ((startmv.as_mv.col - 8) | 4);
     thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, b->src_stride, &sse);
@@ -726,7 +735,7 @@
         *sse1 = sse;
     }
 
-    // go up then down and check error
+    /* go up then down and check error */
     this_mv.as_mv.col = startmv.as_mv.col;
     this_mv.as_mv.row = ((startmv.as_mv.row - 8) | 4);
     thismse = vfp->svf_halfpix_v(y - y_stride, y_stride, z, b->src_stride, &sse);
@@ -752,7 +761,7 @@
         *sse1 = sse;
     }
 
-    // now check 1 more diagonal -
+    /* now check 1 more diagonal - */
     whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
     this_mv = startmv;
 
@@ -861,7 +870,7 @@
     int in_what_stride = pre_stride;
     int br, bc;
     int_mv this_mv;
-    unsigned int bestsad = 0x7fffffff;
+    unsigned int bestsad;
     unsigned int thissad;
     unsigned char *base_offset;
     unsigned char *this_offset;
@@ -875,18 +884,17 @@
     fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
     fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
 
-    // adjust ref_mv to make sure it is within MV range
+    /* adjust ref_mv to make sure it is within MV range */
     vp8_clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
     br = ref_mv->as_mv.row;
     bc = ref_mv->as_mv.col;
 
-    // Work out the start point for the search
+    /* Work out the start point for the search */
     base_offset = (unsigned char *)(base_pre + d->offset);
     this_offset = base_offset + (br * (pre_stride)) + bc;
     this_mv.as_mv.row = br;
     this_mv.as_mv.col = bc;
-    bestsad = vfp->sdf( what, what_stride, this_offset,
-                        in_what_stride, 0x7fffffff)
+    bestsad = vfp->sdf(what, what_stride, this_offset, in_what_stride, UINT_MAX)
             + mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit);
 
 #if CONFIG_MULTI_RES_ENCODING
@@ -901,8 +909,7 @@
     dia_range = 8;
 #endif
 
-    // hex search
-    //j=0
+    /* hex search */
     CHECK_BOUNDS(2)
 
     if(all_in)
@@ -912,7 +919,7 @@
             this_mv.as_mv.row = br + hex[i].row;
             this_mv.as_mv.col = bc + hex[i].col;
             this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + this_mv.as_mv.col;
-            thissad=vfp->sdf( what, what_stride, this_offset, in_what_stride, bestsad);
+            thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
             CHECK_BETTER
         }
     }else
@@ -923,7 +930,7 @@
             this_mv.as_mv.col = bc + hex[i].col;
             CHECK_POINT
             this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + this_mv.as_mv.col;
-            thissad=vfp->sdf( what, what_stride, this_offset, in_what_stride, bestsad);
+            thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
             CHECK_BETTER
         }
     }
@@ -949,7 +956,7 @@
                 this_mv.as_mv.row = br + next_chkpts[k][i].row;
                 this_mv.as_mv.col = bc + next_chkpts[k][i].col;
                 this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
-                thissad = vfp->sdf( what, what_stride, this_offset, in_what_stride, bestsad);
+                thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
                 CHECK_BETTER
             }
         }else
@@ -960,7 +967,7 @@
                 this_mv.as_mv.col = bc + next_chkpts[k][i].col;
                 CHECK_POINT
                 this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
-                thissad = vfp->sdf( what, what_stride, this_offset, in_what_stride, bestsad);
+                thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
                 CHECK_BETTER
             }
         }
@@ -977,7 +984,7 @@
         }
     }
 
-    // check 4 1-away neighbors
+    /* check 4 1-away neighbors */
 cal_neighbors:
     for (j = 0; j < dia_range; j++)
     {
@@ -991,7 +998,7 @@
                 this_mv.as_mv.row = br + neighbors[i].row;
                 this_mv.as_mv.col = bc + neighbors[i].col;
                 this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
-                thissad = vfp->sdf( what, what_stride, this_offset, in_what_stride, bestsad);
+                thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
                 CHECK_BETTER
             }
         }else
@@ -1002,7 +1009,7 @@
                 this_mv.as_mv.col = bc + neighbors[i].col;
                 CHECK_POINT
                 this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
-                thissad = vfp->sdf( what, what_stride, this_offset, in_what_stride, bestsad);
+                thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
                 CHECK_BETTER
             }
         }
@@ -1053,7 +1060,8 @@
     int tot_steps;
     int_mv this_mv;
 
-    int bestsad = INT_MAX;
+    unsigned int bestsad;
+    unsigned int thissad;
     int best_site = 0;
     int last_site = 0;
 
@@ -1064,10 +1072,12 @@
     search_site *ss;
 
     unsigned char *check_here;
-    int thissad;
 
-    int *mvsadcost[2] = {x->mvsadcost[0], x->mvsadcost[1]};
+    int *mvsadcost[2];
     int_mv fcenter_mv;
+
+    mvsadcost[0] = x->mvsadcost[0];
+    mvsadcost[1] = x->mvsadcost[1];
     fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
     fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
 
@@ -1078,17 +1088,18 @@
     best_mv->as_mv.row = ref_row;
     best_mv->as_mv.col = ref_col;
 
-    // Work out the start point for the search
+    /* Work out the start point for the search */
     in_what = (unsigned char *)(base_pre + d->offset + (ref_row * pre_stride) + ref_col);
     best_address = in_what;
 
-    // Check the starting position
-    bestsad = fn_ptr->sdf(what, what_stride, in_what,
-                          in_what_stride, 0x7fffffff)
-              + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
+    /* Check the starting position */
+    bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, UINT_MAX)
+            + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
 
-    // search_param determines the length of the initial step and hence the number of iterations
-    // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc.
+    /* search_param determines the length of the initial step and hence
+     * the number of iterations 0 = initial step (MAX_FIRST_STEP) pel :
+     * 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc.
+     */
     ss = &x->ss[search_param * x->searches_per_step];
     tot_steps = (x->ss_count / x->searches_per_step) - search_param;
 
@@ -1098,7 +1109,7 @@
     {
         for (j = 0 ; j < x->searches_per_step ; j++)
         {
-            // Trap illegal vectors
+            /* Trap illegal vectors */
             this_row_offset = best_mv->as_mv.row + ss[i].mv.row;
             this_col_offset = best_mv->as_mv.col + ss[i].mv.col;
 
@@ -1107,14 +1118,14 @@
 
             {
                 check_here = ss[i].offset + best_address;
-                thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
+                thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
 
                 if (thissad < bestsad)
                 {
                     this_mv.as_mv.row = this_row_offset;
                     this_mv.as_mv.col = this_col_offset;
                     thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
-                                                mvsadcost, sad_per_bit);
+                                              mvsadcost, sad_per_bit);
 
                     if (thissad < bestsad)
                     {
@@ -1141,11 +1152,8 @@
     this_mv.as_mv.row = best_mv->as_mv.row << 3;
     this_mv.as_mv.col = best_mv->as_mv.col << 3;
 
-    if (bestsad == INT_MAX)
-        return INT_MAX;
-
-    return fn_ptr->vf(what, what_stride, best_address, in_what_stride, (unsigned int *)(&thissad))
-        + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
+    return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad)
+           + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
 }
 
 int vp8_diamond_search_sadx4
@@ -1176,7 +1184,8 @@
     int tot_steps;
     int_mv this_mv;
 
-    unsigned int bestsad = UINT_MAX;
+    unsigned int bestsad;
+    unsigned int thissad;
     int best_site = 0;
     int last_site = 0;
 
@@ -1187,10 +1196,12 @@
     search_site *ss;
 
     unsigned char *check_here;
-    unsigned int thissad;
 
-    int *mvsadcost[2] = {x->mvsadcost[0], x->mvsadcost[1]};
+    int *mvsadcost[2];
     int_mv fcenter_mv;
+
+    mvsadcost[0] = x->mvsadcost[0];
+    mvsadcost[1] = x->mvsadcost[1];
     fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
     fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
 
@@ -1201,17 +1212,18 @@
     best_mv->as_mv.row = ref_row;
     best_mv->as_mv.col = ref_col;
 
-    // Work out the start point for the search
+    /* Work out the start point for the search */
     in_what = (unsigned char *)(base_pre + d->offset + (ref_row * pre_stride) + ref_col);
     best_address = in_what;
 
-    // Check the starting position
-    bestsad = fn_ptr->sdf(what, what_stride,
-                          in_what, in_what_stride, 0x7fffffff)
-              + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
+    /* Check the starting position */
+    bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, UINT_MAX)
+            + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
 
-    // search_param determines the length of the initial step and hence the number of iterations
-    // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc.
+    /* search_param determines the length of the initial step and hence the
+     * number of iterations 0 = initial step (MAX_FIRST_STEP) pel : 1 =
+     * (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc.
+     */
     ss = &x->ss[search_param * x->searches_per_step];
     tot_steps = (x->ss_count / x->searches_per_step) - search_param;
 
@@ -1221,8 +1233,10 @@
     {
         int all_in = 1, t;
 
-        // To know if all neighbor points are within the bounds, 4 bounds checking are enough instead of
-        // checking 4 bounds for each points.
+        /* To know if all neighbor points are within the bounds, 4 bounds
+         * checking are enough instead of checking 4 bounds for each
+         * points.
+         */
         all_in &= ((best_mv->as_mv.row + ss[i].mv.row)> x->mv_row_min);
         all_in &= ((best_mv->as_mv.row + ss[i+1].mv.row) < x->mv_row_max);
         all_in &= ((best_mv->as_mv.col + ss[i+2].mv.col) > x->mv_col_min);
@@ -1234,7 +1248,7 @@
 
             for (j = 0 ; j < x->searches_per_step ; j += 4)
             {
-                unsigned char *block_offset[4];
+                const unsigned char *block_offset[4];
 
                 for (t = 0; t < 4; t++)
                     block_offset[t] = ss[i+t].offset + best_address;
@@ -1263,7 +1277,7 @@
         {
             for (j = 0 ; j < x->searches_per_step ; j++)
             {
-                // Trap illegal vectors
+                /* Trap illegal vectors */
                 this_row_offset = best_mv->as_mv.row + ss[i].mv.row;
                 this_col_offset = best_mv->as_mv.col + ss[i].mv.col;
 
@@ -1271,14 +1285,14 @@
                 (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max))
                 {
                     check_here = ss[i].offset + best_address;
-                    thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
+                    thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
 
                     if (thissad < bestsad)
                     {
                         this_mv.as_mv.row = this_row_offset;
                         this_mv.as_mv.col = this_col_offset;
                         thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
-                                                   mvsadcost, sad_per_bit);
+                                                  mvsadcost, sad_per_bit);
 
                         if (thissad < bestsad)
                         {
@@ -1305,11 +1319,8 @@
     this_mv.as_mv.row = best_mv->as_mv.row << 3;
     this_mv.as_mv.col = best_mv->as_mv.col << 3;
 
-    if (bestsad == INT_MAX)
-        return INT_MAX;
-
-    return fn_ptr->vf(what, what_stride, best_address, in_what_stride, (unsigned int *)(&thissad))
-        + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
+    return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad)
+           + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
 }
 
 int vp8_full_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
@@ -1327,11 +1338,11 @@
     unsigned char *bestaddress;
     int_mv *best_mv = &d->bmi.mv;
     int_mv this_mv;
-    int bestsad = INT_MAX;
+    unsigned int bestsad;
+    unsigned int thissad;
     int r, c;
 
     unsigned char *check_here;
-    int thissad;
 
     int ref_row = ref_mv->as_mv.row;
     int ref_col = ref_mv->as_mv.col;
@@ -1341,24 +1352,29 @@
     int col_min = ref_col - distance;
     int col_max = ref_col + distance;
 
-    int *mvsadcost[2] = {x->mvsadcost[0], x->mvsadcost[1]};
+    int *mvsadcost[2];
     int_mv fcenter_mv;
+
+    mvsadcost[0] = x->mvsadcost[0];
+    mvsadcost[1] = x->mvsadcost[1];
     fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
     fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
 
-    // Work out the mid point for the search
+    /* Work out the mid point for the search */
     in_what = base_pre + d->offset;
     bestaddress = in_what + (ref_row * pre_stride) + ref_col;
 
     best_mv->as_mv.row = ref_row;
     best_mv->as_mv.col = ref_col;
 
-    // Baseline value at the centre
+    /* Baseline value at the centre */
     bestsad = fn_ptr->sdf(what, what_stride, bestaddress,
-                          in_what_stride, 0x7fffffff)
-              + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
+                          in_what_stride, UINT_MAX)
+            + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
 
-    // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
+    /* Apply further limits to prevent us looking using vectors that
+     * stretch beyiond the UMV border
+     */
     if (col_min < x->mv_col_min)
         col_min = x->mv_col_min;
 
@@ -1378,11 +1394,11 @@
 
         for (c = col_min; c < col_max; c++)
         {
-            thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
+            thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
 
             this_mv.as_mv.col = c;
-            thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,
-                        mvsadcost, sad_per_bit);
+            thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+                                      mvsadcost, sad_per_bit);
 
             if (thissad < bestsad)
             {
@@ -1399,11 +1415,8 @@
     this_mv.as_mv.row = best_mv->as_mv.row << 3;
     this_mv.as_mv.col = best_mv->as_mv.col << 3;
 
-    if (bestsad < INT_MAX)
-        return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad))
-        + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
-    else
-        return INT_MAX;
+    return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, &thissad)
+           + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
 }
 
 int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
@@ -1421,11 +1434,11 @@
     unsigned char *bestaddress;
     int_mv *best_mv = &d->bmi.mv;
     int_mv this_mv;
-    unsigned int bestsad = UINT_MAX;
+    unsigned int bestsad;
+    unsigned int thissad;
     int r, c;
 
     unsigned char *check_here;
-    unsigned int thissad;
 
     int ref_row = ref_mv->as_mv.row;
     int ref_col = ref_mv->as_mv.col;
@@ -1437,24 +1450,29 @@
 
     unsigned int sad_array[3];
 
-    int *mvsadcost[2] = {x->mvsadcost[0], x->mvsadcost[1]};
+    int *mvsadcost[2];
     int_mv fcenter_mv;
+
+    mvsadcost[0] = x->mvsadcost[0];
+    mvsadcost[1] = x->mvsadcost[1];
     fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
     fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
 
-    // Work out the mid point for the search
+    /* Work out the mid point for the search */
     in_what = base_pre + d->offset;
     bestaddress = in_what + (ref_row * pre_stride) + ref_col;
 
     best_mv->as_mv.row = ref_row;
     best_mv->as_mv.col = ref_col;
 
-    // Baseline value at the centre
-    bestsad = fn_ptr->sdf(what, what_stride,
-                          bestaddress, in_what_stride, 0x7fffffff)
-              + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
+    /* Baseline value at the centre */
+    bestsad = fn_ptr->sdf(what, what_stride, bestaddress,
+                          in_what_stride, UINT_MAX)
+            + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
 
-    // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
+    /* Apply further limits to prevent us looking using vectors that stretch
+     * beyond the UMV border
+     */
     if (col_min < x->mv_col_min)
         col_min = x->mv_col_min;
 
@@ -1477,7 +1495,7 @@
         {
             int i;
 
-            fn_ptr->sdx3f(what, what_stride, check_here , in_what_stride, sad_array);
+            fn_ptr->sdx3f(what, what_stride, check_here, in_what_stride, sad_array);
 
             for (i = 0; i < 3; i++)
             {
@@ -1486,8 +1504,8 @@
                 if (thissad < bestsad)
                 {
                     this_mv.as_mv.col = c;
-                    thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,
-                                                mvsadcost, sad_per_bit);
+                    thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+                                              mvsadcost, sad_per_bit);
 
                     if (thissad < bestsad)
                     {
@@ -1505,13 +1523,13 @@
 
         while (c < col_max)
         {
-            thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
+            thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
 
             if (thissad < bestsad)
             {
                 this_mv.as_mv.col = c;
-                thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,
-                                            mvsadcost, sad_per_bit);
+                thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+                                          mvsadcost, sad_per_bit);
 
                 if (thissad < bestsad)
                 {
@@ -1531,11 +1549,8 @@
     this_mv.as_mv.row = best_mv->as_mv.row << 3;
     this_mv.as_mv.col = best_mv->as_mv.col << 3;
 
-    if (bestsad < INT_MAX)
-        return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad))
-        + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
-    else
-        return INT_MAX;
+    return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, &thissad)
+           + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
 }
 
 int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
@@ -1553,11 +1568,11 @@
     unsigned char *bestaddress;
     int_mv *best_mv = &d->bmi.mv;
     int_mv this_mv;
-    unsigned int bestsad = UINT_MAX;
+    unsigned int bestsad;
+    unsigned int thissad;
     int r, c;
 
     unsigned char *check_here;
-    unsigned int thissad;
 
     int ref_row = ref_mv->as_mv.row;
     int ref_col = ref_mv->as_mv.col;
@@ -1570,24 +1585,29 @@
     DECLARE_ALIGNED_ARRAY(16, unsigned short, sad_array8, 8);
     unsigned int sad_array[3];
 
-    int *mvsadcost[2] = {x->mvsadcost[0], x->mvsadcost[1]};
+    int *mvsadcost[2];
     int_mv fcenter_mv;
+
+    mvsadcost[0] = x->mvsadcost[0];
+    mvsadcost[1] = x->mvsadcost[1];
     fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
     fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
 
-    // Work out the mid point for the search
+    /* Work out the mid point for the search */
     in_what = base_pre + d->offset;
     bestaddress = in_what + (ref_row * pre_stride) + ref_col;
 
     best_mv->as_mv.row = ref_row;
     best_mv->as_mv.col = ref_col;
 
-    // Baseline value at the centre
+    /* Baseline value at the centre */
     bestsad = fn_ptr->sdf(what, what_stride,
-                          bestaddress, in_what_stride, 0x7fffffff)
-              + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
+                          bestaddress, in_what_stride, UINT_MAX)
+            + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
 
-    // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
+    /* Apply further limits to prevent us looking using vectors that stretch
+     * beyond the UMV border
+     */
     if (col_min < x->mv_col_min)
         col_min = x->mv_col_min;
 
@@ -1610,17 +1630,17 @@
         {
             int i;
 
-            fn_ptr->sdx8f(what, what_stride, check_here , in_what_stride, sad_array8);
+            fn_ptr->sdx8f(what, what_stride, check_here, in_what_stride, sad_array8);
 
             for (i = 0; i < 8; i++)
             {
-                thissad = (unsigned int)sad_array8[i];
+                thissad = sad_array8[i];
 
                 if (thissad < bestsad)
                 {
                     this_mv.as_mv.col = c;
-                    thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,
-                                                mvsadcost, sad_per_bit);
+                    thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+                                              mvsadcost, sad_per_bit);
 
                     if (thissad < bestsad)
                     {
@@ -1693,11 +1713,8 @@
     this_mv.as_mv.row = best_mv->as_mv.row << 3;
     this_mv.as_mv.col = best_mv->as_mv.col << 3;
 
-    if (bestsad < INT_MAX)
-        return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad))
-        + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
-    else
-        return INT_MAX;
+    return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, &thissad)
+           + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
 }
 
 int vp8_refining_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
@@ -1717,17 +1734,21 @@
     unsigned char *best_address = (unsigned char *)(base_pre + d->offset +
         (ref_mv->as_mv.row * pre_stride) + ref_mv->as_mv.col);
     unsigned char *check_here;
-    unsigned int thissad;
     int_mv this_mv;
-    unsigned int bestsad = INT_MAX;
+    unsigned int bestsad;
+    unsigned int thissad;
 
-    int *mvsadcost[2] = {x->mvsadcost[0], x->mvsadcost[1]};
+    int *mvsadcost[2];
     int_mv fcenter_mv;
 
+    mvsadcost[0] = x->mvsadcost[0];
+    mvsadcost[1] = x->mvsadcost[1];
     fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
     fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
 
-    bestsad = fn_ptr->sdf(what, what_stride, best_address, in_what_stride, 0x7fffffff) + mvsad_err_cost(ref_mv, &fcenter_mv, mvsadcost, error_per_bit);
+    bestsad = fn_ptr->sdf(what, what_stride, best_address,
+                          in_what_stride, UINT_MAX)
+            + mvsad_err_cost(ref_mv, &fcenter_mv, mvsadcost, error_per_bit);
 
     for (i=0; i<search_range; i++)
     {
@@ -1772,11 +1793,8 @@
     this_mv.as_mv.row = ref_mv->as_mv.row << 3;
     this_mv.as_mv.col = ref_mv->as_mv.col << 3;
 
-    if (bestsad < INT_MAX)
-        return fn_ptr->vf(what, what_stride, best_address, in_what_stride, (unsigned int *)(&thissad))
-        + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
-    else
-        return INT_MAX;
+    return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad)
+           + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
 }
 
 int vp8_refining_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
@@ -1796,17 +1814,21 @@
     unsigned char *best_address = (unsigned char *)(base_pre + d->offset +
         (ref_mv->as_mv.row * pre_stride) + ref_mv->as_mv.col);
     unsigned char *check_here;
-    unsigned int thissad;
     int_mv this_mv;
-    unsigned int bestsad = INT_MAX;
+    unsigned int bestsad;
+    unsigned int thissad;
 
-    int *mvsadcost[2] = {x->mvsadcost[0], x->mvsadcost[1]};
+    int *mvsadcost[2];
     int_mv fcenter_mv;
 
+    mvsadcost[0] = x->mvsadcost[0];
+    mvsadcost[1] = x->mvsadcost[1];
     fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
     fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
 
-    bestsad = fn_ptr->sdf(what, what_stride, best_address, in_what_stride, 0x7fffffff) + mvsad_err_cost(ref_mv, &fcenter_mv, mvsadcost, error_per_bit);
+    bestsad = fn_ptr->sdf(what, what_stride, best_address,
+                          in_what_stride, UINT_MAX)
+            + mvsad_err_cost(ref_mv, &fcenter_mv, mvsadcost, error_per_bit);
 
     for (i=0; i<search_range; i++)
     {
@@ -1821,7 +1843,7 @@
         if(all_in)
         {
             unsigned int sad_array[4];
-            unsigned char *block_offset[4];
+            const unsigned char *block_offset[4];
             block_offset[0] = best_address - in_what_stride;
             block_offset[1] = best_address - 1;
             block_offset[2] = best_address + 1;
@@ -1887,11 +1909,8 @@
     this_mv.as_mv.row = ref_mv->as_mv.row << 3;
     this_mv.as_mv.col = ref_mv->as_mv.col << 3;
 
-    if (bestsad < INT_MAX)
-        return fn_ptr->vf(what, what_stride, best_address, in_what_stride, (unsigned int *)(&thissad))
-        + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
-    else
-        return INT_MAX;
+    return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad)
+           + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
 }
 
 #ifdef ENTROPY_STATS
@@ -1906,16 +1925,16 @@
 
     for (j = 0; j < 6; j++)
     {
-        fprintf(f, "  { // %d \n", j);
+        fprintf(f, "  { /* %d */\n", j);
         fprintf(f, "    ");
 
         for (i = 0; i < 4; i++)
         {
             int overal_prob;
             int this_prob;
-            int count; // = mv_ref_ct[j][i][0]+mv_ref_ct[j][i][1];
+            int count;
 
-            // Overall probs
+            /* Overall probs */
             count = mv_mode_cts[i][0] + mv_mode_cts[i][1];
 
             if (count)
@@ -1926,7 +1945,7 @@
             if (overal_prob == 0)
                 overal_prob = 1;
 
-            // context probs
+            /* context probs */
             count = mv_ref_ct[j][i][0] + mv_ref_ct[j][i][1];
 
             if (count)
@@ -1938,8 +1957,6 @@
                 this_prob = 1;
 
             fprintf(f, "%5d, ", this_prob);
-            //fprintf(f,"%5d, %5d, %8d,", this_prob, overal_prob, (this_prob << 10)/overal_prob);
-            //fprintf(f,"%8d, ", (this_prob << 10)/overal_prob);
         }
 
         fprintf(f, "  },\n");

diff --git a/vp8/encoder/mcomp.h b/vp8/encoder/mcomp.h
index cdb0cb6..890113f 100644
--- a/vp8/encoder/mcomp.h
+++ b/vp8/encoder/mcomp.h

@@ -21,9 +21,16 @@
 #endif
 
 
-#define MAX_MVSEARCH_STEPS 8                                    // The maximum number of steps in a step search given the largest allowed initial step
-#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS)) - 1)      // Max full pel mv specified in 1 pel units
-#define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1))            // Maximum size of the first step in full pel units
+/* The maximum number of steps in a step search given the largest allowed
+ * initial step
+ */
+#define MAX_MVSEARCH_STEPS 8
+
+/* Max full pel mv specified in 1 pel units */
+#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS)) - 1)
+
+/* Maximum size of the first step in full pel units */
+#define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1))
 
 extern void print_mode_context(void);
 extern int vp8_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvcost[2], int Weight);

diff --git a/vp8/encoder/mr_dissim.c b/vp8/encoder/mr_dissim.c
index 564f963..71218cc 100644
--- a/vp8/encoder/mr_dissim.c
+++ b/vp8/encoder/mr_dissim.c

@@ -53,6 +53,7 @@
 void vp8_cal_dissimilarity(VP8_COMP *cpi)
 {
     VP8_COMMON *cm = &cpi->common;
+    int i;
 
     /* Note: The first row & first column in mip are outside the frame, which
      * were initialized to all 0.(ref_frame, mode, mv...)
@@ -72,6 +73,13 @@
 
         if(cm->frame_type != KEY_FRAME)
         {
+            store_info->is_frame_dropped = 0;
+            for (i = 1; i < MAX_REF_FRAMES; i++)
+                store_info->low_res_ref_frames[i] = cpi->current_ref_frames[i];
+        }
+
+        if(cm->frame_type != KEY_FRAME)
+        {
             int mb_row;
             int mb_col;
             /* Point to beginning of allocated MODE_INFO arrays. */
@@ -203,3 +211,26 @@
         }
     }
 }
+
+/* This function is called only when this frame is dropped at current
+   resolution level. */
+void vp8_store_drop_frame_info(VP8_COMP *cpi)
+{
+    /* If the frame is dropped in lower-resolution encoding, this information
+       is passed to higher resolution level so that the encoder knows there
+       is no mode & motion info available.
+     */
+    if (cpi->oxcf.mr_total_resolutions >1
+        && cpi->oxcf.mr_encoder_id < (cpi->oxcf.mr_total_resolutions - 1))
+    {
+        /* Store info for show/no-show frames for supporting alt_ref.
+         * If parent frame is alt_ref, child has one too.
+         */
+        LOWER_RES_FRAME_INFO* store_info
+                      = (LOWER_RES_FRAME_INFO*)cpi->oxcf.mr_low_res_mode_info;
+
+        /* Set frame_type to be INTER_FRAME since we won't drop key frame. */
+        store_info->frame_type = INTER_FRAME;
+        store_info->is_frame_dropped = 1;
+    }
+}

diff --git a/vp8/encoder/mr_dissim.h b/vp8/encoder/mr_dissim.h
index 3d2c203..f8cb135 100644
--- a/vp8/encoder/mr_dissim.h
+++ b/vp8/encoder/mr_dissim.h

@@ -15,5 +15,6 @@
 
 extern void vp8_cal_low_res_mb_cols(VP8_COMP *cpi);
 extern void vp8_cal_dissimilarity(VP8_COMP *cpi);
+extern void vp8_store_drop_frame_info(VP8_COMP *cpi);
 
 #endif

diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index df16637..1d27ae0 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c

@@ -11,6 +11,7 @@
 
 #include "vpx_config.h"
 #include "vp8/common/onyxc_int.h"
+#include "vp8/common/blockd.h"
 #include "onyx_int.h"
 #include "vp8/common/systemdependent.h"
 #include "quantize.h"
@@ -57,10 +58,6 @@
 extern void print_tree_update_probs();
 extern void vp8cx_create_encoder_threads(VP8_COMP *cpi);
 extern void vp8cx_remove_encoder_threads(VP8_COMP *cpi);
-#if HAVE_NEON
-extern void vp8_yv12_copy_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
-extern void vp8_yv12_copy_src_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
-#endif
 
 int vp8_estimate_entropy_savings(VP8_COMP *cpi);
 
@@ -143,7 +140,7 @@
 extern void vp8cx_init_quantizer(VP8_COMP *cpi);
 extern const int vp8cx_base_skip_false_prob[128];
 
-// Tables relating active max Q to active min Q
+/* Tables relating active max Q to active min Q */
 static const unsigned char kf_low_motion_minq[QINDEX_RANGE] =
 {
     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
@@ -219,9 +216,8 @@
 {
     LAYER_CONTEXT *lc = &cpi->layer_context[cpi->current_layer];
 
-    // Save layer dependent coding state
+    /* Save layer dependent coding state */
     lc->target_bandwidth                 = cpi->target_bandwidth;
-    //lc->target_bandwidth                 = cpi->oxcf.target_bandwidth;
     lc->starting_buffer_level            = cpi->oxcf.starting_buffer_level;
     lc->optimal_buffer_level             = cpi->oxcf.optimal_buffer_level;
     lc->maximum_buffer_size              = cpi->oxcf.maximum_buffer_size;
@@ -258,7 +254,7 @@
 {
     LAYER_CONTEXT *lc = &cpi->layer_context[layer];
 
-    // Restore layer dependent coding state
+    /* Restore layer dependent coding state */
     cpi->current_layer                    = layer;
     cpi->target_bandwidth                 = lc->target_bandwidth;
     cpi->oxcf.target_bandwidth            = lc->target_bandwidth;
@@ -271,9 +267,7 @@
     cpi->buffer_level                     = lc->buffer_level;
     cpi->bits_off_target                  = lc->bits_off_target;
     cpi->total_actual_bits                = lc->total_actual_bits;
-    //cpi->worst_quality                    = lc->worst_quality;
     cpi->active_worst_quality             = lc->active_worst_quality;
-    //cpi->best_quality                     = lc->best_quality;
     cpi->active_best_quality              = lc->active_best_quality;
     cpi->ni_av_qi                         = lc->ni_av_qi;
     cpi->ni_tot_qi                        = lc->ni_tot_qi;
@@ -296,12 +290,17 @@
 
 static void setup_features(VP8_COMP *cpi)
 {
-    // Set up default state for MB feature flags
-    cpi->mb.e_mbd.segmentation_enabled = 0;
-    cpi->mb.e_mbd.update_mb_segmentation_map = 0;
-    cpi->mb.e_mbd.update_mb_segmentation_data = 0;
-    vpx_memset(cpi->mb.e_mbd.mb_segment_tree_probs, 255, sizeof(cpi->mb.e_mbd.mb_segment_tree_probs));
-    vpx_memset(cpi->mb.e_mbd.segment_feature_data, 0, sizeof(cpi->mb.e_mbd.segment_feature_data));
+    // If segmentation enabled set the update flags
+    if ( cpi->mb.e_mbd.segmentation_enabled )
+    {
+        cpi->mb.e_mbd.update_mb_segmentation_map = 1;
+        cpi->mb.e_mbd.update_mb_segmentation_data = 1;
+    }
+    else
+    {
+        cpi->mb.e_mbd.update_mb_segmentation_map = 0;
+        cpi->mb.e_mbd.update_mb_segmentation_data = 0;
+    }
 
     cpi->mb.e_mbd.mode_ref_lf_delta_enabled = 0;
     cpi->mb.e_mbd.mode_ref_lf_delta_update = 0;
@@ -323,7 +322,7 @@
     vpx_free(cpi->tplist);
     cpi->tplist = NULL;
 
-    // Delete last frame MV storage buffers
+    /* Delete last frame MV storage buffers */
     vpx_free(cpi->lfmv);
     cpi->lfmv = 0;
 
@@ -333,7 +332,7 @@
     vpx_free(cpi->lf_ref_frame);
     cpi->lf_ref_frame = 0;
 
-    // Delete sementation map
+    /* Delete sementation map */
     vpx_free(cpi->segmentation_map);
     cpi->segmentation_map = 0;
 
@@ -349,11 +348,11 @@
     vpx_free(cpi->tok);
     cpi->tok = 0;
 
-    // Structure used to monitor GF usage
+    /* Structure used to monitor GF usage */
     vpx_free(cpi->gf_active_flags);
     cpi->gf_active_flags = 0;
 
-    // Activity mask based per mb zbin adjustments
+    /* Activity mask based per mb zbin adjustments */
     vpx_free(cpi->mb_activity_map);
     cpi->mb_activity_map = 0;
     vpx_free(cpi->mb_norm_activity_map);
@@ -361,41 +360,51 @@
 
     vpx_free(cpi->mb.pip);
     cpi->mb.pip = 0;
+
+#if CONFIG_MULTITHREAD
+    vpx_free(cpi->mt_current_mb_col);
+    cpi->mt_current_mb_col = NULL;
+#endif
 }
 
 static void enable_segmentation(VP8_COMP *cpi)
 {
-    // Set the appropriate feature bit
+    /* Set the appropriate feature bit */
     cpi->mb.e_mbd.segmentation_enabled = 1;
     cpi->mb.e_mbd.update_mb_segmentation_map = 1;
     cpi->mb.e_mbd.update_mb_segmentation_data = 1;
 }
 static void disable_segmentation(VP8_COMP *cpi)
 {
-    // Clear the appropriate feature bit
+    /* Clear the appropriate feature bit */
     cpi->mb.e_mbd.segmentation_enabled = 0;
 }
 
-// Valid values for a segment are 0 to 3
-// Segmentation map is arrange as [Rows][Columns]
+/* Valid values for a segment are 0 to 3
+ * Segmentation map is arrange as [Rows][Columns]
+ */
 static void set_segmentation_map(VP8_COMP *cpi, unsigned char *segmentation_map)
 {
-    // Copy in the new segmentation map
+    /* Copy in the new segmentation map */
     vpx_memcpy(cpi->segmentation_map, segmentation_map, (cpi->common.mb_rows * cpi->common.mb_cols));
 
-    // Signal that the map should be updated.
+    /* Signal that the map should be updated. */
     cpi->mb.e_mbd.update_mb_segmentation_map = 1;
     cpi->mb.e_mbd.update_mb_segmentation_data = 1;
 }
 
-// The values given for each segment can be either deltas (from the default value chosen for the frame) or absolute values.
-//
-// Valid range for abs values is (0-127 for MB_LVL_ALT_Q) , (0-63 for SEGMENT_ALT_LF)
-// Valid range for delta values are (+/-127 for MB_LVL_ALT_Q) , (+/-63 for SEGMENT_ALT_LF)
-//
-// abs_delta = SEGMENT_DELTADATA (deltas) abs_delta = SEGMENT_ABSDATA (use the absolute values given).
-//
-//
+/* The values given for each segment can be either deltas (from the default
+ * value chosen for the frame) or absolute values.
+ *
+ * Valid range for abs values is:
+ *    (0-127 for MB_LVL_ALT_Q), (0-63 for SEGMENT_ALT_LF)
+ * Valid range for delta values are:
+ *    (+/-127 for MB_LVL_ALT_Q), (+/-63 for SEGMENT_ALT_LF)
+ *
+ * abs_delta = SEGMENT_DELTADATA (deltas)
+ * abs_delta = SEGMENT_ABSDATA (use the absolute values given).
+ *
+ */
 static void set_segment_data(VP8_COMP *cpi, signed char *feature_data, unsigned char abs_delta)
 {
     cpi->mb.e_mbd.mb_segement_abs_delta = abs_delta;
@@ -411,26 +420,6 @@
     // Create a temporary map for segmentation data.
     CHECK_MEM_ERROR(seg_map, vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1));
 
-    // MB loop to set local segmentation map
-    /*for ( i = 0; i < cpi->common.mb_rows; i++ )
-    {
-        for ( j = 0; j < cpi->common.mb_cols; j++ )
-        {
-            //seg_map[(i*cpi->common.mb_cols) + j] = (j % 2) + ((i%2)* 2);
-            //if ( j < cpi->common.mb_cols/2 )
-
-            // Segment 1 around the edge else 0
-            if ( (i == 0) || (j == 0) || (i == (cpi->common.mb_rows-1)) || (j == (cpi->common.mb_cols-1)) )
-                seg_map[(i*cpi->common.mb_cols) + j] = 1;
-            //else if ( (i < 2) || (j < 2) || (i > (cpi->common.mb_rows-3)) || (j > (cpi->common.mb_cols-3)) )
-            //  seg_map[(i*cpi->common.mb_cols) + j] = 2;
-            //else if ( (i < 5) || (j < 5) || (i > (cpi->common.mb_rows-6)) || (j > (cpi->common.mb_cols-6)) )
-            //  seg_map[(i*cpi->common.mb_cols) + j] = 3;
-            else
-                seg_map[(i*cpi->common.mb_cols) + j] = 0;
-        }
-    }*/
-
     // Set the segmentation Map
     set_segmentation_map(cpi, seg_map);
 
@@ -453,13 +442,12 @@
     set_segment_data(cpi, &feature_data[0][0], SEGMENT_DELTADATA);
 
     // Delete sementation map
-        vpx_free(seg_map);
+    vpx_free(seg_map);
 
     seg_map = 0;
-
 }
 
-// A simple function to cyclically refresh the background at a lower Q
+/* A simple function to cyclically refresh the background at a lower Q */
 static void cyclic_background_refresh(VP8_COMP *cpi, int Q, int lf_adjustment)
 {
     unsigned char *seg_map;
@@ -468,7 +456,7 @@
     int block_count = cpi->cyclic_refresh_mode_max_mbs_perframe;
     int mbs_in_frame = cpi->common.mb_rows * cpi->common.mb_cols;
 
-    // Create a temporary map for segmentation data.
+    /* Create a temporary map for segmentation data. */
     CHECK_MEM_ERROR(seg_map, vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1));
 
     cpi->cyclic_refresh_q = Q;
@@ -476,7 +464,6 @@
     for (i = Q; i > 0; i--)
     {
         if (vp8_bits_per_mb[cpi->common.frame_type][i] >= ((vp8_bits_per_mb[cpi->common.frame_type][Q]*(Q + 128)) / 64))
-            //if ( vp8_bits_per_mb[cpi->common.frame_type][i] >= ((vp8_bits_per_mb[cpi->common.frame_type][Q]*((2*Q)+96))/64) )
         {
             break;
         }
@@ -484,16 +471,19 @@
 
     cpi->cyclic_refresh_q = i;
 
-    // Only update for inter frames
+    /* Only update for inter frames */
     if (cpi->common.frame_type != KEY_FRAME)
     {
-        // Cycle through the macro_block rows
-        // MB loop to set local segmentation map
+        /* Cycle through the macro_block rows */
+        /* MB loop to set local segmentation map */
         for (i = cpi->cyclic_refresh_mode_index; i < mbs_in_frame; i++)
         {
-            // If the MB is as a candidate for clean up then mark it for possible boost/refresh (segment 1)
-            // The segment id may get reset to 0 later if the MB gets coded anything other than last frame 0,0
-            // as only (last frame 0,0) MBs are eligable for refresh : that is to say Mbs likely to be background blocks.
+            /* If the MB is as a candidate for clean up then mark it for
+             * possible boost/refresh (segment 1) The segment id may get
+             * reset to 0 later if the MB gets coded anything other than
+             * last frame 0,0 as only (last frame 0,0) MBs are eligable for
+             * refresh : that is to say Mbs likely to be background blocks.
+             */
             if (cpi->cyclic_refresh_map[i] == 0)
             {
                 seg_map[i] = 1;
@@ -502,9 +492,8 @@
             {
                 seg_map[i] = 0;
 
-                // Skip blocks that have been refreshed recently anyway.
+                /* Skip blocks that have been refreshed recently anyway. */
                 if (cpi->cyclic_refresh_map[i] < 0)
-                    //cpi->cyclic_refresh_map[i] = cpi->cyclic_refresh_map[i] / 16;
                     cpi->cyclic_refresh_map[i]++;
             }
 
@@ -516,36 +505,35 @@
 
         }
 
-        // If we have gone through the frame reset to the start
+        /* If we have gone through the frame reset to the start */
         cpi->cyclic_refresh_mode_index = i;
 
         if (cpi->cyclic_refresh_mode_index >= mbs_in_frame)
             cpi->cyclic_refresh_mode_index = 0;
     }
 
-    // Set the segmentation Map
+    /* Set the segmentation Map */
     set_segmentation_map(cpi, seg_map);
 
-    // Activate segmentation.
+    /* Activate segmentation. */
     enable_segmentation(cpi);
 
-    // Set up the quant segment data
+    /* Set up the quant segment data */
     feature_data[MB_LVL_ALT_Q][0] = 0;
     feature_data[MB_LVL_ALT_Q][1] = (cpi->cyclic_refresh_q - Q);
     feature_data[MB_LVL_ALT_Q][2] = 0;
     feature_data[MB_LVL_ALT_Q][3] = 0;
 
-    // Set up the loop segment data
+    /* Set up the loop segment data */
     feature_data[MB_LVL_ALT_LF][0] = 0;
     feature_data[MB_LVL_ALT_LF][1] = lf_adjustment;
     feature_data[MB_LVL_ALT_LF][2] = 0;
     feature_data[MB_LVL_ALT_LF][3] = 0;
 
-    // Initialise the feature data structure
-    // SEGMENT_DELTADATA    0, SEGMENT_ABSDATA      1
+    /* Initialise the feature data structure */
     set_segment_data(cpi, &feature_data[0][0], SEGMENT_DELTADATA);
 
-    // Delete sementation map
+    /* Delete sementation map */
     vpx_free(seg_map);
 
     seg_map = 0;
@@ -560,16 +548,21 @@
     vpx_memset(cpi->mb.e_mbd.ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas));
     vpx_memset(cpi->mb.e_mbd.mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas));
 
-    // Test of ref frame deltas
+    /* Test of ref frame deltas */
     cpi->mb.e_mbd.ref_lf_deltas[INTRA_FRAME] = 2;
     cpi->mb.e_mbd.ref_lf_deltas[LAST_FRAME] = 0;
     cpi->mb.e_mbd.ref_lf_deltas[GOLDEN_FRAME] = -2;
     cpi->mb.e_mbd.ref_lf_deltas[ALTREF_FRAME] = -2;
 
-    cpi->mb.e_mbd.mode_lf_deltas[0] = 4;               // BPRED
-    cpi->mb.e_mbd.mode_lf_deltas[1] = -2;              // Zero
-    cpi->mb.e_mbd.mode_lf_deltas[2] = 2;               // New mv
-    cpi->mb.e_mbd.mode_lf_deltas[3] = 4;               // Split mv
+    cpi->mb.e_mbd.mode_lf_deltas[0] = 4;               /* BPRED */
+
+    if(cpi->oxcf.Mode == MODE_REALTIME)
+      cpi->mb.e_mbd.mode_lf_deltas[1] = -12;              /* Zero */
+    else
+      cpi->mb.e_mbd.mode_lf_deltas[1] = -2;              /* Zero */
+
+    cpi->mb.e_mbd.mode_lf_deltas[2] = 2;               /* New mv */
+    cpi->mb.e_mbd.mode_lf_deltas[3] = 4;               /* Split mv */
 }
 
 /* Convenience macros for mapping speed and mode into a continuous
@@ -669,7 +662,7 @@
     int last_improved_quant = sf->improved_quant;
     int ref_frames;
 
-    // Initialise default mode frequency sampling variables
+    /* Initialise default mode frequency sampling variables */
     for (i = 0; i < MAX_MODES; i ++)
     {
         cpi->mode_check_freq[i] = 0;
@@ -679,7 +672,7 @@
 
     cpi->mbs_tested_so_far = 0;
 
-    // best quality defaults
+    /* best quality defaults */
     sf->RD = 1;
     sf->search_method = NSTEP;
     sf->improved_quant = 1;
@@ -697,17 +690,17 @@
     sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
     sf->improved_mv_pred = 1;
 
-    // default thresholds to 0
+    /* default thresholds to 0 */
     for (i = 0; i < MAX_MODES; i++)
         sf->thresh_mult[i] = 0;
 
     /* Count enabled references */
     ref_frames = 1;
-    if (cpi->ref_frame_flags & VP8_LAST_FLAG)
+    if (cpi->ref_frame_flags & VP8_LAST_FRAME)
         ref_frames++;
-    if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
+    if (cpi->ref_frame_flags & VP8_GOLD_FRAME)
         ref_frames++;
-    if (cpi->ref_frame_flags & VP8_ALT_FLAG)
+    if (cpi->ref_frame_flags & VP8_ALTR_FRAME)
         ref_frames++;
 
     /* Convert speed to continuous range, with clamping */
@@ -779,7 +772,7 @@
     switch (Mode)
     {
 #if !(CONFIG_REALTIME_ONLY)
-    case 0: // best quality mode
+    case 0: /* best quality mode */
         sf->first_step = 0;
         sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
         break;
@@ -800,8 +793,9 @@
             sf->improved_quant = 0;
             sf->improved_dct = 0;
 
-            // Only do recode loop on key frames, golden frames and
-            // alt ref frames
+            /* Only do recode loop on key frames, golden frames and
+             * alt ref frames
+             */
             sf->recode_loop = 2;
 
         }
@@ -809,14 +803,14 @@
         if (Speed > 3)
         {
             sf->auto_filter = 1;
-            sf->recode_loop = 0; // recode loop off
-            sf->RD = 0;         // Turn rd off
+            sf->recode_loop = 0; /* recode loop off */
+            sf->RD = 0;         /* Turn rd off */
 
         }
 
         if (Speed > 4)
         {
-            sf->auto_filter = 0;                     // Faster selection of loop filter
+            sf->auto_filter = 0;  /* Faster selection of loop filter */
         }
 
         break;
@@ -839,7 +833,7 @@
         }
 
         if (Speed > 2)
-            sf->auto_filter = 0;                     // Faster selection of loop filter
+            sf->auto_filter = 0;  /* Faster selection of loop filter */
 
         if (Speed > 3)
         {
@@ -849,7 +843,7 @@
 
         if (Speed > 4)
         {
-            sf->auto_filter = 0;                     // Faster selection of loop filter
+            sf->auto_filter = 0;  /* Faster selection of loop filter */
             sf->search_method = HEX;
             sf->iterative_sub_pixel = 0;
         }
@@ -876,7 +870,7 @@
             total_skip = sum;
             sum = 0;
 
-            // i starts from 2 to make sure thresh started from 2048
+            /* i starts from 2 to make sure thresh started from 2048 */
             for (; i < 1024; i++)
             {
                 sum += cpi->error_bins[i];
@@ -930,7 +924,7 @@
             cm->filter_type = SIMPLE_LOOPFILTER;
         }
 
-        // This has a big hit on quality. Last resort
+        /* This has a big hit on quality. Last resort */
         if (Speed >= 15)
             sf->half_pixel_search = 0;
 
@@ -938,8 +932,9 @@
 
     }; /* switch */
 
-    // Slow quant, dct and trellis not worthwhile for first pass
-    // so make sure they are always turned off.
+    /* Slow quant, dct and trellis not worthwhile for first pass
+     * so make sure they are always turned off.
+     */
     if ( cpi->pass == 1 )
     {
         sf->improved_quant = 0;
@@ -1107,28 +1102,53 @@
         CHECK_MEM_ERROR(cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok)));
     }
 
-    // Data used for real time vc mode to see if gf needs refreshing
+    /* Data used for real time vc mode to see if gf needs refreshing */
     cpi->inter_zz_count = 0;
     cpi->gf_bad_count = 0;
     cpi->gf_update_recommended = 0;
 
 
-    // Structures used to minitor GF usage
+    /* Structures used to monitor GF usage */
     vpx_free(cpi->gf_active_flags);
     CHECK_MEM_ERROR(cpi->gf_active_flags,
-                    vpx_calloc(1, cm->mb_rows * cm->mb_cols));
+                    vpx_calloc(sizeof(*cpi->gf_active_flags),
+                    cm->mb_rows * cm->mb_cols));
     cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
 
     vpx_free(cpi->mb_activity_map);
     CHECK_MEM_ERROR(cpi->mb_activity_map,
-                    vpx_calloc(sizeof(unsigned int),
+                    vpx_calloc(sizeof(*cpi->mb_activity_map),
                     cm->mb_rows * cm->mb_cols));
 
     vpx_free(cpi->mb_norm_activity_map);
     CHECK_MEM_ERROR(cpi->mb_norm_activity_map,
-                    vpx_calloc(sizeof(unsigned int),
+                    vpx_calloc(sizeof(*cpi->mb_norm_activity_map),
                     cm->mb_rows * cm->mb_cols));
 
+    /* allocate memory for storing last frame's MVs for MV prediction. */
+    vpx_free(cpi->lfmv);
+    CHECK_MEM_ERROR(cpi->lfmv, vpx_calloc((cm->mb_rows+2) * (cm->mb_cols+2),
+                    sizeof(*cpi->lfmv)));
+    vpx_free(cpi->lf_ref_frame_sign_bias);
+    CHECK_MEM_ERROR(cpi->lf_ref_frame_sign_bias,
+                    vpx_calloc((cm->mb_rows+2) * (cm->mb_cols+2),
+                    sizeof(*cpi->lf_ref_frame_sign_bias)));
+    vpx_free(cpi->lf_ref_frame);
+    CHECK_MEM_ERROR(cpi->lf_ref_frame,
+                    vpx_calloc((cm->mb_rows+2) * (cm->mb_cols+2),
+                    sizeof(*cpi->lf_ref_frame)));
+
+    /* Create the encoder segmentation map and set all entries to 0 */
+    vpx_free(cpi->segmentation_map);
+    CHECK_MEM_ERROR(cpi->segmentation_map,
+                    vpx_calloc(cm->mb_rows * cm->mb_cols,
+                    sizeof(*cpi->segmentation_map)));
+    vpx_free(cpi->active_map);
+    CHECK_MEM_ERROR(cpi->active_map,
+                    vpx_calloc(cm->mb_rows * cm->mb_cols,
+                    sizeof(*cpi->active_map)));
+    vpx_memset(cpi->active_map , 1, (cm->mb_rows * cm->mb_cols));
+
 #if CONFIG_MULTITHREAD
     if (width < 640)
         cpi->mt_sync_range = 1;
@@ -1138,15 +1158,22 @@
         cpi->mt_sync_range = 8;
     else
         cpi->mt_sync_range = 16;
+
+    if (cpi->oxcf.multi_threaded > 1)
+    {
+        vpx_free(cpi->mt_current_mb_col);
+        CHECK_MEM_ERROR(cpi->mt_current_mb_col,
+                    vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cm->mb_rows));
+    }
+
 #endif
 
     vpx_free(cpi->tplist);
-
-    CHECK_MEM_ERROR(cpi->tplist, vpx_malloc(sizeof(TOKENLIST) * cpi->common.mb_rows));
+    CHECK_MEM_ERROR(cpi->tplist, vpx_malloc(sizeof(TOKENLIST) * cm->mb_rows));
 }
 
 
-// Quant MOD
+/* Quant MOD */
 static const int q_trans[] =
 {
     0,   1,  2,  3,  4,  5,  7,  8,
@@ -1168,7 +1195,7 @@
             return i;
 
     return 63;
-};
+}
 void vp8_new_frame_rate(VP8_COMP *cpi, double framerate)
 {
     if(framerate < .1)
@@ -1182,16 +1209,16 @@
     cpi->min_frame_bandwidth    = (int)(cpi->av_per_frame_bandwidth *
                                   cpi->oxcf.two_pass_vbrmin_section / 100);
 
-    // Set Maximum gf/arf interval
+    /* Set Maximum gf/arf interval */
     cpi->max_gf_interval = ((int)(cpi->output_frame_rate / 2.0) + 2);
 
     if(cpi->max_gf_interval < 12)
         cpi->max_gf_interval = 12;
 
-    // Extended interval for genuinely static scenes
+    /* Extended interval for genuinely static scenes */
     cpi->twopass.static_scene_max_gf_interval = cpi->key_frame_frequency >> 1;
 
-     // Special conditions when altr ref frame enabled in lagged compress mode
+     /* Special conditions when altr ref frame enabled in lagged compress mode */
     if (cpi->oxcf.play_alternate && cpi->oxcf.lag_in_frames)
     {
         if (cpi->max_gf_interval > cpi->oxcf.lag_in_frames - 1)
@@ -1213,7 +1240,7 @@
     int64_t llden = denom;
     int64_t llval = val;
 
-    return llval * llnum / llden;
+    return (int)(llval * llnum / llden);
 }
 
 
@@ -1225,7 +1252,6 @@
 
     cpi->auto_gold = 1;
     cpi->auto_adjust_gold_quantizer = 1;
-    cpi->goldfreq = 7;
 
     cm->version = oxcf->Version;
     vp8_setup_version(cm);
@@ -1244,15 +1270,15 @@
 
     cpi->ref_frame_rate = cpi->frame_rate;
 
-    // change includes all joint functionality
+    /* change includes all joint functionality */
     vp8_change_config(cpi, oxcf);
 
-    // Initialize active best and worst q and average q values.
+    /* Initialize active best and worst q and average q values. */
     cpi->active_worst_quality         = cpi->oxcf.worst_allowed_q;
     cpi->active_best_quality          = cpi->oxcf.best_allowed_q;
     cpi->avg_frame_qindex             = cpi->oxcf.worst_allowed_q;
 
-    // Initialise the starting buffer levels
+    /* Initialise the starting buffer levels */
     cpi->buffer_level                 = cpi->oxcf.starting_buffer_level;
     cpi->bits_off_target              = cpi->oxcf.starting_buffer_level;
 
@@ -1264,7 +1290,7 @@
     cpi->total_actual_bits            = 0;
     cpi->total_target_vs_actual       = 0;
 
-    // Temporal scalabilty
+    /* Temporal scalabilty */
     if (cpi->oxcf.number_of_layers > 1)
     {
         unsigned int i;
@@ -1274,7 +1300,7 @@
         {
             LAYER_CONTEXT *lc = &cpi->layer_context[i];
 
-            // Layer configuration
+            /* Layer configuration */
             lc->frame_rate =
                         cpi->output_frame_rate / cpi->oxcf.rate_decimator[i];
             lc->target_bandwidth = cpi->oxcf.target_bitrate[i] * 1000;
@@ -1284,28 +1310,29 @@
             lc->maximum_buffer_size_in_ms   = oxcf->maximum_buffer_size;
 
             lc->starting_buffer_level =
-              rescale(oxcf->starting_buffer_level,
+              rescale((int)(oxcf->starting_buffer_level),
                           lc->target_bandwidth, 1000);
 
             if (oxcf->optimal_buffer_level == 0)
                 lc->optimal_buffer_level = lc->target_bandwidth / 8;
             else
                 lc->optimal_buffer_level =
-                  rescale(oxcf->optimal_buffer_level,
+                  rescale((int)(oxcf->optimal_buffer_level),
                           lc->target_bandwidth, 1000);
 
             if (oxcf->maximum_buffer_size == 0)
                 lc->maximum_buffer_size = lc->target_bandwidth / 8;
             else
                 lc->maximum_buffer_size =
-                  rescale(oxcf->maximum_buffer_size,
+                  rescale((int)oxcf->maximum_buffer_size,
                           lc->target_bandwidth, 1000);
 
-            // Work out the average size of a frame within this layer
+            /* Work out the average size of a frame within this layer */
             if (i > 0)
-                lc->avg_frame_size_for_layer = (cpi->oxcf.target_bitrate[i] -
-                    cpi->oxcf.target_bitrate[i-1]) * 1000 /
-                    (lc->frame_rate - prev_layer_frame_rate);
+                lc->avg_frame_size_for_layer =
+                  (int)((cpi->oxcf.target_bitrate[i] -
+                         cpi->oxcf.target_bitrate[i-1]) * 1000 /
+                        (lc->frame_rate - prev_layer_frame_rate));
 
             lc->active_worst_quality         = cpi->oxcf.worst_allowed_q;
             lc->active_best_quality          = cpi->oxcf.best_allowed_q;
@@ -1321,7 +1348,7 @@
             lc->rate_correction_factor            = 1.0;
             lc->key_frame_rate_correction_factor  = 1.0;
             lc->gf_rate_correction_factor         = 1.0;
-            lc->inter_frame_target                = 0.0;
+            lc->inter_frame_target                = 0;
 
             prev_layer_frame_rate = lc->frame_rate;
         }
@@ -1358,32 +1385,29 @@
             lc->target_bandwidth = oxcf->target_bitrate[i] * 1000;
 
             lc->starting_buffer_level = rescale(
-                          oxcf->starting_buffer_level_in_ms,
+                          (int)oxcf->starting_buffer_level_in_ms,
                           lc->target_bandwidth, 1000);
 
             if (oxcf->optimal_buffer_level == 0)
                 lc->optimal_buffer_level = lc->target_bandwidth / 8;
             else
                 lc->optimal_buffer_level = rescale(
-                          oxcf->optimal_buffer_level_in_ms,
+                          (int)oxcf->optimal_buffer_level_in_ms,
                           lc->target_bandwidth, 1000);
 
             if (oxcf->maximum_buffer_size == 0)
                 lc->maximum_buffer_size = lc->target_bandwidth / 8;
             else
                 lc->maximum_buffer_size = rescale(
-                          oxcf->maximum_buffer_size_in_ms,
+                          (int)oxcf->maximum_buffer_size_in_ms,
                           lc->target_bandwidth, 1000);
 
-            // Work out the average size of a frame within this layer
+            /* Work out the average size of a frame within this layer */
             if (i > 0)
-                lc->avg_frame_size_for_layer = (oxcf->target_bitrate[i] -
-                    oxcf->target_bitrate[i-1]) * 1000 /
-                    (lc->frame_rate - prev_layer_frame_rate);
-
-            lc->active_worst_quality         = oxcf->worst_allowed_q;
-            lc->active_best_quality          = oxcf->best_allowed_q;
-            lc->avg_frame_qindex             = oxcf->worst_allowed_q;
+                lc->avg_frame_size_for_layer =
+                   (int)((oxcf->target_bitrate[i] -
+                          oxcf->target_bitrate[i-1]) * 1000 /
+                          (lc->frame_rate - prev_layer_frame_rate));
 
             prev_layer_frame_rate = lc->frame_rate;
         }
@@ -1514,10 +1538,8 @@
     cpi->baseline_gf_interval =
         cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL;
 
-    cpi->ref_frame_flags = VP8_ALT_FLAG | VP8_GOLD_FLAG | VP8_LAST_FLAG;
+    cpi->ref_frame_flags = VP8_ALTR_FRAME | VP8_GOLD_FRAME | VP8_LAST_FRAME;
 
-    //cpi->use_golden_frame_only = 0;
-    //cpi->use_last_frame_only = 0;
     cm->refresh_golden_frame = 0;
     cm->refresh_last_frame = 1;
     cm->refresh_entropy_probs = 1;
@@ -1539,11 +1561,11 @@
             cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout;
     }
 
-    // At the moment the first order values may not be > MAXQ
+    /* At the moment the first order values may not be > MAXQ */
     if (cpi->oxcf.fixed_q > MAXQ)
         cpi->oxcf.fixed_q = MAXQ;
 
-    // local file playback mode == really big buffer
+    /* local file playback mode == really big buffer */
     if (cpi->oxcf.end_usage == USAGE_LOCAL_FILE_PLAYBACK)
     {
         cpi->oxcf.starting_buffer_level       = 60000;
@@ -1554,41 +1576,41 @@
         cpi->oxcf.maximum_buffer_size_in_ms   = 240000;
     }
 
-    // Convert target bandwidth from Kbit/s to Bit/s
+    /* Convert target bandwidth from Kbit/s to Bit/s */
     cpi->oxcf.target_bandwidth       *= 1000;
 
     cpi->oxcf.starting_buffer_level =
-        rescale(cpi->oxcf.starting_buffer_level,
+        rescale((int)cpi->oxcf.starting_buffer_level,
                 cpi->oxcf.target_bandwidth, 1000);
 
-    // Set or reset optimal and maximum buffer levels.
+    /* Set or reset optimal and maximum buffer levels. */
     if (cpi->oxcf.optimal_buffer_level == 0)
         cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8;
     else
         cpi->oxcf.optimal_buffer_level =
-            rescale(cpi->oxcf.optimal_buffer_level,
+            rescale((int)cpi->oxcf.optimal_buffer_level,
                     cpi->oxcf.target_bandwidth, 1000);
 
     if (cpi->oxcf.maximum_buffer_size == 0)
         cpi->oxcf.maximum_buffer_size = cpi->oxcf.target_bandwidth / 8;
     else
         cpi->oxcf.maximum_buffer_size =
-            rescale(cpi->oxcf.maximum_buffer_size,
+            rescale((int)cpi->oxcf.maximum_buffer_size,
                     cpi->oxcf.target_bandwidth, 1000);
 
-    // Set up frame rate and related parameters rate control values.
+    /* Set up frame rate and related parameters rate control values. */
     vp8_new_frame_rate(cpi, cpi->frame_rate);
 
-    // Set absolute upper and lower quality limits
+    /* Set absolute upper and lower quality limits */
     cpi->worst_quality               = cpi->oxcf.worst_allowed_q;
     cpi->best_quality                = cpi->oxcf.best_allowed_q;
 
-    // active values should only be modified if out of new range
+    /* active values should only be modified if out of new range */
     if (cpi->active_worst_quality > cpi->oxcf.worst_allowed_q)
     {
       cpi->active_worst_quality = cpi->oxcf.worst_allowed_q;
     }
-    // less likely
+    /* less likely */
     else if (cpi->active_worst_quality < cpi->oxcf.best_allowed_q)
     {
       cpi->active_worst_quality = cpi->oxcf.best_allowed_q;
@@ -1597,7 +1619,7 @@
     {
       cpi->active_best_quality = cpi->oxcf.best_allowed_q;
     }
-    // less likely
+    /* less likely */
     else if (cpi->active_best_quality > cpi->oxcf.worst_allowed_q)
     {
       cpi->active_best_quality = cpi->oxcf.worst_allowed_q;
@@ -1607,7 +1629,7 @@
 
     cpi->cq_target_quality = cpi->oxcf.cq_level;
 
-    // Only allow dropped frames in buffered mode
+    /* Only allow dropped frames in buffered mode */
     cpi->drop_frames_allowed = cpi->oxcf.allow_df && cpi->buffered_mode;
 
     cpi->target_bandwidth = cpi->oxcf.target_bandwidth;
@@ -1622,7 +1644,7 @@
      * correct.
      */
 
-    // VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs)
+    /* VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs) */
     if (cpi->oxcf.Sharpness > 7)
         cpi->oxcf.Sharpness = 7;
 
@@ -1636,7 +1658,7 @@
         Scale2Ratio(cm->horiz_scale, &hr, &hs);
         Scale2Ratio(cm->vert_scale, &vr, &vs);
 
-        // always go to the next whole number
+        /* always go to the next whole number */
         cm->Width = (hs - 1 + cpi->oxcf.Width * hr) / hs;
         cm->Height = (vs - 1 + cpi->oxcf.Height * vr) / vs;
     }
@@ -1650,6 +1672,7 @@
           cm->yv12_fb[cm->lst_fb_idx].y_height ||
         cm->yv12_fb[cm->lst_fb_idx].y_width == 0)
     {
+        dealloc_raw_frame_buffers(cpi);
         alloc_raw_frame_buffers(cpi);
         vp8_alloc_compressor_data(cpi);
     }
@@ -1662,16 +1685,16 @@
 
     cpi->Speed = cpi->oxcf.cpu_used;
 
-    // force to allowlag to 0 if lag_in_frames is 0;
+    /* force to allowlag to 0 if lag_in_frames is 0; */
     if (cpi->oxcf.lag_in_frames == 0)
     {
         cpi->oxcf.allow_lag = 0;
     }
-    // Limit on lag buffers as these are not currently dynamically allocated
+    /* Limit on lag buffers as these are not currently dynamically allocated */
     else if (cpi->oxcf.lag_in_frames > MAX_LAG_BUFFERS)
         cpi->oxcf.lag_in_frames = MAX_LAG_BUFFERS;
 
-    // YX Temp
+    /* YX Temp */
     cpi->alt_ref_source = NULL;
     cpi->is_src_frame_alt_ref = 0;
 
@@ -1688,7 +1711,7 @@
 #endif
 
 #if 0
-    // Experimental RD Code
+    /* Experimental RD Code */
     cpi->frame_distortion = 0;
     cpi->last_frame_distortion = 0;
 #endif
@@ -1723,7 +1746,7 @@
     VP8_COMMON *cm;
 
     cpi = vpx_memalign(32, sizeof(VP8_COMP));
-    // Check that the CPI instance is valid
+    /* Check that the CPI instance is valid */
     if (!cpi)
         return 0;
 
@@ -1757,14 +1780,15 @@
     cpi->prob_gf_coded                = 128;
     cpi->prob_intra_coded             = 63;
 
-    // Prime the recent reference frame usage counters.
-    // Hereafter they will be maintained as a sort of moving average
+    /* Prime the recent reference frame usage counters.
+     * Hereafter they will be maintained as a sort of moving average
+     */
     cpi->recent_ref_frame_usage[INTRA_FRAME]  = 1;
     cpi->recent_ref_frame_usage[LAST_FRAME]   = 1;
     cpi->recent_ref_frame_usage[GOLDEN_FRAME] = 1;
     cpi->recent_ref_frame_usage[ALTREF_FRAME] = 1;
 
-    // Set reference frame sign bias for ALTREF frame to 1 (for now)
+    /* Set reference frame sign bias for ALTREF frame to 1 (for now) */
     cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 1;
 
     cpi->twopass.gf_decay_rate = 0;
@@ -1774,21 +1798,12 @@
     cpi->alt_is_last  = 0 ;
     cpi->gold_is_alt  = 0 ;
 
-    // allocate memory for storing last frame's MVs for MV prediction.
-    CHECK_MEM_ERROR(cpi->lfmv, vpx_calloc((cpi->common.mb_rows+2) * (cpi->common.mb_cols+2), sizeof(int_mv)));
-    CHECK_MEM_ERROR(cpi->lf_ref_frame_sign_bias, vpx_calloc((cpi->common.mb_rows+2) * (cpi->common.mb_cols+2), sizeof(int)));
-    CHECK_MEM_ERROR(cpi->lf_ref_frame, vpx_calloc((cpi->common.mb_rows+2) * (cpi->common.mb_cols+2), sizeof(int)));
-
-    // Create the encoder segmentation map and set all entries to 0
-    CHECK_MEM_ERROR(cpi->segmentation_map, vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1));
-    CHECK_MEM_ERROR(cpi->active_map, vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1));
-    vpx_memset(cpi->active_map , 1, (cpi->common.mb_rows * cpi->common.mb_cols));
     cpi->active_map_enabled = 0;
 
 #if 0
-    // Experimental code for lagged and one pass
-    // Initialise one_pass GF frames stats
-    // Update stats used for GF selection
+    /* Experimental code for lagged and one pass */
+    /* Initialise one_pass GF frames stats */
+    /* Update stats used for GF selection */
     if (cpi->pass == 0)
     {
         cpi->one_pass_frame_index = 0;
@@ -1808,8 +1823,9 @@
     }
 #endif
 
-    // Should we use the cyclic refresh method.
-    // Currently this is tied to error resilliant mode
+    /* Should we use the cyclic refresh method.
+     * Currently this is tied to error resilliant mode
+     */
     cpi->cyclic_refresh_mode_enabled = cpi->oxcf.error_resilient_mode;
     cpi->cyclic_refresh_mode_max_mbs_perframe = (cpi->common.mb_rows * cpi->common.mb_cols) / 40;
     cpi->cyclic_refresh_mode_index = 0;
@@ -1822,9 +1838,6 @@
     else
         cpi->cyclic_refresh_map = (signed char *) NULL;
 
-    // Test function for segmentation
-    //segmentation_test_function( cpi);
-
 #ifdef ENTROPY_STATS
     init_context_counters();
 #endif
@@ -1832,7 +1845,8 @@
     /*Initialize the feed-forward activity masking.*/
     cpi->activity_avg = 90<<12;
 
-    cpi->frames_since_key = 8;        // Give a sensible default for the first frame.
+    /* Give a sensible default for the first frame. */
+    cpi->frames_since_key = 8;
     cpi->key_frame_frequency = cpi->oxcf.key_freq;
     cpi->this_key_frame_forced = 0;
     cpi->next_key_frame_forced = 0;
@@ -1875,10 +1889,7 @@
 
 #endif
 
-#ifndef LLONG_MAX
-#define LLONG_MAX  9223372036854775807LL
-#endif
-    cpi->first_time_stamp_ever = LLONG_MAX;
+    cpi->first_time_stamp_ever = 0x7FFFFFFF;
 
     cpi->frames_till_gf_update_due      = 0;
     cpi->key_frame_count              = 1;
@@ -1889,9 +1900,6 @@
     cpi->total_byte_count             = 0;
 
     cpi->drop_frame                  = 0;
-    cpi->drop_count                  = 0;
-    cpi->max_drop_count               = 0;
-    cpi->max_consec_dropped_frames     = 4;
 
     cpi->rate_correction_factor         = 1.0;
     cpi->key_frame_rate_correction_factor = 1.0;
@@ -1923,7 +1931,7 @@
     else if (cpi->pass == 2)
     {
         size_t packet_sz = sizeof(FIRSTPASS_STATS);
-        int packets = oxcf->two_pass_stats_in.sz / packet_sz;
+        int packets = (int)(oxcf->two_pass_stats_in.sz / packet_sz);
 
         cpi->twopass.stats_in_start = oxcf->two_pass_stats_in.buf;
         cpi->twopass.stats_in = cpi->twopass.stats_in_start;
@@ -1936,14 +1944,13 @@
 
     if (cpi->compressor_speed == 2)
     {
-        cpi->cpu_freq            = 0; //vp8_get_processor_freq();
         cpi->avg_encode_time      = 0;
         cpi->avg_pick_mode_time    = 0;
     }
 
     vp8_set_speed_features(cpi);
 
-    // Set starting values of RD threshold multipliers (128 = *1)
+    /* Set starting values of RD threshold multipliers (128 = *1) */
     for (i = 0; i < MAX_MODES; i++)
     {
         cpi->rd_thresh_mult[i] = 128;
@@ -2019,11 +2026,14 @@
     cpi->diamond_search_sad = vp8_diamond_search_sad;
     cpi->refining_search_sad = vp8_refining_search_sad;
 
-    // make sure frame 1 is okay
+    /* make sure frame 1 is okay */
     cpi->error_bins[0] = cpi->common.MBs;
 
-    //vp8cx_init_quantizer() is first called here. Add check in vp8cx_frame_init_quantizer() so that vp8cx_init_quantizer is only called later
-    //when needed. This will avoid unnecessary calls of vp8cx_init_quantizer() for every frame.
+    /* vp8cx_init_quantizer() is first called here. Add check in
+     * vp8cx_frame_init_quantizer() so that vp8cx_init_quantizer is only
+     * called later when needed. This will avoid unnecessary calls of
+     * vp8cx_init_quantizer() for every frame.
+     */
     vp8cx_init_quantizer(cpi);
 
     vp8_loop_filter_init(cm);
@@ -2053,6 +2063,10 @@
     cpi->mb.inter_bmode_costs = cpi->rd_costs.inter_bmode_costs;
     cpi->mb.token_costs = cpi->rd_costs.token_costs;
 
+    /* setup block ptrs & offsets */
+    vp8_setup_block_ptrs(&cpi->mb);
+    vp8_setup_block_dptrs(&cpi->mb.e_mbd);
+
     return  cpi;
 }
 
@@ -2103,7 +2117,7 @@
 
                     fprintf(f, "Layer\tBitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\t"
                                "GLPsnrP\tVPXSSIM\t\n");
-                    for (i=0; i<cpi->oxcf.number_of_layers; i++)
+                    for (i=0; i<(int)cpi->oxcf.number_of_layers; i++)
                     {
                         double dr = (double)cpi->bytes_in_layer[i] *
                                               8.0 / 1000.0  / time_encoded;
@@ -2154,7 +2168,7 @@
 
                     fprintf(f, "Layer\tBitRate\tSSIM_Y\tSSIM_U\tSSIM_V\tSSIM_A\t"
                                "Time(us)\n");
-                    for (i=0; i<cpi->oxcf.number_of_layers; i++)
+                    for (i=0; i<(int)cpi->oxcf.number_of_layers; i++)
                     {
                         double dr = (double)cpi->bytes_in_layer[i] *
                                     8.0 / 1000.0  / time_encoded;
@@ -2208,7 +2222,6 @@
                 fprintf(f, "%5d", frames_at_speed[i]);
 
             fprintf(f, "\n");
-            //fprintf(f, "%10d PM %10d %10d %10d EF %10d %10d %10d\n", cpi->Speed, cpi->avg_pick_mode_time, (tot_pm/cnt_pm), cnt_pm,  cpi->avg_encode_time, 0, 0);
             fclose(f);
         }
 
@@ -2270,7 +2283,7 @@
             for (i = 0; i < 10; i++)
             {
 
-                fprintf(fmode, "    { //Above Mode :  %d\n", i);
+                fprintf(fmode, "    { /* Above Mode :  %d */\n", i);
 
                 for (j = 0; j < 10; j++)
                 {
@@ -2285,7 +2298,7 @@
                             fprintf(fmode, " %5d, ", intra_mode_stats[i][j][k]);
                     }
 
-                    fprintf(fmode, "}, // left_mode %d\n", j);
+                    fprintf(fmode, "}, /* left_mode %d */\n", j);
 
                 }
 
@@ -2463,7 +2476,7 @@
 
     for (i = 0; i < 4; i++)
         pkt.data.psnr.psnr[i] = vp8_mse2psnr(pkt.data.psnr.samples[i], 255.0,
-                                             pkt.data.psnr.sse[i]);
+                                             (double)(pkt.data.psnr.sse[i]));
 
     vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt);
 }
@@ -2486,28 +2499,28 @@
     cpi->common.refresh_alt_ref_frame = 0;
     cpi->common.refresh_last_frame   = 0;
 
-    if (ref_frame_flags & VP8_LAST_FLAG)
+    if (ref_frame_flags & VP8_LAST_FRAME)
         cpi->common.refresh_last_frame = 1;
 
-    if (ref_frame_flags & VP8_GOLD_FLAG)
+    if (ref_frame_flags & VP8_GOLD_FRAME)
         cpi->common.refresh_golden_frame = 1;
 
-    if (ref_frame_flags & VP8_ALT_FLAG)
+    if (ref_frame_flags & VP8_ALTR_FRAME)
         cpi->common.refresh_alt_ref_frame = 1;
 
     return 0;
 }
 
-int vp8_get_reference(VP8_COMP *cpi, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
+int vp8_get_reference(VP8_COMP *cpi, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd)
 {
     VP8_COMMON *cm = &cpi->common;
     int ref_fb_idx;
 
-    if (ref_frame_flag == VP8_LAST_FLAG)
+    if (ref_frame_flag == VP8_LAST_FRAME)
         ref_fb_idx = cm->lst_fb_idx;
-    else if (ref_frame_flag == VP8_GOLD_FLAG)
+    else if (ref_frame_flag == VP8_GOLD_FRAME)
         ref_fb_idx = cm->gld_fb_idx;
-    else if (ref_frame_flag == VP8_ALT_FLAG)
+    else if (ref_frame_flag == VP8_ALTR_FRAME)
         ref_fb_idx = cm->alt_fb_idx;
     else
         return -1;
@@ -2516,17 +2529,17 @@
 
     return 0;
 }
-int vp8_set_reference(VP8_COMP *cpi, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
+int vp8_set_reference(VP8_COMP *cpi, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd)
 {
     VP8_COMMON *cm = &cpi->common;
 
     int ref_fb_idx;
 
-    if (ref_frame_flag == VP8_LAST_FLAG)
+    if (ref_frame_flag == VP8_LAST_FRAME)
         ref_fb_idx = cm->lst_fb_idx;
-    else if (ref_frame_flag == VP8_GOLD_FLAG)
+    else if (ref_frame_flag == VP8_GOLD_FRAME)
         ref_fb_idx = cm->gld_fb_idx;
-    else if (ref_frame_flag == VP8_ALT_FLAG)
+    else if (ref_frame_flag == VP8_ALTR_FRAME)
         ref_fb_idx = cm->alt_fb_idx;
     else
         return -1;
@@ -2587,7 +2600,7 @@
 {
     VP8_COMMON *cm = &cpi->common;
 
-    // are we resizing the image
+    /* are we resizing the image */
     if (cm->horiz_scale != 0 || cm->vert_scale != 0)
     {
 #if CONFIG_SPATIAL_RESAMPLING
@@ -2615,51 +2628,57 @@
 }
 
 
-static void resize_key_frame(VP8_COMP *cpi)
+static int resize_key_frame(VP8_COMP *cpi)
 {
 #if CONFIG_SPATIAL_RESAMPLING
     VP8_COMMON *cm = &cpi->common;
 
-    // Do we need to apply resampling for one pass cbr.
-    // In one pass this is more limited than in two pass cbr
-    // The test and any change is only made one per key frame sequence
+    /* Do we need to apply resampling for one pass cbr.
+     * In one pass this is more limited than in two pass cbr
+     * The test and any change is only made one per key frame sequence
+     */
     if (cpi->oxcf.allow_spatial_resampling && (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER))
     {
         int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs);
         int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs);
         int new_width, new_height;
 
-        // If we are below the resample DOWN watermark then scale down a notch.
+        /* If we are below the resample DOWN watermark then scale down a
+         * notch.
+         */
         if (cpi->buffer_level < (cpi->oxcf.resample_down_water_mark * cpi->oxcf.optimal_buffer_level / 100))
         {
             cm->horiz_scale = (cm->horiz_scale < ONETWO) ? cm->horiz_scale + 1 : ONETWO;
             cm->vert_scale = (cm->vert_scale < ONETWO) ? cm->vert_scale + 1 : ONETWO;
         }
-        // Should we now start scaling back up
+        /* Should we now start scaling back up */
         else if (cpi->buffer_level > (cpi->oxcf.resample_up_water_mark * cpi->oxcf.optimal_buffer_level / 100))
         {
             cm->horiz_scale = (cm->horiz_scale > NORMAL) ? cm->horiz_scale - 1 : NORMAL;
             cm->vert_scale = (cm->vert_scale > NORMAL) ? cm->vert_scale - 1 : NORMAL;
         }
 
-        // Get the new hieght and width
+        /* Get the new hieght and width */
         Scale2Ratio(cm->horiz_scale, &hr, &hs);
         Scale2Ratio(cm->vert_scale, &vr, &vs);
         new_width = ((hs - 1) + (cpi->oxcf.Width * hr)) / hs;
         new_height = ((vs - 1) + (cpi->oxcf.Height * vr)) / vs;
 
-        // If the image size has changed we need to reallocate the buffers
-        // and resample the source image
+        /* If the image size has changed we need to reallocate the buffers
+         * and resample the source image
+         */
         if ((cm->Width != new_width) || (cm->Height != new_height))
         {
             cm->Width = new_width;
             cm->Height = new_height;
             vp8_alloc_compressor_data(cpi);
             scale_and_extend_source(cpi->un_scaled_source, cpi);
+            return 1;
         }
     }
 
 #endif
+    return 0;
 }
 
 
@@ -2667,34 +2686,35 @@
 {
     VP8_COMMON *cm = &cpi->common;
 
-    // Select an interval before next GF or altref
+    /* Select an interval before next GF or altref */
     if (!cpi->auto_gold)
-        cpi->frames_till_gf_update_due = cpi->goldfreq;
+        cpi->frames_till_gf_update_due = DEFAULT_GF_INTERVAL;
 
     if ((cpi->pass != 2) && cpi->frames_till_gf_update_due)
     {
         cpi->current_gf_interval = cpi->frames_till_gf_update_due;
 
-        // Set the bits per frame that we should try and recover in subsequent inter frames
-        // to account for the extra GF spend... note that his does not apply for GF updates
-        // that occur coincident with a key frame as the extra cost of key frames is dealt
-        // with elsewhere.
-
+        /* Set the bits per frame that we should try and recover in
+         * subsequent inter frames to account for the extra GF spend...
+         * note that his does not apply for GF updates that occur
+         * coincident with a key frame as the extra cost of key frames is
+         * dealt with elsewhere.
+         */
         cpi->gf_overspend_bits += cpi->projected_frame_size;
         cpi->non_gf_bitrate_adjustment = cpi->gf_overspend_bits / cpi->frames_till_gf_update_due;
     }
 
-    // Update data structure that monitors level of reference to last GF
+    /* Update data structure that monitors level of reference to last GF */
     vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
     cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
 
-    // this frame refreshes means next frames don't unless specified by user
+    /* this frame refreshes means next frames don't unless specified by user */
     cpi->common.frames_since_golden = 0;
 
-    // Clear the alternate reference update pending flag.
+    /* Clear the alternate reference update pending flag. */
     cpi->source_alt_ref_pending = 0;
 
-    // Set the alternate refernce frame active flag
+    /* Set the alternate refernce frame active flag */
     cpi->source_alt_ref_active = 1;
 
 
@@ -2703,25 +2723,29 @@
 {
     VP8_COMMON *cm = &cpi->common;
 
-    // Update the Golden frame usage counts.
+    /* Update the Golden frame usage counts. */
     if (cm->refresh_golden_frame)
     {
-        // Select an interval before next GF
+        /* Select an interval before next GF */
         if (!cpi->auto_gold)
-            cpi->frames_till_gf_update_due = cpi->goldfreq;
+            cpi->frames_till_gf_update_due = DEFAULT_GF_INTERVAL;
 
         if ((cpi->pass != 2) && (cpi->frames_till_gf_update_due > 0))
         {
             cpi->current_gf_interval = cpi->frames_till_gf_update_due;
 
-            // Set the bits per frame that we should try and recover in subsequent inter frames
-            // to account for the extra GF spend... note that his does not apply for GF updates
-            // that occur coincident with a key frame as the extra cost of key frames is dealt
-            // with elsewhere.
+            /* Set the bits per frame that we should try and recover in
+             * subsequent inter frames to account for the extra GF spend...
+             * note that his does not apply for GF updates that occur
+             * coincident with a key frame as the extra cost of key frames
+             * is dealt with elsewhere.
+             */
             if ((cm->frame_type != KEY_FRAME) && !cpi->source_alt_ref_active)
             {
-                // Calcluate GF bits to be recovered
-                // Projected size - av frame bits available for inter frames for clip as a whole
+                /* Calcluate GF bits to be recovered
+                 * Projected size - av frame bits available for inter
+                 * frames for clip as a whole
+                 */
                 cpi->gf_overspend_bits += (cpi->projected_frame_size - cpi->inter_frame_target);
             }
 
@@ -2729,32 +2753,25 @@
 
         }
 
-        // Update data structure that monitors level of reference to last GF
+        /* Update data structure that monitors level of reference to last GF */
         vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
         cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
 
-        // this frame refreshes means next frames don't unless specified by user
+        /* this frame refreshes means next frames don't unless specified by
+         * user
+         */
         cm->refresh_golden_frame = 0;
         cpi->common.frames_since_golden = 0;
 
-        //if ( cm->frame_type == KEY_FRAME )
-        //{
         cpi->recent_ref_frame_usage[INTRA_FRAME] = 1;
         cpi->recent_ref_frame_usage[LAST_FRAME] = 1;
         cpi->recent_ref_frame_usage[GOLDEN_FRAME] = 1;
         cpi->recent_ref_frame_usage[ALTREF_FRAME] = 1;
-        //}
-        //else
-        //{
-        //  // Carry a potrtion of count over to begining of next gf sequence
-        //  cpi->recent_ref_frame_usage[INTRA_FRAME] >>= 5;
-        //  cpi->recent_ref_frame_usage[LAST_FRAME] >>= 5;
-        //  cpi->recent_ref_frame_usage[GOLDEN_FRAME] >>= 5;
-        //  cpi->recent_ref_frame_usage[ALTREF_FRAME] >>= 5;
-        //}
 
-        // ******** Fixed Q test code only ************
-        // If we are going to use the ALT reference for the next group of frames set a flag to say so.
+        /* ******** Fixed Q test code only ************ */
+        /* If we are going to use the ALT reference for the next group of
+         * frames set a flag to say so.
+         */
         if (cpi->oxcf.fixed_q >= 0 &&
             cpi->oxcf.play_alternate && !cpi->common.refresh_alt_ref_frame)
         {
@@ -2765,14 +2782,14 @@
         if (!cpi->source_alt_ref_pending)
             cpi->source_alt_ref_active = 0;
 
-        // Decrement count down till next gf
+        /* Decrement count down till next gf */
         if (cpi->frames_till_gf_update_due > 0)
             cpi->frames_till_gf_update_due--;
 
     }
     else if (!cpi->common.refresh_alt_ref_frame)
     {
-        // Decrement count down till next gf
+        /* Decrement count down till next gf */
         if (cpi->frames_till_gf_update_due > 0)
             cpi->frames_till_gf_update_due--;
 
@@ -2791,8 +2808,9 @@
     }
 }
 
-// This function updates the reference frame probability estimates that
-// will be used during mode selection
+/* This function updates the reference frame probability estimates that
+ * will be used during mode selection
+ */
 static void update_rd_ref_frame_probs(VP8_COMP *cpi)
 {
     VP8_COMMON *cm = &cpi->common;
@@ -2814,7 +2832,9 @@
         cpi->prob_gf_coded    = 128;
     }
 
-    // update reference frame costs since we can do better than what we got last frame.
+    /* update reference frame costs since we can do better than what we got
+     * last frame.
+     */
     if (cpi->oxcf.number_of_layers == 1)
     {
         if (cpi->common.refresh_alt_ref_frame)
@@ -2845,7 +2865,7 @@
 }
 
 
-// 1 = key, 0 = inter
+/* 1 = key, 0 = inter */
 static int decide_key_frame(VP8_COMP *cpi)
 {
     VP8_COMMON *cm = &cpi->common;
@@ -2857,8 +2877,8 @@
     if (cpi->Speed > 11)
         return 0;
 
-    // Clear down mmx registers
-    vp8_clear_system_state();  //__asm emms;
+    /* Clear down mmx registers */
+    vp8_clear_system_state();
 
     if ((cpi->compressor_speed == 2) && (cpi->Speed >= 5) && (cpi->sf.RD == 0))
     {
@@ -2904,7 +2924,7 @@
 
     }
 
-    // If the following are true we might as well code a key frame
+    /* If the following are true we might as well code a key frame */
     if (((cpi->this_frame_percent_intra == 100) &&
          (cpi->this_frame_percent_intra > (cpi->last_frame_percent_intra + 2))) ||
         ((cpi->this_frame_percent_intra > 95) &&
@@ -2912,9 +2932,12 @@
     {
         code_key_frame = 1;
     }
-    // in addition if the following are true and this is not a golden frame then code a key frame
-    // Note that on golden frames there often seems to be a pop in intra useage anyway hence this
-    // restriction is designed to prevent spurious key frames. The Intra pop needs to be investigated.
+    /* in addition if the following are true and this is not a golden frame
+     * then code a key frame Note that on golden frames there often seems
+     * to be a pop in intra useage anyway hence this restriction is
+     * designed to prevent spurious key frames. The Intra pop needs to be
+     * investigated.
+     */
     else if (((cpi->this_frame_percent_intra > 60) &&
               (cpi->this_frame_percent_intra > (cpi->last_frame_percent_intra * 2))) ||
              ((cpi->this_frame_percent_intra > 75) &&
@@ -2946,7 +2969,7 @@
 void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame)
 {
 
-    // write the frame
+    /* write the frame */
     FILE *yframe;
     int i;
     char filename[255];
@@ -2974,10 +2997,11 @@
     fclose(yframe);
 }
 #endif
-// return of 0 means drop frame
+/* return of 0 means drop frame */
 
-// Function to test for conditions that indeicate we should loop
-// back and recode a frame.
+/* Function to test for conditions that indeicate we should loop
+ * back and recode a frame.
+ */
 static int recode_loop_test( VP8_COMP *cpi,
                               int high_limit, int low_limit,
                               int q, int maxq, int minq )
@@ -2985,32 +3009,33 @@
     int force_recode = 0;
     VP8_COMMON *cm = &cpi->common;
 
-    // Is frame recode allowed at all
-    // Yes if either recode mode 1 is selected or mode two is selcted
-    // and the frame is a key frame. golden frame or alt_ref_frame
+    /* Is frame recode allowed at all
+     * Yes if either recode mode 1 is selected or mode two is selcted
+     * and the frame is a key frame. golden frame or alt_ref_frame
+     */
     if ( (cpi->sf.recode_loop == 1) ||
          ( (cpi->sf.recode_loop == 2) &&
            ( (cm->frame_type == KEY_FRAME) ||
              cm->refresh_golden_frame ||
              cm->refresh_alt_ref_frame ) ) )
     {
-        // General over and under shoot tests
+        /* General over and under shoot tests */
         if ( ((cpi->projected_frame_size > high_limit) && (q < maxq)) ||
              ((cpi->projected_frame_size < low_limit) && (q > minq)) )
         {
             force_recode = 1;
         }
-        // Special Constrained quality tests
+        /* Special Constrained quality tests */
         else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY)
         {
-            // Undershoot and below auto cq level
+            /* Undershoot and below auto cq level */
             if ( (q > cpi->cq_target_quality) &&
                  (cpi->projected_frame_size <
                      ((cpi->this_frame_target * 7) >> 3)))
             {
                 force_recode = 1;
             }
-            // Severe undershoot and between auto and user cq level
+            /* Severe undershoot and between auto and user cq level */
             else if ( (q > cpi->oxcf.cq_level) &&
                       (cpi->projected_frame_size < cpi->min_frame_bandwidth) &&
                       (cpi->active_best_quality > cpi->oxcf.cq_level))
@@ -3024,21 +3049,28 @@
     return force_recode;
 }
 
-static void update_reference_frames(VP8_COMMON *cm)
+static void update_reference_frames(VP8_COMP *cpi)
 {
+    VP8_COMMON *cm = &cpi->common;
     YV12_BUFFER_CONFIG *yv12_fb = cm->yv12_fb;
 
-    // At this point the new frame has been encoded.
-    // If any buffer copy / swapping is signaled it should be done here.
+    /* At this point the new frame has been encoded.
+     * If any buffer copy / swapping is signaled it should be done here.
+     */
 
     if (cm->frame_type == KEY_FRAME)
     {
-        yv12_fb[cm->new_fb_idx].flags |= VP8_GOLD_FLAG | VP8_ALT_FLAG ;
+        yv12_fb[cm->new_fb_idx].flags |= VP8_GOLD_FRAME | VP8_ALTR_FRAME ;
 
-        yv12_fb[cm->gld_fb_idx].flags &= ~VP8_GOLD_FLAG;
-        yv12_fb[cm->alt_fb_idx].flags &= ~VP8_ALT_FLAG;
+        yv12_fb[cm->gld_fb_idx].flags &= ~VP8_GOLD_FRAME;
+        yv12_fb[cm->alt_fb_idx].flags &= ~VP8_ALTR_FRAME;
 
         cm->alt_fb_idx = cm->gld_fb_idx = cm->new_fb_idx;
+
+#if CONFIG_MULTI_RES_ENCODING
+        cpi->current_ref_frames[GOLDEN_FRAME] = cm->current_video_frame;
+        cpi->current_ref_frames[ALTREF_FRAME] = cm->current_video_frame;
+#endif
     }
     else    /* For non key frames */
     {
@@ -3046,9 +3078,13 @@
         {
             assert(!cm->copy_buffer_to_arf);
 
-            cm->yv12_fb[cm->new_fb_idx].flags |= VP8_ALT_FLAG;
-            cm->yv12_fb[cm->alt_fb_idx].flags &= ~VP8_ALT_FLAG;
+            cm->yv12_fb[cm->new_fb_idx].flags |= VP8_ALTR_FRAME;
+            cm->yv12_fb[cm->alt_fb_idx].flags &= ~VP8_ALTR_FRAME;
             cm->alt_fb_idx = cm->new_fb_idx;
+
+#if CONFIG_MULTI_RES_ENCODING
+            cpi->current_ref_frames[ALTREF_FRAME] = cm->current_video_frame;
+#endif
         }
         else if (cm->copy_buffer_to_arf)
         {
@@ -3058,18 +3094,28 @@
             {
                 if(cm->alt_fb_idx != cm->lst_fb_idx)
                 {
-                    yv12_fb[cm->lst_fb_idx].flags |= VP8_ALT_FLAG;
-                    yv12_fb[cm->alt_fb_idx].flags &= ~VP8_ALT_FLAG;
+                    yv12_fb[cm->lst_fb_idx].flags |= VP8_ALTR_FRAME;
+                    yv12_fb[cm->alt_fb_idx].flags &= ~VP8_ALTR_FRAME;
                     cm->alt_fb_idx = cm->lst_fb_idx;
+
+#if CONFIG_MULTI_RES_ENCODING
+                    cpi->current_ref_frames[ALTREF_FRAME] =
+                        cpi->current_ref_frames[LAST_FRAME];
+#endif
                 }
             }
             else /* if (cm->copy_buffer_to_arf == 2) */
             {
                 if(cm->alt_fb_idx != cm->gld_fb_idx)
                 {
-                    yv12_fb[cm->gld_fb_idx].flags |= VP8_ALT_FLAG;
-                    yv12_fb[cm->alt_fb_idx].flags &= ~VP8_ALT_FLAG;
+                    yv12_fb[cm->gld_fb_idx].flags |= VP8_ALTR_FRAME;
+                    yv12_fb[cm->alt_fb_idx].flags &= ~VP8_ALTR_FRAME;
                     cm->alt_fb_idx = cm->gld_fb_idx;
+
+#if CONFIG_MULTI_RES_ENCODING
+                    cpi->current_ref_frames[ALTREF_FRAME] =
+                        cpi->current_ref_frames[GOLDEN_FRAME];
+#endif
                 }
             }
         }
@@ -3078,9 +3124,13 @@
         {
             assert(!cm->copy_buffer_to_gf);
 
-            cm->yv12_fb[cm->new_fb_idx].flags |= VP8_GOLD_FLAG;
-            cm->yv12_fb[cm->gld_fb_idx].flags &= ~VP8_GOLD_FLAG;
+            cm->yv12_fb[cm->new_fb_idx].flags |= VP8_GOLD_FRAME;
+            cm->yv12_fb[cm->gld_fb_idx].flags &= ~VP8_GOLD_FRAME;
             cm->gld_fb_idx = cm->new_fb_idx;
+
+#if CONFIG_MULTI_RES_ENCODING
+            cpi->current_ref_frames[GOLDEN_FRAME] = cm->current_video_frame;
+#endif
         }
         else if (cm->copy_buffer_to_gf)
         {
@@ -3090,18 +3140,28 @@
             {
                 if(cm->gld_fb_idx != cm->lst_fb_idx)
                 {
-                    yv12_fb[cm->lst_fb_idx].flags |= VP8_GOLD_FLAG;
-                    yv12_fb[cm->gld_fb_idx].flags &= ~VP8_GOLD_FLAG;
+                    yv12_fb[cm->lst_fb_idx].flags |= VP8_GOLD_FRAME;
+                    yv12_fb[cm->gld_fb_idx].flags &= ~VP8_GOLD_FRAME;
                     cm->gld_fb_idx = cm->lst_fb_idx;
+
+#if CONFIG_MULTI_RES_ENCODING
+                    cpi->current_ref_frames[GOLDEN_FRAME] =
+                        cpi->current_ref_frames[LAST_FRAME];
+#endif
                 }
             }
             else /* if (cm->copy_buffer_to_gf == 2) */
             {
                 if(cm->alt_fb_idx != cm->gld_fb_idx)
                 {
-                    yv12_fb[cm->alt_fb_idx].flags |= VP8_GOLD_FLAG;
-                    yv12_fb[cm->gld_fb_idx].flags &= ~VP8_GOLD_FLAG;
+                    yv12_fb[cm->alt_fb_idx].flags |= VP8_GOLD_FRAME;
+                    yv12_fb[cm->gld_fb_idx].flags &= ~VP8_GOLD_FRAME;
                     cm->gld_fb_idx = cm->alt_fb_idx;
+
+#if CONFIG_MULTI_RES_ENCODING
+                    cpi->current_ref_frames[GOLDEN_FRAME] =
+                        cpi->current_ref_frames[ALTREF_FRAME];
+#endif
                 }
             }
         }
@@ -3109,9 +3169,13 @@
 
     if (cm->refresh_last_frame)
     {
-        cm->yv12_fb[cm->new_fb_idx].flags |= VP8_LAST_FLAG;
-        cm->yv12_fb[cm->lst_fb_idx].flags &= ~VP8_LAST_FLAG;
+        cm->yv12_fb[cm->new_fb_idx].flags |= VP8_LAST_FRAME;
+        cm->yv12_fb[cm->lst_fb_idx].flags &= ~VP8_LAST_FRAME;
         cm->lst_fb_idx = cm->new_fb_idx;
+
+#if CONFIG_MULTI_RES_ENCODING
+        cpi->current_ref_frames[LAST_FRAME] = cm->current_video_frame;
+#endif
     }
 }
 
@@ -3234,13 +3298,14 @@
     int undershoot_seen = 0;
 #endif
 
-    int drop_mark = cpi->oxcf.drop_frames_water_mark * cpi->oxcf.optimal_buffer_level / 100;
+    int drop_mark = (int)(cpi->oxcf.drop_frames_water_mark *
+                          cpi->oxcf.optimal_buffer_level / 100);
     int drop_mark75 = drop_mark * 2 / 3;
     int drop_mark50 = drop_mark / 4;
     int drop_mark25 = drop_mark / 8;
 
 
-    // Clear down mmx registers to allow floating point in what follows
+    /* Clear down mmx registers to allow floating point in what follows */
     vp8_clear_system_state();
 
 #if CONFIG_MULTITHREAD
@@ -3252,41 +3317,44 @@
     }
 #endif
 
-    // Test code for segmentation of gf/arf (0,0)
-    //segmentation_test_function( cpi);
-
     if(cpi->force_next_frame_intra)
     {
         cm->frame_type = KEY_FRAME;  /* delayed intra frame */
         cpi->force_next_frame_intra = 0;
     }
 
-    // For an alt ref frame in 2 pass we skip the call to the second pass function that sets the target bandwidth
+    /* For an alt ref frame in 2 pass we skip the call to the second pass
+     * function that sets the target bandwidth
+     */
 #if !(CONFIG_REALTIME_ONLY)
 
     if (cpi->pass == 2)
     {
         if (cpi->common.refresh_alt_ref_frame)
         {
-            cpi->per_frame_bandwidth = cpi->twopass.gf_bits;                           // Per frame bit target for the alt ref frame
-            cpi->target_bandwidth = cpi->twopass.gf_bits * cpi->output_frame_rate;      // per second target bitrate
+            /* Per frame bit target for the alt ref frame */
+            cpi->per_frame_bandwidth = cpi->twopass.gf_bits;
+            /* per second target bitrate */
+            cpi->target_bandwidth = (int)(cpi->twopass.gf_bits *
+                                          cpi->output_frame_rate);
         }
     }
     else
 #endif
         cpi->per_frame_bandwidth  = (int)(cpi->target_bandwidth / cpi->output_frame_rate);
 
-    // Default turn off buffer to buffer copying
+    /* Default turn off buffer to buffer copying */
     cm->copy_buffer_to_gf = 0;
     cm->copy_buffer_to_arf = 0;
 
-    // Clear zbin over-quant value and mode boost values.
+    /* Clear zbin over-quant value and mode boost values. */
     cpi->zbin_over_quant = 0;
     cpi->zbin_mode_boost = 0;
 
-    // Enable or disable mode based tweaking of the zbin
-    // For 2 Pass Only used where GF/ARF prediction quality
-    // is above a threshold
+    /* Enable or disable mode based tweaking of the zbin
+     * For 2 Pass Only used where GF/ARF prediction quality
+     * is above a threshold
+     */
     cpi->zbin_mode_boost_enabled = 1;
     if (cpi->pass == 2)
     {
@@ -3296,19 +3364,21 @@
         }
     }
 
-    // Current default encoder behaviour for the altref sign bias
+    /* Current default encoder behaviour for the altref sign bias */
     if (cpi->source_alt_ref_active)
         cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 1;
     else
         cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 0;
 
-    // Check to see if a key frame is signalled
-    // For two pass with auto key frame enabled cm->frame_type may already be set, but not for one pass.
+    /* Check to see if a key frame is signalled
+     * For two pass with auto key frame enabled cm->frame_type may already
+     * be set, but not for one pass.
+     */
     if ((cm->current_video_frame == 0) ||
         (cm->frame_flags & FRAMEFLAGS_KEY) ||
         (cpi->oxcf.auto_key && (cpi->frames_since_key % cpi->key_frame_frequency == 0)))
     {
-        // Key frame from VFW/auto-keyframe/first frame
+        /* Key frame from VFW/auto-keyframe/first frame */
         cm->frame_type = KEY_FRAME;
     }
 
@@ -3318,53 +3388,54 @@
      */
     if (cpi->oxcf.mr_encoder_id)
     {
-        cm->frame_type =
-            ((LOWER_RES_FRAME_INFO*)cpi->oxcf.mr_low_res_mode_info)->frame_type;
+        LOWER_RES_FRAME_INFO* low_res_frame_info
+                        = (LOWER_RES_FRAME_INFO*)cpi->oxcf.mr_low_res_mode_info;
+
+        cm->frame_type = low_res_frame_info->frame_type;
+
+        if(cm->frame_type != KEY_FRAME)
+        {
+            cpi->mr_low_res_mv_avail = 1;
+            cpi->mr_low_res_mv_avail &= !(low_res_frame_info->is_frame_dropped);
+
+            if (cpi->ref_frame_flags & VP8_LAST_FRAME)
+                cpi->mr_low_res_mv_avail &= (cpi->current_ref_frames[LAST_FRAME]
+                         == low_res_frame_info->low_res_ref_frames[LAST_FRAME]);
+
+            if (cpi->ref_frame_flags & VP8_GOLD_FRAME)
+                cpi->mr_low_res_mv_avail &= (cpi->current_ref_frames[GOLDEN_FRAME]
+                         == low_res_frame_info->low_res_ref_frames[GOLDEN_FRAME]);
+
+            if (cpi->ref_frame_flags & VP8_ALTR_FRAME)
+                cpi->mr_low_res_mv_avail &= (cpi->current_ref_frames[ALTREF_FRAME]
+                         == low_res_frame_info->low_res_ref_frames[ALTREF_FRAME]);
+        }
     }
 #endif
 
-    // Set default state for segment and mode based loop filter update flags
-    cpi->mb.e_mbd.update_mb_segmentation_map = 0;
-    cpi->mb.e_mbd.update_mb_segmentation_data = 0;
-    cpi->mb.e_mbd.mode_ref_lf_delta_update = 0;
-
-    // Set various flags etc to special state if it is a key frame
+    /* Set various flags etc to special state if it is a key frame */
     if (cm->frame_type == KEY_FRAME)
     {
         int i;
 
-        // Reset the loop filter deltas and segmentation map
+        // Set the loop filter deltas and segmentation map update
         setup_features(cpi);
 
-        // If segmentation is enabled force a map update for key frames
-        if (cpi->mb.e_mbd.segmentation_enabled)
-        {
-            cpi->mb.e_mbd.update_mb_segmentation_map = 1;
-            cpi->mb.e_mbd.update_mb_segmentation_data = 1;
-        }
-
-        // The alternate reference frame cannot be active for a key frame
+        /* The alternate reference frame cannot be active for a key frame */
         cpi->source_alt_ref_active = 0;
 
-        // Reset the RD threshold multipliers to default of * 1 (128)
+        /* Reset the RD threshold multipliers to default of * 1 (128) */
         for (i = 0; i < MAX_MODES; i++)
         {
             cpi->rd_thresh_mult[i] = 128;
         }
     }
 
-    // Test code for segmentation
-    //if ( (cm->frame_type == KEY_FRAME) || ((cm->current_video_frame % 2) == 0))
-    //if ( (cm->current_video_frame % 2) == 0 )
-    //  enable_segmentation(cpi);
-    //else
-    //  disable_segmentation(cpi);
-
 #if 0
-    // Experimental code for lagged compress and one pass
-    // Initialise one_pass GF frames stats
-    // Update stats used for GF selection
-    //if ( cpi->pass == 0 )
+    /* Experimental code for lagged compress and one pass
+     * Initialise one_pass GF frames stats
+     * Update stats used for GF selection
+     */
     {
         cpi->one_pass_frame_index = cm->current_video_frame % MAX_LAG_BUFFERS;
 
@@ -3384,8 +3455,9 @@
 
     if (cpi->drop_frames_allowed)
     {
-        // The reset to decimation 0 is only done here for one pass.
-        // Once it is set two pass leaves decimation on till the next kf.
+        /* The reset to decimation 0 is only done here for one pass.
+         * Once it is set two pass leaves decimation on till the next kf.
+         */
         if ((cpi->buffer_level > drop_mark) && (cpi->decimation_factor > 0))
             cpi->decimation_factor --;
 
@@ -3404,14 +3476,17 @@
         {
             cpi->decimation_factor = 1;
         }
-        //vpx_log("Encoder: Decimation Factor: %d \n",cpi->decimation_factor);
     }
 
-    // The following decimates the frame rate according to a regular pattern (i.e. to 1/2 or 2/3 frame rate)
-    // This can be used to help prevent buffer under-run in CBR mode. Alternatively it might be desirable in
-    // some situations to drop frame rate but throw more bits at each frame.
-    //
-    // Note that dropping a key frame can be problematic if spatial resampling is also active
+    /* The following decimates the frame rate according to a regular
+     * pattern (i.e. to 1/2 or 2/3 frame rate) This can be used to help
+     * prevent buffer under-run in CBR mode. Alternatively it might be
+     * desirable in some situations to drop frame rate but throw more bits
+     * at each frame.
+     *
+     * Note that dropping a key frame can be problematic if spatial
+     * resampling is also active
+     */
     if (cpi->decimation_factor > 0)
     {
         switch (cpi->decimation_factor)
@@ -3427,8 +3502,10 @@
             break;
         }
 
-        // Note that we should not throw out a key frame (especially when spatial resampling is enabled).
-        if ((cm->frame_type == KEY_FRAME)) // && cpi->oxcf.allow_spatial_resampling )
+        /* Note that we should not throw out a key frame (especially when
+         * spatial resampling is enabled).
+         */
+        if ((cm->frame_type == KEY_FRAME))
         {
             cpi->decimation_count = cpi->decimation_factor;
         }
@@ -3440,6 +3517,10 @@
             if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size)
                 cpi->bits_off_target = cpi->oxcf.maximum_buffer_size;
 
+#if CONFIG_MULTI_RES_ENCODING
+            vp8_store_drop_frame_info(cpi);
+#endif
+
             cm->current_video_frame++;
             cpi->frames_since_key++;
 
@@ -3453,7 +3534,9 @@
             {
                 unsigned int i;
 
-                // Propagate bits saved by dropping the frame to higher layers
+                /* Propagate bits saved by dropping the frame to higher
+                 * layers
+                 */
                 for (i=cpi->current_layer+1; i<cpi->oxcf.number_of_layers; i++)
                 {
                     LAYER_CONTEXT *lc = &cpi->layer_context[i];
@@ -3469,24 +3552,32 @@
         else
             cpi->decimation_count = cpi->decimation_factor;
     }
+    else
+        cpi->decimation_count = 0;
 
-    // Decide how big to make the frame
+    /* Decide how big to make the frame */
     if (!vp8_pick_frame_size(cpi))
     {
+        /*TODO: 2 drop_frame and return code could be put together. */
+#if CONFIG_MULTI_RES_ENCODING
+        vp8_store_drop_frame_info(cpi);
+#endif
         cm->current_video_frame++;
         cpi->frames_since_key++;
         return;
     }
 
-    // Reduce active_worst_allowed_q for CBR if our buffer is getting too full.
-    // This has a knock on effect on active best quality as well.
-    // For CBR if the buffer reaches its maximum level then we can no longer
-    // save up bits for later frames so we might as well use them up
-    // on the current frame.
+    /* Reduce active_worst_allowed_q for CBR if our buffer is getting too full.
+     * This has a knock on effect on active best quality as well.
+     * For CBR if the buffer reaches its maximum level then we can no longer
+     * save up bits for later frames so we might as well use them up
+     * on the current frame.
+     */
     if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) &&
         (cpi->buffer_level >= cpi->oxcf.optimal_buffer_level) && cpi->buffered_mode)
     {
-        int Adjustment = cpi->active_worst_quality / 4;       // Max adjustment is 1/4
+        /* Max adjustment is 1/4 */
+        int Adjustment = cpi->active_worst_quality / 4;
 
         if (Adjustment)
         {
@@ -3494,10 +3585,16 @@
 
             if (cpi->buffer_level < cpi->oxcf.maximum_buffer_size)
             {
-                buff_lvl_step = (cpi->oxcf.maximum_buffer_size - cpi->oxcf.optimal_buffer_level) / Adjustment;
+                buff_lvl_step = (int)
+                                ((cpi->oxcf.maximum_buffer_size -
+                                  cpi->oxcf.optimal_buffer_level) /
+                                  Adjustment);
 
                 if (buff_lvl_step)
-                    Adjustment = (cpi->buffer_level - cpi->oxcf.optimal_buffer_level) / buff_lvl_step;
+                    Adjustment = (int)
+                                 ((cpi->buffer_level -
+                                 cpi->oxcf.optimal_buffer_level) /
+                                 buff_lvl_step);
                 else
                     Adjustment = 0;
             }
@@ -3509,8 +3606,9 @@
         }
     }
 
-    // Set an active best quality and if necessary active worst quality
-    // There is some odd behavior for one pass here that needs attention.
+    /* Set an active best quality and if necessary active worst quality
+     * There is some odd behavior for one pass here that needs attention.
+     */
     if ( (cpi->pass == 2) || (cpi->ni_frames > 150))
     {
         vp8_clear_system_state();
@@ -3526,9 +3624,10 @@
                 else
                    cpi->active_best_quality = kf_high_motion_minq[Q];
 
-                // Special case for key frames forced because we have reached
-                // the maximum key frame interval. Here force the Q to a range
-                // based on the ambient Q to reduce the risk of popping
+                /* Special case for key frames forced because we have reached
+                 * the maximum key frame interval. Here force the Q to a range
+                 * based on the ambient Q to reduce the risk of popping
+                 */
                 if ( cpi->this_key_frame_forced )
                 {
                     if ( cpi->active_best_quality > cpi->avg_frame_qindex * 7/8)
@@ -3537,7 +3636,7 @@
                         cpi->active_best_quality = cpi->avg_frame_qindex >> 2;
                 }
             }
-            // One pass more conservative
+            /* One pass more conservative */
             else
                cpi->active_best_quality = kf_high_motion_minq[Q];
         }
@@ -3545,16 +3644,17 @@
         else if (cpi->oxcf.number_of_layers==1 &&
                 (cm->refresh_golden_frame || cpi->common.refresh_alt_ref_frame))
         {
-            // Use the lower of cpi->active_worst_quality and recent
-            // average Q as basis for GF/ARF Q limit unless last frame was
-            // a key frame.
+            /* Use the lower of cpi->active_worst_quality and recent
+             * average Q as basis for GF/ARF Q limit unless last frame was
+             * a key frame.
+             */
             if ( (cpi->frames_since_key > 1) &&
                (cpi->avg_frame_qindex < cpi->active_worst_quality) )
             {
                 Q = cpi->avg_frame_qindex;
             }
 
-            // For constrained quality dont allow Q less than the cq level
+            /* For constrained quality dont allow Q less than the cq level */
             if ( (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
                  (Q < cpi->cq_target_quality) )
             {
@@ -3570,14 +3670,14 @@
                 else
                     cpi->active_best_quality = gf_mid_motion_minq[Q];
 
-                // Constrained quality use slightly lower active best.
+                /* Constrained quality use slightly lower active best. */
                 if ( cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY )
                 {
                     cpi->active_best_quality =
                         cpi->active_best_quality * 15/16;
                 }
             }
-            // One pass more conservative
+            /* One pass more conservative */
             else
                 cpi->active_best_quality = gf_high_motion_minq[Q];
         }
@@ -3585,14 +3685,16 @@
         {
             cpi->active_best_quality = inter_minq[Q];
 
-            // For the constant/constrained quality mode we dont want
-            // q to fall below the cq level.
+            /* For the constant/constrained quality mode we dont want
+             * q to fall below the cq level.
+             */
             if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
                 (cpi->active_best_quality < cpi->cq_target_quality) )
             {
-                // If we are strongly undershooting the target rate in the last
-                // frames then use the user passed in cq value not the auto
-                // cq value.
+                /* If we are strongly undershooting the target rate in the last
+                 * frames then use the user passed in cq value not the auto
+                 * cq value.
+                 */
                 if ( cpi->rolling_actual_bits < cpi->min_frame_bandwidth )
                     cpi->active_best_quality = cpi->oxcf.cq_level;
                 else
@@ -3600,26 +3702,33 @@
             }
         }
 
-        // If CBR and the buffer is as full then it is reasonable to allow
-        // higher quality on the frames to prevent bits just going to waste.
+        /* If CBR and the buffer is as full then it is reasonable to allow
+         * higher quality on the frames to prevent bits just going to waste.
+         */
         if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
         {
-            // Note that the use of >= here elliminates the risk of a devide
-            // by 0 error in the else if clause
+            /* Note that the use of >= here elliminates the risk of a devide
+             * by 0 error in the else if clause
+             */
             if (cpi->buffer_level >= cpi->oxcf.maximum_buffer_size)
                 cpi->active_best_quality = cpi->best_quality;
 
             else if (cpi->buffer_level > cpi->oxcf.optimal_buffer_level)
             {
-                int Fraction = ((cpi->buffer_level - cpi->oxcf.optimal_buffer_level) * 128) / (cpi->oxcf.maximum_buffer_size - cpi->oxcf.optimal_buffer_level);
-                int min_qadjustment = ((cpi->active_best_quality - cpi->best_quality) * Fraction) / 128;
+                int Fraction = (int)
+                  (((cpi->buffer_level - cpi->oxcf.optimal_buffer_level) * 128)
+                  / (cpi->oxcf.maximum_buffer_size -
+                  cpi->oxcf.optimal_buffer_level));
+                int min_qadjustment = ((cpi->active_best_quality -
+                                        cpi->best_quality) * Fraction) / 128;
 
                 cpi->active_best_quality -= min_qadjustment;
             }
         }
     }
-    // Make sure constrained quality mode limits are adhered to for the first
-    // few frames of one pass encodes
+    /* Make sure constrained quality mode limits are adhered to for the first
+     * few frames of one pass encodes
+     */
     else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY)
     {
         if ( (cm->frame_type == KEY_FRAME) ||
@@ -3633,7 +3742,7 @@
         }
     }
 
-    // Clip the active best and worst quality values to limits
+    /* Clip the active best and worst quality values to limits */
     if (cpi->active_worst_quality > cpi->worst_quality)
         cpi->active_worst_quality = cpi->worst_quality;
 
@@ -3643,14 +3752,14 @@
     if ( cpi->active_worst_quality < cpi->active_best_quality )
         cpi->active_worst_quality = cpi->active_best_quality;
 
-    // Determine initial Q to try
+    /* Determine initial Q to try */
     Q = vp8_regulate_q(cpi, cpi->this_frame_target);
 
 #if !(CONFIG_REALTIME_ONLY)
 
-    // Set highest allowed value for Zbin over quant
+    /* Set highest allowed value for Zbin over quant */
     if (cm->frame_type == KEY_FRAME)
-        zbin_oq_high = 0; //ZBIN_OQ_MAX/16
+        zbin_oq_high = 0;
     else if ((cpi->oxcf.number_of_layers == 1) && ((cm->refresh_alt_ref_frame ||
               (cm->refresh_golden_frame && !cpi->source_alt_ref_active))))
     {
@@ -3660,15 +3769,16 @@
         zbin_oq_high = ZBIN_OQ_MAX;
 #endif
 
-    // Setup background Q adjustment for error resilient mode.
-    // For multi-layer encodes only enable this for the base layer.
+    /* Setup background Q adjustment for error resilient mode.
+     * For multi-layer encodes only enable this for the base layer.
+     */
     if (cpi->cyclic_refresh_mode_enabled && (cpi->current_layer==0))
         cyclic_background_refresh(cpi, Q, 0);
 
     vp8_compute_frame_size_bounds(cpi, &frame_under_shoot_limit, &frame_over_shoot_limit);
 
 #if !(CONFIG_REALTIME_ONLY)
-    // Limit Q range for the adaptive loop.
+    /* Limit Q range for the adaptive loop. */
     bottom_index = cpi->active_best_quality;
     top_index    = cpi->active_worst_quality;
     q_low  = cpi->active_best_quality;
@@ -3736,16 +3846,11 @@
 
     do
     {
-        vp8_clear_system_state();  //__asm emms;
-
-        /*
-        if(cpi->is_src_frame_alt_ref)
-            Q = 127;
-            */
+        vp8_clear_system_state();
 
         vp8_set_quantizer(cpi, Q);
 
-        // setup skip prob for costing in mode/mv decision
+        /* setup skip prob for costing in mode/mv decision */
         if (cpi->common.mb_no_coeff_skip)
         {
             cpi->prob_skip_false = cpi->base_skip_false_prob[Q];
@@ -3789,7 +3894,9 @@
                         */
                 }
 
-                //as this is for cost estimate, let's make sure it does not go extreme eitehr way
+                /* as this is for cost estimate, let's make sure it does not
+                 * go extreme eitehr way
+                 */
                 if (cpi->prob_skip_false < 5)
                     cpi->prob_skip_false = 5;
 
@@ -3815,7 +3922,17 @@
 
         if (cm->frame_type == KEY_FRAME)
         {
-            resize_key_frame(cpi);
+            if(resize_key_frame(cpi))
+            {
+              /* If the frame size has changed, need to reset Q, quantizer,
+               * and background refresh.
+               */
+              Q = vp8_regulate_q(cpi, cpi->this_frame_target);
+              if (cpi->cyclic_refresh_mode_enabled && (cpi->current_layer==0))
+                cyclic_background_refresh(cpi, Q, 0);
+              vp8_set_quantizer(cpi, Q);
+            }
+
             vp8_setup_key_frame(cpi);
         }
 
@@ -3834,7 +3951,7 @@
 
             if (cm->refresh_entropy_probs == 0)
             {
-                // save a copy for later refresh
+                /* save a copy for later refresh */
                 vpx_memcpy(&cm->lfc, &cm->fc, sizeof(cm->fc));
             }
 
@@ -3842,23 +3959,25 @@
 
             vp8_update_coef_probs(cpi);
 
-            // transform / motion compensation build reconstruction frame
-            // +pack coef partitions
+            /* transform / motion compensation build reconstruction frame
+             * +pack coef partitions
+             */
             vp8_encode_frame(cpi);
 
             /* cpi->projected_frame_size is not needed for RT mode */
         }
 #else
-        // transform / motion compensation build reconstruction frame
+        /* transform / motion compensation build reconstruction frame */
         vp8_encode_frame(cpi);
 
         cpi->projected_frame_size -= vp8_estimate_entropy_savings(cpi);
         cpi->projected_frame_size = (cpi->projected_frame_size > 0) ? cpi->projected_frame_size : 0;
 #endif
-        vp8_clear_system_state();  //__asm emms;
+        vp8_clear_system_state();
 
-        // Test to see if the stats generated for this frame indicate that we should have coded a key frame
-        // (assuming that we didn't)!
+        /* Test to see if the stats generated for this frame indicate that
+         * we should have coded a key frame (assuming that we didn't)!
+         */
         if (cpi->pass != 2 && cpi->oxcf.auto_key && cm->frame_type != KEY_FRAME)
         {
             int key_frame_decision = decide_key_frame(cpi);
@@ -3872,31 +3991,26 @@
 #if !(CONFIG_REALTIME_ONLY)
             else if (key_frame_decision)
             {
-                // Reset all our sizing numbers and recode
+                /* Reset all our sizing numbers and recode */
                 cm->frame_type = KEY_FRAME;
 
                 vp8_pick_frame_size(cpi);
 
-                // Clear the Alt reference frame active flag when we have a key frame
+                /* Clear the Alt reference frame active flag when we have
+                 * a key frame
+                 */
                 cpi->source_alt_ref_active = 0;
 
-                // Reset the loop filter deltas and segmentation map
+                // Set the loop filter deltas and segmentation map update
                 setup_features(cpi);
 
-                // If segmentation is enabled force a map update for key frames
-                if (cpi->mb.e_mbd.segmentation_enabled)
-                {
-                    cpi->mb.e_mbd.update_mb_segmentation_map = 1;
-                    cpi->mb.e_mbd.update_mb_segmentation_data = 1;
-                }
-
                 vp8_restore_coding_context(cpi);
 
                 Q = vp8_regulate_q(cpi, cpi->this_frame_target);
 
                 vp8_compute_frame_size_bounds(cpi, &frame_under_shoot_limit, &frame_over_shoot_limit);
 
-                // Limit Q range for the adaptive loop.
+                /* Limit Q range for the adaptive loop. */
                 bottom_index = cpi->active_best_quality;
                 top_index    = cpi->active_worst_quality;
                 q_low  = cpi->active_best_quality;
@@ -3915,7 +4029,7 @@
         if (frame_over_shoot_limit == 0)
             frame_over_shoot_limit = 1;
 
-        // Are we are overshooting and up against the limit of active max Q.
+        /* Are we are overshooting and up against the limit of active max Q. */
         if (((cpi->pass != 2) || (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)) &&
             (Q == cpi->active_worst_quality)                     &&
             (cpi->active_worst_quality < cpi->worst_quality)      &&
@@ -3923,50 +4037,52 @@
         {
             int over_size_percent = ((cpi->projected_frame_size - frame_over_shoot_limit) * 100) / frame_over_shoot_limit;
 
-            // If so is there any scope for relaxing it
+            /* If so is there any scope for relaxing it */
             while ((cpi->active_worst_quality < cpi->worst_quality) && (over_size_percent > 0))
             {
                 cpi->active_worst_quality++;
-
-                over_size_percent = (int)(over_size_percent * 0.96);        // Assume 1 qstep = about 4% on frame size.
+                /* Assume 1 qstep = about 4% on frame size. */
+                over_size_percent = (int)(over_size_percent * 0.96);
             }
 #if !(CONFIG_REALTIME_ONLY)
             top_index = cpi->active_worst_quality;
 #endif
-            // If we have updated the active max Q do not call vp8_update_rate_correction_factors() this loop.
+            /* If we have updated the active max Q do not call
+             * vp8_update_rate_correction_factors() this loop.
+             */
             active_worst_qchanged = 1;
         }
         else
             active_worst_qchanged = 0;
 
 #if !(CONFIG_REALTIME_ONLY)
-        // Special case handling for forced key frames
+        /* Special case handling for forced key frames */
         if ( (cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced )
         {
             int last_q = Q;
             int kf_err = vp8_calc_ss_err(cpi->Source,
                                          &cm->yv12_fb[cm->new_fb_idx]);
 
-            // The key frame is not good enough
+            /* The key frame is not good enough */
             if ( kf_err > ((cpi->ambient_err * 7) >> 3) )
             {
-                // Lower q_high
+                /* Lower q_high */
                 q_high = (Q > q_low) ? (Q - 1) : q_low;
 
-                // Adjust Q
+                /* Adjust Q */
                 Q = (q_high + q_low) >> 1;
             }
-            // The key frame is much better than the previous frame
+            /* The key frame is much better than the previous frame */
             else if ( kf_err < (cpi->ambient_err >> 1) )
             {
-                // Raise q_low
+                /* Raise q_low */
                 q_low = (Q < q_high) ? (Q + 1) : q_high;
 
-                // Adjust Q
+                /* Adjust Q */
                 Q = (q_high + q_low + 1) >> 1;
             }
 
-            // Clamp Q to upper and lower limits:
+            /* Clamp Q to upper and lower limits: */
             if (Q > q_high)
                 Q = q_high;
             else if (Q < q_low)
@@ -3975,7 +4091,9 @@
             Loop = Q != last_q;
         }
 
-        // Is the projected frame size out of range and are we allowed to attempt to recode.
+        /* Is the projected frame size out of range and are we allowed
+         * to attempt to recode.
+         */
         else if ( recode_loop_test( cpi,
                                frame_over_shoot_limit, frame_under_shoot_limit,
                                Q, top_index, bottom_index ) )
@@ -3983,28 +4101,33 @@
             int last_q = Q;
             int Retries = 0;
 
-            // Frame size out of permitted range:
-            // Update correction factor & compute new Q to try...
+            /* Frame size out of permitted range. Update correction factor
+             * & compute new Q to try...
+             */
 
-            // Frame is too large
+            /* Frame is too large */
             if (cpi->projected_frame_size > cpi->this_frame_target)
             {
-                //if ( cpi->zbin_over_quant == 0 )
-                q_low = (Q < q_high) ? (Q + 1) : q_high; // Raise Qlow as to at least the current value
+                /* Raise Qlow as to at least the current value */
+                q_low = (Q < q_high) ? (Q + 1) : q_high;
 
-                if (cpi->zbin_over_quant > 0)            // If we are using over quant do the same for zbin_oq_low
+                /* If we are using over quant do the same for zbin_oq_low */
+                if (cpi->zbin_over_quant > 0)
                     zbin_oq_low = (cpi->zbin_over_quant < zbin_oq_high) ? (cpi->zbin_over_quant + 1) : zbin_oq_high;
 
-                //if ( undershoot_seen || (Q == MAXQ) )
                 if (undershoot_seen)
                 {
-                    // Update rate_correction_factor unless cpi->active_worst_quality has changed.
+                    /* Update rate_correction_factor unless
+                     * cpi->active_worst_quality has changed.
+                     */
                     if (!active_worst_qchanged)
                         vp8_update_rate_correction_factors(cpi, 1);
 
                     Q = (q_high + q_low + 1) / 2;
 
-                    // Adjust cpi->zbin_over_quant (only allowed when Q is max)
+                    /* Adjust cpi->zbin_over_quant (only allowed when Q
+                     * is max)
+                     */
                     if (Q < MAXQ)
                         cpi->zbin_over_quant = 0;
                     else
@@ -4015,7 +4138,9 @@
                 }
                 else
                 {
-                    // Update rate_correction_factor unless cpi->active_worst_quality has changed.
+                    /* Update rate_correction_factor unless
+                     * cpi->active_worst_quality has changed.
+                     */
                     if (!active_worst_qchanged)
                         vp8_update_rate_correction_factors(cpi, 0);
 
@@ -4031,23 +4156,29 @@
 
                 overshoot_seen = 1;
             }
-            // Frame is too small
+            /* Frame is too small */
             else
             {
                 if (cpi->zbin_over_quant == 0)
-                    q_high = (Q > q_low) ? (Q - 1) : q_low; // Lower q_high if not using over quant
-                else                                    // else lower zbin_oq_high
+                    /* Lower q_high if not using over quant */
+                    q_high = (Q > q_low) ? (Q - 1) : q_low;
+                else
+                    /* else lower zbin_oq_high */
                     zbin_oq_high = (cpi->zbin_over_quant > zbin_oq_low) ? (cpi->zbin_over_quant - 1) : zbin_oq_low;
 
                 if (overshoot_seen)
                 {
-                    // Update rate_correction_factor unless cpi->active_worst_quality has changed.
+                    /* Update rate_correction_factor unless
+                     * cpi->active_worst_quality has changed.
+                     */
                     if (!active_worst_qchanged)
                         vp8_update_rate_correction_factors(cpi, 1);
 
                     Q = (q_high + q_low) / 2;
 
-                    // Adjust cpi->zbin_over_quant (only allowed when Q is max)
+                    /* Adjust cpi->zbin_over_quant (only allowed when Q
+                     * is max)
+                     */
                     if (Q < MAXQ)
                         cpi->zbin_over_quant = 0;
                     else
@@ -4055,16 +4186,19 @@
                 }
                 else
                 {
-                    // Update rate_correction_factor unless cpi->active_worst_quality has changed.
+                    /* Update rate_correction_factor unless
+                     * cpi->active_worst_quality has changed.
+                     */
                     if (!active_worst_qchanged)
                         vp8_update_rate_correction_factors(cpi, 0);
 
                     Q = vp8_regulate_q(cpi, cpi->this_frame_target);
 
-                    // Special case reset for qlow for constrained quality.
-                    // This should only trigger where there is very substantial
-                    // undershoot on a frame and the auto cq level is above
-                    // the user passsed in value.
+                    /* Special case reset for qlow for constrained quality.
+                     * This should only trigger where there is very substantial
+                     * undershoot on a frame and the auto cq level is above
+                     * the user passsed in value.
+                     */
                     if ( (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
                          (Q < q_low) )
                     {
@@ -4082,13 +4216,13 @@
                 undershoot_seen = 1;
             }
 
-            // Clamp Q to upper and lower limits:
+            /* Clamp Q to upper and lower limits: */
             if (Q > q_high)
                 Q = q_high;
             else if (Q < q_low)
                 Q = q_low;
 
-            // Clamp cpi->zbin_over_quant
+            /* Clamp cpi->zbin_over_quant */
             cpi->zbin_over_quant = (cpi->zbin_over_quant < zbin_oq_low) ? zbin_oq_low : (cpi->zbin_over_quant > zbin_oq_high) ? zbin_oq_high : cpi->zbin_over_quant;
 
             Loop = Q != last_q;
@@ -4112,30 +4246,20 @@
     while (Loop == 1);
 
 #if 0
-    // Experimental code for lagged and one pass
-    // Update stats used for one pass GF selection
+    /* Experimental code for lagged and one pass
+     * Update stats used for one pass GF selection
+     */
     {
-        /*
-            int frames_so_far;
-            double frame_intra_error;
-            double frame_coded_error;
-            double frame_pcnt_inter;
-            double frame_pcnt_motion;
-            double frame_mvr;
-            double frame_mvr_abs;
-            double frame_mvc;
-            double frame_mvc_abs;
-        */
-
         cpi->one_pass_frame_stats[cpi->one_pass_frame_index].frame_coded_error = (double)cpi->prediction_error;
         cpi->one_pass_frame_stats[cpi->one_pass_frame_index].frame_intra_error = (double)cpi->intra_error;
         cpi->one_pass_frame_stats[cpi->one_pass_frame_index].frame_pcnt_inter = (double)(100 - cpi->this_frame_percent_intra) / 100.0;
     }
 #endif
 
-    // Special case code to reduce pulsing when key frames are forced at a
-    // fixed interval. Note the reconstruction error if it is the frame before
-    // the force key frame
+    /* Special case code to reduce pulsing when key frames are forced at a
+     * fixed interval. Note the reconstruction error if it is the frame before
+     * the force key frame
+     */
     if ( cpi->next_key_frame_forced && (cpi->twopass.frames_to_key == 0) )
     {
         cpi->ambient_err = vp8_calc_ss_err(cpi->Source,
@@ -4174,13 +4298,38 @@
         }
     }
 
+    /* Count last ref frame 0,0 usage on current encoded frame. */
+    {
+        int mb_row;
+        int mb_col;
+        /* Point to beginning of MODE_INFO arrays. */
+        MODE_INFO *tmp = cm->mi;
+
+        cpi->inter_zz_count = 0;
+
+        if(cm->frame_type != KEY_FRAME)
+        {
+            for (mb_row = 0; mb_row < cm->mb_rows; mb_row ++)
+            {
+                for (mb_col = 0; mb_col < cm->mb_cols; mb_col ++)
+                {
+                    if(tmp->mbmi.mode == ZEROMV && tmp->mbmi.ref_frame == LAST_FRAME)
+                        cpi->inter_zz_count++;
+                    tmp++;
+                }
+                tmp++;
+            }
+        }
+    }
+
 #if CONFIG_MULTI_RES_ENCODING
     vp8_cal_dissimilarity(cpi);
 #endif
 
-    // Update the GF useage maps.
-    // This is done after completing the compression of a frame when all
-    // modes etc. are finalized but before loop filter
+    /* Update the GF useage maps.
+     * This is done after completing the compression of a frame when all
+     * modes etc. are finalized but before loop filter
+     */
     if (cpi->oxcf.number_of_layers == 1)
         vp8_update_gf_useage_maps(cpi, cm, &cpi->mb);
 
@@ -4195,9 +4344,10 @@
     }
 #endif
 
-    // For inter frames the current default behavior is that when
-    // cm->refresh_golden_frame is set we copy the old GF over to the ARF buffer
-    // This is purely an encoder decision at present.
+    /* For inter frames the current default behavior is that when
+     * cm->refresh_golden_frame is set we copy the old GF over to the ARF buffer
+     * This is purely an encoder decision at present.
+     */
     if (!cpi->oxcf.error_resilient_mode && cm->refresh_golden_frame)
         cm->copy_buffer_to_arf  = 2;
     else
@@ -4208,7 +4358,8 @@
 #if CONFIG_MULTITHREAD
     if (cpi->b_multi_threaded)
     {
-        sem_post(&cpi->h_event_start_lpf); /* start loopfilter in separate thread */
+        /* start loopfilter in separate thread */
+        sem_post(&cpi->h_event_start_lpf);
         cpi->b_lpf_running = 1;
     }
     else
@@ -4217,7 +4368,7 @@
         vp8_loopfilter_frame(cpi, cm);
     }
 
-    update_reference_frames(cm);
+    update_reference_frames(cpi);
 
 #if !(CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
     if (cpi->oxcf.error_resilient_mode)
@@ -4232,7 +4383,7 @@
         sem_wait(&cpi->h_event_end_lpf);
 #endif
 
-    // build the bitstream
+    /* build the bitstream */
     vp8_pack_bitstream(cpi, dest, dest_end, size);
 
 #if CONFIG_MULTITHREAD
@@ -4248,7 +4399,7 @@
      * needed in motion search besides loopfilter */
     cm->last_frame_type = cm->frame_type;
 
-    // Update rate control heuristics
+    /* Update rate control heuristics */
     cpi->total_byte_count += (*size);
     cpi->projected_frame_size = (*size) << 3;
 
@@ -4269,18 +4420,21 @@
         vp8_adjust_key_frame_context(cpi);
     }
 
-    // Keep a record of ambient average Q.
+    /* Keep a record of ambient average Q. */
     if (cm->frame_type != KEY_FRAME)
         cpi->avg_frame_qindex = (2 + 3 * cpi->avg_frame_qindex + cm->base_qindex) >> 2;
 
-    // Keep a record from which we can calculate the average Q excluding GF updates and key frames
+    /* Keep a record from which we can calculate the average Q excluding
+     * GF updates and key frames
+     */
     if ((cm->frame_type != KEY_FRAME) && ((cpi->oxcf.number_of_layers > 1) ||
         (!cm->refresh_golden_frame && !cm->refresh_alt_ref_frame)))
     {
         cpi->ni_frames++;
 
-        // Calculate the average Q for normal inter frames (not key or GFU
-        // frames).
+        /* Calculate the average Q for normal inter frames (not key or GFU
+         * frames).
+         */
         if ( cpi->pass == 2 )
         {
             cpi->ni_tot_qi += Q;
@@ -4288,81 +4442,62 @@
         }
         else
         {
-            // Damp value for first few frames
+            /* Damp value for first few frames */
             if (cpi->ni_frames > 150 )
             {
                 cpi->ni_tot_qi += Q;
                 cpi->ni_av_qi = (cpi->ni_tot_qi / cpi->ni_frames);
             }
-            // For one pass, early in the clip ... average the current frame Q
-            // value with the worstq entered by the user as a dampening measure
+            /* For one pass, early in the clip ... average the current frame Q
+             * value with the worstq entered by the user as a dampening measure
+             */
             else
             {
                 cpi->ni_tot_qi += Q;
                 cpi->ni_av_qi = ((cpi->ni_tot_qi / cpi->ni_frames) + cpi->worst_quality + 1) / 2;
             }
 
-            // If the average Q is higher than what was used in the last frame
-            // (after going through the recode loop to keep the frame size within range)
-            // then use the last frame value - 1.
-            // The -1 is designed to stop Q and hence the data rate, from progressively
-            // falling away during difficult sections, but at the same time reduce the number of
-            // itterations around the recode loop.
+            /* If the average Q is higher than what was used in the last
+             * frame (after going through the recode loop to keep the frame
+             * size within range) then use the last frame value - 1. The -1
+             * is designed to stop Q and hence the data rate, from
+             * progressively falling away during difficult sections, but at
+             * the same time reduce the number of itterations around the
+             * recode loop.
+             */
             if (Q > cpi->ni_av_qi)
                 cpi->ni_av_qi = Q - 1;
         }
     }
 
-#if 0
-
-    // If the frame was massively oversize and we are below optimal buffer level drop next frame
-    if ((cpi->drop_frames_allowed) &&
-        (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) &&
-        (cpi->buffer_level < cpi->oxcf.drop_frames_water_mark * cpi->oxcf.optimal_buffer_level / 100) &&
-        (cpi->projected_frame_size > (4 * cpi->this_frame_target)))
-    {
-        cpi->drop_frame = 1;
-    }
-
-#endif
-
-    // Set the count for maximum consecutive dropped frames based upon the ratio of
-    // this frame size to the target average per frame bandwidth.
-    // (cpi->av_per_frame_bandwidth > 0) is just a sanity check to prevent / 0.
-    if (cpi->drop_frames_allowed && (cpi->av_per_frame_bandwidth > 0))
-    {
-        cpi->max_drop_count = cpi->projected_frame_size / cpi->av_per_frame_bandwidth;
-
-        if (cpi->max_drop_count > cpi->max_consec_dropped_frames)
-            cpi->max_drop_count = cpi->max_consec_dropped_frames;
-    }
-
-    // Update the buffer level variable.
-    // Non-viewable frames are a special case and are treated as pure overhead.
+    /* Update the buffer level variable. */
+    /* Non-viewable frames are a special case and are treated as pure overhead. */
     if ( !cm->show_frame )
         cpi->bits_off_target -= cpi->projected_frame_size;
     else
         cpi->bits_off_target += cpi->av_per_frame_bandwidth - cpi->projected_frame_size;
 
-    // Clip the buffer level to the maximum specified buffer size
+    /* Clip the buffer level to the maximum specified buffer size */
     if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size)
         cpi->bits_off_target = cpi->oxcf.maximum_buffer_size;
 
-    // Rolling monitors of whether we are over or underspending used to help regulate min and Max Q in two pass.
+    /* Rolling monitors of whether we are over or underspending used to
+     * help regulate min and Max Q in two pass.
+     */
     cpi->rolling_target_bits = ((cpi->rolling_target_bits * 3) + cpi->this_frame_target + 2) / 4;
     cpi->rolling_actual_bits = ((cpi->rolling_actual_bits * 3) + cpi->projected_frame_size + 2) / 4;
     cpi->long_rolling_target_bits = ((cpi->long_rolling_target_bits * 31) + cpi->this_frame_target + 16) / 32;
     cpi->long_rolling_actual_bits = ((cpi->long_rolling_actual_bits * 31) + cpi->projected_frame_size + 16) / 32;
 
-    // Actual bits spent
+    /* Actual bits spent */
     cpi->total_actual_bits += cpi->projected_frame_size;
 
-    // Debug stats
+    /* Debug stats */
     cpi->total_target_vs_actual += (cpi->this_frame_target - cpi->projected_frame_size);
 
     cpi->buffer_level = cpi->bits_off_target;
 
-    // Propagate values to higher temporal layers
+    /* Propagate values to higher temporal layers */
     if (cpi->oxcf.number_of_layers > 1)
     {
         unsigned int i;
@@ -4370,12 +4505,13 @@
         for (i=cpi->current_layer+1; i<cpi->oxcf.number_of_layers; i++)
         {
             LAYER_CONTEXT *lc = &cpi->layer_context[i];
-            int bits_off_for_this_layer = lc->target_bandwidth / lc->frame_rate
-                                                - cpi->projected_frame_size;
+            int bits_off_for_this_layer =
+               (int)(lc->target_bandwidth / lc->frame_rate -
+                     cpi->projected_frame_size);
 
             lc->bits_off_target += bits_off_for_this_layer;
 
-            // Clip buffer level to maximum buffer size for the layer
+            /* Clip buffer level to maximum buffer size for the layer */
             if (lc->bits_off_target > lc->maximum_buffer_size)
                 lc->bits_off_target = lc->maximum_buffer_size;
 
@@ -4385,7 +4521,9 @@
         }
     }
 
-    // Update bits left to the kf and gf groups to account for overshoot or undershoot on these frames
+    /* Update bits left to the kf and gf groups to account for overshoot
+     * or undershoot on these frames
+     */
     if (cm->frame_type == KEY_FRAME)
     {
         cpi->twopass.kf_group_bits += cpi->this_frame_target - cpi->projected_frame_size;
@@ -4418,7 +4556,7 @@
             cpi->last_skip_false_probs[0] = cpi->prob_skip_false;
             cpi->last_skip_probs_q[0] = cm->base_qindex;
 
-            //update the baseline
+            /* update the baseline */
             cpi->base_skip_false_prob[cm->base_qindex] = cpi->prob_skip_false;
 
         }
@@ -4428,7 +4566,7 @@
     {
         FILE *f = fopen("tmp.stt", "a");
 
-        vp8_clear_system_state();  //__asm emms;
+        vp8_clear_system_state();
 
         if (cpi->twopass.total_left_stats.coded_error != 0.0)
             fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d %6d %6d"
@@ -4444,7 +4582,6 @@
                        cpi->active_best_quality, cpi->active_worst_quality,
                        cpi->ni_av_qi, cpi->cq_target_quality,
                        cpi->zbin_over_quant,
-                       //cpi->avg_frame_qindex, cpi->zbin_over_quant,
                        cm->refresh_golden_frame, cm->refresh_alt_ref_frame,
                        cm->frame_type, cpi->gfu_boost,
                        cpi->twopass.est_max_qcorrection_factor,
@@ -4467,7 +4604,6 @@
                        cpi->active_best_quality, cpi->active_worst_quality,
                        cpi->ni_av_qi, cpi->cq_target_quality,
                        cpi->zbin_over_quant,
-                       //cpi->avg_frame_qindex, cpi->zbin_over_quant,
                        cm->refresh_golden_frame, cm->refresh_alt_ref_frame,
                        cm->frame_type, cpi->gfu_boost,
                        cpi->twopass.est_max_qcorrection_factor,
@@ -4497,10 +4633,6 @@
 
 #endif
 
-    // If this was a kf or Gf note the Q
-    if ((cm->frame_type == KEY_FRAME) || cm->refresh_golden_frame || cm->refresh_alt_ref_frame)
-        cm->last_kf_gf_q = cm->base_qindex;
-
     if (cm->refresh_golden_frame == 1)
         cm->frame_flags = cm->frame_flags | FRAMEFLAGS_GOLDEN;
     else
@@ -4512,49 +4644,55 @@
         cm->frame_flags = cm->frame_flags&~FRAMEFLAGS_ALTREF;
 
 
-    if (cm->refresh_last_frame & cm->refresh_golden_frame) // both refreshed
+    if (cm->refresh_last_frame & cm->refresh_golden_frame)
+        /* both refreshed */
         cpi->gold_is_last = 1;
-    else if (cm->refresh_last_frame ^ cm->refresh_golden_frame) // 1 refreshed but not the other
+    else if (cm->refresh_last_frame ^ cm->refresh_golden_frame)
+        /* 1 refreshed but not the other */
         cpi->gold_is_last = 0;
 
-    if (cm->refresh_last_frame & cm->refresh_alt_ref_frame) // both refreshed
+    if (cm->refresh_last_frame & cm->refresh_alt_ref_frame)
+        /* both refreshed */
         cpi->alt_is_last = 1;
-    else if (cm->refresh_last_frame ^ cm->refresh_alt_ref_frame) // 1 refreshed but not the other
+    else if (cm->refresh_last_frame ^ cm->refresh_alt_ref_frame)
+        /* 1 refreshed but not the other */
         cpi->alt_is_last = 0;
 
-    if (cm->refresh_alt_ref_frame & cm->refresh_golden_frame) // both refreshed
+    if (cm->refresh_alt_ref_frame & cm->refresh_golden_frame)
+        /* both refreshed */
         cpi->gold_is_alt = 1;
-    else if (cm->refresh_alt_ref_frame ^ cm->refresh_golden_frame) // 1 refreshed but not the other
+    else if (cm->refresh_alt_ref_frame ^ cm->refresh_golden_frame)
+        /* 1 refreshed but not the other */
         cpi->gold_is_alt = 0;
 
-    cpi->ref_frame_flags = VP8_ALT_FLAG | VP8_GOLD_FLAG | VP8_LAST_FLAG;
+    cpi->ref_frame_flags = VP8_ALTR_FRAME | VP8_GOLD_FRAME | VP8_LAST_FRAME;
 
     if (cpi->gold_is_last)
-        cpi->ref_frame_flags &= ~VP8_GOLD_FLAG;
+        cpi->ref_frame_flags &= ~VP8_GOLD_FRAME;
 
     if (cpi->alt_is_last)
-        cpi->ref_frame_flags &= ~VP8_ALT_FLAG;
+        cpi->ref_frame_flags &= ~VP8_ALTR_FRAME;
 
     if (cpi->gold_is_alt)
-        cpi->ref_frame_flags &= ~VP8_ALT_FLAG;
+        cpi->ref_frame_flags &= ~VP8_ALTR_FRAME;
 
 
     if (!cpi->oxcf.error_resilient_mode)
     {
         if (cpi->oxcf.play_alternate && cm->refresh_alt_ref_frame && (cm->frame_type != KEY_FRAME))
-            // Update the alternate reference frame stats as appropriate.
+            /* Update the alternate reference frame stats as appropriate. */
             update_alt_ref_frame_stats(cpi);
         else
-            // Update the Golden frame stats as appropriate.
+            /* Update the Golden frame stats as appropriate. */
             update_golden_frame_stats(cpi);
     }
 
     if (cm->frame_type == KEY_FRAME)
     {
-        // Tell the caller that the frame was coded as a key frame
+        /* Tell the caller that the frame was coded as a key frame */
         *frame_flags = cm->frame_flags | FRAMEFLAGS_KEY;
 
-        // As this frame is a key frame  the next defaults to an inter frame.
+        /* As this frame is a key frame  the next defaults to an inter frame. */
         cm->frame_type = INTER_FRAME;
 
         cpi->last_frame_percent_intra = 100;
@@ -4566,20 +4704,24 @@
         cpi->last_frame_percent_intra = cpi->this_frame_percent_intra;
     }
 
-    // Clear the one shot update flags for segmentation map and mode/ref loop filter deltas.
+    /* Clear the one shot update flags for segmentation map and mode/ref
+     * loop filter deltas.
+     */
     cpi->mb.e_mbd.update_mb_segmentation_map = 0;
     cpi->mb.e_mbd.update_mb_segmentation_data = 0;
     cpi->mb.e_mbd.mode_ref_lf_delta_update = 0;
 
 
-    // Dont increment frame counters if this was an altref buffer update not a real frame
+    /* Dont increment frame counters if this was an altref buffer update
+     * not a real frame
+     */
     if (cm->show_frame)
     {
         cm->current_video_frame++;
         cpi->frames_since_key++;
     }
 
-    // reset to normal state now that we are done.
+    /* reset to normal state now that we are done. */
 
 
 
@@ -4595,8 +4737,8 @@
     }
 #endif
 
-    // DEBUG
-    //vp8_write_yuv_frame("encoder_recon.yuv", cm->frame_to_show);
+    /* DEBUG */
+    /* vp8_write_yuv_frame("encoder_recon.yuv", cm->frame_to_show); */
 
 
 }
@@ -4609,33 +4751,38 @@
     int gf_ref_usage_pct = (cpi->count_mb_ref_frame_usage[GOLDEN_FRAME] * 100) / (cm->mb_rows * cm->mb_cols);
     int last_ref_zz_useage = (cpi->inter_zz_count * 100) / (cm->mb_rows * cm->mb_cols);
 
-    // Gf refresh is not currently being signalled
+    /* Gf refresh is not currently being signalled */
     if (cpi->gf_update_recommended == 0)
     {
         if (cpi->common.frames_since_golden > 7)
         {
-            // Low use of gf
+            /* Low use of gf */
             if ((gf_active_pct < 10) || ((gf_active_pct + gf_ref_usage_pct) < 15))
             {
-                // ...but last frame zero zero usage is reasonbable so a new gf might be appropriate
+                /* ...but last frame zero zero usage is reasonbable so a
+                 * new gf might be appropriate
+                 */
                 if (last_ref_zz_useage >= 25)
                 {
                     cpi->gf_bad_count ++;
 
-                    if (cpi->gf_bad_count >= 8)   // Check that the condition is stable
+                    /* Check that the condition is stable */
+                    if (cpi->gf_bad_count >= 8)
                     {
                         cpi->gf_update_recommended = 1;
                         cpi->gf_bad_count = 0;
                     }
                 }
                 else
-                    cpi->gf_bad_count = 0;        // Restart count as the background is not stable enough
+                    /* Restart count as the background is not stable enough */
+                    cpi->gf_bad_count = 0;
             }
             else
-                cpi->gf_bad_count = 0;            // Gf useage has picked up so reset count
+                /* Gf useage has picked up so reset count */
+                cpi->gf_bad_count = 0;
         }
     }
-    // If the signal is set but has not been read should we cancel it.
+    /* If the signal is set but has not been read should we cancel it. */
     else if (last_ref_zz_useage < 15)
     {
         cpi->gf_update_recommended = 0;
@@ -4675,7 +4822,7 @@
 }
 #endif
 
-//For ARM NEON, d8-d15 are callee-saved registers, and need to be saved by us.
+/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
 #if HAVE_NEON
 extern void vp8_push_neon(int64_t *store);
 extern void vp8_pop_neon(int64_t *store);
@@ -4782,7 +4929,7 @@
     cpi->source = NULL;
 
 #if !(CONFIG_REALTIME_ONLY)
-    // Should we code an alternate reference frame
+    /* Should we code an alternate reference frame */
     if (cpi->oxcf.error_resilient_mode == 0 &&
         cpi->oxcf.play_alternate &&
         cpi->source_alt_ref_pending)
@@ -4803,7 +4950,8 @@
             cm->refresh_golden_frame = 0;
             cm->refresh_last_frame = 0;
             cm->show_frame = 0;
-            cpi->source_alt_ref_pending = 0;  // Clear Pending alt Ref flag.
+            /* Clear Pending alt Ref flag. */
+            cpi->source_alt_ref_pending = 0;
             cpi->is_src_frame_alt_ref = 0;
         }
     }
@@ -4875,7 +5023,7 @@
         cpi->last_end_time_stamp_seen = cpi->source->ts_start;
     }
 
-    // adjust frame rates based on timestamps given
+    /* adjust frame rates based on timestamps given */
     if (cm->show_frame)
     {
         int64_t this_duration;
@@ -4893,9 +5041,10 @@
             this_duration = cpi->source->ts_end - cpi->last_end_time_stamp_seen;
             last_duration = cpi->last_end_time_stamp_seen
                             - cpi->last_time_stamp_seen;
-            // do a step update if the duration changes by 10%
+            /* do a step update if the duration changes by 10% */
             if (last_duration)
-                step = ((this_duration - last_duration) * 10 / last_duration);
+                step = (int)(((this_duration - last_duration) *
+                            10 / last_duration));
         }
 
         if (this_duration)
@@ -4910,7 +5059,8 @@
                  * frame rate. If we haven't seen 1 second yet, then average
                  * over the whole interval seen.
                  */
-                interval = cpi->source->ts_end - cpi->first_time_stamp_ever;
+                interval = (double)(cpi->source->ts_end -
+                                    cpi->first_time_stamp_ever);
                 if(interval > 10000000.0)
                     interval = 10000000;
 
@@ -4925,7 +5075,7 @@
             {
                 unsigned int i;
 
-                // Update frame rates for each layer
+                /* Update frame rates for each layer */
                 for (i=0; i<cpi->oxcf.number_of_layers; i++)
                 {
                     LAYER_CONTEXT *lc = &cpi->layer_context[i];
@@ -4947,7 +5097,7 @@
 
         update_layer_contexts (cpi);
 
-        // Restore layer specific context & set frame rate
+        /* Restore layer specific context & set frame rate */
         layer = cpi->oxcf.layer_id[
                             cm->current_video_frame % cpi->oxcf.periodicity];
         restore_layer_context (cpi, layer);
@@ -4985,11 +5135,11 @@
     }
 #endif
 
-    // start with a 0 size frame
+    /* start with a 0 size frame */
     *size = 0;
 
-    // Clear down mmx registers
-    vp8_clear_system_state();  //__asm emms;
+    /* Clear down mmx registers */
+    vp8_clear_system_state();
 
     cm->frame_type = INTER_FRAME;
     cm->frame_flags = *frame_flags;
@@ -4998,7 +5148,6 @@
 
     if (cm->refresh_alt_ref_frame)
     {
-        //cm->refresh_golden_frame = 1;
         cm->refresh_golden_frame = 0;
         cm->refresh_last_frame = 0;
     }
@@ -5043,7 +5192,7 @@
         vpx_usec_timer_mark(&tsctimer);
         vpx_usec_timer_mark(&ticktimer);
 
-        duration = vpx_usec_timer_elapsed(&ticktimer);
+        duration = (int)(vpx_usec_timer_elapsed(&ticktimer));
         duration2 = (unsigned int)((double)duration / 2);
 
         if (cm->frame_type != KEY_FRAME)
@@ -5056,7 +5205,6 @@
 
         if (duration2)
         {
-            //if(*frame_flags!=1)
             {
 
                 if (cpi->avg_pick_mode_time == 0)
@@ -5073,8 +5221,8 @@
         vpx_memcpy(&cm->fc, &cm->lfc, sizeof(cm->fc));
     }
 
-    // Save the contexts separately for alt ref, gold and last.
-    // (TODO jbb -> Optimize this with pointers to avoid extra copies. )
+    /* Save the contexts separately for alt ref, gold and last. */
+    /* (TODO jbb -> Optimize this with pointers to avoid extra copies. ) */
     if(cm->refresh_alt_ref_frame)
         vpx_memcpy(&cpi->lfc_a, &cm->fc, sizeof(cm->fc));
 
@@ -5084,12 +5232,12 @@
     if(cm->refresh_last_frame)
         vpx_memcpy(&cpi->lfc_n, &cm->fc, sizeof(cm->fc));
 
-    // if its a dropped frame honor the requests on subsequent frames
+    /* if its a dropped frame honor the requests on subsequent frames */
     if (*size > 0)
     {
         cpi->droppable = !frame_is_reference(cpi);
 
-        // return to normal state
+        /* return to normal state */
         cm->refresh_entropy_probs = 1;
         cm->refresh_alt_ref_frame = 0;
         cm->refresh_golden_frame = 0;
@@ -5098,7 +5246,7 @@
 
     }
 
-    // Save layer specific state
+    /* Save layer specific state */
     if (cpi->oxcf.number_of_layers > 1)
         save_layer_context (cpi);
 
@@ -5123,14 +5271,14 @@
 
             if (cpi->b_calculate_psnr)
             {
-                double ye,ue,ve;
+                uint64_t ye,ue,ve;
                 double frame_psnr;
                 YV12_BUFFER_CONFIG      *orig = cpi->Source;
                 YV12_BUFFER_CONFIG      *recon = cpi->common.frame_to_show;
                 int y_samples = orig->y_height * orig->y_width ;
                 int uv_samples = orig->uv_height * orig->uv_width ;
                 int t_samples = y_samples + 2 * uv_samples;
-                int64_t sq_error, sq_error2;
+                double sq_error, sq_error2;
 
                 ye = calc_plane_error(orig->y_buffer, orig->y_stride,
                   recon->y_buffer, recon->y_stride, orig->y_width, orig->y_height);
@@ -5141,13 +5289,13 @@
                 ve = calc_plane_error(orig->v_buffer, orig->uv_stride,
                   recon->v_buffer, recon->uv_stride, orig->uv_width, orig->uv_height);
 
-                sq_error = ye + ue + ve;
+                sq_error = (double)(ye + ue + ve);
 
                 frame_psnr = vp8_mse2psnr(t_samples, 255.0, sq_error);
 
-                cpi->total_y += vp8_mse2psnr(y_samples, 255.0, ye);
-                cpi->total_u += vp8_mse2psnr(uv_samples, 255.0, ue);
-                cpi->total_v += vp8_mse2psnr(uv_samples, 255.0, ve);
+                cpi->total_y += vp8_mse2psnr(y_samples, 255.0, (double)ye);
+                cpi->total_u += vp8_mse2psnr(uv_samples, 255.0, (double)ue);
+                cpi->total_v += vp8_mse2psnr(uv_samples, 255.0, (double)ve);
                 cpi->total_sq_error += sq_error;
                 cpi->total  += frame_psnr;
 #if CONFIG_POSTPROC
@@ -5168,13 +5316,16 @@
                     ve = calc_plane_error(orig->v_buffer, orig->uv_stride,
                       pp->v_buffer, pp->uv_stride, orig->uv_width, orig->uv_height);
 
-                    sq_error2 = ye + ue + ve;
+                    sq_error2 = (double)(ye + ue + ve);
 
                     frame_psnr2 = vp8_mse2psnr(t_samples, 255.0, sq_error2);
 
-                    cpi->totalp_y += vp8_mse2psnr(y_samples, 255.0, ye);
-                    cpi->totalp_u += vp8_mse2psnr(uv_samples, 255.0, ue);
-                    cpi->totalp_v += vp8_mse2psnr(uv_samples, 255.0, ve);
+                    cpi->totalp_y += vp8_mse2psnr(y_samples,
+                                                  255.0, (double)ye);
+                    cpi->totalp_u += vp8_mse2psnr(uv_samples,
+                                                  255.0, (double)ue);
+                    cpi->totalp_v += vp8_mse2psnr(uv_samples,
+                                                  255.0, (double)ve);
                     cpi->total_sq_error2 += sq_error2;
                     cpi->totalp  += frame_psnr2;
 
@@ -5186,7 +5337,7 @@
 
                     if (cpi->oxcf.number_of_layers > 1)
                     {
-                         int i;
+                         unsigned int i;
 
                          for (i=cpi->current_layer;
                                        i<cpi->oxcf.number_of_layers; i++)
@@ -5214,7 +5365,7 @@
 
                 if (cpi->oxcf.number_of_layers > 1)
                 {
-                    int i;
+                    unsigned int i;
 
                     for (i=cpi->current_layer;
                          i<cpi->oxcf.number_of_layers; i++)
@@ -5312,7 +5463,7 @@
             ret = -1;
         }
 
-#endif //!CONFIG_POSTPROC
+#endif
         vp8_clear_system_state();
         return ret;
     }
@@ -5321,29 +5472,53 @@
 int vp8_set_roimap(VP8_COMP *cpi, unsigned char *map, unsigned int rows, unsigned int cols, int delta_q[4], int delta_lf[4], unsigned int threshold[4])
 {
     signed char feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];
+    int internal_delta_q[MAX_MB_SEGMENTS];
+    const int range = 63;
+    int i;
 
+    // This method is currently incompatible with the cyclic refresh method
+    if ( cpi->cyclic_refresh_mode_enabled )
+        return -1;
+
+    // Check number of rows and columns match
     if (cpi->common.mb_rows != rows || cpi->common.mb_cols != cols)
         return -1;
 
+    // Range check the delta Q values and convert the external Q range values
+    // to internal ones.
+    if ( (abs(delta_q[0]) > range) || (abs(delta_q[1]) > range) ||
+         (abs(delta_q[2]) > range) || (abs(delta_q[3]) > range) )
+        return -1;
+
+    // Range check the delta lf values
+    if ( (abs(delta_lf[0]) > range) || (abs(delta_lf[1]) > range) ||
+         (abs(delta_lf[2]) > range) || (abs(delta_lf[3]) > range) )
+        return -1;
+
     if (!map)
     {
         disable_segmentation(cpi);
         return 0;
     }
 
-    // Set the segmentation Map
+    // Translate the external delta q values to internal values.
+    for ( i = 0; i < MAX_MB_SEGMENTS; i++ )
+        internal_delta_q[i] =
+            ( delta_q[i] >= 0 ) ? q_trans[delta_q[i]] : -q_trans[-delta_q[i]];
+
+    /* Set the segmentation Map */
     set_segmentation_map(cpi, map);
 
-    // Activate segmentation.
+    /* Activate segmentation. */
     enable_segmentation(cpi);
 
-    // Set up the quant segment data
-    feature_data[MB_LVL_ALT_Q][0] = delta_q[0];
-    feature_data[MB_LVL_ALT_Q][1] = delta_q[1];
-    feature_data[MB_LVL_ALT_Q][2] = delta_q[2];
-    feature_data[MB_LVL_ALT_Q][3] = delta_q[3];
+    /* Set up the quant segment data */
+    feature_data[MB_LVL_ALT_Q][0] = internal_delta_q[0];
+    feature_data[MB_LVL_ALT_Q][1] = internal_delta_q[1];
+    feature_data[MB_LVL_ALT_Q][2] = internal_delta_q[2];
+    feature_data[MB_LVL_ALT_Q][3] = internal_delta_q[3];
 
-    // Set up the loop segment data s
+    /* Set up the loop segment data s */
     feature_data[MB_LVL_ALT_LF][0] = delta_lf[0];
     feature_data[MB_LVL_ALT_LF][1] = delta_lf[1];
     feature_data[MB_LVL_ALT_LF][2] = delta_lf[2];
@@ -5354,8 +5529,7 @@
     cpi->segment_encode_breakout[2] = threshold[2];
     cpi->segment_encode_breakout[3] = threshold[3];
 
-    // Initialise the feature data structure
-    // SEGMENT_DELTADATA    0, SEGMENT_ABSDATA      1
+    /* Initialise the feature data structure */
     set_segment_data(cpi, &feature_data[0][0], SEGMENT_DELTADATA);
 
     return 0;
@@ -5377,7 +5551,6 @@
     }
     else
     {
-        //cpi->active_map_enabled = 0;
         return -1 ;
     }
 }
@@ -5407,7 +5580,9 @@
     unsigned char *src = source->y_buffer;
     unsigned char *dst = dest->y_buffer;
 
-    // Loop through the Y plane raw and reconstruction data summing (square differences)
+    /* Loop through the Y plane raw and reconstruction data summing
+     * (square differences)
+     */
     for (i = 0; i < source->y_height; i += 16)
     {
         for (j = 0; j < source->y_width; j += 16)

diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index cf70231..caccc60 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h

@@ -25,6 +25,7 @@
 #include "vp8/common/threading.h"
 #include "vpx_ports/mem.h"
 #include "vpx/internal/vpx_codec_internal.h"
+#include "vpx/vp8.h"
 #include "mcomp.h"
 #include "vp8/common/findnearmv.h"
 #include "lookahead.h"
@@ -32,7 +33,6 @@
 #include "vp8/encoder/denoising.h"
 #endif
 
-//#define SPEEDSTATS 1
 #define MIN_GF_INTERVAL             4
 #define DEFAULT_GF_INTERVAL         7
 
@@ -73,7 +73,6 @@
     int mvcosts[2][MVvals+1];
 
 #ifdef MODE_STATS
-    // Stats
     int y_modes[5];
     int uv_modes[4];
     int b_modes[10];
@@ -232,22 +231,22 @@
 
 typedef struct
 {
-    // Layer configuration
+    /* Layer configuration */
     double frame_rate;
     int target_bandwidth;
 
-    // Layer specific coding parameters
-    int starting_buffer_level;
-    int optimal_buffer_level;
-    int maximum_buffer_size;
-    int starting_buffer_level_in_ms;
-    int optimal_buffer_level_in_ms;
-    int maximum_buffer_size_in_ms;
+    /* Layer specific coding parameters */
+    int64_t starting_buffer_level;
+    int64_t optimal_buffer_level;
+    int64_t maximum_buffer_size;
+    int64_t starting_buffer_level_in_ms;
+    int64_t optimal_buffer_level_in_ms;
+    int64_t maximum_buffer_size_in_ms;
 
     int avg_frame_size_for_layer;
 
-    int buffer_level;
-    int bits_off_target;
+    int64_t buffer_level;
+    int64_t bits_off_target;
 
     int64_t total_actual_bits;
     int total_target_vs_actual;
@@ -307,7 +306,7 @@
 
     MACROBLOCK mb;
     VP8_COMMON common;
-    vp8_writer bc[9]; // one boolcoder for each partition
+    vp8_writer bc[9]; /* one boolcoder for each partition */
 
     VP8_CONFIG oxcf;
 
@@ -321,16 +320,20 @@
     YV12_BUFFER_CONFIG scaled_source;
     YV12_BUFFER_CONFIG *last_frame_unscaled_source;
 
-    int source_alt_ref_pending; // frame in src_buffers has been identified to be encoded as an alt ref
-    int source_alt_ref_active;  // an alt ref frame has been encoded and is usable
+    /* frame in src_buffers has been identified to be encoded as an alt ref */
+    int source_alt_ref_pending;
+    /* an alt ref frame has been encoded and is usable */
+    int source_alt_ref_active;
+    /* source of frame to encode is an exact copy of an alt ref frame */
+    int is_src_frame_alt_ref;
 
-    int is_src_frame_alt_ref;   // source of frame to encode is an exact copy of an alt ref frame
+    /* golden frame same as last frame ( short circuit gold searches) */
+    int gold_is_last;
+    /* Alt reference frame same as last ( short circuit altref search) */
+    int alt_is_last;
+    /* don't do both alt and gold search ( just do gold). */
+    int gold_is_alt;
 
-    int gold_is_last; // golden frame same as last frame ( short circuit gold searches)
-    int alt_is_last;  // Alt reference frame same as last ( short circuit altref search)
-    int gold_is_alt;  // don't do both alt and gold search ( just do gold).
-
-    //int refresh_alt_ref_frame;
     YV12_BUFFER_CONFIG pick_lf_lvl_frame;
 
     TOKENEXTRA *tok;
@@ -342,7 +345,7 @@
     unsigned int this_key_frame_forced;
     unsigned int next_key_frame_forced;
 
-    // Ambient reconstruction err target for force key frames
+    /* Ambient reconstruction err target for force key frames */
     int ambient_err;
 
     unsigned int mode_check_freq[MAX_MODES];
@@ -359,7 +362,7 @@
 
     CODING_CONTEXT coding_context;
 
-    // Rate targetting variables
+    /* Rate targetting variables */
     int64_t prediction_error;
     int64_t last_prediction_error;
     int64_t intra_error;
@@ -367,30 +370,43 @@
 
     int this_frame_target;
     int projected_frame_size;
-    int last_q[2];                   // Separate values for Intra/Inter
+    int last_q[2];                   /* Separate values for Intra/Inter */
 
     double rate_correction_factor;
     double key_frame_rate_correction_factor;
     double gf_rate_correction_factor;
 
-    int frames_till_gf_update_due;      // Count down till next GF
-    int current_gf_interval;          // GF interval chosen when we coded the last GF
+    /* Count down till next GF */
+    int frames_till_gf_update_due;
 
-    int gf_overspend_bits;            // Total bits overspent becasue of GF boost (cumulative)
+    /* GF interval chosen when we coded the last GF */
+    int current_gf_interval;
 
-    int non_gf_bitrate_adjustment;     // Used in the few frames following a GF to recover the extra bits spent in that GF
+    /* Total bits overspent becasue of GF boost (cumulative) */
+    int gf_overspend_bits;
 
-    int kf_overspend_bits;            // Extra bits spent on key frames that need to be recovered on inter frames
-    int kf_bitrate_adjustment;        // Current number of bit s to try and recover on each inter frame.
+    /* Used in the few frames following a GF to recover the extra bits
+     * spent in that GF
+     */
+    int non_gf_bitrate_adjustment;
+
+    /* Extra bits spent on key frames that need to be recovered */
+    int kf_overspend_bits;
+
+    /* Current number of bit s to try and recover on each inter frame. */
+    int kf_bitrate_adjustment;
     int max_gf_interval;
     int baseline_gf_interval;
-    int active_arnr_frames;           // <= cpi->oxcf.arnr_max_frames
+    int active_arnr_frames;
 
     int64_t key_frame_count;
     int prior_key_frame_distance[KEY_FRAME_CONTEXT];
-    int per_frame_bandwidth;          // Current section per frame bandwidth target
-    int av_per_frame_bandwidth;        // Average frame size target for clip
-    int min_frame_bandwidth;          // Minimum allocation that should be used for any frame
+    /* Current section per frame bandwidth target */
+    int per_frame_bandwidth;
+    /* Average frame size target for clip */
+    int av_per_frame_bandwidth;
+    /* Minimum allocation that should be used for any frame */
+    int min_frame_bandwidth;
     int inter_frame_target;
     double output_frame_rate;
     int64_t last_time_stamp_seen;
@@ -415,7 +431,7 @@
     double frame_rate;
     double ref_frame_rate;
     int64_t buffer_level;
-    int bits_off_target;
+    int64_t bits_off_target;
 
     int rolling_target_bits;
     int rolling_actual_bits;
@@ -424,7 +440,7 @@
     int long_rolling_actual_bits;
 
     int64_t total_actual_bits;
-    int total_target_vs_actual;        // debug stats
+    int total_target_vs_actual; /* debug stats */
 
     int worst_quality;
     int active_worst_quality;
@@ -433,22 +449,16 @@
 
     int cq_target_quality;
 
-    int drop_frames_allowed;          // Are we permitted to drop frames?
-    int drop_frame;                  // Drop this frame?
-    int drop_count;                  // How many frames have we dropped?
-    int max_drop_count;               // How many frames should we drop?
-    int max_consec_dropped_frames;     // Limit number of consecutive frames that can be dropped.
-
+    int drop_frames_allowed; /* Are we permitted to drop frames? */
+    int drop_frame;          /* Drop this frame? */
 
     int ymode_count [VP8_YMODES];        /* intra MB type cts this frame */
-    int uv_mode_count[VP8_UV_MODES];       /* intra MB type cts this frame */
+    int uv_mode_count[VP8_UV_MODES];     /* intra MB type cts this frame */
 
     unsigned int MVcount [2] [MVvals];  /* (row,col) MV cts this frame */
 
     unsigned int coef_counts [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];  /* for this frame */
 
-    //DECLARE_ALIGNED(16, int, coef_counts_backup [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]);   //not used any more
-    //save vp8_tree_probs_from_distribution result for each frame to avoid repeat calculation
     vp8_prob frame_coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
     char update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
 
@@ -462,7 +472,7 @@
     struct vpx_codec_pkt_list  *output_pkt_list;
 
 #if 0
-    // Experimental code for lagged and one pass
+    /* Experimental code for lagged and one pass */
     ONEPASS_FRAMESTATS one_pass_frame_stats[MAX_LAG_BUFFERS];
     int one_pass_frame_index;
 #endif
@@ -470,17 +480,15 @@
     int decimation_factor;
     int decimation_count;
 
-    // for real time encoding
-    int avg_encode_time;              //microsecond
-    int avg_pick_mode_time;            //microsecond
+    /* for real time encoding */
+    int avg_encode_time;     /* microsecond */
+    int avg_pick_mode_time;  /* microsecond */
     int Speed;
-    unsigned int cpu_freq;           //Mhz
     int compressor_speed;
 
     int interquantizer;
     int auto_gold;
     int auto_adjust_gold_quantizer;
-    int goldfreq;
     int auto_worst_q;
     int cpu_used;
     int pass;
@@ -503,20 +511,25 @@
     SPEED_FEATURES sf;
     int error_bins[1024];
 
-    // Data used for real time conferencing mode to help determine if it would be good to update the gf
+    /* Data used for real time conferencing mode to help determine if it
+     * would be good to update the gf
+     */
     int inter_zz_count;
     int gf_bad_count;
     int gf_update_recommended;
     int skip_true_count;
 
     unsigned char *segmentation_map;
-    signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];            // Segment data (can be deltas or absolute values)
-    int  segment_encode_breakout[MAX_MB_SEGMENTS];                    // segment threashold for encode breakout
+    signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];
+    int  segment_encode_breakout[MAX_MB_SEGMENTS];
 
     unsigned char *active_map;
     unsigned int active_map_enabled;
-    // Video conferencing cyclic refresh mode flags etc
-    // This is a mode designed to clean up the background over time in live encoding scenarious. It uses segmentation
+
+    /* Video conferencing cyclic refresh mode flags. This is a mode
+     * designed to clean up the background over time in live encoding
+     * scenarious. It uses segmentation.
+     */
     int cyclic_refresh_mode_enabled;
     int cyclic_refresh_mode_max_mbs_perframe;
     int cyclic_refresh_mode_index;
@@ -524,7 +537,7 @@
     signed char *cyclic_refresh_map;
 
 #if CONFIG_MULTITHREAD
-    // multithread data
+    /* multithread data */
     int * mt_current_mb_col;
     int mt_sync_range;
     int b_multi_threaded;
@@ -538,7 +551,7 @@
     ENCODETHREAD_DATA *en_thread_data;
     LPFTHREAD_DATA lpf_thread_data;
 
-    //events
+    /* events */
     sem_t *h_event_start_encoding;
     sem_t h_event_end_encoding;
     sem_t h_event_start_lpf;
@@ -549,7 +562,6 @@
     unsigned int partition_sz[MAX_PARTITIONS];
     unsigned char *partition_d[MAX_PARTITIONS];
     unsigned char *partition_d_end[MAX_PARTITIONS];
-    // end of multithread data
 
 
     fractional_mv_step_fp *find_fractional_mv_step;
@@ -557,10 +569,10 @@
     vp8_refining_search_fn_t refining_search_sad;
     vp8_diamond_search_fn_t diamond_search_sad;
     vp8_variance_fn_ptr_t fn_ptr[BLOCK_MAX_SEGMENTS];
-    unsigned int time_receive_data;
-    unsigned int time_compress_data;
-    unsigned int time_pick_lpf;
-    unsigned int time_encode_mb_row;
+    uint64_t time_receive_data;
+    uint64_t time_compress_data;
+    uint64_t time_pick_lpf;
+    uint64_t time_encode_mb_row;
 
     int base_skip_false_prob[128];
 
@@ -594,16 +606,16 @@
         int gf_decay_rate;
         int static_scene_max_gf_interval;
         int kf_bits;
-        int gf_group_error_left;           // Remaining error from uncoded frames in a gf group. Two pass use only
-
-        // Projected total bits available for a key frame group of frames
+        /* Remaining error from uncoded frames in a gf group. */
+        int gf_group_error_left;
+        /* Projected total bits available for a key frame group of frames */
         int64_t kf_group_bits;
-
-        // Error score of frames still to be coded in kf group
+        /* Error score of frames still to be coded in kf group */
         int64_t kf_group_error_left;
-
-        int gf_group_bits;                // Projected Bits available for a group of frames including 1 GF or ARF
-        int gf_bits;                     // Bits for the golden frame or ARF - 2 pass only
+        /* Projected Bits available for a group including 1 GF or ARF */
+        int gf_group_bits;
+        /* Bits for the golden frame or ARF */
+        int gf_bits;
         int alt_extra_bits;
         double est_max_qcorrection_factor;
     } twopass;
@@ -641,24 +653,26 @@
 #endif
     int b_calculate_psnr;
 
-    // Per MB activity measurement
+    /* Per MB activity measurement */
     unsigned int activity_avg;
     unsigned int * mb_activity_map;
     int * mb_norm_activity_map;
 
-    // Record of which MBs still refer to last golden frame either
-    // directly or through 0,0
+    /* Record of which MBs still refer to last golden frame either
+     * directly or through 0,0
+     */
     unsigned char *gf_active_flags;
     int gf_active_count;
 
     int output_partition;
 
-    //Store last frame's MV info for next frame MV prediction
+    /* Store last frame's MV info for next frame MV prediction */
     int_mv *lfmv;
     int *lf_ref_frame_sign_bias;
     int *lf_ref_frame;
 
-    int force_next_frame_intra; /* force next frame to intra when kf_auto says so */
+    /* force next frame to intra when kf_auto says so */
+    int force_next_frame_intra;
 
     int droppable;
 
@@ -666,7 +680,7 @@
     VP8_DENOISER denoiser;
 #endif
 
-    // Coding layer state variables
+    /* Coding layer state variables */
     unsigned int current_layer;
     LAYER_CONTEXT layer_context[VPX_TS_MAX_LAYERS];
 
@@ -687,6 +701,10 @@
 #if CONFIG_MULTI_RES_ENCODING
     /* Number of MBs per row at lower-resolution level */
     int    mr_low_res_mb_cols;
+    /* Indicate if lower-res mv info is available */
+    unsigned char  mr_low_res_mv_avail;
+    /* The frame number of each reference frames */
+    unsigned int current_ref_frames[MAX_REF_FRAMES];
 #endif
 
     struct rd_costs_struct

diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index 47d646f..b67f04b 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c

@@ -141,20 +141,24 @@
     BLOCKD *b = &x->e_mbd.block[ib];
     BLOCK *be = &x->block[ib];
     int dst_stride = x->e_mbd.dst.y_stride;
-    unsigned char *base_dst = x->e_mbd.dst.y_buffer;
+    unsigned char *dst = x->e_mbd.dst.y_buffer + b->offset;
     B_PREDICTION_MODE mode;
-    int best_rd = INT_MAX;       // 1<<30
+    int best_rd = INT_MAX;
     int rate;
     int distortion;
 
-    for (mode = B_DC_PRED; mode <= B_HE_PRED /*B_HU_PRED*/; mode++)
+    unsigned char *Above = dst - dst_stride;
+    unsigned char *yleft = dst - 1;
+    unsigned char top_left = Above[-1];
+
+    for (mode = B_DC_PRED; mode <= B_HE_PRED; mode++)
     {
         int this_rd;
 
         rate = mode_costs[mode];
-        vp8_intra4x4_predict
-                     (base_dst + b->offset, dst_stride,
-                      mode, b->predictor, 16);
+
+        vp8_intra4x4_predict(Above, yleft, dst_stride, mode,
+                             b->predictor, 16, top_left);
         distortion = get_prediction_error(be, b);
         this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
 
@@ -167,7 +171,7 @@
         }
     }
 
-    b->bmi.as_mode = (B_PREDICTION_MODE)(*best_mode);
+    b->bmi.as_mode = *best_mode;
     vp8_encode_intra4x4block(x, ib);
     return best_rd;
 }
@@ -214,8 +218,9 @@
         distortion += d;
         mic->bmi[i].as_mode = best_mode;
 
-        // Break out case where we have already exceeded best so far value
-        // that was passed in
+        /* Break out case where we have already exceeded best so far value
+         * that was passed in
+         */
         if (distortion > *best_dist)
             break;
     }
@@ -408,7 +413,6 @@
     LOWER_RES_MB_INFO* store_mode_info
                           = ((LOWER_RES_FRAME_INFO*)cpi->oxcf.mr_low_res_mode_info)->mb_info;
     unsigned int parent_mb_index;
-    //unsigned int parent_mb_index = map_640x480_to_320x240[mb_row][mb_col];
 
     /* Consider different down_sampling_factor.  */
     {
@@ -440,7 +444,6 @@
         /* Consider different down_sampling_factor.
          * The result can be rounded to be more precise, but it takes more time.
          */
-        //int round = cpi->oxcf.mr_down_sampling_factor.den/2;
         (*parent_ref_mv).as_mv.row = store_mode_info[parent_mb_index].mv.as_mv.row
                                   *cpi->oxcf.mr_down_sampling_factor.num
                                   /cpi->oxcf.mr_down_sampling_factor.den;
@@ -455,9 +458,17 @@
 
 static void check_for_encode_breakout(unsigned int sse, MACROBLOCK* x)
 {
-    if (sse < x->encode_breakout)
+    MACROBLOCKD *xd = &x->e_mbd;
+
+    unsigned int threshold = (xd->block[0].dequant[1]
+        * xd->block[0].dequant[1] >>4);
+
+    if(threshold < x->encode_breakout)
+        threshold = x->encode_breakout;
+
+    if (sse < threshold )
     {
-        // Check u and v to make sure skip is ok
+        /* Check u and v to make sure skip is ok */
         unsigned int sse2 = 0;
 
         sse2 = VP8_UVSSE(x);
@@ -513,7 +524,7 @@
     MB_PREDICTION_MODE this_mode;
     int num00;
     int mdcounts[4];
-    int best_rd = INT_MAX; // 1 << 30;
+    int best_rd = INT_MAX;
     int best_intra_rd = INT_MAX;
     int mode_index;
     int rate;
@@ -530,7 +541,8 @@
 
     int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7};
     int saddone=0;
-    int sr=0;    //search range got from mv_pred(). It uses step_param levels. (0-7)
+    /* search range got from mv_pred(). It uses step_param levels. (0-7) */
+    int sr=0;
 
     unsigned char *plane[4][3];
     int ref_frame_map[4];
@@ -542,7 +554,7 @@
     int_mv parent_ref_mv;
     MB_PREDICTION_MODE parent_mode = 0;
 
-    if (cpi->oxcf.mr_encoder_id)
+    if (cpi->oxcf.mr_encoder_id && cpi->mr_low_res_mv_avail)
         get_lower_res_motion_info(cpi, xd, &dissim, &parent_ref_frame,
                                   &parent_mode, &parent_ref_mv, mb_row, mb_col);
 #endif
@@ -574,15 +586,17 @@
 
     get_predictor_pointers(cpi, plane, recon_yoffset, recon_uvoffset);
 
-    cpi->mbs_tested_so_far++; // Count of the number of MBs tested so far this frame
+    /* Count of the number of MBs tested so far this frame */
+    cpi->mbs_tested_so_far++;
 
     *returnintra = INT_MAX;
     x->skip = 0;
 
     x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
 
-    // if we encode a new mv this is important
-    // find the best new motion vector
+    /* if we encode a new mv this is important
+     * find the best new motion vector
+     */
     for (mode_index = 0; mode_index < MAX_MODES; mode_index++)
     {
         int frame_cost;
@@ -598,11 +612,14 @@
         x->e_mbd.mode_info_context->mbmi.ref_frame = this_ref_frame;
 
 #if CONFIG_MULTI_RES_ENCODING
-        if (cpi->oxcf.mr_encoder_id)
+        if (cpi->oxcf.mr_encoder_id && cpi->mr_low_res_mv_avail)
         {
-            /* If parent MB is intra, child MB is intra. */
-            if (!parent_ref_frame && this_ref_frame)
-                continue;
+            /* TODO: If parent MB is intra, child MB is intra. This is removed
+             * now since it cause noticeable quality loss for some test clip.
+             * Will come back to evaluate more.
+             * if (!parent_ref_frame && this_ref_frame)
+             *     continue;
+             */
 
             /* If parent MB is inter, and it is unlikely there are multiple
              * objects in parent MB, we use parent ref frame as child MB's
@@ -613,7 +630,7 @@
         }
 #endif
 
-        // everything but intra
+        /* everything but intra */
         if (x->e_mbd.mode_info_context->mbmi.ref_frame)
         {
             x->e_mbd.pre.y_buffer = plane[this_ref_frame][0];
@@ -628,7 +645,7 @@
             }
 
 #if CONFIG_MULTI_RES_ENCODING
-            if (cpi->oxcf.mr_encoder_id)
+            if (cpi->oxcf.mr_encoder_id && cpi->mr_low_res_mv_avail)
             {
                 if (vp8_mode_order[mode_index] == NEARESTMV &&
                     mode_mv[NEARESTMV].as_int ==0)
@@ -638,7 +655,7 @@
                     continue;
 
                 if (vp8_mode_order[mode_index] == NEWMV && parent_mode == ZEROMV
-                    && best_ref_mv.as_int==0) //&& dissim==0
+                    && best_ref_mv.as_int==0)
                     continue;
                 else if(vp8_mode_order[mode_index] == NEWMV && dissim==0
                     && best_ref_mv.as_int==parent_ref_mv.as_int)
@@ -728,7 +745,7 @@
 
         case SPLITMV:
 
-            // Split MV modes currently not supported when RD is nopt enabled.
+            /* Split MV modes currently not supported when RD is not enabled. */
             break;
 
         case DC_PRED:
@@ -777,13 +794,22 @@
 
             int speed_adjust = (cpi->Speed > 5) ? ((cpi->Speed >= 8)? 3 : 2) : 1;
 
-            // Further step/diamond searches as necessary
+            /* Further step/diamond searches as necessary */
             step_param = cpi->sf.first_step + speed_adjust;
 
 #if CONFIG_MULTI_RES_ENCODING
-            if (cpi->oxcf.mr_encoder_id)
+            /* If lower-res drops this frame, then higher-res encoder does
+               motion search without any previous knowledge. Also, since
+               last frame motion info is not stored, then we can not
+               use improved_mv_pred. */
+            if (cpi->oxcf.mr_encoder_id && !cpi->mr_low_res_mv_avail)
+                cpi->sf.improved_mv_pred = 0;
+
+            if (cpi->oxcf.mr_encoder_id && cpi->mr_low_res_mv_avail)
             {
-                // Use parent MV as predictor. Adjust search range accordingly.
+                /* Use parent MV as predictor. Adjust search range
+                 * accordingly.
+                 */
                 mvp.as_int = parent_ref_mv.as_int;
                 mvp_full.as_mv.col = parent_ref_mv.as_mv.col>>3;
                 mvp_full.as_mv.row = parent_ref_mv.as_mv.row>>3;
@@ -808,7 +834,7 @@
                                 &near_sadidx[0]);
 
                     sr += speed_adjust;
-                    //adjust search range according to sr from mv prediction
+                    /* adjust search range according to sr from mv prediction */
                     if(sr > step_param)
                         step_param = sr;
 
@@ -823,7 +849,8 @@
             }
 
 #if CONFIG_MULTI_RES_ENCODING
-            if (cpi->oxcf.mr_encoder_id && dissim <= 2 &&
+            if (cpi->oxcf.mr_encoder_id && cpi->mr_low_res_mv_avail &&
+                dissim <= 2 &&
                 MAX(abs(best_ref_mv.as_mv.row - parent_ref_mv.as_mv.row),
                     abs(best_ref_mv.as_mv.col - parent_ref_mv.as_mv.col)) <= 4)
             {
@@ -860,7 +887,10 @@
                  * change the behavior in lowest-resolution encoder.
                  * Will improve it later.
                  */
-                if (!cpi->oxcf.mr_encoder_id)
+                 /* Set step_param to 0 to ensure large-range motion search
+                    when encoder drops this frame at lower-resolution.
+                  */
+                if (!cpi->oxcf.mr_encoder_id || !cpi->mr_low_res_mv_avail)
                     step_param = 0;
 #endif
                     bestsme = vp8_hex_search(x, b, d, &mvp_full, &d->bmi.mv,
@@ -877,10 +907,7 @@
                                           x->mvcost, &best_ref_mv);
                     mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
 
-                    // Further step/diamond searches as necessary
-                    n = 0;
-                    //further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
-
+                    /* Further step/diamond searches as necessary */
                     n = num00;
                     num00 = 0;
 
@@ -927,7 +954,7 @@
 
             mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
 
-            // mv cost;
+            /* mv cost; */
             rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv,
                                      cpi->mb.mvcost, 128);
         }
@@ -965,7 +992,7 @@
         if (cpi->oxcf.noise_sensitivity)
         {
 
-            // Store for later use by denoiser.
+            /* Store for later use by denoiser. */
             if (this_mode == ZEROMV && sse < zero_mv_sse )
             {
                 zero_mv_sse = sse;
@@ -973,7 +1000,7 @@
                         x->e_mbd.mode_info_context->mbmi.ref_frame;
             }
 
-            // Store the best NEWMV in x for later use in the denoiser.
+            /* Store the best NEWMV in x for later use in the denoiser. */
             if (x->e_mbd.mode_info_context->mbmi.mode == NEWMV &&
                     sse < best_sse)
             {
@@ -990,7 +1017,7 @@
 
         if (this_rd < best_rd || x->skip)
         {
-            // Note index of best mode
+            /* Note index of best mode */
             best_mode_index = mode_index;
 
             *returnrate = rate2;
@@ -1030,7 +1057,7 @@
             break;
     }
 
-    // Reduce the activation RD thresholds for the best choice mode
+    /* Reduce the activation RD thresholds for the best choice mode */
     if ((cpi->rd_baseline_thresh[best_mode_index] > 0) && (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2)))
     {
         int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 3);
@@ -1062,7 +1089,7 @@
     {
         if (x->best_sse_inter_mode == DC_PRED)
         {
-            // No best MV found.
+            /* No best MV found. */
             x->best_sse_inter_mode = best_mbmode.mode;
             x->best_sse_mv = best_mbmode.mv;
             x->need_to_clamp_best_mvs = best_mbmode.need_to_clamp_mvs;
@@ -1073,7 +1100,7 @@
                                 recon_yoffset, recon_uvoffset);
 
 
-        // Reevaluate ZEROMV after denoising.
+        /* Reevaluate ZEROMV after denoising. */
         if (best_mbmode.ref_frame == INTRA_FRAME &&
             x->best_zeromv_reference_frame != INTRA_FRAME)
         {
@@ -1083,7 +1110,7 @@
                     vp8_cost_mv_ref(ZEROMV, mdcounts);
             distortion2 = 0;
 
-            // set up the proper prediction buffers for the frame
+            /* set up the proper prediction buffers for the frame */
             x->e_mbd.mode_info_context->mbmi.ref_frame = this_ref_frame;
             x->e_mbd.pre.y_buffer = plane[this_ref_frame][0];
             x->e_mbd.pre.u_buffer = plane[this_ref_frame][1];

diff --git a/vp8/encoder/picklpf.c b/vp8/encoder/picklpf.c
index 21af45a..4121349 100644
--- a/vp8/encoder/picklpf.c
+++ b/vp8/encoder/picklpf.c

@@ -74,7 +74,9 @@
     src += srcoffset;
     dst += dstoffset;
 
-    // Loop through the Y plane raw and reconstruction data summing (square differences)
+    /* Loop through the Y plane raw and reconstruction data summing
+     * (square differences)
+     */
     for (i = 0; i < linestocopy; i += 16)
     {
         for (j = 0; j < source->y_width; j += 16)
@@ -92,7 +94,7 @@
     return Total;
 }
 
-// Enforce a minimum filter level based upon baseline Q
+/* Enforce a minimum filter level based upon baseline Q */
 static int get_min_filter_level(VP8_COMP *cpi, int base_qindex)
 {
     int min_filter_level;
@@ -113,14 +115,15 @@
     return min_filter_level;
 }
 
-// Enforce a maximum filter level based upon baseline Q
+/* Enforce a maximum filter level based upon baseline Q */
 static int get_max_filter_level(VP8_COMP *cpi, int base_qindex)
 {
-    // PGW August 2006: Highest filter values almost always a bad idea
+    /* PGW August 2006: Highest filter values almost always a bad idea */
 
-    // jbb chg: 20100118 - not so any more with this overquant stuff allow high values
-    // with lots of intra coming in.
-    int max_filter_level = MAX_LOOP_FILTER ;//* 3 / 4;
+    /* jbb chg: 20100118 - not so any more with this overquant stuff allow
+     * high values with lots of intra coming in.
+     */
+    int max_filter_level = MAX_LOOP_FILTER;
     (void)base_qindex;
 
     if (cpi->twopass.section_intra_rating > 8)
@@ -155,7 +158,9 @@
         cm->last_sharpness_level = cm->sharpness_level;
     }
 
-    // Start the search at the previous frame filter level unless it is now out of range.
+    /* Start the search at the previous frame filter level unless it is
+     * now out of range.
+     */
     if (cm->filter_level < min_filter_level)
         cm->filter_level = min_filter_level;
     else if (cm->filter_level > max_filter_level)
@@ -164,7 +169,7 @@
     filt_val = cm->filter_level;
     best_filt_val = filt_val;
 
-    // Get the err using the previous frame's filter value.
+    /* Get the err using the previous frame's filter value. */
 
     /* Copy the unfiltered / processed recon buffer to the new buffer */
     vp8_yv12_copy_partial_frame(saved_frame, cm->frame_to_show);
@@ -174,17 +179,17 @@
 
     filt_val -= 1 + (filt_val > 10);
 
-    // Search lower filter levels
+    /* Search lower filter levels */
     while (filt_val >= min_filter_level)
     {
-        // Apply the loop filter
+        /* Apply the loop filter */
         vp8_yv12_copy_partial_frame(saved_frame, cm->frame_to_show);
         vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
 
-        // Get the err for filtered frame
+        /* Get the err for filtered frame */
         filt_err = calc_partial_ssl_err(sd, cm->frame_to_show);
 
-        // Update the best case record or exit loop.
+        /* Update the best case record or exit loop. */
         if (filt_err < best_err)
         {
             best_err = filt_err;
@@ -193,32 +198,34 @@
         else
             break;
 
-        // Adjust filter level
+        /* Adjust filter level */
         filt_val -= 1 + (filt_val > 10);
     }
 
-    // Search up (note that we have already done filt_val = cm->filter_level)
+    /* Search up (note that we have already done filt_val = cm->filter_level) */
     filt_val = cm->filter_level + 1 + (filt_val > 10);
 
     if (best_filt_val == cm->filter_level)
     {
-        // Resist raising filter level for very small gains
+        /* Resist raising filter level for very small gains */
         best_err -= (best_err >> 10);
 
         while (filt_val < max_filter_level)
         {
-            // Apply the loop filter
+            /* Apply the loop filter */
             vp8_yv12_copy_partial_frame(saved_frame, cm->frame_to_show);
 
             vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val);
 
-            // Get the err for filtered frame
+            /* Get the err for filtered frame */
             filt_err = calc_partial_ssl_err(sd, cm->frame_to_show);
 
-            // Update the best case record or exit loop.
+            /* Update the best case record or exit loop. */
             if (filt_err < best_err)
             {
-                // Do not raise filter level if improvement is < 1 part in 4096
+                /* Do not raise filter level if improvement is < 1 part
+                 * in 4096
+                 */
                 best_err = filt_err - (filt_err >> 10);
 
                 best_filt_val = filt_val;
@@ -226,7 +233,7 @@
             else
                 break;
 
-            // Adjust filter level
+            /* Adjust filter level */
             filt_val += 1 + (filt_val > 10);
         }
     }
@@ -243,7 +250,7 @@
     cm->frame_to_show = saved_frame;
 }
 
-// Stub function for now Alt LF not used
+/* Stub function for now Alt LF not used */
 void vp8cx_set_alt_lf_level(VP8_COMP *cpi, int filt_val)
 {
     MACROBLOCKD *mbd = &cpi->mb.e_mbd;
@@ -266,12 +273,14 @@
 
     int filter_step;
     int filt_high = 0;
-    int filt_mid = cm->filter_level;      // Start search at previous frame filter level
+    /* Start search at previous frame filter level */
+    int filt_mid = cm->filter_level;
     int filt_low = 0;
     int filt_best;
     int filt_direction = 0;
 
-    int Bias = 0;                       // Bias against raising loop filter and in favor of lowering it
+    /* Bias against raising loop filter and in favor of lowering it */
+    int Bias = 0;
 
     int ss_err[MAX_LOOP_FILTER + 1];
 
@@ -287,7 +296,9 @@
     else
         cm->sharpness_level = cpi->oxcf.Sharpness;
 
-    // Start the search at the previous frame filter level unless it is now out of range.
+    /* Start the search at the previous frame filter level unless it is
+     * now out of range.
+     */
     filt_mid = cm->filter_level;
 
     if (filt_mid < min_filter_level)
@@ -295,10 +306,10 @@
     else if (filt_mid > max_filter_level)
         filt_mid = max_filter_level;
 
-    // Define the initial step size
+    /* Define the initial step size */
     filter_step = (filt_mid < 16) ? 4 : filt_mid / 4;
 
-    // Get baseline error score
+    /* Get baseline error score */
 
     /* Copy the unfiltered / processed recon buffer to the new buffer */
     vp8_yv12_copy_y(saved_frame, cm->frame_to_show);
@@ -314,9 +325,8 @@
 
     while (filter_step > 0)
     {
-        Bias = (best_err >> (15 - (filt_mid / 8))) * filter_step; //PGW change 12/12/06 for small images
+        Bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
 
-        // jbb chg: 20100118 - in sections with lots of new material coming in don't bias as much to a low filter value
         if (cpi->twopass.section_intra_rating < 20)
             Bias = Bias * cpi->twopass.section_intra_rating / 20;
 
@@ -327,7 +337,7 @@
         {
             if(ss_err[filt_low] == 0)
             {
-                // Get Low filter error score
+                /* Get Low filter error score */
                 vp8_yv12_copy_y(saved_frame, cm->frame_to_show);
                 vp8cx_set_alt_lf_level(cpi, filt_low);
                 vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_low);
@@ -338,10 +348,12 @@
             else
                 filt_err = ss_err[filt_low];
 
-            // If value is close to the best so far then bias towards a lower loop filter value.
+            /* If value is close to the best so far then bias towards a
+             * lower loop filter value.
+             */
             if ((filt_err - Bias) < best_err)
             {
-                // Was it actually better than the previous best?
+                /* Was it actually better than the previous best? */
                 if (filt_err < best_err)
                     best_err = filt_err;
 
@@ -349,7 +361,7 @@
             }
         }
 
-        // Now look at filt_high
+        /* Now look at filt_high */
         if ((filt_direction >= 0) && (filt_high != filt_mid))
         {
             if(ss_err[filt_high] == 0)
@@ -364,7 +376,7 @@
             else
                 filt_err = ss_err[filt_high];
 
-            // Was it better than the previous best?
+            /* Was it better than the previous best? */
             if (filt_err < (best_err - Bias))
             {
                 best_err = filt_err;
@@ -372,7 +384,9 @@
             }
         }
 
-        // Half the step distance if the best filter value was the same as last time
+        /* Half the step distance if the best filter value was the same
+         * as last time
+         */
         if (filt_best == filt_mid)
         {
             filter_step = filter_step / 2;

diff --git a/vp8/encoder/psnr.c b/vp8/encoder/psnr.c
index 5119bb8..5bb49ad 100644
--- a/vp8/encoder/psnr.c
+++ b/vp8/encoder/psnr.c

@@ -22,7 +22,7 @@
     if ((double)Mse > 0.0)
         psnr = 10.0 * log10(Peak * Peak * Samples / Mse);
     else
-        psnr = MAX_PSNR;      // Limit to prevent / 0
+        psnr = MAX_PSNR;      /* Limit to prevent / 0 */
 
     if (psnr > MAX_PSNR)
         psnr = MAX_PSNR;

diff --git a/vp8/encoder/quantize.c b/vp8/encoder/quantize.c
index 766d2b2..88fea11 100644
--- a/vp8/encoder/quantize.c
+++ b/vp8/encoder/quantize.c

@@ -44,21 +44,21 @@
         z    = coeff_ptr[rc];
         zbin = zbin_ptr[rc] ;
 
-        sz = (z >> 31);                                 // sign of z
-        x  = (z ^ sz) - sz;                             // x = abs(z)
+        sz = (z >> 31);                              /* sign of z */
+        x  = (z ^ sz) - sz;                          /* x = abs(z) */
 
         if (x >= zbin)
         {
             x += round_ptr[rc];
             y  = (((x * quant_ptr[rc]) >> 16) + x)
-                 >> quant_shift_ptr[rc];                // quantize (x)
-            x  = (y ^ sz) - sz;                         // get the sign back
-            qcoeff_ptr[rc] = x;                          // write to destination
-            dqcoeff_ptr[rc] = x * dequant_ptr[rc];        // dequantized value
+                 >> quant_shift_ptr[rc];             /* quantize (x) */
+            x  = (y ^ sz) - sz;                      /* get the sign back */
+            qcoeff_ptr[rc] = x;                      /* write to destination */
+            dqcoeff_ptr[rc] = x * dequant_ptr[rc];   /* dequantized value */
 
             if (y)
             {
-                eob = i;                                // last nonzero coeffs
+                eob = i;                             /* last nonzero coeffs */
             }
         }
     }
@@ -84,17 +84,17 @@
         rc   = vp8_default_zig_zag1d[i];
         z    = coeff_ptr[rc];
 
-        sz = (z >> 31);                                 // sign of z
-        x  = (z ^ sz) - sz;                             // x = abs(z)
+        sz = (z >> 31);                              /* sign of z */
+        x  = (z ^ sz) - sz;                          /* x = abs(z) */
 
-        y  = ((x + round_ptr[rc]) * quant_ptr[rc]) >> 16; // quantize (x)
-        x  = (y ^ sz) - sz;                         // get the sign back
-        qcoeff_ptr[rc] = x;                          // write to destination
-        dqcoeff_ptr[rc] = x * dequant_ptr[rc];        // dequantized value
+        y  = ((x + round_ptr[rc]) * quant_ptr[rc]) >> 16; /* quantize (x) */
+        x  = (y ^ sz) - sz;                          /* get the sign back */
+        qcoeff_ptr[rc] = x;                          /* write to destination */
+        dqcoeff_ptr[rc] = x * dequant_ptr[rc];       /* dequantized value */
 
         if (y)
         {
-            eob = i;                                // last nonzero coeffs
+            eob = i;                                 /* last nonzero coeffs */
         }
     }
     *d->eob = (char)(eob + 1);
@@ -132,22 +132,22 @@
         zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value;
 
         zbin_boost_ptr ++;
-        sz = (z >> 31);                                 // sign of z
-        x  = (z ^ sz) - sz;                             // x = abs(z)
+        sz = (z >> 31);                              /* sign of z */
+        x  = (z ^ sz) - sz;                          /* x = abs(z) */
 
         if (x >= zbin)
         {
             x += round_ptr[rc];
             y  = (((x * quant_ptr[rc]) >> 16) + x)
-                 >> quant_shift_ptr[rc];                // quantize (x)
-            x  = (y ^ sz) - sz;                         // get the sign back
-            qcoeff_ptr[rc]  = x;                        // write to destination
-            dqcoeff_ptr[rc] = x * dequant_ptr[rc];      // dequantized value
+                 >> quant_shift_ptr[rc];             /* quantize (x) */
+            x  = (y ^ sz) - sz;                      /* get the sign back */
+            qcoeff_ptr[rc]  = x;                     /* write to destination */
+            dqcoeff_ptr[rc] = x * dequant_ptr[rc];   /* dequantized value */
 
             if (y)
             {
-                eob = i;                                // last nonzero coeffs
-                zbin_boost_ptr = b->zrun_zbin_boost;    // reset zero runlength
+                eob = i;                             /* last nonzero coeffs */
+                zbin_boost_ptr = b->zrun_zbin_boost; /* reset zero runlength */
             }
         }
     }
@@ -240,26 +240,23 @@
         rc   = vp8_default_zig_zag1d[i];
         z    = coeff_ptr[rc];
 
-        //if ( i == 0 )
-        //    zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value/2;
-        //else
         zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value;
 
         zbin_boost_ptr ++;
-        sz = (z >> 31);                                 // sign of z
-        x  = (z ^ sz) - sz;                             // x = abs(z)
+        sz = (z >> 31);                              /* sign of z */
+        x  = (z ^ sz) - sz;                          /* x = abs(z) */
 
         if (x >= zbin)
         {
-            y  = ((x + round_ptr[rc]) * quant_ptr[rc]) >> 16; // quantize (x)
-            x  = (y ^ sz) - sz;                         // get the sign back
-            qcoeff_ptr[rc]  = x;                         // write to destination
-            dqcoeff_ptr[rc] = x * dequant_ptr[rc];        // dequantized value
+            y  = ((x + round_ptr[rc]) * quant_ptr[rc]) >> 16; /* quantize (x) */
+            x  = (y ^ sz) - sz;                      /* get the sign back */
+            qcoeff_ptr[rc]  = x;                     /* write to destination */
+            dqcoeff_ptr[rc] = x * dequant_ptr[rc];   /* dequantized value */
 
             if (y)
             {
-                eob = i;                                // last nonzero coeffs
-                zbin_boost_ptr = &b->zrun_zbin_boost[0];    // reset zero runlength
+                eob = i;                             /* last nonzero coeffs */
+                zbin_boost_ptr = &b->zrun_zbin_boost[0]; /* reset zrl */
             }
         }
     }
@@ -441,7 +438,7 @@
 
     for (Q = 0; Q < QINDEX_RANGE; Q++)
     {
-        // dc values
+        /* dc values */
         quant_val = vp8_dc_quant(Q, cpi->common.y1dc_delta_q);
         cpi->Y1quant_fast[Q][0] = (1 << 16) / quant_val;
         invert_quant(cpi->sf.improved_quant, cpi->Y1quant[Q] + 0,
@@ -469,7 +466,7 @@
         cpi->common.UVdequant[Q][0] = quant_val;
         cpi->zrun_zbin_boost_uv[Q][0] = (quant_val * zbin_boost[0]) >> 7;
 
-        // all the ac values = ;
+        /* all the ac values = ; */
         quant_val = vp8_ac_yquant(Q);
         cpi->Y1quant_fast[Q][1] = (1 << 16) / quant_val;
         invert_quant(cpi->sf.improved_quant, cpi->Y1quant[Q] + 1,
@@ -536,7 +533,7 @@
 
     for (Q = 0; Q < QINDEX_RANGE; Q++)
     {
-        // dc values
+        /* dc values */
         quant_val = vp8_dc_quant(Q, cpi->common.y1dc_delta_q);
         cpi->Y1quant[Q][0] = (1 << 16) / quant_val;
         cpi->Y1zbin[Q][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
@@ -558,7 +555,7 @@
         cpi->common.UVdequant[Q][0] = quant_val;
         cpi->zrun_zbin_boost_uv[Q][0] = (quant_val * zbin_boost[0]) >> 7;
 
-        // all the ac values = ;
+        /* all the ac values = ; */
         for (i = 1; i < 16; i++)
         {
             int rc = vp8_default_zig_zag1d[i];
@@ -613,18 +610,18 @@
     MACROBLOCKD *xd = &x->e_mbd;
     int zbin_extra;
 
-    // Select the baseline MB Q index.
+    /* Select the baseline MB Q index. */
     if (xd->segmentation_enabled)
     {
-        // Abs Value
+        /* Abs Value */
         if (xd->mb_segement_abs_delta == SEGMENT_ABSDATA)
-
             QIndex = xd->segment_feature_data[MB_LVL_ALT_Q][xd->mode_info_context->mbmi.segment_id];
-        // Delta Value
+        /* Delta Value */
         else
         {
             QIndex = cpi->common.base_qindex + xd->segment_feature_data[MB_LVL_ALT_Q][xd->mode_info_context->mbmi.segment_id];
-            QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) : 0;    // Clamp to valid range
+            /* Clamp to valid range */
+            QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) : 0;
         }
     }
     else
@@ -657,13 +654,13 @@
          * This will also require modifications to the x86 and neon assembly.
          * */
         for (i = 0; i < 16; i++)
-            x->e_mbd.block[i].dequant = xd->dequant_y1; //cpi->common.Y1dequant[QIndex];
+            x->e_mbd.block[i].dequant = xd->dequant_y1;
         for (i = 16; i < 24; i++)
-            x->e_mbd.block[i].dequant = xd->dequant_uv; //cpi->common.UVdequant[QIndex];
-        x->e_mbd.block[24].dequant = xd->dequant_y2; //cpi->common.Y2dequant[QIndex];
+            x->e_mbd.block[i].dequant = xd->dequant_uv;
+        x->e_mbd.block[24].dequant = xd->dequant_y2;
 #endif
 
-        // Y
+        /* Y */
         zbin_extra = ZBIN_EXTRA_Y;
 
         for (i = 0; i < 16; i++)
@@ -677,7 +674,7 @@
             x->block[i].zbin_extra = (short)zbin_extra;
         }
 
-        // UV
+        /* UV */
         zbin_extra = ZBIN_EXTRA_UV;
 
         for (i = 16; i < 24; i++)
@@ -691,7 +688,7 @@
             x->block[i].zbin_extra = (short)zbin_extra;
         }
 
-        // Y2
+        /* Y2 */
         zbin_extra = ZBIN_EXTRA_Y2;
 
         x->block[24].quant_fast = cpi->Y2quant_fast[QIndex];
@@ -716,19 +713,19 @@
             || cpi->last_zbin_mode_boost != cpi->zbin_mode_boost
             || x->last_act_zbin_adj != x->act_zbin_adj)
     {
-        // Y
+        /* Y */
         zbin_extra = ZBIN_EXTRA_Y;
 
         for (i = 0; i < 16; i++)
             x->block[i].zbin_extra = (short)zbin_extra;
 
-        // UV
+        /* UV */
         zbin_extra = ZBIN_EXTRA_UV;
 
         for (i = 16; i < 24; i++)
             x->block[i].zbin_extra = (short)zbin_extra;
 
-        // Y2
+        /* Y2 */
         zbin_extra = ZBIN_EXTRA_Y2;
         x->block[24].zbin_extra = (short)zbin_extra;
 
@@ -744,19 +741,19 @@
     int QIndex = x->q_index;
     int zbin_extra;
 
-    // Y
+    /* Y */
     zbin_extra = ZBIN_EXTRA_Y;
 
     for (i = 0; i < 16; i++)
         x->block[i].zbin_extra = (short)zbin_extra;
 
-    // UV
+    /* UV */
     zbin_extra = ZBIN_EXTRA_UV;
 
     for (i = 16; i < 24; i++)
         x->block[i].zbin_extra = (short)zbin_extra;
 
-    // Y2
+    /* Y2 */
     zbin_extra = ZBIN_EXTRA_Y2;
     x->block[24].zbin_extra = (short)zbin_extra;
 }
@@ -766,10 +763,10 @@
 
 void vp8cx_frame_init_quantizer(VP8_COMP *cpi)
 {
-    // Clear Zbin mode boost for default case
+    /* Clear Zbin mode boost for default case */
     cpi->zbin_mode_boost = 0;
 
-    // MB level quantizer setup
+    /* MB level quantizer setup */
     vp8cx_mb_init_quantizer(cpi, &cpi->mb, 0);
 }
 
@@ -801,7 +798,7 @@
     cm->y2dc_delta_q = new_delta_q;
 
 
-    // Set Segment specific quatizers
+    /* Set Segment specific quatizers */
     mbd->segment_feature_data[MB_LVL_ALT_Q][0] = cpi->segment_feature_data[MB_LVL_ALT_Q][0];
     mbd->segment_feature_data[MB_LVL_ALT_Q][1] = cpi->segment_feature_data[MB_LVL_ALT_Q][1];
     mbd->segment_feature_data[MB_LVL_ALT_Q][2] = cpi->segment_feature_data[MB_LVL_ALT_Q][2];

diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c
index 04fd2d0..4dc078a 100644
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c

@@ -41,15 +41,16 @@
 extern int inter_b_modes[10];
 #endif
 
-// Bits Per MB at different Q (Multiplied by 512)
+/* Bits Per MB at different Q (Multiplied by 512) */
 #define BPER_MB_NORMBITS    9
 
-// Work in progress recalibration of baseline rate tables based on
-// the assumption that bits per mb is inversely proportional to the
-// quantizer value.
+/* Work in progress recalibration of baseline rate tables based on
+ * the assumption that bits per mb is inversely proportional to the
+ * quantizer value.
+ */
 const int vp8_bits_per_mb[2][QINDEX_RANGE] =
 {
-    // Intra case 450000/Qintra
+    /* Intra case 450000/Qintra */
     {
         1125000,900000, 750000, 642857, 562500, 500000, 450000, 450000,
         409090, 375000, 346153, 321428, 300000, 281250, 264705, 264705,
@@ -68,7 +69,7 @@
         36885,  36290,  35714,  35156,  34615,  34090,  33582,  33088,
         32608,  32142,  31468,  31034,  30405,  29801,  29220,  28662,
     },
-    // Inter case 285000/Qinter
+    /* Inter case 285000/Qinter */
     {
         712500, 570000, 475000, 407142, 356250, 316666, 285000, 259090,
         237500, 219230, 203571, 190000, 178125, 167647, 158333, 150000,
@@ -109,7 +110,7 @@
     220, 220, 220, 220, 220, 220, 220, 220,
 };
 
-//#define GFQ_ADJUSTMENT (Q+100)
+/* #define GFQ_ADJUSTMENT (Q+100) */
 #define GFQ_ADJUSTMENT vp8_gf_boost_qadjustment[Q]
 const int vp8_gf_boost_qadjustment[QINDEX_RANGE] =
 {
@@ -173,7 +174,7 @@
     600, 600, 600, 600, 600, 600, 600, 600,
 };
 
-// % adjustment to target kf size based on seperation from previous frame
+/* % adjustment to target kf size based on seperation from previous frame */
 static const int kf_boost_seperation_adjustment[16] =
 {
     30,   40,   50,   55,   60,   65,   70,   75,
@@ -224,10 +225,11 @@
 {
     CODING_CONTEXT *const cc = & cpi->coding_context;
 
-    // Stores a snapshot of key state variables which can subsequently be
-    // restored with a call to vp8_restore_coding_context. These functions are
-    // intended for use in a re-code loop in vp8_compress_frame where the
-    // quantizer value is adjusted between loop iterations.
+    /* Stores a snapshot of key state variables which can subsequently be
+     * restored with a call to vp8_restore_coding_context. These functions are
+     * intended for use in a re-code loop in vp8_compress_frame where the
+     * quantizer value is adjusted between loop iterations.
+     */
 
     cc->frames_since_key          = cpi->frames_since_key;
     cc->filter_level             = cpi->common.filter_level;
@@ -244,7 +246,7 @@
     vp8_copy(cc->uv_mode_count, cpi->uv_mode_count);
 
 
-    // Stats
+    /* Stats */
 #ifdef MODE_STATS
     vp8_copy(cc->y_modes,       y_modes);
     vp8_copy(cc->uv_modes,      uv_modes);
@@ -262,8 +264,9 @@
 {
     CODING_CONTEXT *const cc = & cpi->coding_context;
 
-    // Restore key state variables to the snapshot state stored in the
-    // previous call to vp8_save_coding_context.
+    /* Restore key state variables to the snapshot state stored in the
+     * previous call to vp8_save_coding_context.
+     */
 
     cpi->frames_since_key         =   cc->frames_since_key;
     cpi->common.filter_level     =   cc->filter_level;
@@ -280,7 +283,7 @@
     vp8_copy(cpi->ymode_count, cc->ymode_count);
     vp8_copy(cpi->uv_mode_count, cc->uv_mode_count);
 
-    // Stats
+    /* Stats */
 #ifdef MODE_STATS
     vp8_copy(y_modes, cc->y_modes);
     vp8_copy(uv_modes, cc->uv_modes);
@@ -297,7 +300,7 @@
 
 void vp8_setup_key_frame(VP8_COMP *cpi)
 {
-    // Setup for Key frame:
+    /* Setup for Key frame: */
 
     vp8_default_coef_probs(& cpi->common);
 
@@ -307,23 +310,23 @@
         vp8_build_component_cost_table(cpi->mb.mvcost, (const MV_CONTEXT *) cpi->common.fc.mvc, flag);
     }
 
-    vpx_memset(cpi->common.fc.pre_mvc, 0, sizeof(cpi->common.fc.pre_mvc));  //initialize pre_mvc to all zero.
+    /* initialize pre_mvc to all zero. */
+    vpx_memset(cpi->common.fc.pre_mvc, 0, sizeof(cpi->common.fc.pre_mvc));
 
-    // Make sure we initialize separate contexts for altref,gold, and normal.
-    // TODO shouldn't need 3 different copies of structure to do this!
+    /* Make sure we initialize separate contexts for altref,gold, and normal.
+     * TODO shouldn't need 3 different copies of structure to do this!
+     */
     vpx_memcpy(&cpi->lfc_a, &cpi->common.fc, sizeof(cpi->common.fc));
     vpx_memcpy(&cpi->lfc_g, &cpi->common.fc, sizeof(cpi->common.fc));
     vpx_memcpy(&cpi->lfc_n, &cpi->common.fc, sizeof(cpi->common.fc));
 
-    //cpi->common.filter_level = 0;      // Reset every key frame.
     cpi->common.filter_level = cpi->common.base_qindex * 3 / 8 ;
 
-    // Provisional interval before next GF
+    /* Provisional interval before next GF */
     if (cpi->auto_gold)
-        //cpi->frames_till_gf_update_due = DEFAULT_GF_INTERVAL;
         cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
     else
-        cpi->frames_till_gf_update_due = cpi->goldfreq;
+        cpi->frames_till_gf_update_due = DEFAULT_GF_INTERVAL;
 
     cpi->common.refresh_golden_frame = 1;
     cpi->common.refresh_alt_ref_frame = 1;
@@ -348,12 +351,12 @@
 
 static void calc_iframe_target_size(VP8_COMP *cpi)
 {
-    // boost defaults to half second
+    /* boost defaults to half second */
     int kf_boost;
-    unsigned int target;
+    uint64_t target;
 
-    // Clear down mmx registers to allow floating point in what follows
-    vp8_clear_system_state();  //__asm emms;
+    /* Clear down mmx registers to allow floating point in what follows */
+    vp8_clear_system_state();
 
     if (cpi->oxcf.fixed_q >= 0)
     {
@@ -364,10 +367,10 @@
     }
     else if (cpi->pass == 2)
     {
-        // New Two pass RC
+        /* New Two pass RC */
         target = cpi->per_frame_bandwidth;
     }
-    // First Frame is a special case
+    /* First Frame is a special case */
     else if (cpi->common.current_video_frame == 0)
     {
         /* 1 Pass there is no information on which to base size so use
@@ -381,29 +384,29 @@
     }
     else
     {
-        // if this keyframe was forced, use a more recent Q estimate
+        /* if this keyframe was forced, use a more recent Q estimate */
         int Q = (cpi->common.frame_flags & FRAMEFLAGS_KEY)
                 ? cpi->avg_frame_qindex : cpi->ni_av_qi;
 
-        int initial_boost = 24; // Corresponds to: |2.5 * per_frame_bandwidth|
-        // Boost depends somewhat on frame rate: only used for 1 layer case.
+        int initial_boost = 32; /* |3.0 * per_frame_bandwidth| */
+        /* Boost depends somewhat on frame rate: only used for 1 layer case. */
         if (cpi->oxcf.number_of_layers == 1) {
           kf_boost = MAX(initial_boost, (int)(2 * cpi->output_frame_rate - 16));
         }
         else {
-          // Initial factor: set target size to: |2.5 * per_frame_bandwidth|.
+          /* Initial factor: set target size to: |3.0 * per_frame_bandwidth|. */
           kf_boost = initial_boost;
         }
 
-        // adjustment up based on q: this factor ranges from ~1.2 to 2.2.
+        /* adjustment up based on q: this factor ranges from ~1.2 to 2.2. */
         kf_boost = kf_boost * kf_boost_qadjustment[Q] / 100;
 
-        // frame separation adjustment ( down)
+        /* frame separation adjustment ( down) */
         if (cpi->frames_since_key  < cpi->output_frame_rate / 2)
             kf_boost = (int)(kf_boost
                        * cpi->frames_since_key / (cpi->output_frame_rate / 2));
 
-        // Minimal target size is |2* per_frame_bandwidth|.
+        /* Minimal target size is |2* per_frame_bandwidth|. */
         if (kf_boost < 16)
             kf_boost = 16;
 
@@ -420,10 +423,11 @@
             target = max_rate;
     }
 
-    cpi->this_frame_target = target;
+    cpi->this_frame_target = (int)target;
 
-    // TODO: if we separate rate targeting from Q targetting, move this.
-    // Reset the active worst quality to the baseline value for key frames.
+    /* TODO: if we separate rate targeting from Q targetting, move this.
+     * Reset the active worst quality to the baseline value for key frames.
+     */
     if (cpi->pass != 2)
         cpi->active_worst_quality = cpi->worst_quality;
 
@@ -432,9 +436,6 @@
         FILE *f;
 
         f = fopen("kf_boost.stt", "a");
-        //fprintf(f, " %8d %10d %10d %10d %10d %10d %10d\n",
-        //  cpi->common.current_video_frame,  cpi->target_bandwidth, cpi->frames_to_key, kf_boost_qadjustment[cpi->ni_av_qi], cpi->kf_boost, (cpi->this_frame_target *100 / cpi->per_frame_bandwidth), cpi->this_frame_target );
-
         fprintf(f, " %8u %10d %10d %10d\n",
                 cpi->common.current_video_frame,  cpi->gfu_boost, cpi->baseline_gf_interval, cpi->source_alt_ref_pending);
 
@@ -444,14 +445,15 @@
 }
 
 
-//  Do the best we can to define the parameters for the next GF based on what
-// information we have available.
+/* Do the best we can to define the parameters for the next GF based on what
+ * information we have available.
+ */
 static void calc_gf_params(VP8_COMP *cpi)
 {
     int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
     int Boost = 0;
 
-    int gf_frame_useage = 0;      // Golden frame useage since last GF
+    int gf_frame_useage = 0;      /* Golden frame useage since last GF */
     int tot_mbs = cpi->recent_ref_frame_usage[INTRA_FRAME]  +
                   cpi->recent_ref_frame_usage[LAST_FRAME]   +
                   cpi->recent_ref_frame_usage[GOLDEN_FRAME] +
@@ -459,33 +461,30 @@
 
     int pct_gf_active = (100 * cpi->gf_active_count) / (cpi->common.mb_rows * cpi->common.mb_cols);
 
-    // Reset the last boost indicator
-    //cpi->last_boost = 100;
-
     if (tot_mbs)
         gf_frame_useage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] + cpi->recent_ref_frame_usage[ALTREF_FRAME]) * 100 / tot_mbs;
 
     if (pct_gf_active > gf_frame_useage)
         gf_frame_useage = pct_gf_active;
 
-    // Not two pass
+    /* Not two pass */
     if (cpi->pass != 2)
     {
-        // Single Pass lagged mode: TBD
+        /* Single Pass lagged mode: TBD */
         if (0)
         {
         }
 
-        // Single Pass compression: Has to use current and historical data
+        /* Single Pass compression: Has to use current and historical data */
         else
         {
 #if 0
-            // Experimental code
+            /* Experimental code */
             int index = cpi->one_pass_frame_index;
             int frames_to_scan = (cpi->max_gf_interval <= MAX_LAG_BUFFERS) ? cpi->max_gf_interval : MAX_LAG_BUFFERS;
 
+            /* ************** Experimental code - incomplete */
             /*
-            // *************** Experimental code - incomplete
             double decay_val = 1.0;
             double IIAccumulator = 0.0;
             double last_iiaccumulator = 0.0;
@@ -528,48 +527,51 @@
 #else
 
             /*************************************************************/
-            // OLD code
+            /* OLD code */
 
-            // Adjust boost based upon ambient Q
+            /* Adjust boost based upon ambient Q */
             Boost = GFQ_ADJUSTMENT;
 
-            // Adjust based upon most recently measure intra useage
+            /* Adjust based upon most recently measure intra useage */
             Boost = Boost * gf_intra_usage_adjustment[(cpi->this_frame_percent_intra < 15) ? cpi->this_frame_percent_intra : 14] / 100;
 
-            // Adjust gf boost based upon GF usage since last GF
+            /* Adjust gf boost based upon GF usage since last GF */
             Boost = Boost * gf_adjust_table[gf_frame_useage] / 100;
 #endif
         }
 
-        // golden frame boost without recode loop often goes awry.  be safe by keeping numbers down.
+        /* golden frame boost without recode loop often goes awry.  be
+         * safe by keeping numbers down.
+         */
         if (!cpi->sf.recode_loop)
         {
             if (cpi->compressor_speed == 2)
                 Boost = Boost / 2;
         }
 
-        // Apply an upper limit based on Q for 1 pass encodes
+        /* Apply an upper limit based on Q for 1 pass encodes */
         if (Boost > kf_gf_boost_qlimits[Q] && (cpi->pass == 0))
             Boost = kf_gf_boost_qlimits[Q];
 
-        // Apply lower limits to boost.
+        /* Apply lower limits to boost. */
         else if (Boost < 110)
             Boost = 110;
 
-        // Note the boost used
+        /* Note the boost used */
         cpi->last_boost = Boost;
 
     }
 
-    // Estimate next interval
-    // This is updated once the real frame size/boost is known.
+    /* Estimate next interval
+     * This is updated once the real frame size/boost is known.
+     */
     if (cpi->oxcf.fixed_q == -1)
     {
-        if (cpi->pass == 2)         // 2 Pass
+        if (cpi->pass == 2)         /* 2 Pass */
         {
             cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
         }
-        else                            // 1 Pass
+        else                            /* 1 Pass */
         {
             cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
 
@@ -595,10 +597,10 @@
     else
         cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
 
-    // ARF on or off
+    /* ARF on or off */
     if (cpi->pass != 2)
     {
-        // For now Alt ref is not allowed except in 2 pass modes.
+        /* For now Alt ref is not allowed except in 2 pass modes. */
         cpi->source_alt_ref_pending = 0;
 
         /*if ( cpi->oxcf.fixed_q == -1)
@@ -635,89 +637,34 @@
         min_frame_target = cpi->per_frame_bandwidth / 4;
 
 
-    // Special alt reference frame case
+    /* Special alt reference frame case */
     if((cpi->common.refresh_alt_ref_frame) && (cpi->oxcf.number_of_layers == 1))
     {
         if (cpi->pass == 2)
         {
-            cpi->per_frame_bandwidth = cpi->twopass.gf_bits;                       // Per frame bit target for the alt ref frame
+            /* Per frame bit target for the alt ref frame */
+            cpi->per_frame_bandwidth = cpi->twopass.gf_bits;
             cpi->this_frame_target = cpi->per_frame_bandwidth;
         }
 
         /* One Pass ??? TBD */
-        /*else
-        {
-            int frames_in_section;
-            int allocation_chunks;
-            int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
-            int alt_boost;
-            int max_arf_rate;
-
-            alt_boost = (cpi->gfu_boost * 3 * GFQ_ADJUSTMENT) / (2 * 100);
-            alt_boost += (cpi->frames_till_gf_update_due * 50);
-
-            // If alt ref is not currently active then we have a pottential double hit with GF and ARF so reduce the boost a bit.
-            // A similar thing is done on GFs that preceed a arf update.
-            if ( !cpi->source_alt_ref_active )
-                alt_boost = alt_boost * 3 / 4;
-
-            frames_in_section = cpi->frames_till_gf_update_due+1;                                   // Standard frames + GF
-            allocation_chunks = (frames_in_section * 100) + alt_boost;
-
-            // Normalize Altboost and allocations chunck down to prevent overflow
-            while ( alt_boost > 1000 )
-            {
-                alt_boost /= 2;
-                allocation_chunks /= 2;
-            }
-
-            else
-            {
-                int bits_in_section;
-
-                if ( cpi->kf_overspend_bits > 0 )
-                {
-                    Adjustment = (cpi->kf_bitrate_adjustment <= cpi->kf_overspend_bits) ? cpi->kf_bitrate_adjustment : cpi->kf_overspend_bits;
-
-                    if ( Adjustment > (cpi->per_frame_bandwidth - min_frame_target) )
-                        Adjustment = (cpi->per_frame_bandwidth - min_frame_target);
-
-                    cpi->kf_overspend_bits -= Adjustment;
-
-                    // Calculate an inter frame bandwidth target for the next few frames designed to recover
-                    // any extra bits spent on the key frame.
-                    cpi->inter_frame_target = cpi->per_frame_bandwidth - Adjustment;
-                    if ( cpi->inter_frame_target < min_frame_target )
-                        cpi->inter_frame_target = min_frame_target;
-                }
-                else
-                    cpi->inter_frame_target = cpi->per_frame_bandwidth;
-
-                bits_in_section = cpi->inter_frame_target * frames_in_section;
-
-                // Avoid loss of precision but avoid overflow
-                if ( (bits_in_section>>7) > allocation_chunks )
-                    cpi->this_frame_target = alt_boost * (bits_in_section / allocation_chunks);
-                else
-                    cpi->this_frame_target = (alt_boost * bits_in_section) / allocation_chunks;
-            }
-        }
-        */
     }
 
-    // Normal frames (gf,and inter)
+    /* Normal frames (gf,and inter) */
     else
     {
-        // 2 pass
+        /* 2 pass */
         if (cpi->pass == 2)
         {
             cpi->this_frame_target = cpi->per_frame_bandwidth;
         }
-        // 1 pass
+        /* 1 pass */
         else
         {
-            // Make rate adjustment to recover bits spent in key frame
-            // Test to see if the key frame inter data rate correction should still be in force
+            /* Make rate adjustment to recover bits spent in key frame
+             * Test to see if the key frame inter data rate correction
+             * should still be in force
+             */
             if (cpi->kf_overspend_bits > 0)
             {
                 Adjustment = (cpi->kf_bitrate_adjustment <= cpi->kf_overspend_bits) ? cpi->kf_bitrate_adjustment : cpi->kf_overspend_bits;
@@ -727,8 +674,10 @@
 
                 cpi->kf_overspend_bits -= Adjustment;
 
-                // Calculate an inter frame bandwidth target for the next few frames designed to recover
-                // any extra bits spent on the key frame.
+                /* Calculate an inter frame bandwidth target for the next
+                 * few frames designed to recover any extra bits spent on
+                 * the key frame.
+                 */
                 cpi->this_frame_target = cpi->per_frame_bandwidth - Adjustment;
 
                 if (cpi->this_frame_target < min_frame_target)
@@ -737,7 +686,9 @@
             else
                 cpi->this_frame_target = cpi->per_frame_bandwidth;
 
-            // If appropriate make an adjustment to recover bits spent on a recent GF
+            /* If appropriate make an adjustment to recover bits spent on a
+             * recent GF
+             */
             if ((cpi->gf_overspend_bits > 0) && (cpi->this_frame_target > min_frame_target))
             {
                 int Adjustment = (cpi->non_gf_bitrate_adjustment <= cpi->gf_overspend_bits) ? cpi->non_gf_bitrate_adjustment : cpi->gf_overspend_bits;
@@ -749,11 +700,11 @@
                 cpi->this_frame_target -= Adjustment;
             }
 
-            // Apply small + and - boosts for non gf frames
+            /* Apply small + and - boosts for non gf frames */
             if ((cpi->last_boost > 150) && (cpi->frames_till_gf_update_due > 0) &&
                 (cpi->current_gf_interval >= (MIN_GF_INTERVAL << 1)))
             {
-                // % Adjustment limited to the range 1% to 10%
+                /* % Adjustment limited to the range 1% to 10% */
                 Adjustment = (cpi->last_boost - 100) >> 5;
 
                 if (Adjustment < 1)
@@ -761,7 +712,7 @@
                 else if (Adjustment > 10)
                     Adjustment = 10;
 
-                // Convert to bits
+                /* Convert to bits */
                 Adjustment = (cpi->this_frame_target * Adjustment) / 100;
 
                 if (Adjustment > (cpi->this_frame_target - min_frame_target))
@@ -775,47 +726,53 @@
         }
     }
 
-    // Sanity check that the total sum of adjustments is not above the maximum allowed
-    // That is that having allowed for KF and GF penalties we have not pushed the
-    // current interframe target to low. If the adjustment we apply here is not capable of recovering
-    // all the extra bits we have spent in the KF or GF then the remainder will have to be recovered over
-    // a longer time span via other buffer / rate control mechanisms.
+    /* Sanity check that the total sum of adjustments is not above the
+     * maximum allowed That is that having allowed for KF and GF penalties
+     * we have not pushed the current interframe target to low. If the
+     * adjustment we apply here is not capable of recovering all the extra
+     * bits we have spent in the KF or GF then the remainder will have to
+     * be recovered over a longer time span via other buffer / rate control
+     * mechanisms.
+     */
     if (cpi->this_frame_target < min_frame_target)
         cpi->this_frame_target = min_frame_target;
 
     if (!cpi->common.refresh_alt_ref_frame)
-        // Note the baseline target data rate for this inter frame.
+        /* Note the baseline target data rate for this inter frame. */
         cpi->inter_frame_target = cpi->this_frame_target;
 
-    // One Pass specific code
+    /* One Pass specific code */
     if (cpi->pass == 0)
     {
-        // Adapt target frame size with respect to any buffering constraints:
+        /* Adapt target frame size with respect to any buffering constraints: */
         if (cpi->buffered_mode)
         {
-            int one_percent_bits = 1 + cpi->oxcf.optimal_buffer_level / 100;
+            int one_percent_bits = (int)
+                (1 + cpi->oxcf.optimal_buffer_level / 100);
 
             if ((cpi->buffer_level < cpi->oxcf.optimal_buffer_level) ||
                 (cpi->bits_off_target < cpi->oxcf.optimal_buffer_level))
             {
                 int percent_low = 0;
 
-                // Decide whether or not we need to adjust the frame data rate target.
-                //
-                // If we are are below the optimal buffer fullness level and adherence
-                // to buffering constraints is important to the end usage then adjust
-                // the per frame target.
+                /* Decide whether or not we need to adjust the frame data
+                 * rate target.
+                 *
+                 * If we are are below the optimal buffer fullness level
+                 * and adherence to buffering constraints is important to
+                 * the end usage then adjust the per frame target.
+                 */
                 if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) &&
                     (cpi->buffer_level < cpi->oxcf.optimal_buffer_level))
                 {
-                    percent_low =
-                        (cpi->oxcf.optimal_buffer_level - cpi->buffer_level) /
-                        one_percent_bits;
+                    percent_low = (int)
+                        ((cpi->oxcf.optimal_buffer_level - cpi->buffer_level) /
+                        one_percent_bits);
                 }
-                // Are we overshooting the long term clip data rate...
+                /* Are we overshooting the long term clip data rate... */
                 else if (cpi->bits_off_target < 0)
                 {
-                    // Adjust per frame data target downwards to compensate.
+                    /* Adjust per frame data target downwards to compensate. */
                     percent_low = (int)(100 * -cpi->bits_off_target /
                                        (cpi->total_byte_count * 8));
                 }
@@ -825,40 +782,46 @@
                 else if (percent_low < 0)
                     percent_low = 0;
 
-                // lower the target bandwidth for this frame.
+                /* lower the target bandwidth for this frame. */
                 cpi->this_frame_target -=
                         (cpi->this_frame_target * percent_low) / 200;
 
-                // Are we using allowing control of active_worst_allowed_q
-                // according to buffer level.
+                /* Are we using allowing control of active_worst_allowed_q
+                 * according to buffer level.
+                 */
                 if (cpi->auto_worst_q && cpi->ni_frames > 150)
                 {
-                    int critical_buffer_level;
+                    int64_t critical_buffer_level;
 
-                    // For streaming applications the most important factor is
-                    // cpi->buffer_level as this takes into account the
-                    // specified short term buffering constraints. However,
-                    // hitting the long term clip data rate target is also
-                    // important.
+                    /* For streaming applications the most important factor is
+                     * cpi->buffer_level as this takes into account the
+                     * specified short term buffering constraints. However,
+                     * hitting the long term clip data rate target is also
+                     * important.
+                     */
                     if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
                     {
-                        // Take the smaller of cpi->buffer_level and
-                        // cpi->bits_off_target
+                        /* Take the smaller of cpi->buffer_level and
+                         * cpi->bits_off_target
+                         */
                         critical_buffer_level =
                             (cpi->buffer_level < cpi->bits_off_target)
                             ? cpi->buffer_level : cpi->bits_off_target;
                     }
-                    // For local file playback short term buffering constraints
-                    // are less of an issue
+                    /* For local file playback short term buffering constraints
+                     * are less of an issue
+                     */
                     else
                     {
-                        // Consider only how we are doing for the clip as a
-                        // whole
+                        /* Consider only how we are doing for the clip as a
+                         * whole
+                         */
                         critical_buffer_level = cpi->bits_off_target;
                     }
 
-                    // Set the active worst quality based upon the selected
-                    // buffer fullness number.
+                    /* Set the active worst quality based upon the selected
+                     * buffer fullness number.
+                     */
                     if (critical_buffer_level < cpi->oxcf.optimal_buffer_level)
                     {
                         if ( critical_buffer_level >
@@ -870,15 +833,16 @@
                                       (critical_buffer_level -
                                        (cpi->oxcf.optimal_buffer_level >> 2));
 
-                            // Step active worst quality down from
-                            // cpi->ni_av_qi when (critical_buffer_level ==
-                            // cpi->optimal_buffer_level) to
-                            // cpi->worst_quality when
-                            // (critical_buffer_level ==
-                            //     cpi->optimal_buffer_level >> 2)
+                            /* Step active worst quality down from
+                             * cpi->ni_av_qi when (critical_buffer_level ==
+                             * cpi->optimal_buffer_level) to
+                             * cpi->worst_quality when
+                             * (critical_buffer_level ==
+                             *     cpi->optimal_buffer_level >> 2)
+                             */
                             cpi->active_worst_quality =
                                 cpi->worst_quality -
-                                ((qadjustment_range * above_base) /
+                                (int)((qadjustment_range * above_base) /
                                  (cpi->oxcf.optimal_buffer_level*3>>2));
                         }
                         else
@@ -903,9 +867,9 @@
                 if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
                      && (cpi->buffer_level > cpi->oxcf.optimal_buffer_level))
                 {
-                    percent_high = (cpi->buffer_level
+                    percent_high = (int)((cpi->buffer_level
                                     - cpi->oxcf.optimal_buffer_level)
-                                   / one_percent_bits;
+                                   / one_percent_bits);
                 }
                 else if (cpi->bits_off_target > cpi->oxcf.optimal_buffer_level)
                 {
@@ -921,11 +885,14 @@
                 cpi->this_frame_target += (cpi->this_frame_target *
                                           percent_high) / 200;
 
-                // Are we allowing control of active_worst_allowed_q according
-                // to buffer level.
+                /* Are we allowing control of active_worst_allowed_q according
+                 * to buffer level.
+                 */
                 if (cpi->auto_worst_q && cpi->ni_frames > 150)
                 {
-                    // When using the relaxed buffer model stick to the user specified value
+                    /* When using the relaxed buffer model stick to the
+                     * user specified value
+                     */
                     cpi->active_worst_quality = cpi->ni_av_qi;
                 }
                 else
@@ -934,26 +901,27 @@
                 }
             }
 
-            // Set active_best_quality to prevent quality rising too high
+            /* Set active_best_quality to prevent quality rising too high */
             cpi->active_best_quality = cpi->best_quality;
 
-            // Worst quality obviously must not be better than best quality
+            /* Worst quality obviously must not be better than best quality */
             if (cpi->active_worst_quality <= cpi->active_best_quality)
                 cpi->active_worst_quality = cpi->active_best_quality + 1;
 
             if(cpi->active_worst_quality > 127)
                 cpi->active_worst_quality = 127;
         }
-        // Unbuffered mode (eg. video conferencing)
+        /* Unbuffered mode (eg. video conferencing) */
         else
         {
-            // Set the active worst quality
+            /* Set the active worst quality */
             cpi->active_worst_quality = cpi->worst_quality;
         }
 
-        // Special trap for constrained quality mode
-        // "active_worst_quality" may never drop below cq level
-        // for any frame type.
+        /* Special trap for constrained quality mode
+         * "active_worst_quality" may never drop below cq level
+         * for any frame type.
+         */
         if ( cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY &&
              cpi->active_worst_quality < cpi->cq_target_quality)
         {
@@ -961,16 +929,19 @@
         }
     }
 
-    // Test to see if we have to drop a frame
-    // The auto-drop frame code is only used in buffered mode.
-    // In unbufferd mode (eg vide conferencing) the descision to
-    // code or drop a frame is made outside the codec in response to real
-    // world comms or buffer considerations.
-    if (cpi->drop_frames_allowed && cpi->buffered_mode &&
+    /* Test to see if we have to drop a frame
+     * The auto-drop frame code is only used in buffered mode.
+     * In unbufferd mode (eg vide conferencing) the descision to
+     * code or drop a frame is made outside the codec in response to real
+     * world comms or buffer considerations.
+     */
+    if (cpi->drop_frames_allowed &&
         (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) &&
-        ((cpi->common.frame_type != KEY_FRAME))) //|| !cpi->oxcf.allow_spatial_resampling) )
+        ((cpi->common.frame_type != KEY_FRAME)))
     {
-        // Check for a buffer underun-crisis in which case we have to drop a frame
+        /* Check for a buffer underun-crisis in which case we have to drop
+         * a frame
+         */
         if ((cpi->buffer_level < 0))
         {
 #if 0
@@ -981,41 +952,23 @@
                     (cpi->buffer_level * 100) / cpi->oxcf.optimal_buffer_level);
             fclose(f);
 #endif
-            //vpx_log("Decoder: Drop frame due to bandwidth: %d \n",cpi->buffer_level, cpi->av_per_frame_bandwidth);
-
             cpi->drop_frame = 1;
-        }
 
-#if 0
-        // Check for other drop frame crtieria (Note 2 pass cbr uses decimation on whole KF sections)
-        else if ((cpi->buffer_level < cpi->oxcf.drop_frames_water_mark * cpi->oxcf.optimal_buffer_level / 100) &&
-                 (cpi->drop_count < cpi->max_drop_count) && (cpi->pass == 0))
-        {
-            cpi->drop_frame = 1;
-        }
-
-#endif
-
-        if (cpi->drop_frame)
-        {
-            // Update the buffer level variable.
+            /* Update the buffer level variable. */
             cpi->bits_off_target += cpi->av_per_frame_bandwidth;
             if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size)
-              cpi->bits_off_target = cpi->oxcf.maximum_buffer_size;
+              cpi->bits_off_target = (int)cpi->oxcf.maximum_buffer_size;
             cpi->buffer_level = cpi->bits_off_target;
         }
-        else
-            cpi->drop_count = 0;
     }
 
-    // Adjust target frame size for Golden Frames:
+    /* Adjust target frame size for Golden Frames: */
     if (cpi->oxcf.error_resilient_mode == 0 &&
         (cpi->frames_till_gf_update_due == 0) && !cpi->drop_frame)
     {
-        //int Boost = 0;
         int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
 
-        int gf_frame_useage = 0;      // Golden frame useage since last GF
+        int gf_frame_useage = 0;      /* Golden frame useage since last GF */
         int tot_mbs = cpi->recent_ref_frame_usage[INTRA_FRAME]  +
                       cpi->recent_ref_frame_usage[LAST_FRAME]   +
                       cpi->recent_ref_frame_usage[GOLDEN_FRAME] +
@@ -1023,30 +976,29 @@
 
         int pct_gf_active = (100 * cpi->gf_active_count) / (cpi->common.mb_rows * cpi->common.mb_cols);
 
-        // Reset the last boost indicator
-        //cpi->last_boost = 100;
-
         if (tot_mbs)
             gf_frame_useage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] + cpi->recent_ref_frame_usage[ALTREF_FRAME]) * 100 / tot_mbs;
 
         if (pct_gf_active > gf_frame_useage)
             gf_frame_useage = pct_gf_active;
 
-        // Is a fixed manual GF frequency being used
+        /* Is a fixed manual GF frequency being used */
         if (cpi->auto_gold)
         {
-            // For one pass throw a GF if recent frame intra useage is low or the GF useage is high
+            /* For one pass throw a GF if recent frame intra useage is
+             * low or the GF useage is high
+             */
             if ((cpi->pass == 0) && (cpi->this_frame_percent_intra < 15 || gf_frame_useage >= 5))
                 cpi->common.refresh_golden_frame = 1;
 
-            // Two pass GF descision
+            /* Two pass GF descision */
             else if (cpi->pass == 2)
                 cpi->common.refresh_golden_frame = 1;
         }
 
 #if 0
 
-        // Debug stats
+        /* Debug stats */
         if (0)
         {
             FILE *f;
@@ -1063,7 +1015,7 @@
         {
 #if 0
 
-            if (0)   // p_gw
+            if (0)
             {
                 FILE *f;
 
@@ -1079,16 +1031,20 @@
                 calc_gf_params(cpi);
             }
 
-            // If we are using alternate ref instead of gf then do not apply the boost
-            // It will instead be applied to the altref update
-            // Jims modified boost
+            /* If we are using alternate ref instead of gf then do not apply the
+             * boost It will instead be applied to the altref update Jims
+             * modified boost
+             */
             if (!cpi->source_alt_ref_active)
             {
                 if (cpi->oxcf.fixed_q < 0)
                 {
                     if (cpi->pass == 2)
                     {
-                        cpi->this_frame_target = cpi->per_frame_bandwidth;          // The spend on the GF is defined in the two pass code for two pass encodes
+                        /* The spend on the GF is defined in the two pass
+                         * code for two pass encodes
+                         */
+                        cpi->this_frame_target = cpi->per_frame_bandwidth;
                     }
                     else
                     {
@@ -1097,14 +1053,16 @@
                         int allocation_chunks = (frames_in_section * 100) + (Boost - 100);
                         int bits_in_section = cpi->inter_frame_target * frames_in_section;
 
-                        // Normalize Altboost and allocations chunck down to prevent overflow
+                        /* Normalize Altboost and allocations chunck down to
+                         * prevent overflow
+                         */
                         while (Boost > 1000)
                         {
                             Boost /= 2;
                             allocation_chunks /= 2;
                         }
 
-                        // Avoid loss of precision but avoid overflow
+                        /* Avoid loss of precision but avoid overflow */
                         if ((bits_in_section >> 7) > allocation_chunks)
                             cpi->this_frame_target = Boost * (bits_in_section / allocation_chunks);
                         else
@@ -1117,10 +1075,11 @@
                          * cpi->last_boost) / 100;
 
             }
-            // If there is an active ARF at this location use the minimum
-            // bits on this frame even if it is a contructed arf.
-            // The active maximum quantizer insures that an appropriate
-            // number of bits will be spent if needed for contstructed ARFs.
+            /* If there is an active ARF at this location use the minimum
+             * bits on this frame even if it is a contructed arf.
+             * The active maximum quantizer insures that an appropriate
+             * number of bits will be spent if needed for contstructed ARFs.
+             */
             else
             {
                 cpi->this_frame_target = 0;
@@ -1144,8 +1103,8 @@
 
     int    projected_size_based_on_q = 0;
 
-    // Clear down mmx registers to allow floating point in what follows
-    vp8_clear_system_state();  //__asm emms;
+    /* Clear down mmx registers to allow floating point in what follows */
+    vp8_clear_system_state();
 
     if (cpi->common.frame_type == KEY_FRAME)
     {
@@ -1159,17 +1118,18 @@
             rate_correction_factor = cpi->rate_correction_factor;
     }
 
-    // Work out how big we would have expected the frame to be at this Q given the current correction factor.
-    // Stay in double to avoid int overflow when values are large
-    //projected_size_based_on_q = ((int)(.5 + rate_correction_factor * vp8_bits_per_mb[cpi->common.frame_type][Q]) * cpi->common.MBs) >> BPER_MB_NORMBITS;
+    /* Work out how big we would have expected the frame to be at this Q
+     * given the current correction factor. Stay in double to avoid int
+     * overflow when values are large
+     */
     projected_size_based_on_q = (int)(((.5 + rate_correction_factor * vp8_bits_per_mb[cpi->common.frame_type][Q]) * cpi->common.MBs) / (1 << BPER_MB_NORMBITS));
 
-    // Make some allowance for cpi->zbin_over_quant
+    /* Make some allowance for cpi->zbin_over_quant */
     if (cpi->zbin_over_quant > 0)
     {
         int Z = cpi->zbin_over_quant;
         double Factor = 0.99;
-        double factor_adjustment = 0.01 / 256.0; //(double)ZBIN_OQ_MAX;
+        double factor_adjustment = 0.01 / 256.0;
 
         while (Z > 0)
         {
@@ -1183,13 +1143,13 @@
         }
     }
 
-    // Work out a size correction factor.
-    //if ( cpi->this_frame_target > 0 )
-    //  correction_factor = (100 * cpi->projected_frame_size) / cpi->this_frame_target;
+    /* Work out a size correction factor. */
     if (projected_size_based_on_q > 0)
         correction_factor = (100 * cpi->projected_frame_size) / projected_size_based_on_q;
 
-    // More heavily damped adjustment used if we have been oscillating either side of target
+    /* More heavily damped adjustment used if we have been oscillating
+     * either side of target
+     */
     switch (damp_var)
     {
     case 0:
@@ -1204,25 +1164,23 @@
         break;
     }
 
-    //if ( (correction_factor > 102) && (Q < cpi->active_worst_quality) )
     if (correction_factor > 102)
     {
-        // We are not already at the worst allowable quality
+        /* We are not already at the worst allowable quality */
         correction_factor = (int)(100.5 + ((correction_factor - 100) * adjustment_limit));
         rate_correction_factor = ((rate_correction_factor * correction_factor) / 100);
 
-        // Keep rate_correction_factor within limits
+        /* Keep rate_correction_factor within limits */
         if (rate_correction_factor > MAX_BPB_FACTOR)
             rate_correction_factor = MAX_BPB_FACTOR;
     }
-    //else if ( (correction_factor < 99) && (Q > cpi->active_best_quality) )
     else if (correction_factor < 99)
     {
-        // We are not already at the best allowable quality
+        /* We are not already at the best allowable quality */
         correction_factor = (int)(100.5 - ((100 - correction_factor) * adjustment_limit));
         rate_correction_factor = ((rate_correction_factor * correction_factor) / 100);
 
-        // Keep rate_correction_factor within limits
+        /* Keep rate_correction_factor within limits */
         if (rate_correction_factor < MIN_BPB_FACTOR)
             rate_correction_factor = MIN_BPB_FACTOR;
     }
@@ -1243,7 +1201,7 @@
 {
     int Q = cpi->active_worst_quality;
 
-    // Reset Zbin OQ value
+    /* Reset Zbin OQ value */
     cpi->zbin_over_quant = 0;
 
     if (cpi->oxcf.fixed_q >= 0)
@@ -1272,7 +1230,7 @@
         int bits_per_mb_at_this_q;
         double correction_factor;
 
-        // Select the appropriate correction factor based upon type of frame.
+        /* Select the appropriate correction factor based upon type of frame. */
         if (cpi->common.frame_type == KEY_FRAME)
             correction_factor = cpi->key_frame_rate_correction_factor;
         else
@@ -1283,9 +1241,12 @@
                 correction_factor = cpi->rate_correction_factor;
         }
 
-        // Calculate required scaling factor based on target frame size and size of frame produced using previous Q
+        /* Calculate required scaling factor based on target frame size and
+         * size of frame produced using previous Q
+         */
         if (target_bits_per_frame >= (INT_MAX >> BPER_MB_NORMBITS))
-            target_bits_per_mb = (target_bits_per_frame / cpi->common.MBs) << BPER_MB_NORMBITS;       // Case where we would overflow int
+            /* Case where we would overflow int */
+            target_bits_per_mb = (target_bits_per_frame / cpi->common.MBs) << BPER_MB_NORMBITS;
         else
             target_bits_per_mb = (target_bits_per_frame << BPER_MB_NORMBITS) / cpi->common.MBs;
 
@@ -1310,17 +1271,19 @@
         while (++i <= cpi->active_worst_quality);
 
 
-        // If we are at MAXQ then enable Q over-run which seeks to claw back additional bits through things like
-        // the RD multiplier and zero bin size.
+        /* If we are at MAXQ then enable Q over-run which seeks to claw
+         * back additional bits through things like the RD multiplier
+         * and zero bin size.
+         */
         if (Q >= MAXQ)
         {
             int zbin_oqmax;
 
             double Factor = 0.99;
-            double factor_adjustment = 0.01 / 256.0; //(double)ZBIN_OQ_MAX;
+            double factor_adjustment = 0.01 / 256.0;
 
             if (cpi->common.frame_type == KEY_FRAME)
-                zbin_oqmax = 0; //ZBIN_OQ_MAX/16
+                zbin_oqmax = 0;
             else if (cpi->common.refresh_alt_ref_frame || (cpi->common.refresh_golden_frame && !cpi->source_alt_ref_active))
                 zbin_oqmax = 16;
             else
@@ -1340,10 +1303,13 @@
                 cpi->zbin_over_quant = (int)Oq;
             }*/
 
-            // Each incrment in the zbin is assumed to have a fixed effect on bitrate. This is not of course true.
-            // The effect will be highly clip dependent and may well have sudden steps.
-            // The idea here is to acheive higher effective quantizers than the normal maximum by expanding the zero
-            // bin and hence decreasing the number of low magnitude non zero coefficients.
+            /* Each incrment in the zbin is assumed to have a fixed effect
+             * on bitrate. This is not of course true. The effect will be
+             * highly clip dependent and may well have sudden steps. The
+             * idea here is to acheive higher effective quantizers than the
+             * normal maximum by expanding the zero bin and hence
+             * decreasing the number of low magnitude non zero coefficients.
+             */
             while (cpi->zbin_over_quant < zbin_oqmax)
             {
                 cpi->zbin_over_quant ++;
@@ -1351,14 +1317,15 @@
                 if (cpi->zbin_over_quant > zbin_oqmax)
                     cpi->zbin_over_quant = zbin_oqmax;
 
-                // Adjust bits_per_mb_at_this_q estimate
+                /* Adjust bits_per_mb_at_this_q estimate */
                 bits_per_mb_at_this_q = (int)(Factor * bits_per_mb_at_this_q);
                 Factor += factor_adjustment;
 
                 if (Factor  >= 0.999)
                     Factor = 0.999;
 
-                if (bits_per_mb_at_this_q <= target_bits_per_mb)    // Break out if we get down to the target rate
+                /* Break out if we get down to the target rate */
+                if (bits_per_mb_at_this_q <= target_bits_per_mb)
                     break;
             }
 
@@ -1373,7 +1340,7 @@
 {
     int i;
 
-    // Average key frame frequency
+    /* Average key frame frequency */
     int av_key_frame_frequency = 0;
 
     /* First key frame at start of sequence is a special case. We have no
@@ -1424,11 +1391,11 @@
 
 void vp8_adjust_key_frame_context(VP8_COMP *cpi)
 {
-    // Clear down mmx registers to allow floating point in what follows
+    /* Clear down mmx registers to allow floating point in what follows */
     vp8_clear_system_state();
 
-    // Do we have any key frame overspend to recover?
-    // Two-pass overspend handled elsewhere.
+    /* Do we have any key frame overspend to recover? */
+    /* Two-pass overspend handled elsewhere. */
     if ((cpi->pass != 2)
          && (cpi->projected_frame_size > cpi->per_frame_bandwidth))
     {
@@ -1462,10 +1429,12 @@
 
 void vp8_compute_frame_size_bounds(VP8_COMP *cpi, int *frame_under_shoot_limit, int *frame_over_shoot_limit)
 {
-    // Set-up bounds on acceptable frame size:
+    /* Set-up bounds on acceptable frame size: */
     if (cpi->oxcf.fixed_q >= 0)
     {
-        // Fixed Q scenario: frame size never outranges target (there is no target!)
+        /* Fixed Q scenario: frame size never outranges target
+         * (there is no target!)
+         */
         *frame_under_shoot_limit = 0;
         *frame_over_shoot_limit  = INT_MAX;
     }
@@ -1487,18 +1456,22 @@
             }
             else
             {
-                // For CBR take buffer fullness into account
+                /* For CBR take buffer fullness into account */
                 if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
                 {
                     if (cpi->buffer_level >= ((cpi->oxcf.optimal_buffer_level + cpi->oxcf.maximum_buffer_size) >> 1))
                     {
-                        // Buffer is too full so relax overshoot and tighten undershoot
+                        /* Buffer is too full so relax overshoot and tighten
+                         * undershoot
+                         */
                         *frame_over_shoot_limit  = cpi->this_frame_target * 12 / 8;
                         *frame_under_shoot_limit = cpi->this_frame_target * 6 / 8;
                     }
                     else if (cpi->buffer_level <= (cpi->oxcf.optimal_buffer_level >> 1))
                     {
-                        // Buffer is too low so relax undershoot and tighten overshoot
+                        /* Buffer is too low so relax undershoot and tighten
+                         * overshoot
+                         */
                         *frame_over_shoot_limit  = cpi->this_frame_target * 10 / 8;
                         *frame_under_shoot_limit = cpi->this_frame_target * 4 / 8;
                     }
@@ -1508,11 +1481,13 @@
                         *frame_under_shoot_limit = cpi->this_frame_target * 5 / 8;
                     }
                 }
-                // VBR and CQ mode
-                // Note that tighter restrictions here can help quality but hurt encode speed
+                /* VBR and CQ mode */
+                /* Note that tighter restrictions here can help quality
+                 * but hurt encode speed
+                 */
                 else
                 {
-                    // Stron overshoot limit for constrained quality
+                    /* Stron overshoot limit for constrained quality */
                     if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY)
                     {
                         *frame_over_shoot_limit  = cpi->this_frame_target * 11 / 8;
@@ -1527,9 +1502,10 @@
             }
         }
 
-        // For very small rate targets where the fractional adjustment
-        // (eg * 7/8) may be tiny make sure there is at least a minimum
-        // range.
+        /* For very small rate targets where the fractional adjustment
+         * (eg * 7/8) may be tiny make sure there is at least a minimum
+         * range.
+         */
         *frame_over_shoot_limit += 200;
         *frame_under_shoot_limit -= 200;
         if ( *frame_under_shoot_limit < 0 )
@@ -1539,7 +1515,7 @@
 }
 
 
-// return of 0 means drop frame
+/* return of 0 means drop frame */
 int vp8_pick_frame_size(VP8_COMP *cpi)
 {
     VP8_COMMON *cm = &cpi->common;
@@ -1550,11 +1526,10 @@
     {
         calc_pframe_target_size(cpi);
 
-        // Check if we're dropping the frame:
+        /* Check if we're dropping the frame: */
         if (cpi->drop_frame)
         {
             cpi->drop_frame = 0;
-            cpi->drop_count++;
             return 0;
         }
     }

diff --git a/vp8/encoder/ratectrl.h b/vp8/encoder/ratectrl.h
index d4f7796..c43f08d 100644
--- a/vp8/encoder/ratectrl.h
+++ b/vp8/encoder/ratectrl.h

@@ -22,7 +22,7 @@
 extern void vp8_adjust_key_frame_context(VP8_COMP *cpi);
 extern void vp8_compute_frame_size_bounds(VP8_COMP *cpi, int *frame_under_shoot_limit, int *frame_over_shoot_limit);
 
-// return of 0 means drop frame
+/* return of 0 means drop frame */
 extern int vp8_pick_frame_size(VP8_COMP *cpi);
 
 #endif

diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index b457f03..28d5c1e 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c

@@ -160,7 +160,9 @@
         for (j = 0; j < COEF_BANDS; j++)
             for (k = 0; k < PREV_COEF_CONTEXTS; k++)
 
-                // check for pt=0 and band > 1 if block type 0 and 0 if blocktype 1
+                /* check for pt=0 and band > 1 if block type 0
+                 * and 0 if blocktype 1
+                 */
                 if (k == 0 && j > (i == 0))
                     vp8_cost_tokens2(c[i][j][k], p [i][j][k], vp8_coef_tree, 2);
                 else
@@ -228,22 +230,22 @@
     double capped_q = (Qvalue < 160) ? (double)Qvalue : 160.0;
     double rdconst = 2.80;
 
-    vp8_clear_system_state();  //__asm emms;
+    vp8_clear_system_state();
 
-    // Further tests required to see if optimum is different
-    // for key frames, golden frames and arf frames.
-    // if (cpi->common.refresh_golden_frame ||
-    //     cpi->common.refresh_alt_ref_frame)
+    /* Further tests required to see if optimum is different
+     * for key frames, golden frames and arf frames.
+     */
     cpi->RDMULT = (int)(rdconst * (capped_q * capped_q));
 
-    // Extend rate multiplier along side quantizer zbin increases
+    /* Extend rate multiplier along side quantizer zbin increases */
     if (cpi->zbin_over_quant  > 0)
     {
         double oq_factor;
         double modq;
 
-        // Experimental code using the same basic equation as used for Q above
-        // The units of cpi->zbin_over_quant are 1/128 of Q bin size
+        /* Experimental code using the same basic equation as used for Q above
+         * The units of cpi->zbin_over_quant are 1/128 of Q bin size
+         */
         oq_factor = 1.0 + ((double)0.0015625 * cpi->zbin_over_quant);
         modq = (int)((double)capped_q * oq_factor);
         cpi->RDMULT = (int)(rdconst * (modq * modq));
@@ -307,7 +309,7 @@
     }
 
     {
-      // build token cost array for the type of frame we have now
+      /* build token cost array for the type of frame we have now */
       FRAME_CONTEXT *l = &cpi->lfc_n;
 
       if(cpi->common.refresh_alt_ref_frame)
@@ -326,12 +328,8 @@
       */
 
 
-      // TODO make these mode costs depend on last,alt or gold too.  (jbb)
+      /* TODO make these mode costs depend on last,alt or gold too.  (jbb) */
       vp8_init_mode_costs(cpi);
-
-      // TODO figure onnnnuut why making mv cost frame type dependent didn't help (jbb)
-      //vp8_build_component_cost_table(cpi->mb.mvcost, (const MV_CONTEXT *) l->mvc, flags);
-
     }
 
 }
@@ -356,14 +354,6 @@
 
 #endif
 
-    /*
-    // this is done during parameter valid check
-    if( cpi->oxcf.cpu_used > 16)
-        cpi->oxcf.cpu_used = 16;
-    if( cpi->oxcf.cpu_used < -16)
-        cpi->oxcf.cpu_used = -16;
-    */
-
     if (cpi->avg_pick_mode_time < milliseconds_for_compress && (cpi->avg_encode_time - cpi->avg_pick_mode_time) < milliseconds_for_compress)
     {
         if (cpi->avg_pick_mode_time == 0)
@@ -390,10 +380,10 @@
                 cpi->avg_pick_mode_time = 0;
                 cpi->avg_encode_time = 0;
 
-                // In real-time mode, cpi->speed is in [4, 16].
-                if (cpi->Speed < 4)        //if ( cpi->Speed < 0 )
+                /* In real-time mode, cpi->speed is in [4, 16]. */
+                if (cpi->Speed < 4)
                 {
-                    cpi->Speed = 4;        //cpi->Speed = 0;
+                    cpi->Speed = 4;
                 }
             }
         }
@@ -549,7 +539,7 @@
     if (c < 16)
         cost += mb->token_costs [type] [vp8_coef_bands[c]] [pt] [DCT_EOB_TOKEN];
 
-    pt = (c != !type); // is eob first coefficient;
+    pt = (c != !type); /* is eob first coefficient; */
     *a = *l = pt;
 
     return cost;
@@ -595,7 +585,7 @@
     vp8_subtract_mby( mb->src_diff, *(mb->block[0].base_src),
         mb->block[0].src_stride,  mb->e_mbd.predictor, 16);
 
-    // Fdct and building the 2nd order block
+    /* Fdct and building the 2nd order block */
     for (beptr = mb->block; beptr < mb->block + 16; beptr += 2)
     {
         mb->short_fdct8x4(beptr->src_diff, beptr->coeff, 32);
@@ -603,25 +593,25 @@
         *Y2DCPtr++ = beptr->coeff[16];
     }
 
-    // 2nd order fdct
+    /* 2nd order fdct */
     mb->short_walsh4x4(mb_y2->src_diff, mb_y2->coeff, 8);
 
-    // Quantization
+    /* Quantization */
     for (b = 0; b < 16; b++)
     {
         mb->quantize_b(&mb->block[b], &mb->e_mbd.block[b]);
     }
 
-    // DC predication and Quantization of 2nd Order block
+    /* DC predication and Quantization of 2nd Order block */
     mb->quantize_b(mb_y2, x_y2);
 
-    // Distortion
+    /* Distortion */
     d = vp8_mbblock_error(mb, 1) << 2;
     d += vp8_block_error(mb_y2->coeff, x_y2->dqcoeff);
 
     *Distortion = (d >> 4);
 
-    // rate
+    /* rate */
     *Rate = vp8_rdcost_mby(mb);
 }
 
@@ -663,7 +653,11 @@
     DECLARE_ALIGNED_ARRAY(16, unsigned char,  best_predictor, 16*4);
     DECLARE_ALIGNED_ARRAY(16, short, best_dqcoeff, 16);
     int dst_stride = x->e_mbd.dst.y_stride;
-    unsigned char *base_dst = x->e_mbd.dst.y_buffer;
+    unsigned char *dst = x->e_mbd.dst.y_buffer + b->offset;
+
+    unsigned char *Above = dst - dst_stride;
+    unsigned char *yleft = dst - 1;
+    unsigned char top_left = Above[-1];
 
     for (mode = B_DC_PRED; mode <= B_HU_PRED; mode++)
     {
@@ -672,8 +666,8 @@
 
         rate = bmode_costs[mode];
 
-        vp8_intra4x4_predict(base_dst + b->offset, dst_stride, mode,
-                             b->predictor, 16);
+        vp8_intra4x4_predict(Above, yleft, dst_stride, mode,
+                             b->predictor, 16, top_left);
         vp8_subtract_b(be, b, 16);
         x->short_fdct4x4(be->src_diff, be->coeff, 32);
         x->quantize_b(be, b);
@@ -700,10 +694,9 @@
             vpx_memcpy(best_dqcoeff, b->dqcoeff, 32);
         }
     }
-    b->bmi.as_mode = (B_PREDICTION_MODE)(*best_mode);
+    b->bmi.as_mode = *best_mode;
 
-    vp8_short_idct4x4llm(best_dqcoeff, best_predictor, 16, base_dst + b->offset,
-                         dst_stride);
+    vp8_short_idct4x4llm(best_dqcoeff, best_predictor, 16, dst, dst_stride);
 
     return best_rd;
 }
@@ -787,7 +780,7 @@
     int this_rd;
     MACROBLOCKD *xd = &x->e_mbd;
 
-    //Y Search for 16x16 intra prediction mode
+    /* Y Search for 16x16 intra prediction mode */
     for (mode = DC_PRED; mode <= TM_PRED; mode++)
     {
         xd->mode_info_context->mbmi.mode = mode;
@@ -984,8 +977,9 @@
             m = ABOVE4X4;
         else
         {
-            // the only time we should do costing for new motion vector or mode
-            // is when we are on a new label  (jbb May 08, 2007)
+            /* the only time we should do costing for new motion vector
+             * or mode is when we are on a new label  (jbb May 08, 2007)
+             */
             switch (m = this_mode)
             {
             case NEW4X4 :
@@ -1004,7 +998,7 @@
                 break;
             }
 
-            if (m == ABOVE4X4)  // replace above with left if same
+            if (m == ABOVE4X4)  /* replace above with left if same */
             {
                 int_mv left_mv;
 
@@ -1065,9 +1059,6 @@
             vp8_build_inter_predictors_b(bd, 16, base_pre, pre_stride, x->e_mbd.subpixel_predict);
             vp8_subtract_b(be, bd, 16);
             x->short_fdct4x4(be->src_diff, be->coeff, 32);
-
-            // set to 0 no way to account for 2nd order DC so discount
-            //be->coeff[0] = 0;
             x->quantize_b(be, bd);
 
             distortion += vp8_block_error(be->coeff, bd->dqcoeff);
@@ -1098,8 +1089,8 @@
   int mvthresh;
   int *mdcounts;
 
-  int_mv sv_mvp[4];     // save 4 mvp from 8x8
-  int sv_istep[2];  // save 2 initial step_param for 16x8/8x16
+  int_mv sv_mvp[4]; /* save 4 mvp from 8x8 */
+  int sv_istep[2];  /* save 2 initial step_param for 16x8/8x16 */
 
 } BEST_SEG_INFO;
 
@@ -1146,13 +1137,13 @@
     labels = vp8_mbsplits[segmentation];
     label_count = vp8_mbsplit_count[segmentation];
 
-    // 64 makes this threshold really big effectively
-    // making it so that we very rarely check mvs on
-    // segments.   setting this to 1 would make mv thresh
-    // roughly equal to what it is for macroblocks
+    /* 64 makes this threshold really big effectively making it so that we
+     * very rarely check mvs on segments.   setting this to 1 would make mv
+     * thresh roughly equal to what it is for macroblocks
+     */
     label_mv_thresh = 1 * bsi->mvthresh / label_count ;
 
-    // Segmentation method overheads
+    /* Segmentation method overheads */
     rate = vp8_cost_token(vp8_mbsplit_tree, vp8_mbsplit_probs, vp8_mbsplit_encodings + segmentation);
     rate += vp8_cost_mv_ref(SPLITMV, bsi->mdcounts);
     this_segment_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);
@@ -1165,7 +1156,7 @@
         B_PREDICTION_MODE mode_selected = ZERO4X4;
         int bestlabelyrate = 0;
 
-        // search for the best motion vector on this segment
+        /* search for the best motion vector on this segment */
         for (this_mode = LEFT4X4; this_mode <= NEW4X4 ; this_mode ++)
         {
             int this_rd;
@@ -1194,7 +1185,9 @@
                 BLOCK *c;
                 BLOCKD *e;
 
-                // Is the best so far sufficiently good that we cant justify doing and new motion search.
+                /* Is the best so far sufficiently good that we cant justify
+                 * doing a new motion search.
+                 */
                 if (best_label_rd < label_mv_thresh)
                     break;
 
@@ -1209,7 +1202,9 @@
                         step_param = bsi->sv_istep[i];
                     }
 
-                    // use previous block's result as next block's MV predictor.
+                    /* use previous block's result as next block's MV
+                     * predictor.
+                     */
                     if (segmentation == BLOCK_4X4 && i>0)
                     {
                         bsi->mvp.as_int = x->e_mbd.block[i-1].bmi.mv.as_int;
@@ -1228,7 +1223,7 @@
                     mvp_full.as_mv.row = bsi->mvp.as_mv.row >>3;
                     mvp_full.as_mv.col = bsi->mvp.as_mv.col >>3;
 
-                    // find first label
+                    /* find first label */
                     n = vp8_mbsplit_offset[segmentation][i];
 
                     c = &x->block[n];
@@ -1268,7 +1263,7 @@
 
                     sseshift = segmentation_to_sseshift[segmentation];
 
-                    // Should we do a full search (best quality only)
+                    /* Should we do a full search (best quality only) */
                     if ((cpi->compressor_speed == 0) && (bestsme >> sseshift) > 4000)
                     {
                         /* Check if mvp_full is within the range. */
@@ -1285,7 +1280,9 @@
                         }
                         else
                         {
-                            // The full search result is actually worse so re-instate the previous best vector
+                            /* The full search result is actually worse so
+                             * re-instate the previous best vector
+                             */
                             e->bmi.mv.as_int = mode_mv[NEW4X4].as_int;
                         }
                     }
@@ -1305,7 +1302,7 @@
             rate = labels2mode(x, labels, i, this_mode, &mode_mv[this_mode],
                                bsi->ref_mv, x->mvcost);
 
-            // Trap vectors that reach beyond the UMV borders
+            /* Trap vectors that reach beyond the UMV borders */
             if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) || ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) ||
                 ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) || ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max))
             {
@@ -1357,7 +1354,7 @@
         bsi->segment_rd = this_segment_rd;
         bsi->segment_num = segmentation;
 
-        // store everything needed to come back to this!!
+        /* store everything needed to come back to this!! */
         for (i = 0; i < 16; i++)
         {
             bsi->mvs[i].as_mv = x->partition_info->bmi[i].mv.as_mv;
@@ -1519,7 +1516,7 @@
     return bsi.segment_rd;
 }
 
-//The improved MV prediction
+/* The improved MV prediction */
 void vp8_mv_pred
 (
     VP8_COMP *cpi,
@@ -1553,7 +1550,9 @@
         near_mvs[0].as_int = near_mvs[1].as_int = near_mvs[2].as_int = near_mvs[3].as_int = near_mvs[4].as_int = near_mvs[5].as_int = near_mvs[6].as_int = near_mvs[7].as_int = 0;
         near_ref[0] = near_ref[1] = near_ref[2] = near_ref[3] = near_ref[4] = near_ref[5] = near_ref[6] = near_ref[7] = 0;
 
-        // read in 3 nearby block's MVs from current frame as prediction candidates.
+        /* read in 3 nearby block's MVs from current frame as prediction
+         * candidates.
+         */
         if (above->mbmi.ref_frame != INTRA_FRAME)
         {
             near_mvs[vcnt].as_int = above->mbmi.mv.as_int;
@@ -1576,12 +1575,12 @@
         }
         vcnt++;
 
-        // read in 5 nearby block's MVs from last frame.
+        /* read in 5 nearby block's MVs from last frame. */
         if(cpi->common.last_frame_type != KEY_FRAME)
         {
             mb_offset = (-xd->mb_to_top_edge/128 + 1) * (xd->mode_info_stride +1) + (-xd->mb_to_left_edge/128 +1) ;
 
-            // current in last frame
+            /* current in last frame */
             if (cpi->lf_ref_frame[mb_offset] != INTRA_FRAME)
             {
                 near_mvs[vcnt].as_int = cpi->lfmv[mb_offset].as_int;
@@ -1590,7 +1589,7 @@
             }
             vcnt++;
 
-            // above in last frame
+            /* above in last frame */
             if (cpi->lf_ref_frame[mb_offset - xd->mode_info_stride-1] != INTRA_FRAME)
             {
                 near_mvs[vcnt].as_int = cpi->lfmv[mb_offset - xd->mode_info_stride-1].as_int;
@@ -1599,7 +1598,7 @@
             }
             vcnt++;
 
-            // left in last frame
+            /* left in last frame */
             if (cpi->lf_ref_frame[mb_offset-1] != INTRA_FRAME)
             {
                 near_mvs[vcnt].as_int = cpi->lfmv[mb_offset -1].as_int;
@@ -1608,7 +1607,7 @@
             }
             vcnt++;
 
-            // right in last frame
+            /* right in last frame */
             if (cpi->lf_ref_frame[mb_offset +1] != INTRA_FRAME)
             {
                 near_mvs[vcnt].as_int = cpi->lfmv[mb_offset +1].as_int;
@@ -1617,7 +1616,7 @@
             }
             vcnt++;
 
-            // below in last frame
+            /* below in last frame */
             if (cpi->lf_ref_frame[mb_offset + xd->mode_info_stride +1] != INTRA_FRAME)
             {
                 near_mvs[vcnt].as_int = cpi->lfmv[mb_offset + xd->mode_info_stride +1].as_int;
@@ -1658,7 +1657,9 @@
             mv.as_mv.col = mvy[vcnt/2];
 
             find = 1;
-            //sr is set to 0 to allow calling function to decide the search range.
+            /* sr is set to 0 to allow calling function to decide the search
+             * range.
+             */
             *sr = 0;
         }
     }
@@ -1670,33 +1671,36 @@
 
 void vp8_cal_sad(VP8_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x, int recon_yoffset, int near_sadidx[])
 {
-
-    int near_sad[8] = {0}; // 0-cf above, 1-cf left, 2-cf aboveleft, 3-lf current, 4-lf above, 5-lf left, 6-lf right, 7-lf below
+    /* near_sad indexes:
+     *   0-cf above, 1-cf left, 2-cf aboveleft,
+     *   3-lf current, 4-lf above, 5-lf left, 6-lf right, 7-lf below
+     */
+    int near_sad[8] = {0};
     BLOCK *b = &x->block[0];
     unsigned char *src_y_ptr = *(b->base_src);
 
-    //calculate sad for current frame 3 nearby MBs.
+    /* calculate sad for current frame 3 nearby MBs. */
     if( xd->mb_to_top_edge==0 && xd->mb_to_left_edge ==0)
     {
         near_sad[0] = near_sad[1] = near_sad[2] = INT_MAX;
     }else if(xd->mb_to_top_edge==0)
-    {   //only has left MB for sad calculation.
+    {   /* only has left MB for sad calculation. */
         near_sad[0] = near_sad[2] = INT_MAX;
-        near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - 16,xd->dst.y_stride, 0x7fffffff);
+        near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - 16,xd->dst.y_stride, UINT_MAX);
     }else if(xd->mb_to_left_edge ==0)
-    {   //only has left MB for sad calculation.
+    {   /* only has left MB for sad calculation. */
         near_sad[1] = near_sad[2] = INT_MAX;
-        near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16,xd->dst.y_stride, 0x7fffffff);
+        near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16,xd->dst.y_stride, UINT_MAX);
     }else
     {
-        near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16,xd->dst.y_stride, 0x7fffffff);
-        near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - 16,xd->dst.y_stride, 0x7fffffff);
-        near_sad[2] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16 -16,xd->dst.y_stride, 0x7fffffff);
+        near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16,xd->dst.y_stride, UINT_MAX);
+        near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - 16,xd->dst.y_stride, UINT_MAX);
+        near_sad[2] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16 -16,xd->dst.y_stride, UINT_MAX);
     }
 
     if(cpi->common.last_frame_type != KEY_FRAME)
     {
-        //calculate sad for last frame 5 nearby MBs.
+        /* calculate sad for last frame 5 nearby MBs. */
         unsigned char *pre_y_buffer = cpi->common.yv12_fb[cpi->common.lst_fb_idx].y_buffer + recon_yoffset;
         int pre_y_stride = cpi->common.yv12_fb[cpi->common.lst_fb_idx].y_stride;
 
@@ -1706,14 +1710,14 @@
         if(xd->mb_to_bottom_edge==0) near_sad[7] = INT_MAX;
 
         if(near_sad[4] != INT_MAX)
-            near_sad[4] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer - pre_y_stride *16, pre_y_stride, 0x7fffffff);
+            near_sad[4] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer - pre_y_stride *16, pre_y_stride, UINT_MAX);
         if(near_sad[5] != INT_MAX)
-            near_sad[5] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer - 16, pre_y_stride, 0x7fffffff);
-        near_sad[3] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer, pre_y_stride, 0x7fffffff);
+            near_sad[5] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer - 16, pre_y_stride, UINT_MAX);
+        near_sad[3] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer, pre_y_stride, UINT_MAX);
         if(near_sad[6] != INT_MAX)
-            near_sad[6] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer + 16, pre_y_stride, 0x7fffffff);
+            near_sad[6] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer + 16, pre_y_stride, UINT_MAX);
         if(near_sad[7] != INT_MAX)
-            near_sad[7] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer + pre_y_stride *16, pre_y_stride, 0x7fffffff);
+            near_sad[7] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer + pre_y_stride *16, pre_y_stride, UINT_MAX);
     }
 
     if(cpi->common.last_frame_type != KEY_FRAME)
@@ -1787,7 +1791,7 @@
             if ((sse - var < q2dc * q2dc >>4) ||
                 (sse /2 > var && sse-var < 64))
             {
-                // Check u and v to make sure skip is ok
+                /* Check u and v to make sure skip is ok */
                 unsigned int sse2 = VP8_UVSSE(x);
                 if (sse2 * 2 < threshold)
                 {
@@ -1808,17 +1812,15 @@
     }
 
 
-    //intermodecost[mode_index] = vp8_cost_mv_ref(this_mode, mdcounts);   // Experimental debug code
-
-    // Add in the Mv/mode cost
+    /* Add in the Mv/mode cost */
     rd->rate2 += vp8_cost_mv_ref(this_mode, mdcounts);
 
-    // Y cost and distortion
+    /* Y cost and distortion */
     macro_block_yrd(x, &rd->rate_y, &distortion);
     rd->rate2 += rd->rate_y;
     rd->distortion2 += distortion;
 
-    // UV cost and distortion
+    /* UV cost and distortion */
     rd_inter16x16_uv(cpi, x, &rd->rate_uv, &rd->distortion_uv,
                      cpi->common.full_pixel);
     rd->rate2 += rd->rate_uv;
@@ -1835,9 +1837,11 @@
                                     VP8_COMP *cpi, MACROBLOCK *x)
 {
     MB_PREDICTION_MODE this_mode = x->e_mbd.mode_info_context->mbmi.mode;
-    // Where skip is allowable add in the default per mb cost for the no skip case.
-    // where we then decide to skip we have to delete this and replace it with the
-    // cost of signallying a skip
+
+    /* Where skip is allowable add in the default per mb cost for the no
+     * skip case. where we then decide to skip we have to delete this and
+     * replace it with the cost of signalling a skip
+     */
     if (cpi->common.mb_no_coeff_skip)
     {
         *other_cost += vp8_cost_bit(cpi->prob_skip_false, 0);
@@ -1852,7 +1856,10 @@
 
     if (!disable_skip)
     {
-        // Test for the condition where skip block will be activated because there are no non zero coefficients and make any necessary adjustment for rate
+        /* Test for the condition where skip block will be activated
+         * because there are no non zero coefficients and make any
+         * necessary adjustment for rate
+         */
         if (cpi->common.mb_no_coeff_skip)
         {
             int i;
@@ -1877,10 +1884,10 @@
             if (tteob == 0)
             {
                 rd->rate2 -= (rd->rate_y + rd->rate_uv);
-                //for best_yrd calculation
+                /* for best_yrd calculation */
                 rd->rate_uv = 0;
 
-                // Back out no skip flag costing and add in skip flag costing
+                /* Back out no skip flag costing and add in skip flag costing */
                 if (cpi->prob_skip_false)
                 {
                     int prob_skip_cost;
@@ -1892,7 +1899,7 @@
                 }
             }
         }
-        // Calculate the final RD estimate for this mode
+        /* Calculate the final RD estimate for this mode */
         this_rd = RDCOST(x->rdmult, x->rddiv, rd->rate2, rd->distortion2);
         if (this_rd < INT_MAX && x->e_mbd.mode_info_context->mbmi.ref_frame
                                  == INTRA_FRAME)
@@ -1956,7 +1963,8 @@
     int_mv mvp;
     int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7};
     int saddone=0;
-    int sr=0;    //search range got from mv_pred(). It uses step_param levels. (0-7)
+    /* search range got from mv_pred(). It uses step_param levels. (0-7) */
+    int sr=0;
 
     unsigned char *plane[4][3];
     int ref_frame_map[4];
@@ -2002,7 +2010,8 @@
     get_predictor_pointers(cpi, plane, recon_yoffset, recon_uvoffset);
 
     *returnintra = INT_MAX;
-    cpi->mbs_tested_so_far++;          // Count of the number of MBs tested so far this frame
+    /* Count of the number of MBs tested so far this frame */
+    cpi->mbs_tested_so_far++;
 
     x->skip = 0;
 
@@ -2013,14 +2022,16 @@
         int other_cost = 0;
         int this_ref_frame = ref_frame_map[vp8_ref_frame_order[mode_index]];
 
-        // Test best rd so far against threshold for trying this mode.
+        /* Test best rd so far against threshold for trying this mode. */
         if (best_mode.rd <= cpi->rd_threshes[mode_index])
             continue;
 
         if (this_ref_frame < 0)
             continue;
 
-        // These variables hold are rolling total cost and distortion for this mode
+        /* These variables hold are rolling total cost and distortion for
+         * this mode
+         */
         rd.rate2 = 0;
         rd.distortion2 = 0;
 
@@ -2029,9 +2040,10 @@
         x->e_mbd.mode_info_context->mbmi.mode = this_mode;
         x->e_mbd.mode_info_context->mbmi.ref_frame = this_ref_frame;
 
-        // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
-        // unless ARNR filtering is enabled in which case we want
-        // an unfiltered alternative
+        /* Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
+         * unless ARNR filtering is enabled in which case we want
+         * an unfiltered alternative
+         */
         if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0))
         {
             if (this_mode != ZEROMV || x->e_mbd.mode_info_context->mbmi.ref_frame != ALTREF_FRAME)
@@ -2053,13 +2065,17 @@
             }
         }
 
-        // Check to see if the testing frequency for this mode is at its max
-        // If so then prevent it from being tested and increase the threshold for its testing
+        /* Check to see if the testing frequency for this mode is at its
+         * max If so then prevent it from being tested and increase the
+         * threshold for its testing
+         */
         if (cpi->mode_test_hit_counts[mode_index] && (cpi->mode_check_freq[mode_index] > 1))
         {
             if (cpi->mbs_tested_so_far  <= cpi->mode_check_freq[mode_index] * cpi->mode_test_hit_counts[mode_index])
             {
-                // Increase the threshold for coding this mode to make it less likely to be chosen
+                /* Increase the threshold for coding this mode to make it
+                 * less likely to be chosen
+                 */
                 cpi->rd_thresh_mult[mode_index] += 4;
 
                 if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
@@ -2071,10 +2087,15 @@
             }
         }
 
-        // We have now reached the point where we are going to test the current mode so increment the counter for the number of times it has been tested
+        /* We have now reached the point where we are going to test the
+         * current mode so increment the counter for the number of times
+         * it has been tested
+         */
         cpi->mode_test_hit_counts[mode_index] ++;
 
-        // Experimental code. Special case for gf and arf zeromv modes. Increase zbin size to supress noise
+        /* Experimental code. Special case for gf and arf zeromv modes.
+         * Increase zbin size to supress noise
+         */
         if (cpi->zbin_mode_boost_enabled)
         {
             if ( this_ref_frame == INTRA_FRAME )
@@ -2121,7 +2142,9 @@
         {
             int tmp_rd;
 
-            // Note the rate value returned here includes the cost of coding the BPRED mode : x->mbmode_cost[x->e_mbd.frame_type][BPRED];
+            /* Note the rate value returned here includes the cost of
+             * coding the BPRED mode: x->mbmode_cost[x->e_mbd.frame_type][BPRED]
+             */
             int distortion;
             tmp_rd = rd_pick_intra4x4mby_modes(cpi, x, &rate, &rd.rate_y, &distortion, best_mode.yrd);
             rd.rate2 += rate;
@@ -2158,10 +2181,12 @@
             rd.rate2 += rate;
             rd.distortion2 += distortion;
 
-            // If even the 'Y' rd value of split is higher than best so far then dont bother looking at UV
+            /* If even the 'Y' rd value of split is higher than best so far
+             * then dont bother looking at UV
+             */
             if (tmp_rd < best_mode.yrd)
             {
-                // Now work out UV cost and add it in
+                /* Now work out UV cost and add it in */
                 rd_inter4x4_uv(cpi, x, &rd.rate_uv, &rd.distortion_uv, cpi->common.full_pixel);
                 rd.rate2 += rd.rate_uv;
                 rd.distortion2 += rd.distortion_uv;
@@ -2233,7 +2258,9 @@
             mvp_full.as_mv.col = mvp.as_mv.col>>3;
             mvp_full.as_mv.row = mvp.as_mv.row>>3;
 
-            // Get intersection of UMV window and valid MV window to reduce # of checks in diamond search.
+            /* Get intersection of UMV window and valid MV window to
+             * reduce # of checks in diamond search.
+             */
             if (x->mv_col_min < col_min )
                 x->mv_col_min = col_min;
             if (x->mv_col_max > col_max )
@@ -2243,11 +2270,11 @@
             if (x->mv_row_max > row_max )
                 x->mv_row_max = row_max;
 
-            //adjust search range according to sr from mv prediction
+            /* adjust search range according to sr from mv prediction */
             if(sr > step_param)
                 step_param = sr;
 
-            // Initial step/diamond search
+            /* Initial step/diamond search */
             {
                 bestsme = cpi->diamond_search_sad(x, b, d, &mvp_full, &d->bmi.mv,
                                         step_param, sadpb, &num00,
@@ -2255,7 +2282,7 @@
                                         x->mvcost, &best_ref_mv);
                 mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
 
-                // Further step/diamond searches as necessary
+                /* Further step/diamond searches as necessary */
                 n = 0;
                 further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
 
@@ -2301,11 +2328,8 @@
             {
                 int search_range;
 
-                //It seems not a good way to set search_range. Need further investigation.
-                //search_range = MAXF(abs((mvp.row>>3) - d->bmi.mv.as_mv.row), abs((mvp.col>>3) - d->bmi.mv.as_mv.col));
                 search_range = 8;
 
-                //thissme = cpi->full_search_sad(x, b, d, &d->bmi.mv.as_mv, sadpb, search_range, &cpi->fn_ptr[BLOCK_16X16], x->mvcost, &best_ref_mv);
                 thissme = cpi->refining_search_sad(x, b, d, &d->bmi.mv, sadpb,
                                        search_range, &cpi->fn_ptr[BLOCK_16X16],
                                        x->mvcost, &best_ref_mv);
@@ -2338,24 +2362,31 @@
 
             mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
 
-            // Add the new motion vector cost to our rolling cost variable
+            /* Add the new motion vector cost to our rolling cost variable */
             rd.rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv, x->mvcost, 96);
         }
 
         case NEARESTMV:
         case NEARMV:
-            // Clip "next_nearest" so that it does not extend to far out of image
+            /* Clip "next_nearest" so that it does not extend to far out
+             * of image
+             */
             vp8_clamp_mv2(&mode_mv[this_mode], xd);
 
-            // Do not bother proceeding if the vector (from newmv,nearest or near) is 0,0 as this should then be coded using the zeromv mode.
+            /* Do not bother proceeding if the vector (from newmv, nearest
+             * or near) is 0,0 as this should then be coded using the zeromv
+             * mode.
+             */
             if (((this_mode == NEARMV) || (this_mode == NEARESTMV)) && (mode_mv[this_mode].as_int == 0))
                 continue;
 
         case ZEROMV:
 
-            // Trap vectors that reach beyond the UMV borders
-            // Note that ALL New MV, Nearest MV Near MV and Zero MV code drops through to this point
-            // because of the lack of break statements in the previous two cases.
+            /* Trap vectors that reach beyond the UMV borders
+             * Note that ALL New MV, Nearest MV Near MV and Zero MV code
+             * drops through to this point because of the lack of break
+             * statements in the previous two cases.
+             */
             if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) || ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) ||
                 ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) || ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max))
                 continue;
@@ -2373,7 +2404,7 @@
                                            disable_skip, uv_intra_tteob,
                                            intra_rd_penalty, cpi, x);
 
-        // Keep record of best intra distortion
+        /* Keep record of best intra distortion */
         if ((x->e_mbd.mode_info_context->mbmi.ref_frame == INTRA_FRAME) &&
             (this_rd < best_mode.intra_rd) )
         {
@@ -2390,7 +2421,7 @@
             if (sse < best_rd_sse)
                 best_rd_sse = sse;
 
-            // Store for later use by denoiser.
+            /* Store for later use by denoiser. */
             if (this_mode == ZEROMV && sse < zero_mv_sse )
             {
                 zero_mv_sse = sse;
@@ -2398,7 +2429,7 @@
                         x->e_mbd.mode_info_context->mbmi.ref_frame;
             }
 
-            // Store the best NEWMV in x for later use in the denoiser.
+            /* Store the best NEWMV in x for later use in the denoiser. */
             if (x->e_mbd.mode_info_context->mbmi.mode == NEWMV &&
                     sse < best_sse)
             {
@@ -2415,10 +2446,10 @@
         }
 #endif
 
-        // Did this mode help.. i.i is it the new best mode
+        /* Did this mode help.. i.i is it the new best mode */
         if (this_rd < best_mode.rd || x->skip)
         {
-            // Note index of best mode so far
+            /* Note index of best mode so far */
             best_mode_index = mode_index;
             *returnrate = rd.rate2;
             *returndistortion = rd.distortion2;
@@ -2431,12 +2462,16 @@
             update_best_mode(&best_mode, this_rd, &rd, other_cost, x);
 
 
-            // Testing this mode gave rise to an improvement in best error score. Lower threshold a bit for next time
+            /* Testing this mode gave rise to an improvement in best error
+             * score. Lower threshold a bit for next time
+             */
             cpi->rd_thresh_mult[mode_index] = (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ? cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
             cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
         }
 
-        // If the mode did not help improve the best error case then raise the threshold for testing that mode next time around.
+        /* If the mode did not help improve the best error case then raise
+         * the threshold for testing that mode next time around.
+         */
         else
         {
             cpi->rd_thresh_mult[mode_index] += 4;
@@ -2452,33 +2487,16 @@
 
     }
 
-    // Reduce the activation RD thresholds for the best choice mode
+    /* Reduce the activation RD thresholds for the best choice mode */
     if ((cpi->rd_baseline_thresh[best_mode_index] > 0) && (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2)))
     {
         int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 2);
 
         cpi->rd_thresh_mult[best_mode_index] = (cpi->rd_thresh_mult[best_mode_index] >= (MIN_THRESHMULT + best_adjustment)) ? cpi->rd_thresh_mult[best_mode_index] - best_adjustment : MIN_THRESHMULT;
         cpi->rd_threshes[best_mode_index] = (cpi->rd_baseline_thresh[best_mode_index] >> 7) * cpi->rd_thresh_mult[best_mode_index];
-
-        // If we chose a split mode then reset the new MV thresholds as well
-        /*if ( vp8_mode_order[best_mode_index] == SPLITMV )
-        {
-            best_adjustment = 4; //(cpi->rd_thresh_mult[THR_NEWMV] >> 4);
-            cpi->rd_thresh_mult[THR_NEWMV] = (cpi->rd_thresh_mult[THR_NEWMV] >= (MIN_THRESHMULT+best_adjustment)) ? cpi->rd_thresh_mult[THR_NEWMV]-best_adjustment: MIN_THRESHMULT;
-            cpi->rd_threshes[THR_NEWMV] = (cpi->rd_baseline_thresh[THR_NEWMV] >> 7) * cpi->rd_thresh_mult[THR_NEWMV];
-
-            best_adjustment = 4; //(cpi->rd_thresh_mult[THR_NEWG] >> 4);
-            cpi->rd_thresh_mult[THR_NEWG] = (cpi->rd_thresh_mult[THR_NEWG] >= (MIN_THRESHMULT+best_adjustment)) ? cpi->rd_thresh_mult[THR_NEWG]-best_adjustment: MIN_THRESHMULT;
-            cpi->rd_threshes[THR_NEWG] = (cpi->rd_baseline_thresh[THR_NEWG] >> 7) * cpi->rd_thresh_mult[THR_NEWG];
-
-            best_adjustment = 4; //(cpi->rd_thresh_mult[THR_NEWA] >> 4);
-            cpi->rd_thresh_mult[THR_NEWA] = (cpi->rd_thresh_mult[THR_NEWA] >= (MIN_THRESHMULT+best_adjustment)) ? cpi->rd_thresh_mult[THR_NEWA]-best_adjustment: MIN_THRESHMULT;
-            cpi->rd_threshes[THR_NEWA] = (cpi->rd_baseline_thresh[THR_NEWA] >> 7) * cpi->rd_thresh_mult[THR_NEWA];
-        }*/
-
     }
 
-    // Note how often each mode chosen as best
+    /* Note how often each mode chosen as best */
     cpi->mode_chosen_counts[best_mode_index] ++;
 
 #if CONFIG_TEMPORAL_DENOISING
@@ -2486,7 +2504,7 @@
     {
         if (x->best_sse_inter_mode == DC_PRED)
         {
-            // No best MV found.
+            /* No best MV found. */
             x->best_sse_inter_mode = best_mode.mbmode.mode;
             x->best_sse_mv = best_mode.mbmode.mv;
             x->need_to_clamp_best_mvs = best_mode.mbmode.need_to_clamp_mvs;
@@ -2497,7 +2515,7 @@
                                 recon_yoffset, recon_uvoffset);
 
 
-        // Reevaluate ZEROMV after denoising.
+        /* Reevaluate ZEROMV after denoising. */
         if (best_mode.mbmode.ref_frame == INTRA_FRAME &&
             x->best_zeromv_reference_frame != INTRA_FRAME)
         {
@@ -2509,7 +2527,7 @@
                     vp8_cost_mv_ref(ZEROMV, mdcounts);
             rd.distortion2 = 0;
 
-            // set up the proper prediction buffers for the frame
+            /* set up the proper prediction buffers for the frame */
             x->e_mbd.mode_info_context->mbmi.ref_frame = this_ref_frame;
             x->e_mbd.pre.y_buffer = plane[this_ref_frame][0];
             x->e_mbd.pre.u_buffer = plane[this_ref_frame][1];
@@ -2525,7 +2543,7 @@
                                                intra_rd_penalty, cpi, x);
             if (this_rd < best_mode.rd || x->skip)
             {
-                // Note index of best mode so far
+                /* Note index of best mode so far */
                 best_mode_index = mode_index;
                 *returnrate = rd.rate2;
                 *returndistortion = rd.distortion2;
@@ -2550,7 +2568,7 @@
     }
 
 
-    // macroblock modes
+    /* macroblock modes */
     vpx_memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mode.mbmode, sizeof(MB_MODE_INFO));
 
     if (best_mode.mbmode.mode == B_PRED)

diff --git a/vp8/encoder/rdopt.h b/vp8/encoder/rdopt.h
index db939f9..bbcb59f 100644
--- a/vp8/encoder/rdopt.h
+++ b/vp8/encoder/rdopt.h

@@ -86,15 +86,15 @@
                                        unsigned int    recon_yoffset,
                                        unsigned int    recon_uvoffset)
 {
-    if (cpi->ref_frame_flags & VP8_LAST_FLAG)
+    if (cpi->ref_frame_flags & VP8_LAST_FRAME)
         get_plane_pointers(&cpi->common.yv12_fb[cpi->common.lst_fb_idx],
                            plane[LAST_FRAME], recon_yoffset, recon_uvoffset);
 
-    if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
+    if (cpi->ref_frame_flags & VP8_GOLD_FRAME)
         get_plane_pointers(&cpi->common.yv12_fb[cpi->common.gld_fb_idx],
                            plane[GOLDEN_FRAME], recon_yoffset, recon_uvoffset);
 
-    if (cpi->ref_frame_flags & VP8_ALT_FLAG)
+    if (cpi->ref_frame_flags & VP8_ALTR_FRAME)
         get_plane_pointers(&cpi->common.yv12_fb[cpi->common.alt_fb_idx],
                            plane[ALTREF_FRAME], recon_yoffset, recon_uvoffset);
 }
@@ -106,11 +106,11 @@
     int i=0;
 
     ref_frame_map[i++] = INTRA_FRAME;
-    if (cpi->ref_frame_flags & VP8_LAST_FLAG)
+    if (cpi->ref_frame_flags & VP8_LAST_FRAME)
         ref_frame_map[i++] = LAST_FRAME;
-    if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
+    if (cpi->ref_frame_flags & VP8_GOLD_FRAME)
         ref_frame_map[i++] = GOLDEN_FRAME;
-    if (cpi->ref_frame_flags & VP8_ALT_FLAG)
+    if (cpi->ref_frame_flags & VP8_ALTR_FRAME)
         ref_frame_map[i++] = ALTREF_FRAME;
     for(; i<4; i++)
         ref_frame_map[i] = -1;

diff --git a/vp8/encoder/segmentation.c b/vp8/encoder/segmentation.c
index fc0967d..37972e2 100644
--- a/vp8/encoder/segmentation.c
+++ b/vp8/encoder/segmentation.c

@@ -22,22 +22,24 @@
 
     if ((cm->frame_type == KEY_FRAME) || (cm->refresh_golden_frame))
     {
-        // Reset Gf useage monitors
+        /* Reset Gf useage monitors */
         vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
         cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
     }
     else
     {
-        // for each macroblock row in image
+        /* for each macroblock row in image */
         for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
         {
-            // for each macroblock col in image
+            /* for each macroblock col in image */
             for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
             {
 
-                // If using golden then set GF active flag if not already set.
-                // If using last frame 0,0 mode then leave flag as it is
-                // else if using non 0,0 motion or intra modes then clear flag if it is currently set
+                /* If using golden then set GF active flag if not already set.
+                 * If using last frame 0,0 mode then leave flag as it is
+                 * else if using non 0,0 motion or intra modes then clear
+                 * flag if it is currently set
+                 */
                 if ((this_mb_mode_info->mbmi.ref_frame == GOLDEN_FRAME) || (this_mb_mode_info->mbmi.ref_frame == ALTREF_FRAME))
                 {
                     if (*(x->gf_active_ptr) == 0)
@@ -52,12 +54,12 @@
                     cpi->gf_active_count--;
                 }
 
-                x->gf_active_ptr++;          // Step onto next entry
-                this_mb_mode_info++;           // skip to next mb
+                x->gf_active_ptr++;          /* Step onto next entry */
+                this_mb_mode_info++;         /* skip to next mb */
 
             }
 
-            // this is to account for the border
+            /* this is to account for the border */
             this_mb_mode_info++;
         }
     }

diff --git a/vp8/encoder/temporal_filter.c b/vp8/encoder/temporal_filter.c
index b391d5a..b83ae89 100644
--- a/vp8/encoder/temporal_filter.c
+++ b/vp8/encoder/temporal_filter.c

@@ -30,8 +30,8 @@
 #include <math.h>
 #include <limits.h>
 
-#define ALT_REF_MC_ENABLED 1    // dis/enable MC in AltRef filtering
-#define ALT_REF_SUBPEL_ENABLED 1 // dis/enable subpel in MC AltRef filtering
+#define ALT_REF_MC_ENABLED 1    /* dis/enable MC in AltRef filtering */
+#define ALT_REF_SUBPEL_ENABLED 1 /* dis/enable subpel in MC AltRef filtering */
 
 #if VP8_TEMPORAL_ALT_REF
 
@@ -50,7 +50,7 @@
     int offset;
     unsigned char *yptr, *uptr, *vptr;
 
-    // Y
+    /* Y */
     yptr = y_mb_ptr + (mv_row >> 3) * stride + (mv_col >> 3);
 
     if ((mv_row | mv_col) & 7)
@@ -63,7 +63,7 @@
         vp8_copy_mem16x16(yptr, stride, &pred[0], 16);
     }
 
-    // U & V
+    /* U & V */
     mv_row >>= 1;
     mv_col >>= 1;
     stride = (stride + 1) >> 1;
@@ -109,9 +109,10 @@
             int pixel_value = *frame2++;
 
             modifier   = src_byte - pixel_value;
-            // This is an integer approximation of:
-            // float coeff = (3.0 * modifer * modifier) / pow(2, strength);
-            // modifier =  (int)roundf(coeff > 16 ? 0 : 16-coeff);
+            /* This is an integer approximation of:
+             * float coeff = (3.0 * modifer * modifier) / pow(2, strength);
+             * modifier =  (int)roundf(coeff > 16 ? 0 : 16-coeff);
+             */
             modifier  *= modifier;
             modifier  *= 3;
             modifier  += 1 << (strength - 1);
@@ -154,7 +155,7 @@
     int_mv best_ref_mv1;
     int_mv best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
 
-    // Save input state
+    /* Save input state */
     unsigned char **base_src = b->base_src;
     int src = b->src;
     int src_stride = b->src_stride;
@@ -166,7 +167,7 @@
     best_ref_mv1_full.as_mv.col = best_ref_mv1.as_mv.col >>3;
     best_ref_mv1_full.as_mv.row = best_ref_mv1.as_mv.row >>3;
 
-    // Setup frame pointers
+    /* Setup frame pointers */
     b->base_src = &arf_frame->y_buffer;
     b->src_stride = arf_frame->y_stride;
     b->src = mb_offset;
@@ -175,7 +176,7 @@
     x->e_mbd.pre.y_stride = frame_ptr->y_stride;
     d->offset = mb_offset;
 
-    // Further step/diamond searches as necessary
+    /* Further step/diamond searches as necessary */
     if (cpi->Speed < 8)
     {
         step_param = cpi->sf.first_step + (cpi->Speed > 5);
@@ -185,21 +186,19 @@
         step_param = cpi->sf.first_step + 2;
     }
 
-    /*cpi->sf.search_method == HEX*/
-    // TODO Check that the 16x16 vf & sdf are selected here
-    // Ignore mv costing by sending NULL cost arrays
+    /* TODO Check that the 16x16 vf & sdf are selected here */
+    /* Ignore mv costing by sending NULL cost arrays */
     bestsme = vp8_hex_search(x, b, d, &best_ref_mv1_full, &d->bmi.mv,
                              step_param, sadpb,
                              &cpi->fn_ptr[BLOCK_16X16],
                              NULL, NULL, &best_ref_mv1);
 
 #if ALT_REF_SUBPEL_ENABLED
-    // Try sub-pixel MC?
-    //if (bestsme > error_thresh && bestsme < INT_MAX)
+    /* Try sub-pixel MC? */
     {
         int distortion;
         unsigned int sse;
-        // Ignore mv costing by sending NULL cost array
+        /* Ignore mv costing by sending NULL cost array */
         bestsme = cpi->find_fractional_mv_step(x, b, d,
                                                &d->bmi.mv,
                                                &best_ref_mv1,
@@ -209,7 +208,7 @@
     }
 #endif
 
-    // Save input state
+    /* Save input state */
     b->base_src = base_src;
     b->src = src;
     b->src_stride = src_stride;
@@ -244,7 +243,7 @@
     unsigned char *dst1, *dst2;
     DECLARE_ALIGNED_ARRAY(16, unsigned char,  predictor, 16*16 + 8*8 + 8*8);
 
-    // Save input state
+    /* Save input state */
     unsigned char *y_buffer = mbd->pre.y_buffer;
     unsigned char *u_buffer = mbd->pre.u_buffer;
     unsigned char *v_buffer = mbd->pre.v_buffer;
@@ -252,16 +251,17 @@
     for (mb_row = 0; mb_row < mb_rows; mb_row++)
     {
 #if ALT_REF_MC_ENABLED
-        // Source frames are extended to 16 pixels.  This is different than
-        //  L/A/G reference frames that have a border of 32 (VP8BORDERINPIXELS)
-        // A 6 tap filter is used for motion search.  This requires 2 pixels
-        //  before and 3 pixels after.  So the largest Y mv on a border would
-        //  then be 16 - 3.  The UV blocks are half the size of the Y and
-        //  therefore only extended by 8.  The largest mv that a UV block
-        //  can support is 8 - 3.  A UV mv is half of a Y mv.
-        //  (16 - 3) >> 1 == 6 which is greater than 8 - 3.
-        // To keep the mv in play for both Y and UV planes the max that it
-        //  can be on a border is therefore 16 - 5.
+        /* Source frames are extended to 16 pixels.  This is different than
+         *  L/A/G reference frames that have a border of 32 (VP8BORDERINPIXELS)
+         * A 6 tap filter is used for motion search.  This requires 2 pixels
+         *  before and 3 pixels after.  So the largest Y mv on a border would
+         *  then be 16 - 3.  The UV blocks are half the size of the Y and
+         *  therefore only extended by 8.  The largest mv that a UV block
+         *  can support is 8 - 3.  A UV mv is half of a Y mv.
+         *  (16 - 3) >> 1 == 6 which is greater than 8 - 3.
+         * To keep the mv in play for both Y and UV planes the max that it
+         *  can be on a border is therefore 16 - 5.
+         */
         cpi->mb.mv_row_min = -((mb_row * 16) + (16 - 5));
         cpi->mb.mv_row_max = ((cpi->common.mb_rows - 1 - mb_row) * 16)
                                 + (16 - 5);
@@ -299,7 +299,7 @@
 #if ALT_REF_MC_ENABLED
 #define THRESH_LOW   10000
 #define THRESH_HIGH  20000
-                    // Find best match in this frame by MC
+                    /* Find best match in this frame by MC */
                     err = vp8_temporal_filter_find_matching_mb_c
                               (cpi,
                                cpi->frames[alt_ref_index],
@@ -307,16 +307,17 @@
                                mb_y_offset,
                                THRESH_LOW);
 #endif
-                    // Assign higher weight to matching MB if it's error
-                    // score is lower. If not applying MC default behavior
-                    // is to weight all MBs equal.
+                    /* Assign higher weight to matching MB if it's error
+                     * score is lower. If not applying MC default behavior
+                     * is to weight all MBs equal.
+                     */
                     filter_weight = err<THRESH_LOW
                                        ? 2 : err<THRESH_HIGH ? 1 : 0;
                 }
 
                 if (filter_weight != 0)
                 {
-                    // Construct the predictors
+                    /* Construct the predictors */
                     vp8_temporal_filter_predictors_mb_c
                         (mbd,
                          cpi->frames[frame]->y_buffer + mb_y_offset,
@@ -327,7 +328,7 @@
                          mbd->block[0].bmi.mv.as_mv.col,
                          predictor);
 
-                    // Apply the filter (YUV)
+                    /* Apply the filter (YUV) */
                     vp8_temporal_filter_apply
                         (f->y_buffer + mb_y_offset,
                          f->y_stride,
@@ -360,7 +361,7 @@
                 }
             }
 
-            // Normalize filter output to produce AltRef frame
+            /* Normalize filter output to produce AltRef frame */
             dst1 = cpi->alt_ref_buffer.y_buffer;
             stride = cpi->alt_ref_buffer.y_stride;
             byte = mb_y_offset;
@@ -374,7 +375,7 @@
 
                     dst1[byte] = (unsigned char)pval;
 
-                    // move to next pixel
+                    /* move to next pixel */
                     byte++;
                 }
 
@@ -391,19 +392,19 @@
                 {
                     int m=k+64;
 
-                    // U
+                    /* U */
                     unsigned int pval = accumulator[k] + (count[k] >> 1);
                     pval *= cpi->fixed_divide[count[k]];
                     pval >>= 19;
                     dst1[byte] = (unsigned char)pval;
 
-                    // V
+                    /* V */
                     pval = accumulator[m] + (count[m] >> 1);
                     pval *= cpi->fixed_divide[count[m]];
                     pval >>= 19;
                     dst2[byte] = (unsigned char)pval;
 
-                    // move to next pixel
+                    /* move to next pixel */
                     byte++;
                 }
 
@@ -418,7 +419,7 @@
         mb_uv_offset += 8*(f->uv_stride-mb_cols);
     }
 
-    // Restore input state
+    /* Restore input state */
     mbd->pre.y_buffer = y_buffer;
     mbd->pre.u_buffer = u_buffer;
     mbd->pre.v_buffer = v_buffer;
@@ -452,8 +453,7 @@
     switch (blur_type)
     {
     case 1:
-        /////////////////////////////////////////
-        // Backward Blur
+        /* Backward Blur */
 
         frames_to_blur_backward = num_frames_backward;
 
@@ -464,8 +464,7 @@
         break;
 
     case 2:
-        /////////////////////////////////////////
-        // Forward Blur
+        /* Forward Blur */
 
         frames_to_blur_forward = num_frames_forward;
 
@@ -477,8 +476,7 @@
 
     case 3:
     default:
-        /////////////////////////////////////////
-        // Center Blur
+        /* Center Blur */
         frames_to_blur_forward = num_frames_forward;
         frames_to_blur_backward = num_frames_backward;
 
@@ -488,7 +486,7 @@
         if (frames_to_blur_backward > frames_to_blur_forward)
             frames_to_blur_backward = frames_to_blur_forward;
 
-        // When max_frames is even we have 1 more frame backward than forward
+        /* When max_frames is even we have 1 more frame backward than forward */
         if (frames_to_blur_forward > (max_frames - 1) / 2)
             frames_to_blur_forward = ((max_frames - 1) / 2);
 
@@ -501,21 +499,7 @@
 
     start_frame = distance + frames_to_blur_forward;
 
-#ifdef DEBUGFWG
-    // DEBUG FWG
-    printf("max:%d FBCK:%d FFWD:%d ftb:%d ftbbck:%d ftbfwd:%d sei:%d lasei:%d start:%d"
-           , max_frames
-           , num_frames_backward
-           , num_frames_forward
-           , frames_to_blur
-           , frames_to_blur_backward
-           , frames_to_blur_forward
-           , cpi->source_encode_index
-           , cpi->last_alt_ref_sei
-           , start_frame);
-#endif
-
-    // Setup frame pointers, NULL indicates frame not included in filter
+    /* Setup frame pointers, NULL indicates frame not included in filter */
     vpx_memset(cpi->frames, 0, max_frames*sizeof(YV12_BUFFER_CONFIG *));
     for (frame = 0; frame < frames_to_blur; frame++)
     {

diff --git a/vp8/encoder/x86/denoising_sse2.c b/vp8/encoder/x86/denoising_sse2.c
index 41991c2..fbce8d1 100644
--- a/vp8/encoder/x86/denoising_sse2.c
+++ b/vp8/encoder/x86/denoising_sse2.c

@@ -55,7 +55,7 @@
         const __m128i k_zero = _mm_set1_epi16(0);
         const __m128i k_128 = _mm_set1_epi32(128);
 
-        // Calculate absolute differences
+        /* Calculate absolute differences */
         DECLARE_ALIGNED_ARRAY(16,unsigned char,abs_diff,16);
         DECLARE_ALIGNED_ARRAY(16,uint32_t,filter_coefficient,16);
         __m128i v_sig = _mm_loadu_si128((__m128i *)(&sig[0]));
@@ -66,14 +66,14 @@
         __m128i v_abs_diff = _mm_adds_epu8(a_minus_b, b_minus_a);
         _mm_store_si128((__m128i *)(&abs_diff[0]), v_abs_diff);
 
-        // Use LUT to get filter coefficients (two 16b value; f and 256-f)
+        /* Use LUT to get filter coefficients (two 16b value; f and 256-f) */
         for (c = 0; c < 16; ++c)
         {
             filter_coefficient[c] = LUT[abs_diff[c]].as_int;
         }
 
-        // Filtering...
-        // load filter coefficients (two 16b value; f and 256-f)
+        /* Filtering... */
+        /* load filter coefficients (two 16b value; f and 256-f) */
         filter_coefficient_00 = _mm_load_si128(
                 (__m128i *)(&filter_coefficient[ 0]));
         filter_coefficient_04 = _mm_load_si128(
@@ -83,18 +83,18 @@
         filter_coefficient_12 = _mm_load_si128(
                 (__m128i *)(&filter_coefficient[12]));
 
-        // expand sig from 8b to 16b
+        /* expand sig from 8b to 16b */
         v_sig0 = _mm_unpacklo_epi8(v_sig, k_zero);
         v_sig1 = _mm_unpackhi_epi8(v_sig, k_zero);
-        // expand mc_running_avg_y from 8b to 16b
+        /* expand mc_running_avg_y from 8b to 16b */
         v_mc_running_avg_y0 = _mm_unpacklo_epi8(v_mc_running_avg_y, k_zero);
         v_mc_running_avg_y1 = _mm_unpackhi_epi8(v_mc_running_avg_y, k_zero);
-        // interleave sig and mc_running_avg_y for upcoming multiply-add
+        /* interleave sig and mc_running_avg_y for upcoming multiply-add */
         state0 = _mm_unpacklo_epi16(v_mc_running_avg_y0, v_sig0);
         state1 = _mm_unpackhi_epi16(v_mc_running_avg_y0, v_sig0);
         state2 = _mm_unpacklo_epi16(v_mc_running_avg_y1, v_sig1);
         state3 = _mm_unpackhi_epi16(v_mc_running_avg_y1, v_sig1);
-        // blend values
+        /* blend values */
         res0 = _mm_madd_epi16(filter_coefficient_00, state0);
         res1 = _mm_madd_epi16(filter_coefficient_04, state1);
         res2 = _mm_madd_epi16(filter_coefficient_08, state2);
@@ -107,15 +107,16 @@
         res1 = _mm_srai_epi32(res1, 8);
         res2 = _mm_srai_epi32(res2, 8);
         res3 = _mm_srai_epi32(res3, 8);
-        // combine the 32b results into a single 8b vector
+        /* combine the 32b results into a single 8b vector */
         res0 = _mm_packs_epi32(res0, res1);
         res2 = _mm_packs_epi32(res2, res3);
         v_running_avg_y = _mm_packus_epi16(res0, res2);
 
-        // Depending on the magnitude of the difference between the signal and
-        // filtered version, either replace the signal by the filtered one or
-        // update the filter state with the signal when the change in a pixel
-        // isn't classified as noise.
+        /* Depending on the magnitude of the difference between the signal and
+         * filtered version, either replace the signal by the filtered one or
+         * update the filter state with the signal when the change in a pixel
+         * isn't classified as noise.
+         */
         diff0 = _mm_sub_epi16(v_sig0, res0);
         diff1 = _mm_sub_epi16(v_sig1, res2);
         acc_diff = _mm_add_epi16(acc_diff, _mm_add_epi16(diff0, diff1));
@@ -130,14 +131,14 @@
         _mm_storeu_si128((__m128i *)(&running_avg_y[0]), p2);
         _mm_storeu_si128((__m128i *)(&filtered[0]), p2);
 
-        // Update pointers for next iteration.
+        /* Update pointers for next iteration. */
         sig += sig_stride;
         filtered += 16;
         mc_running_avg_y += mc_avg_y_stride;
         running_avg_y += avg_y_stride;
     }
     {
-        // Compute the sum of all pixel differences of this MB.
+        /* Compute the sum of all pixel differences of this MB. */
         union sum_union s;
         int sum_diff;
         s.v = acc_diff;

diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index 1fbe5d4..a328f46 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk

@@ -120,6 +120,14 @@
 endif
 
 # common (c)
+VP8_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/idctllm_dspr2.c
+VP8_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/filter_dspr2.c
+VP8_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/loopfilter_filters_dspr2.c
+VP8_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/reconinter_dspr2.c
+VP8_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/idct_blk_dspr2.c
+VP8_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/dequantize_dspr2.c
+
+# common (c)
 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/filter_arm.c
 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/loopfilter_arm.c
 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/reconintra_arm.c

diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index ca06648..eeac3a8 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c

@@ -9,6 +9,7 @@
  */
 
 
+#include "vpx_rtcd.h"
 #include "vpx/vpx_codec.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "vpx_version.h"
@@ -22,7 +23,6 @@
 struct vp8_extracfg
 {
     struct vpx_codec_pkt_list *pkt_list;
-    vp8e_encoding_mode      encoding_mode;               /** best, good, realtime            */
     int                         cpu_used;                    /** available cpu percentage in 1/16*/
     unsigned int                enable_auto_alt_ref;           /** if encoder decides to uses alternate reference frame */
     unsigned int                noise_sensitivity;
@@ -51,10 +51,8 @@
         {
             NULL,
 #if !(CONFIG_REALTIME_ONLY)
-            VP8_BEST_QUALITY_ENCODING,  /* Encoding Mode */
             0,                          /* cpu_used      */
 #else
-            VP8_REAL_TIME_ENCODING,     /* Encoding Mode */
             4,                          /* cpu_used      */
 #endif
             0,                          /* enable_auto_alt_ref */
@@ -88,7 +86,8 @@
     vpx_image_t             preview_img;
     unsigned int            next_frame_flag;
     vp8_postproc_cfg_t      preview_ppcfg;
-    vpx_codec_pkt_list_decl(64) pkt_list;              // changed to accomendate the maximum number of lagged frames allowed
+    /* pkt_list size depends on the maximum number of lagged frames allowed. */
+    vpx_codec_pkt_list_decl(64) pkt_list;
     unsigned int                fixed_kf_cntr;
 };
 
@@ -146,25 +145,39 @@
     RANGE_CHECK_HI(cfg, rc_max_quantizer,   63);
     RANGE_CHECK_HI(cfg, rc_min_quantizer,   cfg->rc_max_quantizer);
     RANGE_CHECK_HI(cfg, g_threads,          64);
-#if !(CONFIG_REALTIME_ONLY)
-    RANGE_CHECK_HI(cfg, g_lag_in_frames,    25);
-#else
+#if CONFIG_REALTIME_ONLY
     RANGE_CHECK_HI(cfg, g_lag_in_frames,    0);
+#elif CONFIG_MULTI_RES_ENCODING
+    if (ctx->base.enc.total_encoders > 1)
+        RANGE_CHECK_HI(cfg, g_lag_in_frames,    0);
+#else
+    RANGE_CHECK_HI(cfg, g_lag_in_frames,    25);
 #endif
     RANGE_CHECK(cfg, rc_end_usage,          VPX_VBR, VPX_CQ);
     RANGE_CHECK_HI(cfg, rc_undershoot_pct,  1000);
     RANGE_CHECK_HI(cfg, rc_overshoot_pct,   1000);
     RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100);
     RANGE_CHECK(cfg, kf_mode,               VPX_KF_DISABLED, VPX_KF_AUTO);
-    //RANGE_CHECK_BOOL(cfg,                 g_delete_firstpassfile);
-    RANGE_CHECK_BOOL(cfg,                   rc_resize_allowed);
+
+/* TODO: add spatial re-sampling support and frame dropping in
+ * multi-res-encoder.*/
+#if CONFIG_MULTI_RES_ENCODING
+    if (ctx->base.enc.total_encoders > 1)
+        RANGE_CHECK_HI(cfg, rc_resize_allowed,     0);
+#else
+    RANGE_CHECK_BOOL(cfg, rc_resize_allowed);
+#endif
     RANGE_CHECK_HI(cfg, rc_dropframe_thresh,   100);
     RANGE_CHECK_HI(cfg, rc_resize_up_thresh,   100);
     RANGE_CHECK_HI(cfg, rc_resize_down_thresh, 100);
-#if !(CONFIG_REALTIME_ONLY)
-    RANGE_CHECK(cfg,        g_pass,         VPX_RC_ONE_PASS, VPX_RC_LAST_PASS);
-#else
+
+#if CONFIG_REALTIME_ONLY
     RANGE_CHECK(cfg,        g_pass,         VPX_RC_ONE_PASS, VPX_RC_ONE_PASS);
+#elif CONFIG_MULTI_RES_ENCODING
+    if (ctx->base.enc.total_encoders > 1)
+        RANGE_CHECK(cfg,    g_pass,         VPX_RC_ONE_PASS, VPX_RC_ONE_PASS);
+#else
+    RANGE_CHECK(cfg,        g_pass,         VPX_RC_ONE_PASS, VPX_RC_LAST_PASS);
 #endif
 
     /* VP8 does not support a lower bound on the keyframe interval in
@@ -177,11 +190,6 @@
 
     RANGE_CHECK_BOOL(vp8_cfg,               enable_auto_alt_ref);
     RANGE_CHECK(vp8_cfg, cpu_used,           -16, 16);
-#if !(CONFIG_REALTIME_ONLY)
-    RANGE_CHECK(vp8_cfg, encoding_mode,      VP8_BEST_QUALITY_ENCODING, VP8_REAL_TIME_ENCODING);
-#else
-    RANGE_CHECK(vp8_cfg, encoding_mode,      VP8_REAL_TIME_ENCODING, VP8_REAL_TIME_ENCODING);
-#endif
 
 #if CONFIG_REALTIME_ONLY && !CONFIG_TEMPORAL_DENOISING
     RANGE_CHECK(vp8_cfg, noise_sensitivity,  0, 0);
@@ -189,7 +197,8 @@
     RANGE_CHECK_HI(vp8_cfg, noise_sensitivity,  6);
 #endif
 
-    RANGE_CHECK(vp8_cfg, token_partitions,   VP8_ONE_TOKENPARTITION, VP8_EIGHT_TOKENPARTITION);
+    RANGE_CHECK(vp8_cfg, token_partitions,   VP8_ONE_TOKENPARTITION,
+                VP8_EIGHT_TOKENPARTITION);
     RANGE_CHECK_HI(vp8_cfg, Sharpness,       7);
     RANGE_CHECK(vp8_cfg, arnr_max_frames, 0, 15);
     RANGE_CHECK_HI(vp8_cfg, arnr_strength,   6);
@@ -203,7 +212,8 @@
     if (cfg->g_pass == VPX_RC_LAST_PASS)
     {
         size_t           packet_sz = sizeof(FIRSTPASS_STATS);
-        int              n_packets = cfg->rc_twopass_stats_in.sz / packet_sz;
+        int              n_packets = (int)(cfg->rc_twopass_stats_in.sz /
+                                          packet_sz);
         FIRSTPASS_STATS *stats;
 
         if (!cfg->rc_twopass_stats_in.buf)
@@ -299,7 +309,7 @@
         break;
     }
 
-    if (cfg.g_pass == VPX_RC_FIRST_PASS)
+    if (cfg.g_pass == VPX_RC_FIRST_PASS || cfg.g_pass == VPX_RC_ONE_PASS)
     {
         oxcf->allow_lag     = 0;
         oxcf->lag_in_frames = 0;
@@ -355,7 +365,6 @@
 
     oxcf->auto_key                 = cfg.kf_mode == VPX_KF_AUTO
                                        && cfg.kf_min_dist != cfg.kf_max_dist;
-    //oxcf->kf_min_dist            = cfg.kf_min_dis;
     oxcf->key_freq                 = cfg.kf_max_dist;
 
     oxcf->number_of_layers         = cfg.ts_number_layers;
@@ -385,9 +394,6 @@
     }
 #endif
 
-    //oxcf->delete_first_pass_file = cfg.g_delete_firstpassfile;
-    //strcpy(oxcf->first_pass_file, cfg.g_firstpass_file);
-
     oxcf->cpu_used               = vp8_cfg.cpu_used;
     oxcf->encode_breakout        = vp8_cfg.static_thresh;
     oxcf->play_alternate         = vp8_cfg.enable_auto_alt_ref;
@@ -576,6 +582,8 @@
 
     struct VP8_COMP *optr;
 
+    vpx_rtcd();
+
     if (!ctx->priv)
     {
         priv = calloc(1, sizeof(struct vpx_codec_alg_priv));
@@ -624,15 +632,15 @@
             return VPX_CODEC_MEM_ERROR;
         }
 
+        if(mr_cfg)
+            ctx->priv->enc.total_encoders   = mr_cfg->mr_total_resolutions;
+        else
+            ctx->priv->enc.total_encoders   = 1;
+
         res = validate_config(priv, &priv->cfg, &priv->vp8_cfg, 0);
 
         if (!res)
         {
-            if(mr_cfg)
-                ctx->priv->enc.total_encoders   = mr_cfg->mr_total_resolutions;
-            else
-                ctx->priv->enc.total_encoders   = 1;
-
             set_vp8e_config(&ctx->priv->alg_priv->oxcf,
                              ctx->priv->alg_priv->cfg,
                              ctx->priv->alg_priv->vp8_cfg,
@@ -685,7 +693,7 @@
     yv12->uv_stride = img->stride[VPX_PLANE_U];
 
     yv12->border  = (img->stride[VPX_PLANE_Y] - img->w) / 2;
-    yv12->clrtype = (img->fmt == VPX_IMG_FMT_VPXI420 || img->fmt == VPX_IMG_FMT_VPXYV12); //REG_YUV = 0
+    yv12->clrtype = (img->fmt == VPX_IMG_FMT_VPXI420 || img->fmt == VPX_IMG_FMT_VPXYV12);
     return res;
 }
 
@@ -771,13 +779,13 @@
         int ref = 7;
 
         if (flags & VP8_EFLAG_NO_REF_LAST)
-            ref ^= VP8_LAST_FLAG;
+            ref ^= VP8_LAST_FRAME;
 
         if (flags & VP8_EFLAG_NO_REF_GF)
-            ref ^= VP8_GOLD_FLAG;
+            ref ^= VP8_GOLD_FRAME;
 
         if (flags & VP8_EFLAG_NO_REF_ARF)
-            ref ^= VP8_ALT_FLAG;
+            ref ^= VP8_ALTR_FRAME;
 
         vp8_use_as_reference(ctx->cpi, ref);
     }
@@ -789,13 +797,13 @@
         int upd = 7;
 
         if (flags & VP8_EFLAG_NO_UPD_LAST)
-            upd ^= VP8_LAST_FLAG;
+            upd ^= VP8_LAST_FRAME;
 
         if (flags & VP8_EFLAG_NO_UPD_GF)
-            upd ^= VP8_GOLD_FLAG;
+            upd ^= VP8_GOLD_FRAME;
 
         if (flags & VP8_EFLAG_NO_UPD_ARF)
-            upd ^= VP8_ALT_FLAG;
+            upd ^= VP8_ALTR_FRAME;
 
         vp8_update_reference(ctx->cpi, upd);
     }
@@ -884,15 +892,16 @@
                 VP8_COMP *cpi = (VP8_COMP *)ctx->cpi;
 
                 /* Add the frame packet to the list of returned packets. */
-                round = 1000000 * ctx->cfg.g_timebase.num / 2 - 1;
+                round = (vpx_codec_pts_t)1000000
+                        * ctx->cfg.g_timebase.num / 2 - 1;
                 delta = (dst_end_time_stamp - dst_time_stamp);
                 pkt.kind = VPX_CODEC_CX_FRAME_PKT;
                 pkt.data.frame.pts =
                     (dst_time_stamp * ctx->cfg.g_timebase.den + round)
                     / ctx->cfg.g_timebase.num / 10000000;
-                pkt.data.frame.duration =
-                    (delta * ctx->cfg.g_timebase.den + round)
-                    / ctx->cfg.g_timebase.num / 10000000;
+                pkt.data.frame.duration = (unsigned long)
+                    ((delta * ctx->cfg.g_timebase.den + round)
+                    / ctx->cfg.g_timebase.num / 10000000);
                 pkt.data.frame.flags = lib_flags << 16;
 
                 if (lib_flags & FRAMEFLAGS_KEY)
@@ -902,10 +911,11 @@
                 {
                     pkt.data.frame.flags |= VPX_FRAME_IS_INVISIBLE;
 
-                    // This timestamp should be as close as possible to the
-                    // prior PTS so that if a decoder uses pts to schedule when
-                    // to do this, we start right after last frame was decoded.
-                    // Invisible frames have no duration.
+                    /* This timestamp should be as close as possible to the
+                     * prior PTS so that if a decoder uses pts to schedule when
+                     * to do this, we start right after last frame was decoded.
+                     * Invisible frames have no duration.
+                     */
                     pkt.data.frame.pts = ((cpi->last_time_stamp_seen
                         * ctx->cfg.g_timebase.den + round)
                         / ctx->cfg.g_timebase.num / 10000000) + 1;
@@ -957,8 +967,6 @@
                     cx_data += size;
                     cx_data_sz -= size;
                 }
-
-                //printf("timestamp: %lld, duration: %d\n", pkt->data.frame.pts, pkt->data.frame.duration);
             }
         }
     }

diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index 37773db..fe8c9a0 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c

@@ -11,6 +11,7 @@
 
 #include <stdlib.h>
 #include <string.h>
+#include "vpx_rtcd.h"
 #include "vpx/vpx_decoder.h"
 #include "vpx/vp8dx.h"
 #include "vpx/internal/vpx_codec_internal.h"
@@ -187,6 +188,8 @@
     vpx_codec_err_t        res = VPX_CODEC_OK;
     (void) data;
 
+    vpx_rtcd();
+
     /* This function only allocates space for the vpx_codec_alg_priv_t
      * structure. More memory may be required at the time the stream
      * information becomes known.

diff --git a/vp8_multi_resolution_encoder.c b/vp8_multi_resolution_encoder.c
index 81e7137..eae36a4 100644
--- a/vp8_multi_resolution_encoder.c
+++ b/vp8_multi_resolution_encoder.c

@@ -164,7 +164,7 @@
     mem_put_le32(header+24, frame_cnt);           /* length */
     mem_put_le32(header+28, 0);                   /* unused */
 
-    if(fwrite(header, 1, 32, outfile));
+    (void) fwrite(header, 1, 32, outfile);
 }
 
 static void write_ivf_frame_header(FILE *outfile,
@@ -181,7 +181,7 @@
     mem_put_le32(header+4, pts&0xFFFFFFFF);
     mem_put_le32(header+8, pts >> 32);
 
-    if(fwrite(header, 1, 12, outfile));
+    (void) fwrite(header, 1, 12, outfile);
 }
 
 int main(int argc, char **argv)
@@ -273,7 +273,7 @@
     cfg[0].g_w = width;
     cfg[0].g_h = height;
     cfg[0].g_threads = 1;                           /* number of threads used */
-    cfg[0].rc_dropframe_thresh = 0;
+    cfg[0].rc_dropframe_thresh = 30;
     cfg[0].rc_end_usage = VPX_CBR;
     cfg[0].rc_resize_allowed = 0;
     cfg[0].rc_min_quantizer = 4;
@@ -283,7 +283,6 @@
     cfg[0].rc_buf_initial_sz = 500;
     cfg[0].rc_buf_optimal_sz = 600;
     cfg[0].rc_buf_sz = 1000;
-    //cfg[0].rc_dropframe_thresh = 10;
     cfg[0].g_error_resilient = 1;              /* Enable error resilient mode */
     cfg[0].g_lag_in_frames   = 0;
 
@@ -293,8 +292,8 @@
      */
     //cfg[0].kf_mode           = VPX_KF_DISABLED;
     cfg[0].kf_mode           = VPX_KF_AUTO;
-    cfg[0].kf_min_dist = 0;
-    cfg[0].kf_max_dist = 150;
+    cfg[0].kf_min_dist = 3000;
+    cfg[0].kf_max_dist = 3000;
 
     cfg[0].rc_target_bitrate = target_bitrate[0];       /* Set target bitrate */
     cfg[0].g_timebase.num = 1;                          /* Set fps */
@@ -366,6 +365,12 @@
         if(vpx_codec_control(&codec[i], VP8E_SET_STATIC_THRESHOLD, static_thresh))
             die_codec(&codec[i], "Failed to set static threshold");
     }
+    /* Set NOISE_SENSITIVITY to do TEMPORAL_DENOISING */
+    for ( i=0; i< NUM_ENCODERS; i++)
+    {
+        if(vpx_codec_control(&codec[i], VP8E_SET_NOISE_SENSITIVITY, 0))
+            die_codec(&codec[i], "Failed to set noise_sensitivity");
+    }
 
     frame_avail = 1;
     got_data = 0;
@@ -410,8 +415,8 @@
                 switch(pkt[i]->kind) {
                     case VPX_CODEC_CX_FRAME_PKT:
                         write_ivf_frame_header(outfile[i], pkt[i]);
-                        if(fwrite(pkt[i]->data.frame.buf, 1, pkt[i]->data.frame.sz,
-                                  outfile[i]));
+                        (void) fwrite(pkt[i]->data.frame.buf, 1,
+                                      pkt[i]->data.frame.sz, outfile[i]);
                     break;
                     case VPX_CODEC_PSNR_PKT:
                         if (show_psnr)

diff --git a/vp8_scalable_patterns.c b/vp8_scalable_patterns.c
index 8d151dd..4256a90 100644
--- a/vp8_scalable_patterns.c
+++ b/vp8_scalable_patterns.c

@@ -93,7 +93,7 @@
     mem_put_le32(header+24, frame_cnt);           /* length */
     mem_put_le32(header+28, 0);                   /* unused */
 
-    if(fwrite(header, 1, 32, outfile));
+    (void) fwrite(header, 1, 32, outfile);
 }
 
 
@@ -111,7 +111,7 @@
     mem_put_le32(header+4, pts&0xFFFFFFFF);
     mem_put_le32(header+8, pts >> 32);
 
-    if(fwrite(header, 1, 12, outfile));
+    (void) fwrite(header, 1, 12, outfile);
 }
 
 static int mode_to_num_layers[9] = {2, 2, 3, 3, 3, 3, 5, 2, 3};
@@ -129,8 +129,8 @@
     int                  got_data;
     int                  flags = 0;
     int                  i;
-    int                  pts = 0;              // PTS starts at 0
-    int                  frame_duration = 1;   // 1 timebase tick per frame
+    int                  pts = 0;              /* PTS starts at 0 */
+    int                  frame_duration = 1;   /* 1 timebase tick per frame */
 
     int                  layering_mode = 0;
     int                  frames_in_layer[VPX_TS_MAX_LAYERS] = {0};
@@ -138,7 +138,7 @@
     int                  flag_periodicity;
     int                  max_intra_size_pct;
 
-    // Check usage and arguments
+    /* Check usage and arguments */
     if (argc < 9)
         die("Usage: %s <infile> <outfile> <width> <height> <rate_num> "
             " <rate_den> <mode> <Rate_0> ... <Rate_nlayers-1>\n", argv[0]);
@@ -161,29 +161,29 @@
 
     printf("Using %s\n",vpx_codec_iface_name(interface));
 
-    // Populate encoder configuration
+    /* Populate encoder configuration */
     res = vpx_codec_enc_config_default(interface, &cfg, 0);
     if(res) {
         printf("Failed to get config: %s\n", vpx_codec_err_to_string(res));
         return EXIT_FAILURE;
     }
 
-    // Update the default configuration with our settings
+    /* Update the default configuration with our settings */
     cfg.g_w = width;
     cfg.g_h = height;
 
-    // Timebase format e.g. 30fps: numerator=1, demoninator=30
+    /* Timebase format e.g. 30fps: numerator=1, demoninator=30 */
     if (!sscanf (argv[5], "%d", &cfg.g_timebase.num ))
         die ("Invalid timebase numerator %s", argv[5]);
     if (!sscanf (argv[6], "%d", &cfg.g_timebase.den ))
         die ("Invalid timebase denominator %s", argv[6]);
 
     for (i=8; i<8+mode_to_num_layers[layering_mode]; i++)
-        if (!sscanf(argv[i], "%d", &cfg.ts_target_bitrate[i-8]))
+        if (!sscanf(argv[i], "%ud", &cfg.ts_target_bitrate[i-8]))
             die ("Invalid data rate %s", argv[i]);
 
-    // Real time parameters
-    cfg.rc_dropframe_thresh = 0;  // 30
+    /* Real time parameters */
+    cfg.rc_dropframe_thresh = 0;
     cfg.rc_end_usage        = VPX_CBR;
     cfg.rc_resize_allowed   = 0;
     cfg.rc_min_quantizer    = 8;
@@ -194,25 +194,26 @@
     cfg.rc_buf_optimal_sz   = 600;
     cfg.rc_buf_sz           = 1000;
 
-    // Enable error resilient mode
+    /* Enable error resilient mode */
     cfg.g_error_resilient = 1;
     cfg.g_lag_in_frames   = 0;
     cfg.kf_mode           = VPX_KF_DISABLED;
 
-    // Disable automatic keyframe placement
+    /* Disable automatic keyframe placement */
     cfg.kf_min_dist = cfg.kf_max_dist = 1000;
 
-    // Temporal scaling parameters:
-    // NOTE: The 3 prediction frames cannot be used interchangeably due to
-    // differences in the way they are handled throughout the code. The
-    // frames should be allocated to layers in the order LAST, GF, ARF.
-    // Other combinations work, but may produce slightly inferior results.
+    /* Temporal scaling parameters: */
+    /* NOTE: The 3 prediction frames cannot be used interchangeably due to
+     * differences in the way they are handled throughout the code. The
+     * frames should be allocated to layers in the order LAST, GF, ARF.
+     * Other combinations work, but may produce slightly inferior results.
+     */
     switch (layering_mode)
     {
 
     case 0:
     {
-        // 2-layers, 2-frame period
+        /* 2-layers, 2-frame period */
         int ids[2] = {0,1};
         cfg.ts_number_layers     = 2;
         cfg.ts_periodicity       = 2;
@@ -222,14 +223,14 @@
 
         flag_periodicity = cfg.ts_periodicity;
 #if 1
-        // 0=L, 1=GF, Intra-layer prediction enabled
+        /* 0=L, 1=GF, Intra-layer prediction enabled */
         layer_flags[0] = VPX_EFLAG_FORCE_KF  |
                          VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF |
                          VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF;
         layer_flags[1] = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST |
                          VP8_EFLAG_NO_REF_ARF;
 #else
-        // 0=L, 1=GF, Intra-layer prediction disabled
+        /* 0=L, 1=GF, Intra-layer prediction disabled */
         layer_flags[0] = VPX_EFLAG_FORCE_KF  |
                          VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF |
                          VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF;
@@ -241,7 +242,7 @@
 
     case 1:
     {
-        // 2-layers, 3-frame period
+        /* 2-layers, 3-frame period */
         int ids[3] = {0,1,1};
         cfg.ts_number_layers     = 2;
         cfg.ts_periodicity       = 3;
@@ -251,7 +252,7 @@
 
         flag_periodicity = cfg.ts_periodicity;
 
-        // 0=L, 1=GF, Intra-layer prediction enabled
+        /* 0=L, 1=GF, Intra-layer prediction enabled */
         layer_flags[0] = VPX_EFLAG_FORCE_KF  |
                          VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF |
                          VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
@@ -264,7 +265,7 @@
 
     case 2:
     {
-        // 3-layers, 6-frame period
+        /* 3-layers, 6-frame period */
         int ids[6] = {0,2,2,1,2,2};
         cfg.ts_number_layers     = 3;
         cfg.ts_periodicity       = 6;
@@ -275,7 +276,7 @@
 
         flag_periodicity = cfg.ts_periodicity;
 
-        // 0=L, 1=GF, 2=ARF, Intra-layer prediction enabled
+        /* 0=L, 1=GF, 2=ARF, Intra-layer prediction enabled */
         layer_flags[0] = VPX_EFLAG_FORCE_KF  |
                          VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF |
                          VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
@@ -290,7 +291,7 @@
 
     case 3:
     {
-        // 3-layers, 4-frame period
+        /* 3-layers, 4-frame period */
         int ids[4] = {0,2,1,2};
         cfg.ts_number_layers     = 3;
         cfg.ts_periodicity       = 4;
@@ -301,7 +302,7 @@
 
         flag_periodicity = cfg.ts_periodicity;
 
-        // 0=L, 1=GF, 2=ARF, Intra-layer prediction disabled
+        /* 0=L, 1=GF, 2=ARF, Intra-layer prediction disabled */
         layer_flags[0] = VPX_EFLAG_FORCE_KF  |
                          VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF |
                          VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
@@ -317,7 +318,7 @@
 
     case 4:
     {
-        // 3-layers, 4-frame period
+        /* 3-layers, 4-frame period */
         int ids[4] = {0,2,1,2};
         cfg.ts_number_layers     = 3;
         cfg.ts_periodicity       = 4;
@@ -328,8 +329,9 @@
 
         flag_periodicity = cfg.ts_periodicity;
 
-        // 0=L, 1=GF, 2=ARF, Intra-layer prediction enabled in layer 1,
-        // disabled in layer 2
+        /* 0=L, 1=GF, 2=ARF, Intra-layer prediction enabled in layer 1,
+         * disabled in layer 2
+         */
         layer_flags[0] = VPX_EFLAG_FORCE_KF  |
                          VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF |
                          VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
@@ -344,7 +346,7 @@
 
     case 5:
     {
-        // 3-layers, 4-frame period
+        /* 3-layers, 4-frame period */
         int ids[4] = {0,2,1,2};
         cfg.ts_number_layers     = 3;
         cfg.ts_periodicity       = 4;
@@ -355,7 +357,7 @@
 
         flag_periodicity = cfg.ts_periodicity;
 
-        // 0=L, 1=GF, 2=ARF, Intra-layer prediction enabled
+        /* 0=L, 1=GF, 2=ARF, Intra-layer prediction enabled */
         layer_flags[0] = VPX_EFLAG_FORCE_KF  |
                          VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF |
                          VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
@@ -368,9 +370,9 @@
 
     case 6:
     {
-        // NOTE: Probably of academic interest only
+        /* NOTE: Probably of academic interest only */
 
-        // 5-layers, 16-frame period
+        /* 5-layers, 16-frame period */
         int ids[16] = {0,4,3,4,2,4,3,4,1,4,3,4,2,4,3,4};
         cfg.ts_number_layers     = 5;
         cfg.ts_periodicity       = 16;
@@ -407,7 +409,7 @@
 
     case 7:
     {
-        // 2-layers
+        /* 2-layers */
         int ids[2] = {0,1};
         cfg.ts_number_layers     = 2;
         cfg.ts_periodicity       = 2;
@@ -417,7 +419,7 @@
 
         flag_periodicity = 8;
 
-        // 0=L, 1=GF
+        /* 0=L, 1=GF */
         layer_flags[0] = VPX_EFLAG_FORCE_KF  |
                          VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF |
                          VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
@@ -440,7 +442,7 @@
     case 8:
     default:
     {
-        // 3-layers
+        /* 3-layers */
         int ids[4] = {0,2,1,2};
         cfg.ts_number_layers     = 3;
         cfg.ts_periodicity       = 4;
@@ -451,7 +453,7 @@
 
         flag_periodicity = 8;
 
-        // 0=L, 1=GF, 2=ARF
+        /* 0=L, 1=GF, 2=ARF */
         layer_flags[0] = VPX_EFLAG_FORCE_KF  |
                          VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF |
                          VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
@@ -472,11 +474,11 @@
     }
     }
 
-    // Open input file
+    /* Open input file */
     if(!(infile = fopen(argv[1], "rb")))
         die("Failed to open %s for reading", argv[1]);
 
-    // Open an output file for each stream
+    /* Open an output file for each stream */
     for (i=0; i<cfg.ts_number_layers; i++)
     {
         char file_name[512];
@@ -486,11 +488,11 @@
         write_ivf_file_header(outfile[i], &cfg, 0);
     }
 
-    // Initialize codec
+    /* Initialize codec */
     if (vpx_codec_enc_init (&codec, interface, &cfg, 0))
         die_codec (&codec, "Failed to initialize encoder");
 
-    // Cap CPU & first I-frame size
+    /* Cap CPU & first I-frame size */
     vpx_codec_control (&codec, VP8E_SET_CPUUSED,                -6);
     vpx_codec_control (&codec, VP8E_SET_STATIC_THRESHOLD,      800);
     vpx_codec_control (&codec, VP8E_SET_NOISE_SENSITIVITY,       1);
@@ -498,12 +500,10 @@
     max_intra_size_pct = (int) (((double)cfg.rc_buf_optimal_sz * 0.5)
                          * ((double) cfg.g_timebase.den / cfg.g_timebase.num)
                          / 10.0);
-    //printf ("max_intra_size_pct=%d\n", max_intra_size_pct);
+    /* printf ("max_intra_size_pct=%d\n", max_intra_size_pct); */
 
     vpx_codec_control(&codec, VP8E_SET_MAX_INTRA_BITRATE_PCT,
                       max_intra_size_pct);
-    //    vpx_codec_control (&codec, VP8E_SET_TOKEN_PARTITIONS,
-    //                      static_cast<vp8e_token_partitions>(_tokenPartitions));
 
     frame_avail = 1;
     while (frame_avail || got_data) {
@@ -517,7 +517,7 @@
                             1, flags, VPX_DL_REALTIME))
             die_codec(&codec, "Failed to encode frame");
 
-        // Reset KF flag
+        /* Reset KF flag */
         if (layering_mode != 6)
             layer_flags[0] &= ~VPX_EFLAG_FORCE_KF;
 
@@ -530,8 +530,8 @@
                                               i<cfg.ts_number_layers; i++)
                 {
                     write_ivf_frame_header(outfile[i], pkt);
-                    if (fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz,
-                              outfile[i]));
+                    (void) fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz,
+                                  outfile[i]);
                     frames_in_layer[i]++;
                 }
                 break;
@@ -552,7 +552,7 @@
     if (vpx_codec_destroy(&codec))
             die_codec (&codec, "Failed to destroy codec");
 
-    // Try to rewrite the output file headers with the actual frame count
+    /* Try to rewrite the output file headers with the actual frame count */
     for (i=0; i<cfg.ts_number_layers; i++)
     {
         if (!fseek(outfile[i], 0, SEEK_SET))
@@ -562,4 +562,3 @@
 
     return EXIT_SUCCESS;
 }
-

diff --git a/vpx/internal/vpx_codec_internal.h b/vpx/internal/vpx_codec_internal.h
index 0703d6a..4474331 100644
--- a/vpx/internal/vpx_codec_internal.h
+++ b/vpx/internal/vpx_codec_internal.h

@@ -165,7 +165,7 @@
  * mapping. This implies that ctrl_id values chosen by the algorithm
  * \ref MUST be non-zero.
  */
-typedef const struct
+typedef const struct vpx_codec_ctrl_fn_map
 {
     int                    ctrl_id;
     vpx_codec_control_fn_t   fn;
@@ -280,7 +280,7 @@
  * one mapping must be present, in addition to the end-of-list.
  *
  */
-typedef const struct
+typedef const struct vpx_codec_enc_cfg_map
 {
     int                 usage;
     vpx_codec_enc_cfg_t cfg;
@@ -302,14 +302,14 @@
     vpx_codec_ctrl_fn_map_t  *ctrl_maps;   /**< \copydoc ::vpx_codec_ctrl_fn_map_t */
     vpx_codec_get_mmap_fn_t   get_mmap;    /**< \copydoc ::vpx_codec_get_mmap_fn_t */
     vpx_codec_set_mmap_fn_t   set_mmap;    /**< \copydoc ::vpx_codec_set_mmap_fn_t */
-    struct
+    struct vpx_codec_dec_iface
     {
         vpx_codec_peek_si_fn_t    peek_si;     /**< \copydoc ::vpx_codec_peek_si_fn_t */
         vpx_codec_get_si_fn_t     get_si;      /**< \copydoc ::vpx_codec_peek_si_fn_t */
         vpx_codec_decode_fn_t     decode;      /**< \copydoc ::vpx_codec_decode_fn_t */
         vpx_codec_get_frame_fn_t  get_frame;   /**< \copydoc ::vpx_codec_get_frame_fn_t */
     } dec;
-    struct
+    struct vpx_codec_enc_iface
     {
         vpx_codec_enc_cfg_map_t           *cfg_maps;      /**< \copydoc ::vpx_codec_enc_cfg_map_t */
         vpx_codec_encode_fn_t              encode;        /**< \copydoc ::vpx_codec_encode_fn_t */

diff --git a/vpx/src/vpx_encoder.c b/vpx/src/vpx_encoder.c
index db0120c..73c1c66 100644
--- a/vpx/src/vpx_encoder.c
+++ b/vpx/src/vpx_encoder.c

@@ -133,8 +133,20 @@
 
                 if (res)
                 {
-                    ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL;
+                    const char *error_detail =
+                        ctx->priv ? ctx->priv->err_detail : NULL;
+                    /* Destroy current ctx */
+                    ctx->err_detail = error_detail;
                     vpx_codec_destroy(ctx);
+
+                    /* Destroy already allocated high-level ctx */
+                    while (i)
+                    {
+                        ctx--;
+                        ctx->err_detail = error_detail;
+                        vpx_codec_destroy(ctx);
+                        i--;
+                    }
                 }
 
                 if (ctx->priv)

diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index 0af631c..a3c95d2 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h

@@ -204,8 +204,8 @@
     unsigned char *roi_map;      /**< specify an id between 0 and 3 for each 16x16 region within a frame */
     unsigned int   rows;         /**< number of rows */
     unsigned int   cols;         /**< number of cols */
-    int     delta_q[4];          /**< quantizer delta [-64, 64] off baseline for regions with id between 0 and 3*/
-    int     delta_lf[4];         /**< loop filter strength delta [-32, 32] for regions with id between 0 and 3 */
+    int     delta_q[4];          /**< quantizer delta [-63, 63] off baseline for regions with id between 0 and 3*/
+    int     delta_lf[4];         /**< loop filter strength delta [-63, 63] for regions with id between 0 and 3 */
     unsigned int   static_threshold[4];/**< threshold for region to be treated as static */
 } vpx_roi_map_t;
 
@@ -234,18 +234,6 @@
     VPX_SCALING_MODE    v_scaling_mode;  /**< vertical scaling mode   */
 } vpx_scaling_mode_t;
 
-/*!\brief VP8 encoding mode
- *
- * This defines VP8 encoding mode
- *
- */
-typedef enum
-{
-    VP8_BEST_QUALITY_ENCODING,
-    VP8_GOOD_QUALITY_ENCODING,
-    VP8_REAL_TIME_ENCODING
-} vp8e_encoding_mode;
-
 /*!\brief VP8 token partition mode
  *
  * This defines VP8 partitioning mode for compressed data, i.e., the number of
@@ -298,12 +286,12 @@
 VPX_CTRL_USE_TYPE(VP8E_SET_NOISE_SENSITIVITY,  unsigned int)
 VPX_CTRL_USE_TYPE(VP8E_SET_SHARPNESS,          unsigned int)
 VPX_CTRL_USE_TYPE(VP8E_SET_STATIC_THRESHOLD,   unsigned int)
-VPX_CTRL_USE_TYPE(VP8E_SET_TOKEN_PARTITIONS,   vp8e_token_partitions)
+VPX_CTRL_USE_TYPE(VP8E_SET_TOKEN_PARTITIONS,   int) /* vp8e_token_partitions */
 
 VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_MAXFRAMES,     unsigned int)
 VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_STRENGTH ,     unsigned int)
 VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_TYPE     ,     unsigned int)
-VPX_CTRL_USE_TYPE(VP8E_SET_TUNING,             vp8e_tuning)
+VPX_CTRL_USE_TYPE(VP8E_SET_TUNING,             int) /* vp8e_tuning */
 VPX_CTRL_USE_TYPE(VP8E_SET_CQ_LEVEL     ,      unsigned int)
 
 VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER,     int *)

diff --git a/vpx/vpx_decoder.h b/vpx/vpx_decoder.h
index 7992cc4..1ccf1c5 100644
--- a/vpx/vpx_decoder.h
+++ b/vpx/vpx_decoder.h

@@ -113,6 +113,10 @@
      * function directly, to ensure that the ABI version number parameter
      * is properly initialized.
      *
+     * If the library was configured with --disable-multithread, this call
+     * is not thread safe and should be guarded with a lock if being used
+     * in a multithreaded context.
+     *
      * In XMA mode (activated by setting VPX_CODEC_USE_XMA in the flags
      * parameter), the storage pointed to by the cfg parameter must be
      * kept readable and stable until all memory maps have been set.

diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index 239036e..67d9033 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h

@@ -655,6 +655,10 @@
      * function directly, to ensure that the ABI version number parameter
      * is properly initialized.
      *
+     * If the library was configured with --disable-multithread, this call
+     * is not thread safe and should be guarded with a lock if being used
+     * in a multithreaded context.
+     *
      * In XMA mode (activated by setting VPX_CODEC_USE_XMA in the flags
      * parameter), the storage pointed to by the cfg parameter must be
      * kept readable and stable until all memory maps have been set.

diff --git a/vpx_ports/arm_cpudetect.c b/vpx_ports/arm_cpudetect.c
index ebe428d..8ff95a1 100644
--- a/vpx_ports/arm_cpudetect.c
+++ b/vpx_ports/arm_cpudetect.c

@@ -32,8 +32,33 @@
     return env && *env ? (int)strtol(env, NULL, 0) : ~0;
 }
 
+#if !CONFIG_RUNTIME_CPU_DETECT
 
-#if defined(_MSC_VER)
+int arm_cpu_caps(void)
+{
+  /* This function should actually be a no-op. There is no way to adjust any of
+   * these because the RTCD tables do not exist: the functions are called
+   * statically */
+    int flags;
+    int mask;
+    if (!arm_cpu_env_flags(&flags))
+    {
+        return flags;
+    }
+    mask = arm_cpu_env_mask();
+#if HAVE_EDSP
+    flags |= HAS_EDSP;
+#endif /* HAVE_EDSP */
+#if HAVE_MEDIA
+    flags |= HAS_MEDIA;
+#endif /* HAVE_MEDIA */
+#if HAVE_NEON
+    flags |= HAS_NEON;
+#endif /* HAVE_NEON */
+    return flags & mask;
+}
+
+#elif defined(_MSC_VER) /* end !CONFIG_RUNTIME_CPU_DETECT */
 /*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/
 #define WIN32_LEAN_AND_MEAN
 #define WIN32_EXTRA_LEAN
@@ -52,7 +77,7 @@
      *  instructions via their assembled hex code.
      * All of these instructions should be essentially nops.
      */
-#if defined(HAVE_EDSP)
+#if HAVE_EDSP
     if (mask & HAS_EDSP)
     {
         __try
@@ -66,7 +91,7 @@
             /*Ignore exception.*/
         }
     }
-#if defined(HAVE_MEDIA)
+#if HAVE_MEDIA
     if (mask & HAS_MEDIA)
         __try
         {
@@ -79,7 +104,7 @@
             /*Ignore exception.*/
         }
     }
-#if defined(HAVE_NEON)
+#if HAVE_NEON
     if (mask & HAS_NEON)
     {
         __try
@@ -93,14 +118,13 @@
             /*Ignore exception.*/
         }
     }
-#endif
-#endif
-#endif
+#endif /* HAVE_NEON */
+#endif /* HAVE_MEDIA */
+#endif /* HAVE_EDSP */
     return flags & mask;
 }
 
-#elif defined(__linux__)
-#if defined(__ANDROID__)
+#elif defined(__ANDROID__) /* end _MSC_VER */
 #include <cpu-features.h>
 
 int arm_cpu_caps(void)
@@ -115,19 +139,20 @@
     mask = arm_cpu_env_mask();
     features = android_getCpuFeatures();
 
-#if defined(HAVE_EDSP)
+#if HAVE_EDSP
     flags |= HAS_EDSP;
-#endif
-#if defined(HAVE_MEDIA)
+#endif /* HAVE_EDSP */
+#if HAVE_MEDIA
     flags |= HAS_MEDIA;
-#endif
-#if defined(HAVE_NEON)
+#endif /* HAVE_MEDIA */
+#if HAVE_NEON
     if (features & ANDROID_CPU_ARM_FEATURE_NEON)
         flags |= HAS_NEON;
-#endif
+#endif /* HAVE_NEON */
     return flags & mask;
 }
-#else // !defined(__ANDROID__)
+
+#elif defined(__linux__) /* end __ANDROID__ */
 #include <stdio.h>
 
 int arm_cpu_caps(void)
@@ -153,27 +178,27 @@
         char buf[512];
         while (fgets(buf, 511, fin) != NULL)
         {
-#if defined(HAVE_EDSP) || defined(HAVE_NEON)
+#if HAVE_EDSP || HAVE_NEON
             if (memcmp(buf, "Features", 8) == 0)
             {
                 char *p;
-#if defined(HAVE_EDSP)
+#if HAVE_EDSP
                 p=strstr(buf, " edsp");
                 if (p != NULL && (p[5] == ' ' || p[5] == '\n'))
                 {
                     flags |= HAS_EDSP;
                 }
-#if defined(HAVE_NEON)
+#if HAVE_NEON
                 p = strstr(buf, " neon");
                 if (p != NULL && (p[5] == ' ' || p[5] == '\n'))
                 {
                     flags |= HAS_NEON;
                 }
-#endif
-#endif
+#endif /* HAVE_NEON */
+#endif /* HAVE_EDSP */
             }
-#endif
-#if defined(HAVE_MEDIA)
+#endif /* HAVE_EDSP || HAVE_NEON */
+#if HAVE_MEDIA
             if (memcmp(buf, "CPU architecture:",17) == 0){
                 int version;
                 version = atoi(buf+17);
@@ -182,37 +207,13 @@
                     flags |= HAS_MEDIA;
                 }
             }
-#endif
+#endif /* HAVE_MEDIA */
         }
         fclose(fin);
     }
     return flags & mask;
 }
-#endif // defined(__linux__)
-#elif !CONFIG_RUNTIME_CPU_DETECT
-
-int arm_cpu_caps(void)
-{
-    int flags;
-    int mask;
-    if (!arm_cpu_env_flags(&flags))
-    {
-        return flags;
-    }
-    mask = arm_cpu_env_mask();
-#if defined(HAVE_EDSP)
-    flags |= HAS_EDSP;
-#endif
-#if defined(HAVE_MEDIA)
-    flags |= HAS_MEDIA;
-#endif
-#if defined(HAVE_NEON)
-    flags |= HAS_NEON;
-#endif
-    return flags & mask;
-}
-
-#else
+#else /* end __linux__ */
 #error "--enable-runtime-cpu-detect selected, but no CPU detection method " \
- "available for your platform. Reconfigure without --enable-runtime-cpu-detect."
+ "available for your platform. Reconfigure with --disable-runtime-cpu-detect."
 #endif

diff --git a/vpx_ports/asm_offsets.h b/vpx_ports/asm_offsets.h
index fc1287e..7b6ae4a 100644
--- a/vpx_ports/asm_offsets.h
+++ b/vpx_ports/asm_offsets.h

@@ -19,11 +19,11 @@
     static void assert_##name(void) {switch(0){case 0:case !!(cond):;}}
 
 #if INLINE_ASM
-#define DEFINE(sym, val) asm("\n" #sym " EQU %0" : : "i" (val));
+#define DEFINE(sym, val) asm("\n" #sym " EQU %0" : : "i" (val))
 #define BEGIN int main(void) {
 #define END return 0; }
 #else
-#define DEFINE(sym, val) const int sym = val;
+#define DEFINE(sym, val) const int sym = val
 #define BEGIN
 #define END
 #endif

diff --git a/vpx_ports/mem_ops.h b/vpx_ports/mem_ops.h
index 0e52368..dec28d5 100644
--- a/vpx_ports/mem_ops.h
+++ b/vpx_ports/mem_ops.h

@@ -145,27 +145,27 @@
 
 #undef  mem_get_sbe16
 #define mem_get_sbe16 mem_ops_wrap_symbol(mem_get_sbe16)
-mem_get_s_generic(be, 16);
+mem_get_s_generic(be, 16)
 
 #undef  mem_get_sbe24
 #define mem_get_sbe24 mem_ops_wrap_symbol(mem_get_sbe24)
-mem_get_s_generic(be, 24);
+mem_get_s_generic(be, 24)
 
 #undef  mem_get_sbe32
 #define mem_get_sbe32 mem_ops_wrap_symbol(mem_get_sbe32)
-mem_get_s_generic(be, 32);
+mem_get_s_generic(be, 32)
 
 #undef  mem_get_sle16
 #define mem_get_sle16 mem_ops_wrap_symbol(mem_get_sle16)
-mem_get_s_generic(le, 16);
+mem_get_s_generic(le, 16)
 
 #undef  mem_get_sle24
 #define mem_get_sle24 mem_ops_wrap_symbol(mem_get_sle24)
-mem_get_s_generic(le, 24);
+mem_get_s_generic(le, 24)
 
 #undef  mem_get_sle32
 #define mem_get_sle32 mem_ops_wrap_symbol(mem_get_sle32)
-mem_get_s_generic(le, 32);
+mem_get_s_generic(le, 32)
 
 #undef  mem_put_be16
 #define mem_put_be16 mem_ops_wrap_symbol(mem_put_be16)

diff --git a/vpx_ports/mem_ops_aligned.h b/vpx_ports/mem_ops_aligned.h
index 0fbba65..fca653a 100644
--- a/vpx_ports/mem_ops_aligned.h
+++ b/vpx_ports/mem_ops_aligned.h

@@ -99,51 +99,51 @@
 
 #undef  mem_get_be16_aligned
 #define mem_get_be16_aligned mem_ops_wrap_symbol(mem_get_be16_aligned)
-mem_get_be_aligned_generic(16);
+mem_get_be_aligned_generic(16)
 
 #undef  mem_get_be32_aligned
 #define mem_get_be32_aligned mem_ops_wrap_symbol(mem_get_be32_aligned)
-mem_get_be_aligned_generic(32);
+mem_get_be_aligned_generic(32)
 
 #undef  mem_get_le16_aligned
 #define mem_get_le16_aligned mem_ops_wrap_symbol(mem_get_le16_aligned)
-mem_get_le_aligned_generic(16);
+mem_get_le_aligned_generic(16)
 
 #undef  mem_get_le32_aligned
 #define mem_get_le32_aligned mem_ops_wrap_symbol(mem_get_le32_aligned)
-mem_get_le_aligned_generic(32);
+mem_get_le_aligned_generic(32)
 
 #undef  mem_get_sbe16_aligned
 #define mem_get_sbe16_aligned mem_ops_wrap_symbol(mem_get_sbe16_aligned)
-mem_get_sbe_aligned_generic(16);
+mem_get_sbe_aligned_generic(16)
 
 #undef  mem_get_sbe32_aligned
 #define mem_get_sbe32_aligned mem_ops_wrap_symbol(mem_get_sbe32_aligned)
-mem_get_sbe_aligned_generic(32);
+mem_get_sbe_aligned_generic(32)
 
 #undef  mem_get_sle16_aligned
 #define mem_get_sle16_aligned mem_ops_wrap_symbol(mem_get_sle16_aligned)
-mem_get_sle_aligned_generic(16);
+mem_get_sle_aligned_generic(16)
 
 #undef  mem_get_sle32_aligned
 #define mem_get_sle32_aligned mem_ops_wrap_symbol(mem_get_sle32_aligned)
-mem_get_sle_aligned_generic(32);
+mem_get_sle_aligned_generic(32)
 
 #undef  mem_put_be16_aligned
 #define mem_put_be16_aligned mem_ops_wrap_symbol(mem_put_be16_aligned)
-mem_put_be_aligned_generic(16);
+mem_put_be_aligned_generic(16)
 
 #undef  mem_put_be32_aligned
 #define mem_put_be32_aligned mem_ops_wrap_symbol(mem_put_be32_aligned)
-mem_put_be_aligned_generic(32);
+mem_put_be_aligned_generic(32)
 
 #undef  mem_put_le16_aligned
 #define mem_put_le16_aligned mem_ops_wrap_symbol(mem_put_le16_aligned)
-mem_put_le_aligned_generic(16);
+mem_put_le_aligned_generic(16)
 
 #undef  mem_put_le32_aligned
 #define mem_put_le32_aligned mem_ops_wrap_symbol(mem_put_le32_aligned)
-mem_put_le_aligned_generic(32);
+mem_put_le_aligned_generic(32)
 
 #undef mem_get_ne_aligned_generic
 #undef mem_get_se_aligned_generic

diff --git a/vpx_ports/x86.h b/vpx_ports/x86.h
index 1341c7f..9dd8c4b 100644
--- a/vpx_ports/x86.h
+++ b/vpx_ports/x86.h

@@ -162,7 +162,7 @@
     return tsc;
 #else
 #if ARCH_X86_64
-    return __rdtsc();
+    return (unsigned int)__rdtsc();
 #else
     __asm  rdtsc;
 #endif

diff --git a/vpx_scale/arm/neon/yv12extend_arm.c b/vpx_scale/arm/neon/yv12extend_arm.c
index 7529fc6..6f34ce3 100644
--- a/vpx_scale/arm/neon/yv12extend_arm.c
+++ b/vpx_scale/arm/neon/yv12extend_arm.c

@@ -8,16 +8,13 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "vpx_rtcd.h"
 
-#include "vpx_scale/yv12config.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vpx_scale/vpxscale.h"
+extern void vp8_yv12_copy_frame_func_neon(struct yv12_buffer_config *src_ybc,
+                                          struct yv12_buffer_config *dst_ybc);
 
-extern void vp8_yv12_copy_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc,
-                                          YV12_BUFFER_CONFIG *dst_ybc);
-
-void vp8_yv12_copy_frame_neon(YV12_BUFFER_CONFIG *src_ybc,
-                              YV12_BUFFER_CONFIG *dst_ybc)
+void vp8_yv12_copy_frame_neon(struct yv12_buffer_config *src_ybc,
+                              struct yv12_buffer_config *dst_ybc)
 {
     vp8_yv12_copy_frame_func_neon(src_ybc, dst_ybc);
 

diff --git a/vpxdec.c b/vpxdec.c
index 4482f3d..8657905 100644
--- a/vpxdec.c
+++ b/vpxdec.c

@@ -52,7 +52,7 @@
 static const struct
 {
     char const *name;
-    const vpx_codec_iface_t *iface;
+    vpx_codec_iface_t *iface;
     unsigned int             fourcc;
     unsigned int             fourcc_mask;
 } ifaces[] =
@@ -152,7 +152,8 @@
             "write to. If the\n  argument does not include any escape "
             "characters, the output will be\n  written to a single file. "
             "Otherwise, the filename will be calculated by\n  expanding "
-            "the following escape characters:\n"
+            "the following escape characters:\n");
+    fprintf(stderr,
             "\n\t%%w   - Frame width"
             "\n\t%%h   - Frame height"
             "\n\t%%<n> - Frame number, zero padded to <n> places (1..9)"
@@ -356,7 +357,7 @@
     }
     else
     {
-        if(fwrite(buf, 1, len, out));
+        (void) fwrite(buf, 1, len, out);
     }
 }
 
@@ -502,7 +503,7 @@
         case NESTEGG_SEEK_CUR: whence = SEEK_CUR; break;
         case NESTEGG_SEEK_END: whence = SEEK_END; break;
     };
-    return fseek(userdata, offset, whence)? -1 : 0;
+    return fseek(userdata, (long)offset, whence)? -1 : 0;
 }
 
 
@@ -559,7 +560,7 @@
         goto fail;
 
     *fps_num = (i - 1) * 1000000;
-    *fps_den = tstamp / 1000;
+    *fps_den = (unsigned int)(tstamp / 1000);
     return 0;
 fail:
     nestegg_destroy(input->nestegg_ctx);
@@ -580,10 +581,10 @@
     unsigned int i, n;
     int          track_type = -1;
 
-    nestegg_io io = {nestegg_read_cb, nestegg_seek_cb, nestegg_tell_cb,
-                     input->infile};
+    nestegg_io io = {nestegg_read_cb, nestegg_seek_cb, nestegg_tell_cb, 0};
     nestegg_video_params params;
 
+    io.userdata = input->infile;
     if(nestegg_init(&input->nestegg_ctx, io, NULL))
         goto fail;
 
@@ -647,7 +648,7 @@
         {
             size_t pat_len;
 
-            // parse the pattern
+            /* parse the pattern */
             q[q_len - 1] = '\0';
             switch(p[1])
             {
@@ -677,7 +678,7 @@
         {
             size_t copy_len;
 
-            // copy the next segment
+            /* copy the next segment */
             if(!next_pat)
                 copy_len = strlen(p);
             else
@@ -922,7 +923,7 @@
             p = strchr(p, '%');
             if(p && p[1] >= '1' && p[1] <= '9')
             {
-                // pattern contains sequence number, so it's not unique.
+                /* pattern contains sequence number, so it's not unique. */
                 single_file = 0;
                 break;
             }
@@ -962,7 +963,8 @@
           That will have to wait until these tools support WebM natively.*/
         sprintf(buffer, "YUV4MPEG2 C%s W%u H%u F%u:%u I%c\n",
                 "420jpeg", width, height, fps_num, fps_den, 'p');
-        out_put(out, (unsigned char *)buffer, strlen(buffer), do_md5);
+        out_put(out, (unsigned char *)buffer,
+                (unsigned int)strlen(buffer), do_md5);
     }
 
     /* Try to determine the codec from the fourcc. */
@@ -1040,7 +1042,7 @@
 
         vpx_usec_timer_start(&timer);
 
-        if (vpx_codec_decode(&decoder, buf, buf_sz, NULL, 0))
+        if (vpx_codec_decode(&decoder, buf, (unsigned int)buf_sz, NULL, 0))
         {
             const char *detail = vpx_codec_error_detail(&decoder);
             fprintf(stderr, "Failed to decode frame: %s\n", vpx_codec_error(&decoder));
@@ -1052,7 +1054,7 @@
         }
 
         vpx_usec_timer_mark(&timer);
-        dx_time += vpx_usec_timer_elapsed(&timer);
+        dx_time += (unsigned int)vpx_usec_timer_elapsed(&timer);
 
         ++frame_in;
 

diff --git a/vpxenc.c b/vpxenc.c
index b9aa8e1..d68e40e 100644
--- a/vpxenc.c
+++ b/vpxenc.c

@@ -54,11 +54,7 @@
 #define off_t off64_t
 #endif
 
-#if defined(_MSC_VER)
-#define LITERALU64(n) n
-#else
-#define LITERALU64(n) n##LLU
-#endif
+#define LITERALU64(hi,lo) ((((uint64_t)hi)<<32)|lo)
 
 /* We should use 32-bit file operations in WebM file format
  * when building ARM executable file (.axf) with RVCT */
@@ -68,12 +64,28 @@
 #define ftello ftell
 #endif
 
+/* Swallow warnings about unused results of fread/fwrite */
+static size_t wrap_fread(void *ptr, size_t size, size_t nmemb,
+                         FILE *stream)
+{
+    return fread(ptr, size, nmemb, stream);
+}
+#define fread wrap_fread
+
+static size_t wrap_fwrite(const void *ptr, size_t size, size_t nmemb,
+                          FILE *stream)
+{
+    return fwrite(ptr, size, nmemb, stream);
+}
+#define fwrite wrap_fwrite
+
+
 static const char *exec_name;
 
 static const struct codec_item
 {
     char const              *name;
-    const vpx_codec_iface_t *iface;
+    vpx_codec_iface_t       *iface;
     unsigned int             fourcc;
 } codecs[] =
 {
@@ -245,7 +257,7 @@
 {
     if (stats->file)
     {
-        if(fwrite(pkt, 1, len, stats->file));
+        (void) fwrite(pkt, 1, len, stats->file);
     }
     else
     {
@@ -338,7 +350,7 @@
              * write_ivf_frame_header() for documentation on the frame header
              * layout.
              */
-            if(fread(junk, 1, IVF_FRAME_HDR_SZ, f));
+            (void) fread(junk, 1, IVF_FRAME_HDR_SZ, f);
         }
 
         for (plane = 0; plane < 3; plane++)
@@ -468,7 +480,7 @@
     mem_put_le32(header + 24, frame_cnt);         /* length */
     mem_put_le32(header + 28, 0);                 /* unused */
 
-    if(fwrite(header, 1, 32, outfile));
+    (void) fwrite(header, 1, 32, outfile);
 }
 
 
@@ -482,18 +494,18 @@
         return;
 
     pts = pkt->data.frame.pts;
-    mem_put_le32(header, pkt->data.frame.sz);
+    mem_put_le32(header, (int)pkt->data.frame.sz);
     mem_put_le32(header + 4, pts & 0xFFFFFFFF);
     mem_put_le32(header + 8, pts >> 32);
 
-    if(fwrite(header, 1, 12, outfile));
+    (void) fwrite(header, 1, 12, outfile);
 }
 
 static void write_ivf_frame_size(FILE *outfile, size_t size)
 {
     char             header[4];
-    mem_put_le32(header, size);
-    fwrite(header, 1, 4, outfile);
+    mem_put_le32(header, (int)size);
+    (void) fwrite(header, 1, 4, outfile);
 }
 
 
@@ -541,13 +553,13 @@
 
 void Ebml_Write(EbmlGlobal *glob, const void *buffer_in, unsigned long len)
 {
-    if(fwrite(buffer_in, 1, len, glob->stream));
+    (void) fwrite(buffer_in, 1, len, glob->stream);
 }
 
 #define WRITE_BUFFER(s) \
 for(i = len-1; i>=0; i--)\
 { \
-    x = *(const s *)buffer_in >> (i * CHAR_BIT); \
+    x = (char)(*(const s *)buffer_in >> (i * CHAR_BIT)); \
     Ebml_Write(glob, &x, 1); \
 }
 void Ebml_Serialize(EbmlGlobal *glob, const void *buffer_in, int buffer_size, unsigned long len)
@@ -597,9 +609,9 @@
 Ebml_StartSubElement(EbmlGlobal *glob, EbmlLoc *ebmlLoc,
                           unsigned long class_id)
 {
-    //todo this is always taking 8 bytes, this may need later optimization
-    //this is a key that says length unknown
-    uint64_t unknownLen =  LITERALU64(0x01FFFFFFFFFFFFFF);
+    /* todo this is always taking 8 bytes, this may need later optimization */
+    /* this is a key that says length unknown */
+    uint64_t unknownLen = LITERALU64(0x01FFFFFF, 0xFFFFFFFF);
 
     Ebml_WriteID(glob, class_id);
     *ebmlLoc = ftello(glob->stream);
@@ -617,7 +629,7 @@
 
     /* Calculate the size of this element */
     size = pos - *ebmlLoc - 8;
-    size |=  LITERALU64(0x0100000000000000);
+    size |= LITERALU64(0x01000000,0x00000000);
 
     /* Seek back to the beginning of the element and write the new size */
     fseeko(glob->stream, *ebmlLoc, SEEK_SET);
@@ -664,7 +676,7 @@
         Ebml_EndSubElement(ebml, &start);
     }
     {
-        //segment info
+        /* segment info */
         EbmlLoc startInfo;
         uint64_t frame_time;
         char version_string[64];
@@ -686,7 +698,7 @@
         Ebml_StartSubElement(ebml, &startInfo, Info);
         Ebml_SerializeUnsigned(ebml, TimecodeScale, 1000000);
         Ebml_SerializeFloat(ebml, Segment_Duration,
-                            ebml->last_pts_ms + frame_time);
+                            (double)(ebml->last_pts_ms + frame_time));
         Ebml_SerializeString(ebml, 0x4D80, version_string);
         Ebml_SerializeString(ebml, 0x5741, version_string);
         Ebml_EndSubElement(ebml, &startInfo);
@@ -704,16 +716,16 @@
         EbmlLoc start;
         Ebml_StartSubElement(glob, &start, EBML);
         Ebml_SerializeUnsigned(glob, EBMLVersion, 1);
-        Ebml_SerializeUnsigned(glob, EBMLReadVersion, 1); //EBML Read Version
-        Ebml_SerializeUnsigned(glob, EBMLMaxIDLength, 4); //EBML Max ID Length
-        Ebml_SerializeUnsigned(glob, EBMLMaxSizeLength, 8); //EBML Max Size Length
-        Ebml_SerializeString(glob, DocType, "webm"); //Doc Type
-        Ebml_SerializeUnsigned(glob, DocTypeVersion, 2); //Doc Type Version
-        Ebml_SerializeUnsigned(glob, DocTypeReadVersion, 2); //Doc Type Read Version
+        Ebml_SerializeUnsigned(glob, EBMLReadVersion, 1);
+        Ebml_SerializeUnsigned(glob, EBMLMaxIDLength, 4);
+        Ebml_SerializeUnsigned(glob, EBMLMaxSizeLength, 8);
+        Ebml_SerializeString(glob, DocType, "webm");
+        Ebml_SerializeUnsigned(glob, DocTypeVersion, 2);
+        Ebml_SerializeUnsigned(glob, DocTypeReadVersion, 2);
         Ebml_EndSubElement(glob, &start);
     }
     {
-        Ebml_StartSubElement(glob, &glob->startSegment, Segment); //segment
+        Ebml_StartSubElement(glob, &glob->startSegment, Segment);
         glob->position_reference = ftello(glob->stream);
         glob->framerate = *fps;
         write_webm_seek_info(glob);
@@ -731,7 +743,7 @@
                 Ebml_SerializeUnsigned(glob, TrackNumber, trackNumber);
                 glob->track_id_pos = ftello(glob->stream);
                 Ebml_SerializeUnsigned32(glob, TrackUID, trackID);
-                Ebml_SerializeUnsigned(glob, TrackType, 1); //video is always 1
+                Ebml_SerializeUnsigned(glob, TrackType, 1);
                 Ebml_SerializeString(glob, CodecID, "V_VP8");
                 {
                     unsigned int pixelWidth = cfg->g_w;
@@ -744,13 +756,13 @@
                     Ebml_SerializeUnsigned(glob, PixelHeight, pixelHeight);
                     Ebml_SerializeUnsigned(glob, StereoMode, stereo_fmt);
                     Ebml_SerializeFloat(glob, FrameRate, frameRate);
-                    Ebml_EndSubElement(glob, &videoStart); //Video
+                    Ebml_EndSubElement(glob, &videoStart);
                 }
-                Ebml_EndSubElement(glob, &start); //Track Entry
+                Ebml_EndSubElement(glob, &start); /* Track Entry */
             }
             Ebml_EndSubElement(glob, &trackStart);
         }
-        // segment element is open
+        /* segment element is open */
     }
 }
 
@@ -778,7 +790,7 @@
     if(pts_ms - glob->cluster_timecode > SHRT_MAX)
         start_cluster = 1;
     else
-        block_timecode = pts_ms - glob->cluster_timecode;
+        block_timecode = (unsigned short)pts_ms - glob->cluster_timecode;
 
     is_keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY);
     if(start_cluster || is_keyframe)
@@ -789,9 +801,9 @@
         /* Open the new cluster */
         block_timecode = 0;
         glob->cluster_open = 1;
-        glob->cluster_timecode = pts_ms;
+        glob->cluster_timecode = (uint32_t)pts_ms;
         glob->cluster_pos = ftello(glob->stream);
-        Ebml_StartSubElement(glob, &glob->startCluster, Cluster); //cluster
+        Ebml_StartSubElement(glob, &glob->startCluster, Cluster); /* cluster */
         Ebml_SerializeUnsigned(glob, Timecode, glob->cluster_timecode);
 
         /* Save a cue point if this is a keyframe. */
@@ -816,7 +828,7 @@
     /* Write the Simple Block */
     Ebml_WriteID(glob, SimpleBlock);
 
-    block_length = pkt->data.frame.sz + 4;
+    block_length = (unsigned long)pkt->data.frame.sz + 4;
     block_length |= 0x10000000;
     Ebml_Serialize(glob, &block_length, sizeof(block_length), 4);
 
@@ -833,7 +845,7 @@
         flags |= 0x08;
     Ebml_Write(glob, &flags, 1);
 
-    Ebml_Write(glob, pkt->data.frame.buf, pkt->data.frame.sz);
+    Ebml_Write(glob, pkt->data.frame.buf, (unsigned long)pkt->data.frame.sz);
 }
 
 
@@ -865,7 +877,6 @@
                 Ebml_SerializeUnsigned(glob, CueTrack, 1);
                 Ebml_SerializeUnsigned64(glob, CueClusterPosition,
                                          cue->loc - glob->position_reference);
-                //Ebml_SerializeUnsigned(glob, CueBlockNumber, cue->blockNumber);
                 Ebml_EndSubElement(glob, &start);
             }
             Ebml_EndSubElement(glob, &start);
@@ -942,7 +953,7 @@
     if ((double)Mse > 0.0)
         psnr = 10.0 * log10(Peak * Peak * Samples / Mse);
     else
-        psnr = 60;      // Limit to prevent / 0
+        psnr = 60;      /* Limit to prevent / 0 */
 
     if (psnr > 60)
         psnr = 60;
@@ -1225,7 +1236,7 @@
     {
         int last_bucket = buckets - 1;
 
-        // merge the small bucket with an adjacent one.
+        /* merge the small bucket with an adjacent one. */
         if(small_bucket == 0)
             merge_bucket = 1;
         else if(small_bucket == last_bucket)
@@ -1325,7 +1336,7 @@
         int j;
         float pct;
 
-        pct = 100.0 * (float)bucket[i].count / (float)total;
+        pct = (float)(100.0 * bucket[i].count / total);
         len = HIST_BAR_MAX * bucket[i].count / scale;
         if(len < 1)
             len = 1;
@@ -1393,7 +1404,7 @@
      */
     hist->samples = cfg->rc_buf_sz * 5 / 4 * fps->num / fps->den / 1000;
 
-    // prevent division by zero
+    /* prevent division by zero */
     if (hist->samples == 0)
       hist->samples=1;
 
@@ -1427,7 +1438,7 @@
 
     idx = hist->frames++ % hist->samples;
     hist->pts[idx] = now;
-    hist->sz[idx] = pkt->data.frame.sz;
+    hist->sz[idx] = (int)pkt->data.frame.sz;
 
     if(now < cfg->rc_buf_initial_sz)
         return;
@@ -1449,15 +1460,15 @@
         return;
 
     avg_bitrate = sum_sz * 8 * 1000 / (now - then);
-    idx = avg_bitrate * (RATE_BINS/2) / (cfg->rc_target_bitrate * 1000);
+    idx = (int)(avg_bitrate * (RATE_BINS/2) / (cfg->rc_target_bitrate * 1000));
     if(idx < 0)
         idx = 0;
     if(idx > RATE_BINS-1)
         idx = RATE_BINS-1;
     if(hist->bucket[idx].low > avg_bitrate)
-        hist->bucket[idx].low = avg_bitrate;
+        hist->bucket[idx].low = (int)avg_bitrate;
     if(hist->bucket[idx].high < avg_bitrate)
-        hist->bucket[idx].high = avg_bitrate;
+        hist->bucket[idx].high = (int)avg_bitrate;
     hist->bucket[idx].count++;
     hist->total++;
 }
@@ -2000,7 +2011,7 @@
     {
         double framerate = (double)global->framerate.num/global->framerate.den;
         if (framerate > 0.0)
-            stream->config.cfg.kf_max_dist = 5.0*framerate;
+            stream->config.cfg.kf_max_dist = (unsigned int)(5.0*framerate);
     }
 }
 
@@ -2180,7 +2191,7 @@
                         / cfg->g_timebase.num / global->framerate.num;
     vpx_usec_timer_start(&timer);
     vpx_codec_encode(&stream->encoder, img, frame_start,
-                     next_frame_start - frame_start,
+                     (unsigned long)(next_frame_start - frame_start),
                      0, global->deadline);
     vpx_usec_timer_mark(&timer);
     stream->cx_time += vpx_usec_timer_elapsed(&timer);
@@ -2233,7 +2244,8 @@
                 /* Update the hash */
                 if(!stream->ebml.debug)
                     stream->hash = murmur(pkt->data.frame.buf,
-                                          pkt->data.frame.sz, stream->hash);
+                                          (int)pkt->data.frame.sz,
+                                          stream->hash);
 
                 write_webm_block(&stream->ebml, cfg, pkt);
             }
@@ -2259,8 +2271,8 @@
                     }
                 }
 
-                fwrite(pkt->data.frame.buf, 1,
-                       pkt->data.frame.sz, stream->file);
+                (void) fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz,
+                              stream->file);
             }
             stream->nbytes += pkt->data.raw.sz;
             break;
@@ -2283,7 +2295,7 @@
                 stream->psnr_samples_total += pkt->data.psnr.samples[0];
                 for (i = 0; i < 4; i++)
                 {
-                    fprintf(stderr, "%.3lf ", pkt->data.psnr.psnr[i]);
+                    fprintf(stderr, "%.3f ", pkt->data.psnr.psnr[i]);
                     stream->psnr_totals[i] += pkt->data.psnr.psnr[i];
                 }
                 stream->psnr_count++;
@@ -2306,13 +2318,13 @@
         return;
 
     fprintf(stderr, "Stream %d PSNR (Overall/Avg/Y/U/V)", stream->index);
-    ovpsnr = vp8_mse2psnr(stream->psnr_samples_total, 255.0,
-                          stream->psnr_sse_total);
-    fprintf(stderr, " %.3lf", ovpsnr);
+    ovpsnr = vp8_mse2psnr((double)stream->psnr_samples_total, 255.0,
+                          (double)stream->psnr_sse_total);
+    fprintf(stderr, " %.3f", ovpsnr);
 
     for (i = 0; i < 4; i++)
     {
-        fprintf(stderr, " %.3lf", stream->psnr_totals[i]/stream->psnr_count);
+        fprintf(stderr, " %.3f", stream->psnr_totals[i]/stream->psnr_count);
     }
     fprintf(stderr, "\n");
 }
@@ -2320,7 +2332,7 @@
 
 float usec_to_fps(uint64_t usec, unsigned int frames)
 {
-    return usec > 0 ? (float)frames * 1000000.0 / (float)usec : 0;
+    return (float)(usec > 0 ? frames * 1000000.0 / (float)usec : 0);
 }
 
 
@@ -2484,7 +2496,7 @@
                                         frame_avail ? &raw : NULL,
                                         frames_in));
             vpx_usec_timer_mark(&timer);
-            cx_time += vpx_usec_timer_elapsed(&timer);
+            cx_time += (unsigned long)vpx_usec_timer_elapsed(&timer);
 
             FOREACH_STREAM(update_quantizer_histogram(stream));
 

diff --git a/y4minput.c b/y4minput.c
index dd51421..ff9ffbc 100644
--- a/y4minput.c
+++ b/y4minput.c

@@ -662,7 +662,7 @@
       _nskip--;
     }
     else{
-      ret=fread(buffer+i,1,1,_fin);
+      ret=(int)fread(buffer+i,1,1,_fin);
       if(ret<1)return -1;
     }
     if(buffer[i]=='\n')break;
@@ -818,7 +818,7 @@
   int  c_sz;
   int  ret;
   /*Read and skip the frame header.*/
-  ret=fread(frame,1,6,_fin);
+  ret=(int)fread(frame,1,6,_fin);
   if(ret<6)return 0;
   if(memcmp(frame,"FRAME",5)){
     fprintf(stderr,"Loss of framing in Y4M input data\n");