Merge "configure: enable unused variable warnings"

diff --git a/.mailmap b/.mailmap
index f77ff26..5052f29 100644
--- a/.mailmap
+++ b/.mailmap

@@ -1,2 +1,4 @@
 Adrian Grange <agrange@google.com>
 Johann Koenig <johannkoenig@google.com>
+Tero Rintaluoma <teror@google.com> <tero.rintaluoma@on2.com>
+Tom Finegan <tomfinegan@google.com>

diff --git a/AUTHORS b/AUTHORS
index 110e5e1..b8fc45e 100644
--- a/AUTHORS
+++ b/AUTHORS

@@ -4,13 +4,18 @@
 Aaron Watry <awatry@gmail.com>
 Adrian Grange <agrange@google.com>
 Alex Converse <alex.converse@gmail.com>
+Andoni Morales Alastruey <ylatuya@gmail.com>
 Andres Mejia <mcitadel@gmail.com>
+Attila Nagy <attilanagy@google.com>
 Fabio Pedretti <fabio.ped@libero.it>
 Frank Galligan <fgalligan@google.com>
 Fredrik Söderquist <fs@opera.com>
 Fritz Koenig <frkoenig@google.com>
+Gaute Strokkenes <gaute.strokkenes@broadcom.com>
 Giuseppe Scrivano <gscrivano@gnu.org>
 Guillermo Ballester Valor <gbvalor@gmail.com>
+Henrik Lundin <hlundin@google.com>
+James Berry <jamesberry@google.com>
 James Zern <jzern@google.com>
 Jan Kratochvil <jan.kratochvil@redhat.com>
 Jeff Muizelaar <jmuizelaar@mozilla.com>
@@ -23,10 +28,14 @@
 Makoto Kato <makoto.kt@gmail.com>
 Martin Ettl <ettl.martin78@googlemail.com>
 Michael Kohler <michaelkohler@live.com>
+Mikhal Shemer <mikhal@google.com>
+Pascal Massimino <pascal.massimino@gmail.com>
+Patrik Westin <patrik.westin@gmail.com>
 Paul Wilkins <paulwilkins@google.com>
 Pavol Rusnak <stick@gk2.sk>
 Philip Jägenstedt <philipj@opera.com>
 Scott LaVarnway <slavarnway@google.com>
+Tero Rintaluoma <teror@google.com>
 Timothy B. Terriberry <tterribe@xiph.org>
 Tom Finegan <tomfinegan@google.com>
 Yaowu Xu <yaowu@google.com>

diff --git a/CHANGELOG b/CHANGELOG
index b8da8f8..e8760d1 100644
--- a/CHANGELOG
+++ b/CHANGELOG

@@ -1,3 +1,80 @@
+2011-03-07 v0.9.6 "Bali"
+  Our second named release, focused on a faster, higher quality, encoder.
+
+  - Upgrading:
+    This release is backwards compatible with Aylesbury (v0.9.5). Users
+    of older releases should refer to the Upgrading notes in this
+    document for that release.
+
+  - Enhancements:
+      vpxenc --psnr shows a summary when encode completes
+      --tune=ssim option to enable activity masking
+      improved postproc visualizations for development
+      updated support for Apple iOS to SDK 4.2
+      query decoder to determine which reference frames were updated
+      implemented error tracking in the decoder
+      fix pipe support on windows
+
+  - Speed:
+      Primary focus was on good quality mode, speed 0. Average improvement
+      on x86 about 40%, up to 100% on user-generated content at that speed.
+      Best quality mode speed improved 35%, and realtime speed 10-20%. This
+      release also saw significant improvement in realtime encoding speed
+      on ARM platforms.
+
+        Improved encoder threading
+        Dont pick encoder filter level when loopfilter is disabled.
+        Avoid double copying of key frames into alt and golden buffer
+        FDCT optimizations.
+        x86 sse2 temporal filter
+        SSSE3 version of fast quantizer
+        vp8_rd_pick_best_mbsegmentation code restructure
+        Adjusted breakout RD for SPLITMV
+        Changed segmentation check order
+        Improved rd_pick_intra4x4block
+        Adds armv6 optimized variance calculation
+        ARMv6 optimized sad16x16
+        ARMv6 optimized half pixel variance calculations
+        Full search SAD function optimization in SSE4.1
+        Improve MV prediction accuracy to achieve performance gain
+        Improve MV prediction in vp8_pick_inter_mode() for speed>3
+
+  - Quality:
+      Best quality mode improved PSNR 6.3%, and SSIM 6.1%. This release
+      also includes support for "activity masking," which greatly improves
+      SSIM at the expense of PSNR. For now, this feature is available with
+      the --tune=ssim option. Further experimentation in this area
+      is ongoing. This release also introduces a new rate control mode
+      called "CQ," which changes the allocation of bits within a clip to
+      the sections where they will have the most visual impact.
+
+        Tuning for the more exact quantizer.
+        Relax rate control for last few frames
+        CQ Mode
+        Limit key frame quantizer for forced key frames.
+        KF/GF Pulsing
+        Add simple version of activity masking.
+        make rdmult adaptive for intra in quantizer RDO
+        cap the best quantizer for 2nd order DC
+        change the threshold of DC check for encode breakout
+
+  - Bug Fixes:
+      Fix crash on Sparc Solaris.
+      Fix counter of fixed keyframe distance
+      ARNR filter pointer update bug fix
+      Fixed use of motion percentage in KF/GF group calc
+      Changed condition for using RD in Intra Mode
+      Fix encoder real-time only configuration.
+      Fix ARM encoder crash with multiple token partitions
+      Fixed bug first cluster timecode of webm file is wrong.
+      Fixed various encoder bugs with odd-sized images
+      vp8e_get_preview fixed when spatial resampling enabled
+      quantizer: fix assertion in fast quantizer path
+      Allocate source buffers to be multiples of 16
+      Fix for manual Golden frame frequency
+      Fix drastic undershoot in long form content
+
+
 2010-10-28 v0.9.5 "Aylesbury"
   Our first named release, focused on a faster decoder, and a better encoder.
 

diff --git a/README b/README
index c1a7668..dddc5ea 100644
--- a/README
+++ b/README

@@ -45,18 +45,14 @@
     armv5te-linux-rvct
     armv5te-linux-gcc
     armv5te-symbian-gcc
-    armv5te-wince-vs8
     armv6-darwin-gcc
     armv6-linux-rvct
     armv6-linux-gcc
     armv6-symbian-gcc
-    armv6-wince-vs8
     iwmmxt-linux-rvct
     iwmmxt-linux-gcc
-    iwmmxt-wince-vs8
     iwmmxt2-linux-rvct
     iwmmxt2-linux-gcc
-    iwmmxt2-wince-vs8
     armv7-linux-rvct
     armv7-linux-gcc
     mips32-linux-gcc

diff --git a/build/arm-wince-vs8/armasmv5.rules b/build/arm-wince-vs8/armasmv5.rules
deleted file mode 100644
index efb80bc..0000000
--- a/build/arm-wince-vs8/armasmv5.rules
+++ /dev/null

@@ -1,20 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>

-<VisualStudioToolFile

-	Name="armasm"

-	Version="8.00"

-	>

-	<Rules>

-		<CustomBuildRule

-			Name="ARMASM"

-			DisplayName="Armasm Assembler"

-			CommandLine="armasm -o &quot;$(IntDir)\$(InputName).obj&quot; $(InputPath) -32 -ARCH 5&#x0D;&#x0A;"

-			Outputs="$(IntDir)\$(InputName).obj"

-			FileExtensions="*.asm"

-			ExecutionDescription="Assembling $(InputName).asm"

-			ShowOnlyRuleProperties="false"

-			>

-			<Properties>

-			</Properties>

-		</CustomBuildRule>

-	</Rules>

-</VisualStudioToolFile>


diff --git a/build/arm-wince-vs8/armasmv6.rules b/build/arm-wince-vs8/armasmv6.rules
deleted file mode 100644
index 67c6bc9..0000000
--- a/build/arm-wince-vs8/armasmv6.rules
+++ /dev/null

@@ -1,20 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>

-<VisualStudioToolFile

-	Name="armasm"

-	Version="8.00"

-	>

-	<Rules>

-		<CustomBuildRule

-			Name="ARMASM"

-			DisplayName="Armasm Assembler"

-			CommandLine="armasm -o &quot;$(IntDir)\$(InputName).obj&quot; $(InputPath) -32 -ARCH 6&#x0D;&#x0A;"

-			Outputs="$(IntDir)\$(InputName).obj"

-			FileExtensions="*.asm"

-			ExecutionDescription="Assembling $(InputName).asm"

-			ShowOnlyRuleProperties="false"

-			>

-			<Properties>

-			</Properties>

-		</CustomBuildRule>

-	</Rules>

-</VisualStudioToolFile>


diff --git a/build/arm-wince-vs8/armasmxscale.rules b/build/arm-wince-vs8/armasmxscale.rules
deleted file mode 100644
index 4da9d1e..0000000
--- a/build/arm-wince-vs8/armasmxscale.rules
+++ /dev/null

@@ -1,20 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>

-<VisualStudioToolFile

-	Name="armasm"

-	Version="8.00"

-	>

-	<Rules>

-		<CustomBuildRule

-			Name="ARMASM"

-			DisplayName="Armasm Assembler"

-			CommandLine="armasm -o &quot;$(IntDir)\$(InputName).obj&quot; $(InputPath) -32 -cpu XSCALE&#x0D;&#x0A;"

-			Outputs="$(IntDir)\$(InputName).obj"

-			FileExtensions="*.asm"

-			ExecutionDescription="Assembling $(InputName).asm"

-			ShowOnlyRuleProperties="false"

-			>

-			<Properties>

-			</Properties>

-		</CustomBuildRule>

-	</Rules>

-</VisualStudioToolFile>


diff --git a/build/arm-wince-vs8/obj_int_extract.bat b/build/arm-wince-vs8/obj_int_extract.bat
deleted file mode 100644
index a361fc3..0000000
--- a/build/arm-wince-vs8/obj_int_extract.bat
+++ /dev/null

@@ -1,13 +0,0 @@
-@echo off
-REM   Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-REM
-REM   Use of this source code is governed by a BSD-style license
-REM   that can be found in the LICENSE file in the root of the source
-REM   tree. An additional intellectual property rights grant can be found
-REM   in the file PATENTS.  All contributing project authors may
-REM   be found in the AUTHORS file in the root of the source tree.
-echo on
-
-
-cl /I ".\\" /I "..\vp6_decoder_sdk" /I "..\vp6_decoder_sdk\vpx_ports" /D "NDEBUG" /D "_WIN32_WCE=0x420" /D "UNDER_CE" /D "WIN32_PLATFORM_PSPC" /D "WINCE" /D "_LIB" /D "ARM" /D "_ARM_" /D "_UNICODE" /D "UNICODE" /FD /EHsc /MT /GS- /fp:fast /GR- /Fo"Pocket_PC_2003__ARMV4_\%1/" /Fd"Pocket_PC_2003__ARMV4_\%1/vc80.pdb" /W3 /nologo /c /TC ..\vp6_decoder_sdk\vp6_decoder\algo\common\arm\dec_asm_offsets_arm.c

-obj_int_extract.exe rvds "Pocket_PC_2003__ARMV4_\%1/dec_asm_offsets_arm.obj"


diff --git a/build/arm-wince-vs8/vpx.sln b/build/arm-wince-vs8/vpx.sln
deleted file mode 100644
index 3e49929..0000000
--- a/build/arm-wince-vs8/vpx.sln
+++ /dev/null

@@ -1,88 +0,0 @@
-Microsoft Visual Studio Solution File, Format Version 9.00
-# Visual Studio 2005
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "example", "example.vcproj", "{BA5FE66F-38DD-E034-F542-B1578C5FB950}"
-	ProjectSection(ProjectDependencies) = postProject
-		{DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74} = {DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74}
-		{E1360C65-D375-4335-8057-7ED99CC3F9B2} = {E1360C65-D375-4335-8057-7ED99CC3F9B2}
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "obj_int_extract", "obj_int_extract.vcproj", "{E1360C65-D375-4335-8057-7ED99CC3F9B2}"
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "vpx", "vpx.vcproj", "{DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74}"
-	ProjectSection(ProjectDependencies) = postProject
-		{E1360C65-D375-4335-8057-7ED99CC3F9B2} = {E1360C65-D375-4335-8057-7ED99CC3F9B2}
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "xma", "xma.vcproj", "{A955FC4A-73F1-44F7-135E-30D84D32F022}"
-	ProjectSection(ProjectDependencies) = postProject
-		{E1360C65-D375-4335-8057-7ED99CC3F9B2} = {E1360C65-D375-4335-8057-7ED99CC3F9B2}
-		{DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74} = {DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74}
-	EndProjectSection
-EndProject
-Global
-	GlobalSection(SolutionConfigurationPlatforms) = preSolution
-		Debug|Mixed Platforms = Debug|Mixed Platforms
-		Debug|Pocket PC 2003 (ARMV4) = Debug|Pocket PC 2003 (ARMV4)
-		Debug|Win32 = Debug|Win32
-		Release|Mixed Platforms = Release|Mixed Platforms
-		Release|Pocket PC 2003 (ARMV4) = Release|Pocket PC 2003 (ARMV4)
-		Release|Win32 = Release|Win32
-	EndGlobalSection
-	GlobalSection(ProjectConfigurationPlatforms) = postSolution
-		{BA5FE66F-38DD-E034-F542-B1578C5FB950}.Debug|Mixed Platforms.ActiveCfg = Debug|Pocket PC 2003 (ARMV4)
-		{BA5FE66F-38DD-E034-F542-B1578C5FB950}.Debug|Mixed Platforms.Build.0 = Debug|Pocket PC 2003 (ARMV4)
-		{BA5FE66F-38DD-E034-F542-B1578C5FB950}.Debug|Mixed Platforms.Deploy.0 = Debug|Pocket PC 2003 (ARMV4)
-		{BA5FE66F-38DD-E034-F542-B1578C5FB950}.Debug|Pocket PC 2003 (ARMV4).ActiveCfg = Debug|Pocket PC 2003 (ARMV4)
-		{BA5FE66F-38DD-E034-F542-B1578C5FB950}.Debug|Pocket PC 2003 (ARMV4).Build.0 = Debug|Pocket PC 2003 (ARMV4)
-		{BA5FE66F-38DD-E034-F542-B1578C5FB950}.Debug|Pocket PC 2003 (ARMV4).Deploy.0 = Debug|Pocket PC 2003 (ARMV4)
-		{BA5FE66F-38DD-E034-F542-B1578C5FB950}.Debug|Win32.ActiveCfg = Debug|Pocket PC 2003 (ARMV4)
-		{BA5FE66F-38DD-E034-F542-B1578C5FB950}.Release|Mixed Platforms.ActiveCfg = Release|Pocket PC 2003 (ARMV4)
-		{BA5FE66F-38DD-E034-F542-B1578C5FB950}.Release|Mixed Platforms.Build.0 = Release|Pocket PC 2003 (ARMV4)
-		{BA5FE66F-38DD-E034-F542-B1578C5FB950}.Release|Mixed Platforms.Deploy.0 = Release|Pocket PC 2003 (ARMV4)
-		{BA5FE66F-38DD-E034-F542-B1578C5FB950}.Release|Pocket PC 2003 (ARMV4).ActiveCfg = Release|Pocket PC 2003 (ARMV4)
-		{BA5FE66F-38DD-E034-F542-B1578C5FB950}.Release|Pocket PC 2003 (ARMV4).Build.0 = Release|Pocket PC 2003 (ARMV4)
-		{BA5FE66F-38DD-E034-F542-B1578C5FB950}.Release|Pocket PC 2003 (ARMV4).Deploy.0 = Release|Pocket PC 2003 (ARMV4)
-		{BA5FE66F-38DD-E034-F542-B1578C5FB950}.Release|Win32.ActiveCfg = Release|Pocket PC 2003 (ARMV4)
-		{E1360C65-D375-4335-8057-7ED99CC3F9B2}.Debug|Mixed Platforms.ActiveCfg = Release|Win32
-		{E1360C65-D375-4335-8057-7ED99CC3F9B2}.Debug|Mixed Platforms.Build.0 = Release|Win32
-		{E1360C65-D375-4335-8057-7ED99CC3F9B2}.Debug|Pocket PC 2003 (ARMV4).ActiveCfg = Release|Win32
-		{E1360C65-D375-4335-8057-7ED99CC3F9B2}.Debug|Win32.ActiveCfg = Release|Win32
-		{E1360C65-D375-4335-8057-7ED99CC3F9B2}.Debug|Win32.Build.0 = Release|Win32
-		{E1360C65-D375-4335-8057-7ED99CC3F9B2}.Release|Mixed Platforms.ActiveCfg = Release|Win32
-		{E1360C65-D375-4335-8057-7ED99CC3F9B2}.Release|Mixed Platforms.Build.0 = Release|Win32
-		{E1360C65-D375-4335-8057-7ED99CC3F9B2}.Release|Pocket PC 2003 (ARMV4).ActiveCfg = Release|Win32
-		{E1360C65-D375-4335-8057-7ED99CC3F9B2}.Release|Win32.ActiveCfg = Release|Win32
-		{E1360C65-D375-4335-8057-7ED99CC3F9B2}.Release|Win32.Build.0 = Release|Win32
-		{DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74}.Debug|Mixed Platforms.ActiveCfg = Debug|Pocket PC 2003 (ARMV4)
-		{DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74}.Debug|Mixed Platforms.Build.0 = Debug|Pocket PC 2003 (ARMV4)
-		{DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74}.Debug|Mixed Platforms.Deploy.0 = Debug|Pocket PC 2003 (ARMV4)
-		{DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74}.Debug|Pocket PC 2003 (ARMV4).ActiveCfg = Debug|Pocket PC 2003 (ARMV4)
-		{DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74}.Debug|Pocket PC 2003 (ARMV4).Build.0 = Debug|Pocket PC 2003 (ARMV4)
-		{DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74}.Debug|Pocket PC 2003 (ARMV4).Deploy.0 = Debug|Pocket PC 2003 (ARMV4)
-		{DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74}.Debug|Win32.ActiveCfg = Debug|Pocket PC 2003 (ARMV4)
-		{DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74}.Release|Mixed Platforms.ActiveCfg = Release|Pocket PC 2003 (ARMV4)
-		{DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74}.Release|Mixed Platforms.Build.0 = Release|Pocket PC 2003 (ARMV4)
-		{DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74}.Release|Mixed Platforms.Deploy.0 = Release|Pocket PC 2003 (ARMV4)
-		{DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74}.Release|Pocket PC 2003 (ARMV4).ActiveCfg = Release|Pocket PC 2003 (ARMV4)
-		{DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74}.Release|Pocket PC 2003 (ARMV4).Build.0 = Release|Pocket PC 2003 (ARMV4)
-		{DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74}.Release|Pocket PC 2003 (ARMV4).Deploy.0 = Release|Pocket PC 2003 (ARMV4)
-		{DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74}.Release|Win32.ActiveCfg = Release|Pocket PC 2003 (ARMV4)
-		{A955FC4A-73F1-44F7-135E-30D84D32F022}.Debug|Mixed Platforms.ActiveCfg = Debug|Pocket PC 2003 (ARMV4)
-		{A955FC4A-73F1-44F7-135E-30D84D32F022}.Debug|Mixed Platforms.Build.0 = Debug|Pocket PC 2003 (ARMV4)
-		{A955FC4A-73F1-44F7-135E-30D84D32F022}.Debug|Mixed Platforms.Deploy.0 = Debug|Pocket PC 2003 (ARMV4)
-		{A955FC4A-73F1-44F7-135E-30D84D32F022}.Debug|Pocket PC 2003 (ARMV4).ActiveCfg = Debug|Pocket PC 2003 (ARMV4)
-		{A955FC4A-73F1-44F7-135E-30D84D32F022}.Debug|Pocket PC 2003 (ARMV4).Build.0 = Debug|Pocket PC 2003 (ARMV4)
-		{A955FC4A-73F1-44F7-135E-30D84D32F022}.Debug|Pocket PC 2003 (ARMV4).Deploy.0 = Debug|Pocket PC 2003 (ARMV4)
-		{A955FC4A-73F1-44F7-135E-30D84D32F022}.Debug|Win32.ActiveCfg = Debug|Pocket PC 2003 (ARMV4)
-		{A955FC4A-73F1-44F7-135E-30D84D32F022}.Release|Mixed Platforms.ActiveCfg = Release|Pocket PC 2003 (ARMV4)
-		{A955FC4A-73F1-44F7-135E-30D84D32F022}.Release|Mixed Platforms.Build.0 = Release|Pocket PC 2003 (ARMV4)
-		{A955FC4A-73F1-44F7-135E-30D84D32F022}.Release|Mixed Platforms.Deploy.0 = Release|Pocket PC 2003 (ARMV4)
-		{A955FC4A-73F1-44F7-135E-30D84D32F022}.Release|Pocket PC 2003 (ARMV4).ActiveCfg = Release|Pocket PC 2003 (ARMV4)
-		{A955FC4A-73F1-44F7-135E-30D84D32F022}.Release|Pocket PC 2003 (ARMV4).Build.0 = Release|Pocket PC 2003 (ARMV4)
-		{A955FC4A-73F1-44F7-135E-30D84D32F022}.Release|Pocket PC 2003 (ARMV4).Deploy.0 = Release|Pocket PC 2003 (ARMV4)
-		{A955FC4A-73F1-44F7-135E-30D84D32F022}.Release|Win32.ActiveCfg = Release|Pocket PC 2003 (ARMV4)
-	EndGlobalSection
-	GlobalSection(SolutionProperties) = preSolution
-		HideSolutionNode = FALSE
-	EndGlobalSection
-EndGlobal

diff --git a/build/make/Makefile b/build/make/Makefile
index 40fa6d5..64d3c93 100755
--- a/build/make/Makefile
+++ b/build/make/Makefile

@@ -152,8 +152,8 @@
 # Rule to extract assembly constants from C sources
 #
 obj_int_extract: build/make/obj_int_extract.c
-	$(if $(quiet),echo "    [HOSTCC] $@")
-	$(qexec)$(HOSTCC) -I. -o $@ $<
+	$(if $(quiet),@echo "    [HOSTCC] $@")
+	$(qexec)$(HOSTCC) -I. -I$(SRC_PATH_BARE) -o $@ $<
 CLEAN-OBJS += obj_int_extract
 
 #
@@ -255,7 +255,7 @@
 endif
 
 #
-# Configuration dependant rules
+# Configuration dependent rules
 #
 $(call pairmap,install_map_templates,$(INSTALL_MAPS))
 
@@ -331,11 +331,8 @@
     DIST-SRCS-$(CONFIG_MSVS)  += build/make/gen_msvs_sln.sh
     DIST-SRCS-$(CONFIG_MSVS)  += build/x86-msvs/yasm.rules
     DIST-SRCS-$(CONFIG_RVCT) += build/make/armlink_adapter.sh
-    #
-    # This isn't really ARCH_ARM dependent, it's dependant on whether we're
-    # using assembly code or not (CONFIG_OPTIMIZATIONS maybe). Just use
-    # this for now.
-    DIST-SRCS-$(ARCH_ARM)    += build/make/obj_int_extract.c
+    # Include obj_int_extract if we use offsets from asm_*_offsets
+    DIST-SRCS-$(ARCH_ARM)$(ARCH_X86)$(ARCH_X86_64)    += build/make/obj_int_extract.c
     DIST-SRCS-$(ARCH_ARM)    += build/make/ads2gas.pl
     DIST-SRCS-yes            += $(target:-$(TOOLCHAIN)=).mk
 endif

diff --git a/build/make/armlink_adapter.sh b/build/make/armlink_adapter.sh
index 571e46e..b53669c 100755
--- a/build/make/armlink_adapter.sh
+++ b/build/make/armlink_adapter.sh

@@ -17,15 +17,17 @@
         on_of=1
     elif [ "$i" == "-v" ]; then
         verbose=1
+    elif [ "$i" == "-g" ]; then
+        args="${args} --debug"
     elif [ "$on_of" == "1" ]; then
         outfile=$i
-    on_of=0
+        on_of=0
     elif [ -f "$i" ]; then
         infiles="$infiles $i"
     elif [ "${i:0:2}" == "-l" ]; then
         libs="$libs ${i#-l}"
     elif [ "${i:0:2}" == "-L" ]; then
-    libpaths="${libpaths} ${i#-L}"
+        libpaths="${libpaths} ${i#-L}"
     else
         args="${args} ${i}"
     fi

diff --git a/build/make/configure.sh b/build/make/configure.sh
index 88698fd..a48fd9f 100755
--- a/build/make/configure.sh
+++ b/build/make/configure.sh

@@ -83,7 +83,7 @@
   ${toggle_werror}            treat warnings as errors, if possible
                               (not available with all compilers)
   ${toggle_optimizations}     turn on/off compiler optimization flags
-  ${toggle_pic}               turn on/off Position Independant Code
+  ${toggle_pic}               turn on/off Position Independent Code
   ${toggle_ccache}            turn on/off compiler cache
   ${toggle_debug}             enable/disable debug mode
   ${toggle_gprof}             enable/disable gprof profiling instrumentation
@@ -624,6 +624,10 @@
 
     # Handle Solaris variants. Solaris 10 needs -lposix4
     case ${toolchain} in
+        sparc-solaris-*)
+            add_extralibs -lposix4
+            add_cflags "-DMUST_BE_ALIGNED"
+            ;;
         *-solaris-*)
             add_extralibs -lposix4
             ;;
@@ -711,7 +715,7 @@
             TOOLCHAIN_PATH=${SDK_PATH}/usr/bin
             CC=${TOOLCHAIN_PATH}/gcc
             AR=${TOOLCHAIN_PATH}/ar
-            LD=${TOOLCHAIN_PATH}/arm-apple-darwin9-gcc-4.2.1
+            LD=${TOOLCHAIN_PATH}/arm-apple-darwin10-gcc-4.2.1
             AS=${TOOLCHAIN_PATH}/as
             STRIP=${TOOLCHAIN_PATH}/strip
             NM=${TOOLCHAIN_PATH}/nm
@@ -725,14 +729,14 @@
             add_cflags -arch ${tgt_isa}
             add_ldflags -arch_only ${tgt_isa}
 
-            add_cflags  "-isysroot /Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS3.1.sdk"
+            add_cflags  "-isysroot /Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS4.2.sdk"
 
             # This should be overridable
-            alt_libc=${SDK_PATH}/SDKs/iPhoneOS3.1.sdk
+            alt_libc=${SDK_PATH}/SDKs/iPhoneOS4.2.sdk
 
             # Add the paths for the alternate libc
 #            for d in usr/include usr/include/gcc/darwin/4.0/; do
-            for d in usr/include usr/include/gcc/darwin/4.0/ usr/lib/gcc/arm-apple-darwin9/4.0.1/include/; do
+            for d in usr/include usr/include/gcc/darwin/4.0/ usr/lib/gcc/arm-apple-darwin10/4.2.1/include/; do
                 try_dir="${alt_libc}/${d}"
                 [ -d "${try_dir}" ] && add_cflags -I"${try_dir}"
             done
@@ -863,7 +867,7 @@
                 setup_gnu_toolchain
                 add_cflags -use-msasm -use-asm
                 add_ldflags -i-static
-                enabled x86_64 && add_cflags -ipo -no-prec-div -static -xSSE3 -axSSE3
+                enabled x86_64 && add_cflags -ipo -no-prec-div -static -xSSE2 -axSSE2
                 enabled x86_64 && AR=xiar
                 case ${tune_cpu} in
                     atom*)
@@ -953,7 +957,7 @@
         enabled small && check_add_cflags -O2 || check_add_cflags -O3
     fi
 
-    # Position Independant Code (PIC) support, for building relocatable
+    # Position Independent Code (PIC) support, for building relocatable
     # shared objects
     enabled gcc && enabled pic && check_add_cflags -fPIC
 

diff --git a/build/make/gen_msvs_proj.sh b/build/make/gen_msvs_proj.sh
index 584477f..c2ef44a 100755
--- a/build/make/gen_msvs_proj.sh
+++ b/build/make/gen_msvs_proj.sh

@@ -32,7 +32,8 @@
     --name=project_name         Name of the project (required)
     --proj-guid=GUID            GUID to use for the project
     --module-def=filename       File containing export definitions (for DLLs)
-    --ver=version               Version (7,8) of visual studio to generate for
+    --ver=version               Version (7,8,9) of visual studio to generate for
+    --src-path-bare=dir         Path to root of source tree
     -Ipath/to/include           Additional include directories
     -DFLAG[=value]              Preprocessor macros to define
     -Lpath/to/lib               Additional library search paths
@@ -132,7 +133,7 @@
     open_tag Filter \
         Name=$name \
         Filter=$pats \
-        UniqueIdentifier=`generate_uuid`
+        UniqueIdentifier=`generate_uuid` \
 
     file_list_sz=${#file_list[@]}
     for i in ${!file_list[@]}; do
@@ -145,31 +146,21 @@
                 if [ "$pat" == "asm" ] && $asm_use_custom_step; then
                     for plat in "${platforms[@]}"; do
                         for cfg in Debug Release; do
-                            open_tag  FileConfiguration \
-                            Name="${cfg}|${plat}"
+                            open_tag FileConfiguration \
+                                Name="${cfg}|${plat}" \
+
                             tag Tool \
                                 Name="VCCustomBuildTool" \
                                 Description="Assembling \$(InputFileName)" \
-                                CommandLine="$(eval echo \$asm_${cfg}_cmdline)"\
-                                Outputs="\$(InputName).obj"
+                                CommandLine="$(eval echo \$asm_${cfg}_cmdline)" \
+                                Outputs="\$(InputName).obj" \
+
                             close_tag FileConfiguration
                         done
                     done
                 fi
 
-                if [ "${f##*.}" == "cpp" ]; then
-                    for plat in "${platforms[@]}"; do
-                        for cfg in Debug Release; do
-                        open_tag FileConfiguration \
-                            Name="${cfg}|${plat}"
-                        tag Tool \
-                            Name="VCCLCompilerTool" \
-                            CompileAs="2"
-                        close_tag FileConfiguration
-                        done
-                    done
-                fi
-                close_tag  File
+                close_tag File
 
                 break
             fi
@@ -185,57 +176,63 @@
 for opt in "$@"; do
     optval="${opt#*=}"
     case "$opt" in
-    --help|-h) show_help
-    ;;
-    --target=*) target="${optval}"
-    ;;
-    --out=*) outfile="$optval"
-    ;;
-    --name=*) name="${optval}"
-    ;;
-    --proj-guid=*) guid="${optval}"
-    ;;
-    --module-def=*)
-        link_opts="${link_opts} ModuleDefinitionFile=${optval}"
-    ;;
-    --exe) proj_kind="exe"
-    ;;
-    --lib) proj_kind="lib"
-    ;;
-    --static-crt) use_static_runtime=true
-    ;;
-    --ver=*) vs_ver="$optval"
-             case $optval in
-             [789])
-             ;;
-             *) die Unrecognized Visual Studio Version in $opt
-             ;;
-             esac
-    ;;
-    -I*) opt="${opt%/}"
-         incs="${incs}${incs:+;}&quot;${opt##-I}&quot;"
-         yasmincs="${yasmincs} ${opt}"
-    ;;
-    -D*) defines="${defines}${defines:+;}${opt##-D}"
-    ;;
-    -L*) # fudge . to $(OutDir)
-         if [ "${opt##-L}" == "." ]; then
-             libdirs="${libdirs}${libdirs:+;}&quot;\$(OutDir)&quot;"
-         else
-             # Also try directories for this platform/configuration
-             libdirs="${libdirs}${libdirs:+;}&quot;${opt##-L}&quot;"
-             libdirs="${libdirs}${libdirs:+;}&quot;${opt##-L}/\$(PlatformName)/\$(ConfigurationName)&quot;"
-             libdirs="${libdirs}${libdirs:+;}&quot;${opt##-L}/\$(PlatformName)&quot;"
-         fi
-    ;;
-    -l*) libs="${libs}${libs:+ }${opt##-l}.lib"
-    ;;
-    -*) die_unknown $opt
-    ;;
-    *) file_list[${#file_list[@]}]="$opt"
-       case "$opt" in
-       *.asm) uses_asm=true;;
-       esac
+        --help|-h) show_help
+        ;;
+        --target=*) target="${optval}"
+        ;;
+        --out=*) outfile="$optval"
+        ;;
+        --name=*) name="${optval}"
+        ;;
+        --proj-guid=*) guid="${optval}"
+        ;;
+        --module-def=*) link_opts="${link_opts} ModuleDefinitionFile=${optval}"
+        ;;
+        --exe) proj_kind="exe"
+        ;;
+        --lib) proj_kind="lib"
+        ;;
+        --src-path-bare=*) src_path_bare="$optval"
+        ;;
+        --static-crt) use_static_runtime=true
+        ;;
+        --ver=*)
+            vs_ver="$optval"
+            case "$optval" in
+                [789])
+                ;;
+                *) die Unrecognized Visual Studio Version in $opt
+                ;;
+            esac
+        ;;
+        -I*)
+            opt="${opt%/}"
+            incs="${incs}${incs:+;}&quot;${opt##-I}&quot;"
+            yasmincs="${yasmincs} ${opt}"
+        ;;
+        -D*) defines="${defines}${defines:+;}${opt##-D}"
+        ;;
+        -L*) # fudge . to $(OutDir)
+            if [ "${opt##-L}" == "." ]; then
+                libdirs="${libdirs}${libdirs:+;}&quot;\$(OutDir)&quot;"
+            else
+                 # Also try directories for this platform/configuration
+                 libdirs="${libdirs}${libdirs:+;}&quot;${opt##-L}&quot;"
+                 libdirs="${libdirs}${libdirs:+;}&quot;${opt##-L}/\$(PlatformName)/\$(ConfigurationName)&quot;"
+                 libdirs="${libdirs}${libdirs:+;}&quot;${opt##-L}/\$(PlatformName)&quot;"
+            fi
+        ;;
+        -l*) libs="${libs}${libs:+ }${opt##-l}.lib"
+        ;;
+        -*) die_unknown $opt
+        ;;
+        *)
+            file_list[${#file_list[@]}]="$opt"
+            case "$opt" in
+                 *.asm) uses_asm=true
+                 ;;
+            esac
+        ;;
     esac
 done
 outfile=${outfile:-/dev/stdout}
@@ -278,11 +275,7 @@
 
 # List Keyword for this target
 case "$target" in
-    x86*)
-        keyword="ManagedCProj"
-    ;;
-    arm*|iwmmx*)
-        keyword="Win32Proj"
+    x86*) keyword="ManagedCProj"
     ;;
     *) die "Unsupported target $target!"
 esac
@@ -298,402 +291,255 @@
         asm_Debug_cmdline="yasm -Xvc -g cv8 -f \$(PlatformName) ${yasmincs} &quot;\$(InputPath)&quot;"
         asm_Release_cmdline="yasm -Xvc -f \$(PlatformName) ${yasmincs} &quot;\$(InputPath)&quot;"
     ;;
-    arm*|iwmmx*)
-        case "${name}" in
-        obj_int_extract) platforms[0]="Win32"
-        ;;
-        *) platforms[0]="Pocket PC 2003 (ARMV4)"
-        ;;
-        esac
-    ;;
     *) die "Unsupported target $target!"
-esac
-
-# List Command-line Arguments for this target
-case "$target" in
-    arm*|iwmmx*)
-        if [ "$name" == "example" ];then
-            ARGU="--codec vp6 --flipuv --progress _bnd.vp6"
-        fi
-        if [ "$name" == "xma" ];then
-            ARGU="--codec vp6 -h 240 -w 320 -v"
-        fi
     ;;
 esac
 
 generate_vcproj() {
     case "$proj_kind" in
-    exe) vs_ConfigurationType=1
-    ;;
-    *)   vs_ConfigurationType=4
-    ;;
+        exe) vs_ConfigurationType=1
+        ;;
+        *)   vs_ConfigurationType=4
+        ;;
     esac
 
     echo "<?xml version=\"1.0\" encoding=\"Windows-1252\"?>"
-    open_tag  VisualStudioProject \
-                  ProjectType="Visual C++" \
-                  Version="${vs_ver_id}" \
-                  Name="${name}" \
-                  ProjectGUID="{${guid}}" \
-                  RootNamespace="${name}" \
-                  Keyword="${keyword}"
+    open_tag VisualStudioProject \
+        ProjectType="Visual C++" \
+        Version="${vs_ver_id}" \
+        Name="${name}" \
+        ProjectGUID="{${guid}}" \
+        RootNamespace="${name}" \
+        Keyword="${keyword}" \
 
-    open_tag  Platforms
+    open_tag Platforms
     for plat in "${platforms[@]}"; do
-        tag   Platform Name="$plat"
+        tag Platform Name="$plat"
     done
     close_tag Platforms
 
-    open_tag  ToolFiles
+    open_tag ToolFiles
     case "$target" in
         x86*) $uses_asm && tag ToolFile RelativePath="$self_dirname/../x86-msvs/yasm.rules"
         ;;
-        arm*|iwmmx*)
-            if [ "$name" == "vpx" ];then
-            case "$target" in
-                armv5*)
-                    tag ToolFile RelativePath="$self_dirname/../arm-wince-vs8/armasmv5.rules"
-                ;;
-                armv6*)
-                    tag ToolFile RelativePath="$self_dirname/../arm-wince-vs8/armasmv6.rules"
-                ;;
-                iwmmxt*)
-                    tag ToolFile RelativePath="$self_dirname/../arm-wince-vs8/armasmxscale.rules"
-                ;;
-            esac
-            fi
-        ;;
     esac
     close_tag ToolFiles
 
-    open_tag  Configurations
+    open_tag Configurations
     for plat in "${platforms[@]}"; do
         plat_no_ws=`echo $plat | sed 's/[^A-Za-z0-9_]/_/g'`
-        open_tag  Configuration \
-                      Name="Debug|$plat" \
-                      OutputDirectory="\$(SolutionDir)$plat_no_ws/\$(ConfigurationName)" \
-                      IntermediateDirectory="$plat_no_ws/\$(ConfigurationName)/${name}" \
-                      ConfigurationType="$vs_ConfigurationType" \
-                      CharacterSet="1"
-
-        if [ "$target" == "armv6-wince-vs8" ] || [ "$target" == "armv5te-wince-vs8" ] || [ "$target" == "iwmmxt-wince-vs8" ] || [ "$target" == "iwmmxt2-wince-vs8" ];then
-            case "$name" in
-                vpx)         tag Tool \
-                             Name="VCPreBuildEventTool" \
-                             CommandLine="call obj_int_extract.bat \$(ConfigurationName)"
-                             tag Tool \
-                             Name="VCMIDLTool" \
-                             TargetEnvironment="1"
-                             tag Tool \
-                             Name="VCCLCompilerTool" \
-                             ExecutionBucket="7" \
-                             Optimization="0" \
-                             AdditionalIncludeDirectories="$incs" \
-                             PreprocessorDefinitions="_DEBUG;_WIN32_WCE=\$(CEVER);UNDER_CE;\$(PLATFORMDEFINES);WINCE;DEBUG;_LIB;\$(ARCHFAM);\$(_ARCHFAM_);_UNICODE;UNICODE;" \
-                             MinimalRebuild="true" \
-                             RuntimeLibrary="1" \
-                             BufferSecurityCheck="false" \
-                             UsePrecompiledHeader="0" \
-                             WarningLevel="3" \
-                             DebugInformationFormat="1" \
-                             CompileAs="1"
-                             tag Tool \
-                             Name="VCResourceCompilerTool" \
-                             PreprocessorDefinitions="_DEBUG;_WIN32_WCE=\$(CEVER);UNDER_CE;\$(PLATFORMDEFINES)" \
-                             Culture="1033" \
-                             AdditionalIncludeDirectories="\$(IntDir)" \
-                ;;
-                example|xma) tag Tool \
-                             Name="VCCLCompilerTool" \
-                             ExecutionBucket="7" \
-                             Optimization="0" \
-                             AdditionalIncludeDirectories="$incs" \
-                             PreprocessorDefinitions="_DEBUG;_WIN32_WCE=\$(CEVER);UNDER_CE;\$(PLATFORMDEFINES);WINCE;DEBUG;_CONSOLE;\$(ARCHFAM);\$(_ARCHFAM_);_UNICODE;UNICODE;" \
-                             MinimalRebuild="true" \
-                             RuntimeLibrary="1" \
-                             BufferSecurityCheck="false" \
-                             UsePrecompiledHeader="0" \
-                             WarningLevel="3" \
-                             DebugInformationFormat="1" \
-                             CompileAs="1"
-                             tag Tool \
-                             Name="VCResourceCompilerTool" \
-                             PreprocessorDefinitions="_DEBUG;_WIN32_WCE=\$(CEVER);UNDER_CE;\$(PLATFORMDEFINES)" \
-                             Culture="1033" \
-                             AdditionalIncludeDirectories="\$(IntDir)" \
-                ;;
-                obj_int_extract) tag Tool \
-                             Name="VCCLCompilerTool" \
-                             Optimization="0" \
-                             AdditionalIncludeDirectories="$incs" \
-                             PreprocessorDefinitions="WIN32;DEBUG;_CONSOLE" \
-                             RuntimeLibrary="1" \
-                             WarningLevel="3" \
-                             DebugInformationFormat="1" \
-                ;;
-            esac
-        fi
+        open_tag Configuration \
+            Name="Debug|$plat" \
+            OutputDirectory="\$(SolutionDir)$plat_no_ws/\$(ConfigurationName)" \
+            IntermediateDirectory="$plat_no_ws/\$(ConfigurationName)/${name}" \
+            ConfigurationType="$vs_ConfigurationType" \
+            CharacterSet="1" \
 
         case "$target" in
-            x86*) tag Tool \
-                Name="VCCLCompilerTool" \
-                Optimization="0" \
-                AdditionalIncludeDirectories="$incs" \
-                PreprocessorDefinitions="WIN32;_DEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE;$defines" \
-                RuntimeLibrary="$debug_runtime" \
-                UsePrecompiledHeader="0" \
-                WarningLevel="3" \
-                DebugInformationFormat="1" \
-                Detect64BitPortabilityProblems="true" \
+            x86*)
+                case "$name" in
+                    obj_int_extract)
+                        tag Tool \
+                            Name="VCCLCompilerTool" \
+                            Optimization="0" \
+                            AdditionalIncludeDirectories="$incs" \
+                            PreprocessorDefinitions="WIN32;DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE" \
+                            RuntimeLibrary="$debug_runtime" \
+                            WarningLevel="3" \
+                            Detect64BitPortabilityProblems="true" \
+                            DebugInformationFormat="1" \
+                    ;;
+                    vpx)
+                        tag Tool \
+                            Name="VCPreBuildEventTool" \
+                            CommandLine="call obj_int_extract.bat $src_path_bare" \
 
-                $uses_asm && tag Tool Name="YASM"  IncludePaths="$incs" Debug="1"
+                        tag Tool \
+                            Name="VCCLCompilerTool" \
+                            Optimization="0" \
+                            AdditionalIncludeDirectories="$incs" \
+                            PreprocessorDefinitions="WIN32;_DEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE;$defines" \
+                            RuntimeLibrary="$debug_runtime" \
+                            UsePrecompiledHeader="0" \
+                            WarningLevel="3" \
+                            DebugInformationFormat="1" \
+                            Detect64BitPortabilityProblems="true" \
+
+                        $uses_asm && tag Tool Name="YASM"  IncludePaths="$incs" Debug="1"
+                    ;;
+                    *)
+                        tag Tool \
+                            Name="VCCLCompilerTool" \
+                            Optimization="0" \
+                            AdditionalIncludeDirectories="$incs" \
+                            PreprocessorDefinitions="WIN32;_DEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE;$defines" \
+                            RuntimeLibrary="$debug_runtime" \
+                            UsePrecompiledHeader="0" \
+                            WarningLevel="3" \
+                            DebugInformationFormat="1" \
+                            Detect64BitPortabilityProblems="true" \
+
+                        $uses_asm && tag Tool Name="YASM"  IncludePaths="$incs" Debug="1"
+                    ;;
+                esac
             ;;
         esac
 
         case "$proj_kind" in
             exe)
                 case "$target" in
-                    x86*) tag Tool \
-                          Name="VCLinkerTool" \
-                          AdditionalDependencies="$debug_libs \$(NoInherit)" \
-                          AdditionalLibraryDirectories="$libdirs" \
-                          GenerateDebugInformation="true" \
-                          ProgramDatabaseFile="\$(OutDir)/${name}.pdb" \
-
-                    ;;
-                    arm*|iwmmx*)
+                    x86*)
                         case "$name" in
-                            obj_int_extract) tag Tool \
-                                Name="VCLinkerTool" \
-                                OutputFile="${name}.exe" \
-                                GenerateDebugInformation="true"
+                            obj_int_extract)
+                                tag Tool \
+                                    Name="VCLinkerTool" \
+                                    OutputFile="${name}.exe" \
+                                    GenerateDebugInformation="true" \
                             ;;
-                            *) tag Tool \
-                                Name="VCLinkerTool" \
-                                AdditionalDependencies="$debug_libs" \
-                                OutputFile="\$(OutDir)/${name}.exe" \
-                                LinkIncremental="2" \
-                                AdditionalLibraryDirectories="${libdirs};&quot;..\lib/$plat_no_ws&quot;" \
-                                DelayLoadDLLs="\$(NOINHERIT)" \
-                                GenerateDebugInformation="true" \
-                                ProgramDatabaseFile="\$(OutDir)/${name}.pdb" \
-                                SubSystem="9" \
-                                StackReserveSize="65536" \
-                                StackCommitSize="4096" \
-                                EntryPointSymbol="mainWCRTStartup" \
-                                TargetMachine="3"
+                            *)
+                                tag Tool \
+                                    Name="VCLinkerTool" \
+                                    AdditionalDependencies="$debug_libs \$(NoInherit)" \
+                                    AdditionalLibraryDirectories="$libdirs" \
+                                    GenerateDebugInformation="true" \
+                                    ProgramDatabaseFile="\$(OutDir)/${name}.pdb" \
                             ;;
                         esac
-                     ;;
+                    ;;
                  esac
             ;;
             lib)
                 case "$target" in
-                      arm*|iwmmx*) tag Tool \
-                                    Name="VCLibrarianTool" \
-                                    AdditionalOptions=" /subsystem:windowsce,4.20 /machine:ARM" \
-                                    OutputFile="\$(OutDir)/${name}.lib" \
-                                ;;
-                                *) tag Tool \
-                                    Name="VCLibrarianTool" \
-                                    OutputFile="\$(OutDir)/${name}${lib_sfx}d.lib" \
-                                ;;
+                    x86*)
+                        tag Tool \
+                            Name="VCLibrarianTool" \
+                            OutputFile="\$(OutDir)/${name}${lib_sfx}d.lib" \
+
+                    ;;
                 esac
             ;;
-            dll) tag Tool \
-                 Name="VCLinkerTool" \
-                AdditionalDependencies="\$(NoInherit)" \
-                LinkIncremental="2" \
-                GenerateDebugInformation="true" \
-                AssemblyDebug="1" \
-                TargetMachine="1" \
-                      $link_opts
+            dll)
+                tag Tool \
+                    Name="VCLinkerTool" \
+                    AdditionalDependencies="\$(NoInherit)" \
+                    LinkIncremental="2" \
+                    GenerateDebugInformation="true" \
+                    AssemblyDebug="1" \
+                    TargetMachine="1" \
+                    $link_opts \
+
+            ;;
         esac
 
-        if [ "$target" == "armv6-wince-vs8" ] || [ "$target" == "armv5te-wince-vs8" ] || [ "$target" == "iwmmxt-wince-vs8" ] || [ "$target" == "iwmmxt2-wince-vs8" ];then
-            case "$name" in
-                vpx)         tag DeploymentTool \
-                             ForceDirty="-1" \
-                             RegisterOutput="0"
-                                ;;
-                example|xma) tag DeploymentTool \
-                             ForceDirty="-1" \
-                             RegisterOutput="0"
-                             tag DebuggerTool \
-                             Arguments="${ARGU}"
-                                ;;
-            esac
-        fi
         close_tag Configuration
 
-        open_tag  Configuration \
-                      Name="Release|$plat" \
-                      OutputDirectory="\$(SolutionDir)$plat_no_ws/\$(ConfigurationName)" \
-                      IntermediateDirectory="$plat_no_ws/\$(ConfigurationName)/${name}" \
-                      ConfigurationType="$vs_ConfigurationType" \
-                      CharacterSet="1" \
-                      WholeProgramOptimization="0"
+        open_tag Configuration \
+            Name="Release|$plat" \
+            OutputDirectory="\$(SolutionDir)$plat_no_ws/\$(ConfigurationName)" \
+            IntermediateDirectory="$plat_no_ws/\$(ConfigurationName)/${name}" \
+            ConfigurationType="$vs_ConfigurationType" \
+            CharacterSet="1" \
+            WholeProgramOptimization="0" \
 
-        if [ "$target" == "armv6-wince-vs8" ] || [ "$target" == "armv5te-wince-vs8" ] || [ "$target" == "iwmmxt-wince-vs8" ] || [ "$target" == "iwmmxt2-wince-vs8" ];then
-            case "$name" in
-                vpx)         tag Tool \
-                                     Name="VCPreBuildEventTool" \
-                                     CommandLine="call obj_int_extract.bat \$(ConfigurationName)"
-                             tag Tool \
-                                     Name="VCMIDLTool" \
-                                     TargetEnvironment="1"
-                             tag Tool \
-                                             Name="VCCLCompilerTool" \
-                                             ExecutionBucket="7" \
-                                             Optimization="2" \
-                                             FavorSizeOrSpeed="1" \
-                                             AdditionalIncludeDirectories="$incs" \
-                                             PreprocessorDefinitions="NDEBUG;_WIN32_WCE=\$(CEVER);UNDER_CE;\$(PLATFORMDEFINES);WINCE;_LIB;\$(ARCHFAM);\$(_ARCHFAM_);_UNICODE;UNICODE;" \
-                                             RuntimeLibrary="0" \
-                                             BufferSecurityCheck="false" \
-                                             UsePrecompiledHeader="0" \
-                                             WarningLevel="3" \
-                                             DebugInformationFormat="0" \
-                                             CompileAs="1"
-                             tag Tool \
-                                             Name="VCResourceCompilerTool" \
-                                             PreprocessorDefinitions="NDEBUG;_WIN32_WCE=\$(CEVER);UNDER_CE;\$(PLATFORMDEFINES)" \
-                                             Culture="1033" \
-                                             AdditionalIncludeDirectories="\$(IntDir)" \
-                ;;
-                example|xma) tag Tool \
-                             Name="VCCLCompilerTool" \
-                             ExecutionBucket="7" \
-                             Optimization="2" \
-                             FavorSizeOrSpeed="1" \
-                             AdditionalIncludeDirectories="$incs" \
-                             PreprocessorDefinitions="NDEBUG;_WIN32_WCE=\$(CEVER);UNDER_CE;\$(PLATFORMDEFINES);WINCE;_CONSOLE;\$(ARCHFAM);\$(_ARCHFAM_);_UNICODE;UNICODE;" \
-                             RuntimeLibrary="0" \
-                             BufferSecurityCheck="false" \
-                             UsePrecompiledHeader="0" \
-                             WarningLevel="3" \
-                             DebugInformationFormat="0" \
-                             CompileAs="1"
-                             tag Tool \
-                             Name="VCResourceCompilerTool" \
-                             PreprocessorDefinitions="NDEBUG;_WIN32_WCE=\$(CEVER);UNDER_CE;\$(PLATFORMDEFINES)" \
-                             Culture="1033" \
-                             AdditionalIncludeDirectories="\$(IntDir)" \
-                ;;
-                obj_int_extract) tag Tool \
-                             Name="VCCLCompilerTool" \
-                             AdditionalIncludeDirectories="$incs" \
-                             PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE" \
-                             RuntimeLibrary="0" \
-                             UsePrecompiledHeader="0" \
-                             WarningLevel="3" \
-                             Detect64BitPortabilityProblems="true" \
-                             DebugInformationFormat="0" \
-                ;;
-            esac
-        fi
+        case "$target" in
+            x86*)
+                case "$name" in
+                    obj_int_extract)
+                        tag Tool \
+                            Name="VCCLCompilerTool" \
+                            AdditionalIncludeDirectories="$incs" \
+                            PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE" \
+                            RuntimeLibrary="$release_runtime" \
+                            UsePrecompiledHeader="0" \
+                            WarningLevel="3" \
+                            Detect64BitPortabilityProblems="true" \
+                            DebugInformationFormat="0" \
+                    ;;
+                    vpx)
+                        tag Tool \
+                            Name="VCPreBuildEventTool" \
+                            CommandLine="call obj_int_extract.bat $src_path_bare" \
 
-    case "$target" in
-        x86*) tag       Tool \
-                      Name="VCCLCompilerTool" \
-                      AdditionalIncludeDirectories="$incs" \
-                      PreprocessorDefinitions="WIN32;NDEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE;$defines" \
-                      RuntimeLibrary="$release_runtime" \
-                      UsePrecompiledHeader="0" \
-                      WarningLevel="3" \
-                      DebugInformationFormat="0" \
-                      Detect64BitPortabilityProblems="true"
+                        tag Tool \
+                            Name="VCCLCompilerTool" \
+                            AdditionalIncludeDirectories="$incs" \
+                            PreprocessorDefinitions="WIN32;NDEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE;$defines" \
+                            RuntimeLibrary="$release_runtime" \
+                            UsePrecompiledHeader="0" \
+                            WarningLevel="3" \
+                            DebugInformationFormat="0" \
+                            Detect64BitPortabilityProblems="true" \
 
-                $uses_asm && tag Tool Name="YASM"  IncludePaths="$incs"
-                ;;
+                        $uses_asm && tag Tool Name="YASM"  IncludePaths="$incs"
+                    ;;
+                    *)
+                        tag Tool \
+                            Name="VCCLCompilerTool" \
+                            AdditionalIncludeDirectories="$incs" \
+                            PreprocessorDefinitions="WIN32;NDEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE;$defines" \
+                            RuntimeLibrary="$release_runtime" \
+                            UsePrecompiledHeader="0" \
+                            WarningLevel="3" \
+                            DebugInformationFormat="0" \
+                            Detect64BitPortabilityProblems="true" \
+
+                        $uses_asm && tag Tool Name="YASM"  IncludePaths="$incs"
+                    ;;
                 esac
+            ;;
+        esac
 
         case "$proj_kind" in
             exe)
                 case "$target" in
-                    x86*) tag Tool \
-                                  Name="VCLinkerTool" \
-                                  AdditionalDependencies="$libs \$(NoInherit)" \
-                                  AdditionalLibraryDirectories="$libdirs" \
-                    ;;
-                    arm*|iwmmx*)
+                    x86*)
                         case "$name" in
-                            obj_int_extract) tag Tool \
-                                Name="VCLinkerTool" \
-                                OutputFile="${name}.exe" \
-                                LinkIncremental="1" \
-                                GenerateDebugInformation="false" \
-                                SubSystem="0" \
-                                OptimizeReferences="0" \
-                                EnableCOMDATFolding="0" \
-                                TargetMachine="0"
+                            obj_int_extract)
+                                tag Tool \
+                                    Name="VCLinkerTool" \
+                                    OutputFile="${name}.exe" \
+                                    GenerateDebugInformation="true" \
                             ;;
-                            *) tag Tool \
-                                Name="VCLinkerTool" \
-                                AdditionalDependencies="$libs" \
-                                OutputFile="\$(OutDir)/${name}.exe" \
-                                LinkIncremental="1" \
-                                AdditionalLibraryDirectories="${libdirs};&quot;..\lib/$plat_no_ws&quot;" \
-                                DelayLoadDLLs="\$(NOINHERIT)" \
-                                GenerateDebugInformation="true" \
-                                ProgramDatabaseFile="\$(OutDir)/${name}.pdb" \
-                                SubSystem="9" \
-                                StackReserveSize="65536" \
-                                StackCommitSize="4096" \
-                                OptimizeReferences="2" \
-                                EnableCOMDATFolding="2" \
-                                EntryPointSymbol="mainWCRTStartup" \
-                                TargetMachine="3"
+                            *)
+                                tag Tool \
+                                    Name="VCLinkerTool" \
+                                    AdditionalDependencies="$libs \$(NoInherit)" \
+                                    AdditionalLibraryDirectories="$libdirs" \
+
                             ;;
                         esac
-                     ;;
+                    ;;
                  esac
             ;;
-        lib)
+            lib)
                 case "$target" in
-                      arm*|iwmmx*) tag Tool \
-                                    Name="VCLibrarianTool" \
-                                    AdditionalOptions=" /subsystem:windowsce,4.20 /machine:ARM" \
-                                    OutputFile="\$(OutDir)/${name}.lib" \
-                                ;;
-                                *) tag Tool \
-                                    Name="VCLibrarianTool" \
-                                    OutputFile="\$(OutDir)/${name}${lib_sfx}.lib" \
-                                ;;
-                esac
-        ;;
-        dll) # note differences to debug version: LinkIncremental, AssemblyDebug
-             tag Tool \
-                      Name="VCLinkerTool" \
-                      AdditionalDependencies="\$(NoInherit)" \
-                      LinkIncremental="1" \
-                      GenerateDebugInformation="true" \
-                      TargetMachine="1" \
-                      $link_opts
-        esac
+                    x86*)
+                        tag Tool \
+                            Name="VCLibrarianTool" \
+                            OutputFile="\$(OutDir)/${name}${lib_sfx}.lib" \
 
-        if [ "$target" == "armv6-wince-vs8" ] || [ "$target" == "armv5te-wince-vs8" ] || [ "$target" == "iwmmxt-wince-vs8" ] || [ "$target" == "iwmmxt2-wince-vs8" ];then
-            case "$name" in
-                vpx)         tag DeploymentTool \
-                             ForceDirty="-1" \
-                             RegisterOutput="0"
-                ;;
-                example|xma) tag DeploymentTool \
-                             ForceDirty="-1" \
-                             RegisterOutput="0"
-                             tag DebuggerTool \
-                             Arguments="${ARGU}"
-                                ;;
-            esac
-        fi
+                    ;;
+                esac
+            ;;
+            dll) # note differences to debug version: LinkIncremental, AssemblyDebug
+                tag Tool \
+                    Name="VCLinkerTool" \
+                    AdditionalDependencies="\$(NoInherit)" \
+                    LinkIncremental="1" \
+                    GenerateDebugInformation="true" \
+                    TargetMachine="1" \
+                    $link_opts \
+
+            ;;
+        esac
 
         close_tag Configuration
     done
     close_tag Configurations
 
-    open_tag  Files
-    generate_filter srcs   "Source Files"   "cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
-    generate_filter hdrs   "Header Files"   "h;hpp;hxx;hm;inl;inc;xsd"
+    open_tag Files
+    generate_filter srcs   "Source Files"   "c;def;odl;idl;hpj;bat;asm;asmx"
+    generate_filter hdrs   "Header Files"   "h;hm;inl;inc;xsd"
     generate_filter resrcs "Resource Files" "rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav"
     generate_filter resrcs "Build Files"    "mk"
     close_tag Files

diff --git a/build/make/gen_msvs_sln.sh b/build/make/gen_msvs_sln.sh
index 9cf0900..240678b 100755
--- a/build/make/gen_msvs_sln.sh
+++ b/build/make/gen_msvs_sln.sh

@@ -139,9 +139,6 @@
             echo "${indent}${proj_guid}.${config}.ActiveCfg = ${config}"
             echo "${indent}${proj_guid}.${config}.Build.0 = ${config}"
 
-            if [ "$target" == "armv6-wince-vs8" ] || [ "$target" == "armv5te-wince-vs8" ] || [ "$target" == "iwmmxt-wince-vs8" ] || [ "$target" == "iwmmxt2-wince-vs8" ];then
-                echo "${indent}${proj_guid}.${config}.Deploy.0 = ${config}"
-            fi
         done
         IFS=${IFS_bak}
     done

diff --git a/build/make/obj_int_extract.c b/build/make/obj_int_extract.c
index e01870f..c46d9d5 100644
--- a/build/make/obj_int_extract.c
+++ b/build/make/obj_int_extract.c

@@ -14,7 +14,7 @@
 
 #include "vpx_config.h"
 
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) || defined(__MINGW32__)
 #include <io.h>
 #include <share.h>
 #include "vpx/vpx_integer.h"
@@ -59,20 +59,47 @@
     struct mach_header header;
     uint8_t *buf = base_buf;
     int base_data_section = 0;
+    int bits = 0;
 
+    /* We can read in mach_header for 32 and 64 bit architectures
+     * because it's identical to mach_header_64 except for the last
+     * element (uint32_t reserved), which we don't use. Then, when
+     * we know which architecture we're looking at, increment buf
+     * appropriately.
+     */
     memcpy(&header, buf, sizeof(struct mach_header));
-    buf += sizeof(struct mach_header);
 
-    if (header.magic != MH_MAGIC)
+    if (header.magic == MH_MAGIC)
     {
-        log_msg("Bad magic number for object file. 0x%x expected, 0x%x found.\n",
-                header.magic, MH_MAGIC);
-        goto bail;
+        if (header.cputype == CPU_TYPE_ARM
+            || header.cputype == CPU_TYPE_X86)
+        {
+            bits = 32;
+            buf += sizeof(struct mach_header);
+        }
+        else
+        {
+            log_msg("Bad cputype for object file. Currently only tested for CPU_TYPE_[ARM|X86].\n");
+            goto bail;
+        }
     }
-
-    if (header.cputype != CPU_TYPE_ARM)
+    else if (header.magic == MH_MAGIC_64)
     {
-        log_msg("Bad cputype for object file. Currently only tested for CPU_TYPE_ARM.\n");
+        if (header.cputype == CPU_TYPE_X86_64)
+        {
+            bits = 64;
+            buf += sizeof(struct mach_header_64);
+        }
+        else
+        {
+            log_msg("Bad cputype for object file. Currently only tested for CPU_TYPE_X86_64.\n");
+            goto bail;
+        }
+    }
+    else
+    {
+        log_msg("Bad magic number for object file. 0x%x or 0x%x expected, 0x%x found.\n",
+                MH_MAGIC, MH_MAGIC_64, header.magic);
         goto bail;
     }
 
@@ -85,8 +112,6 @@
     for (i = 0; i < header.ncmds; i++)
     {
         struct load_command lc;
-        struct symtab_command sc;
-        struct segment_command seg_c;
 
         memcpy(&lc, buf, sizeof(struct load_command));
 
@@ -94,50 +119,99 @@
         {
             uint8_t *seg_buf = buf;
             struct section s;
+            struct segment_command seg_c;
 
-            memcpy(&seg_c, buf, sizeof(struct segment_command));
-
+            memcpy(&seg_c, seg_buf, sizeof(struct segment_command));
             seg_buf += sizeof(struct segment_command);
 
-            for (j = 0; j < seg_c.nsects; j++)
+            /* Although each section is given it's own offset, nlist.n_value
+             * references the offset of the first section. This isn't
+             * apparent without debug information because the offset of the
+             * data section is the same as the first section. However, with
+             * debug sections mixed in, the offset of the debug section
+             * increases but n_value still references the first section.
+             */
+            if (seg_c.nsects < 1)
             {
-                memcpy(&s, seg_buf + (j * sizeof(struct section)), sizeof(struct section));
-
-                // Need to get this offset which is the start of the symbol table
-                // before matching the strings up with symbols.
-                base_data_section = s.offset;
+                log_msg("Not enough sections\n");
+                goto bail;
             }
+
+            memcpy(&s, seg_buf, sizeof(struct section));
+            base_data_section = s.offset;
+        }
+        else if (lc.cmd == LC_SEGMENT_64)
+        {
+            uint8_t *seg_buf = buf;
+            struct section_64 s;
+            struct segment_command_64 seg_c;
+
+            memcpy(&seg_c, seg_buf, sizeof(struct segment_command_64));
+            seg_buf += sizeof(struct segment_command_64);
+
+            /* Explanation in LG_SEGMENT */
+            if (seg_c.nsects < 1)
+            {
+                log_msg("Not enough sections\n");
+                goto bail;
+            }
+
+            memcpy(&s, seg_buf, sizeof(struct section_64));
+            base_data_section = s.offset;
         }
         else if (lc.cmd == LC_SYMTAB)
         {
-            uint8_t *sym_buf = base_buf;
-            uint8_t *str_buf = base_buf;
-
             if (base_data_section != 0)
             {
+                struct symtab_command sc;
+                uint8_t *sym_buf = base_buf;
+                uint8_t *str_buf = base_buf;
+
                 memcpy(&sc, buf, sizeof(struct symtab_command));
 
                 if (sc.cmdsize != sizeof(struct symtab_command))
+                {
                     log_msg("Can't find symbol table!\n");
+                    goto bail;
+                }
 
                 sym_buf += sc.symoff;
                 str_buf += sc.stroff;
 
                 for (j = 0; j < sc.nsyms; j++)
                 {
-                    struct nlist nl;
-                    int val;
+                    /* Location of string is cacluated each time from the
+                     * start of the string buffer.  On darwin the symbols
+                     * are prefixed by "_", so we bump the pointer by 1.
+                     * The target value is defined as an int in asm_*_offsets.c,
+                     * which is 4 bytes on all targets we currently use.
+                     */
+                    if (bits == 32)
+                    {
+                        struct nlist nl;
+                        int val;
 
-                    memcpy(&nl, sym_buf + (j * sizeof(struct nlist)), sizeof(struct nlist));
+                        memcpy(&nl, sym_buf, sizeof(struct nlist));
+                        sym_buf += sizeof(struct nlist);
 
-                    val = *((int *)(base_buf + base_data_section + nl.n_value));
+                        memcpy(&val, base_buf + base_data_section + nl.n_value,
+                               sizeof(val));
+                        printf("%-40s EQU %5d\n",
+                               str_buf + nl.n_un.n_strx + 1, val);
+                    }
+                    else /* if (bits == 64) */
+                    {
+                        struct nlist_64 nl;
+                        int val;
 
-                    // Location of string is cacluated each time from the
-                    // start of the string buffer.  On darwin the symbols
-                    // are prefixed by "_".  On other platforms it is not
-                    // so it needs to be removed.  That is the reason for
-                    // the +1.
-                    printf("%-40s EQU %5d\n", str_buf + nl.n_un.n_strx + 1, val);
+                        memcpy(&nl, sym_buf, sizeof(struct nlist_64));
+                        sym_buf += sizeof(struct nlist_64);
+
+                        memcpy(&val, base_buf + base_data_section + nl.n_value,
+                               sizeof(val));
+                        printf("%-40s EQU %5d\n",
+                               str_buf + nl.n_un.n_strx + 1, val);
+                    }
                 }
             }
         }
@@ -218,7 +292,7 @@
     return EXIT_FAILURE;
 }
 
-#else
+#elif defined(__ELF__)
 #include "elf.h"
 
 #define COPY_STRUCT(dst, buf, ofst, sz) do {\
@@ -237,212 +311,420 @@
 
 typedef struct
 {
-    uint8_t     *buf; /* Buffer containing ELF data */
-    size_t       sz;  /* Buffer size */
-    int          le_data;   /* Data is little-endian */
-    Elf32_Ehdr   hdr;
+    uint8_t      *buf; /* Buffer containing ELF data */
+    size_t        sz;  /* Buffer size */
+    int           le_data; /* Data is little-endian */
+    unsigned char e_ident[EI_NIDENT]; /* Magic number and other info */
+    int           bits; /* 32 or 64 */
+    Elf32_Ehdr    hdr32;
+    Elf64_Ehdr    hdr64;
 } elf_obj_t;
 
-int parse_elf32_header(elf_obj_t *elf)
+int parse_elf_header(elf_obj_t *elf)
 {
     int res;
-    /* Verify ELF32 header */
-    COPY_STRUCT(&elf->hdr, elf->buf, 0, elf->sz);
-    res = elf->hdr.e_ident[EI_MAG0] == ELFMAG0;
-    res &= elf->hdr.e_ident[EI_MAG1] == ELFMAG1;
-    res &= elf->hdr.e_ident[EI_MAG2] == ELFMAG2;
-    res &= elf->hdr.e_ident[EI_MAG3] == ELFMAG3;
-    res &= elf->hdr.e_ident[EI_CLASS] == ELFCLASS32;
-    res &= elf->hdr.e_ident[EI_DATA] == ELFDATA2LSB
-           || elf->hdr.e_ident[EI_DATA] == ELFDATA2MSB;
+    /* Verify ELF Magic numbers */
+    COPY_STRUCT(&elf->e_ident, elf->buf, 0, elf->sz);
+    res = elf->e_ident[EI_MAG0] == ELFMAG0;
+    res &= elf->e_ident[EI_MAG1] == ELFMAG1;
+    res &= elf->e_ident[EI_MAG2] == ELFMAG2;
+    res &= elf->e_ident[EI_MAG3] == ELFMAG3;
+    res &= elf->e_ident[EI_CLASS] == ELFCLASS32
+        || elf->e_ident[EI_CLASS] == ELFCLASS64;
+    res &= elf->e_ident[EI_DATA] == ELFDATA2LSB;
 
     if (!res) goto bail;
 
-    elf->le_data = elf->hdr.e_ident[EI_DATA] == ELFDATA2LSB;
+    elf->le_data = elf->e_ident[EI_DATA] == ELFDATA2LSB;
 
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_type);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_machine);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_version);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_entry);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_phoff);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_shoff);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_flags);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_ehsize);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_phentsize);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_phnum);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_shentsize);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_shnum);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr.e_shstrndx);
-    return 0;
-bail:
-    return 1;
-}
-
-int parse_elf32_section(elf_obj_t *elf, int idx, Elf32_Shdr *hdr)
-{
-    if (idx >= elf->hdr.e_shnum)
-        goto bail;
-
-    COPY_STRUCT(hdr, elf->buf, elf->hdr.e_shoff + idx * elf->hdr.e_shentsize,
-                elf->sz);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_name);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_type);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_flags);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_addr);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_offset);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_size);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_link);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_info);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_addralign);
-    ENDIAN_ASSIGN_IN_PLACE(hdr->sh_entsize);
-    return 0;
-bail:
-    return 1;
-}
-
-char *parse_elf32_string_table(elf_obj_t *elf, int s_idx, int idx)
-{
-    Elf32_Shdr shdr;
-
-    if (parse_elf32_section(elf, s_idx, &shdr))
+    /* Read in relevant values */
+    if (elf->e_ident[EI_CLASS] == ELFCLASS32)
     {
-        log_msg("Failed to parse ELF string table: section %d, index %d\n",
-                s_idx, idx);
-        return "";
+        elf->bits = 32;
+        COPY_STRUCT(&elf->hdr32, elf->buf, 0, elf->sz);
+
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_type);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_machine);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_version);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_entry);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_phoff);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_shoff);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_flags);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_ehsize);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_phentsize);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_phnum);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_shentsize);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_shnum);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_shstrndx);
+    }
+    else /* if (elf->e_ident[EI_CLASS] == ELFCLASS64) */
+    {
+        elf->bits = 64;
+        COPY_STRUCT(&elf->hdr64, elf->buf, 0, elf->sz);
+
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_type);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_machine);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_version);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_entry);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_phoff);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_shoff);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_flags);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_ehsize);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_phentsize);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_phnum);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_shentsize);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_shnum);
+        ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_shstrndx);
     }
 
-    return (char *)(elf->buf + shdr.sh_offset + idx);
+    return 0;
+bail:
+    log_msg("Failed to parse ELF file header");
+    return 1;
 }
 
-int parse_elf32_symbol(elf_obj_t *elf, unsigned int ofst, Elf32_Sym *sym)
+int parse_elf_section(elf_obj_t *elf, int idx, Elf32_Shdr *hdr32, Elf64_Shdr *hdr64)
 {
-    COPY_STRUCT(sym, elf->buf, ofst, elf->sz);
-    ENDIAN_ASSIGN_IN_PLACE(sym->st_name);
-    ENDIAN_ASSIGN_IN_PLACE(sym->st_value);
-    ENDIAN_ASSIGN_IN_PLACE(sym->st_size);
-    ENDIAN_ASSIGN_IN_PLACE(sym->st_info);
-    ENDIAN_ASSIGN_IN_PLACE(sym->st_other);
-    ENDIAN_ASSIGN_IN_PLACE(sym->st_shndx);
+    if (hdr32)
+    {
+        if (idx >= elf->hdr32.e_shnum)
+            goto bail;
+
+        COPY_STRUCT(hdr32, elf->buf, elf->hdr32.e_shoff + idx * elf->hdr32.e_shentsize,
+                    elf->sz);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_name);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_type);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_flags);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_addr);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_offset);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_size);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_link);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_info);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_addralign);
+        ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_entsize);
+    }
+    else /* if (hdr64) */
+    {
+        if (idx >= elf->hdr64.e_shnum)
+            goto bail;
+
+        COPY_STRUCT(hdr64, elf->buf, elf->hdr64.e_shoff + idx * elf->hdr64.e_shentsize,
+                    elf->sz);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_name);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_type);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_flags);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_addr);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_offset);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_size);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_link);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_info);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_addralign);
+        ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_entsize);
+    }
+
     return 0;
 bail:
     return 1;
 }
 
-int parse_elf32(uint8_t *buf, size_t sz, output_fmt_t mode)
+char *parse_elf_string_table(elf_obj_t *elf, int s_idx, int idx)
 {
-    elf_obj_t  elf;
-    Elf32_Shdr shdr;
+    if (elf->bits == 32)
+    {
+        Elf32_Shdr shdr;
+
+        if (parse_elf_section(elf, s_idx, &shdr, NULL))
+        {
+            log_msg("Failed to parse ELF string table: section %d, index %d\n",
+                    s_idx, idx);
+            return "";
+        }
+
+        return (char *)(elf->buf + shdr.sh_offset + idx);
+    }
+    else /* if (elf->bits == 64) */
+    {
+        Elf64_Shdr shdr;
+
+        if (parse_elf_section(elf, s_idx, NULL, &shdr))
+        {
+            log_msg("Failed to parse ELF string table: section %d, index %d\n",
+                    s_idx, idx);
+            return "";
+        }
+
+        return (char *)(elf->buf + shdr.sh_offset + idx);
+    }
+}
+
+int parse_elf_symbol(elf_obj_t *elf, unsigned int ofst, Elf32_Sym *sym32, Elf64_Sym *sym64)
+{
+    if (sym32)
+    {
+        COPY_STRUCT(sym32, elf->buf, ofst, elf->sz);
+        ENDIAN_ASSIGN_IN_PLACE(sym32->st_name);
+        ENDIAN_ASSIGN_IN_PLACE(sym32->st_value);
+        ENDIAN_ASSIGN_IN_PLACE(sym32->st_size);
+        ENDIAN_ASSIGN_IN_PLACE(sym32->st_info);
+        ENDIAN_ASSIGN_IN_PLACE(sym32->st_other);
+        ENDIAN_ASSIGN_IN_PLACE(sym32->st_shndx);
+    }
+    else /* if (sym64) */
+    {
+        COPY_STRUCT(sym64, elf->buf, ofst, elf->sz);
+        ENDIAN_ASSIGN_IN_PLACE(sym64->st_name);
+        ENDIAN_ASSIGN_IN_PLACE(sym64->st_value);
+        ENDIAN_ASSIGN_IN_PLACE(sym64->st_size);
+        ENDIAN_ASSIGN_IN_PLACE(sym64->st_info);
+        ENDIAN_ASSIGN_IN_PLACE(sym64->st_other);
+        ENDIAN_ASSIGN_IN_PLACE(sym64->st_shndx);
+    }
+    return 0;
+bail:
+    return 1;
+}
+
+int parse_elf(uint8_t *buf, size_t sz, output_fmt_t mode)
+{
+    elf_obj_t    elf;
     unsigned int ofst;
-    int         i;
-    Elf32_Off strtab_off;   /* save String Table offset for later use */
+    int          i;
+    Elf32_Off    strtab_off32;
+    Elf64_Off    strtab_off64; /* save String Table offset for later use */
 
     memset(&elf, 0, sizeof(elf));
     elf.buf = buf;
     elf.sz = sz;
 
     /* Parse Header */
-    if (parse_elf32_header(&elf))
-    {
-        log_msg("Parse error: File does not appear to be valid ELF32\n");
-        return 1;
-    }
+    if (parse_elf_header(&elf))
+      goto bail;
 
-    for (i = 0; i < elf.hdr.e_shnum; i++)
+    if (elf.bits == 32)
     {
-        parse_elf32_section(&elf, i, &shdr);
-
-        if (shdr.sh_type == SHT_STRTAB)
+        Elf32_Shdr shdr;
+        for (i = 0; i < elf.hdr32.e_shnum; i++)
         {
-            char strtsb_name[128];
+            parse_elf_section(&elf, i, &shdr, NULL);
 
-            strcpy(strtsb_name, (char *)(elf.buf + shdr.sh_offset + shdr.sh_name));
-
-            if (!(strcmp(strtsb_name, ".shstrtab")))
+            if (shdr.sh_type == SHT_STRTAB)
             {
-                log_msg("found section: %s\n", strtsb_name);
-                strtab_off = shdr.sh_offset;
-                break;
+                char strtsb_name[128];
+
+                strcpy(strtsb_name, (char *)(elf.buf + shdr.sh_offset + shdr.sh_name));
+
+                if (!(strcmp(strtsb_name, ".shstrtab")))
+                {
+                    /* log_msg("found section: %s\n", strtsb_name); */
+                    strtab_off32 = shdr.sh_offset;
+                    break;
+                }
+            }
+        }
+    }
+    else /* if (elf.bits == 64) */
+    {
+        Elf64_Shdr shdr;
+        for (i = 0; i < elf.hdr64.e_shnum; i++)
+        {
+            parse_elf_section(&elf, i, NULL, &shdr);
+
+            if (shdr.sh_type == SHT_STRTAB)
+            {
+                char strtsb_name[128];
+
+                strcpy(strtsb_name, (char *)(elf.buf + shdr.sh_offset + shdr.sh_name));
+
+                if (!(strcmp(strtsb_name, ".shstrtab")))
+                {
+                    /* log_msg("found section: %s\n", strtsb_name); */
+                    strtab_off64 = shdr.sh_offset;
+                    break;
+                }
             }
         }
     }
 
     /* Parse all Symbol Tables */
-    for (i = 0; i < elf.hdr.e_shnum; i++)
+    if (elf.bits == 32)
     {
-
-        parse_elf32_section(&elf, i, &shdr);
-
-        if (shdr.sh_type == SHT_SYMTAB)
+        Elf32_Shdr shdr;
+        for (i = 0; i < elf.hdr32.e_shnum; i++)
         {
-            for (ofst = shdr.sh_offset;
-                 ofst < shdr.sh_offset + shdr.sh_size;
-                 ofst += shdr.sh_entsize)
+            parse_elf_section(&elf, i, &shdr, NULL);
+
+            if (shdr.sh_type == SHT_SYMTAB)
             {
-                Elf32_Sym sym;
-
-                parse_elf32_symbol(&elf, ofst, &sym);
-
-                /* For all OBJECTS (data objects), extract the value from the
-                 * proper data segment.
-                 */
-                if (ELF32_ST_TYPE(sym.st_info) == STT_OBJECT && sym.st_name)
-                    log_msg("found data object %s\n",
-                            parse_elf32_string_table(&elf,
-                                                     shdr.sh_link,
-                                                     sym.st_name));
-
-                if (ELF32_ST_TYPE(sym.st_info) == STT_OBJECT
-                    && sym.st_size == 4)
+                for (ofst = shdr.sh_offset;
+                     ofst < shdr.sh_offset + shdr.sh_size;
+                     ofst += shdr.sh_entsize)
                 {
-                    Elf32_Shdr dhdr;
-                    int32_t      val;
-                    char section_name[128];
+                    Elf32_Sym sym;
 
-                    parse_elf32_section(&elf, sym.st_shndx, &dhdr);
+                    parse_elf_symbol(&elf, ofst, &sym, NULL);
 
-                    /* For explanition - refer to _MSC_VER version of code */
-                    strcpy(section_name, (char *)(elf.buf + strtab_off + dhdr.sh_name));
-                    log_msg("Section_name: %s, Section_type: %d\n", section_name, dhdr.sh_type);
+                    /* For all OBJECTS (data objects), extract the value from the
+                     * proper data segment.
+                     */
+                    /* if (ELF32_ST_TYPE(sym.st_info) == STT_OBJECT && sym.st_name)
+                        log_msg("found data object %s\n",
+                                parse_elf_string_table(&elf,
+                                                       shdr.sh_link,
+                                                       sym.st_name));
+                     */
 
-                    if (!(strcmp(section_name, ".bss")))
+                    if (ELF32_ST_TYPE(sym.st_info) == STT_OBJECT
+                        && sym.st_size == 4)
                     {
-                        val = 0;
+                        Elf32_Shdr dhdr;
+                        int val = 0;
+                        char section_name[128];
+
+                        parse_elf_section(&elf, sym.st_shndx, &dhdr, NULL);
+
+                        /* For explanition - refer to _MSC_VER version of code */
+                        strcpy(section_name, (char *)(elf.buf + strtab_off32 + dhdr.sh_name));
+                        /* log_msg("Section_name: %s, Section_type: %d\n", section_name, dhdr.sh_type); */
+
+                        if (strcmp(section_name, ".bss"))
+                        {
+                            if (sizeof(val) != sym.st_size)
+                            {
+                                /* The target value is declared as an int in
+                                 * asm_*_offsets.c, which is 4 bytes on all
+                                 * targets we currently use. Complain loudly if
+                                 * this is not true.
+                                 */
+                                log_msg("Symbol size is wrong\n");
+                                goto bail;
+                            }
+
+                            memcpy(&val,
+                                   elf.buf + dhdr.sh_offset + sym.st_value,
+                                   sym.st_size);
+                        }
+
+                        if (!elf.le_data)
+                        {
+                            log_msg("Big Endian data not supported yet!\n");
+                            goto bail;
+                        }
+
+                        switch (mode)
+                        {
+                            case OUTPUT_FMT_RVDS:
+                                printf("%-40s EQU %5d\n",
+                                       parse_elf_string_table(&elf,
+                                                              shdr.sh_link,
+                                                              sym.st_name),
+                                       val);
+                                break;
+                            case OUTPUT_FMT_GAS:
+                                printf(".equ %-40s, %5d\n",
+                                       parse_elf_string_table(&elf,
+                                                              shdr.sh_link,
+                                                              sym.st_name),
+                                       val);
+                                break;
+                            default:
+                                printf("%s = %d\n",
+                                       parse_elf_string_table(&elf,
+                                                              shdr.sh_link,
+                                                              sym.st_name),
+                                       val);
+                        }
                     }
-                    else
-                    {
-                        memcpy(&val,
-                               elf.buf + dhdr.sh_offset + sym.st_value,
-                               sizeof(val));
-                    }
+                }
+            }
+        }
+    }
+    else /* if (elf.bits == 64) */
+    {
+        Elf64_Shdr shdr;
+        for (i = 0; i < elf.hdr64.e_shnum; i++)
+        {
+            parse_elf_section(&elf, i, NULL, &shdr);
 
-                    if (!elf.le_data)
-                    {
-                        log_msg("Big Endian data not supported yet!\n");
-                        goto bail;
-                    }\
+            if (shdr.sh_type == SHT_SYMTAB)
+            {
+                for (ofst = shdr.sh_offset;
+                     ofst < shdr.sh_offset + shdr.sh_size;
+                     ofst += shdr.sh_entsize)
+                {
+                    Elf64_Sym sym;
 
-                    switch (mode)
+                    parse_elf_symbol(&elf, ofst, NULL, &sym);
+
+                    /* For all OBJECTS (data objects), extract the value from the
+                     * proper data segment.
+                     */
+                    /* if (ELF64_ST_TYPE(sym.st_info) == STT_OBJECT && sym.st_name)
+                        log_msg("found data object %s\n",
+                                parse_elf_string_table(&elf,
+                                                       shdr.sh_link,
+                                                       sym.st_name));
+                     */
+
+                    if (ELF64_ST_TYPE(sym.st_info) == STT_OBJECT
+                        && sym.st_size == 4)
                     {
-                    case OUTPUT_FMT_RVDS:
-                        printf("%-40s EQU %5d\n",
-                               parse_elf32_string_table(&elf,
-                                                        shdr.sh_link,
-                                                        sym.st_name),
-                               val);
-                        break;
-                    case OUTPUT_FMT_GAS:
-                        printf(".equ %-40s, %5d\n",
-                               parse_elf32_string_table(&elf,
-                                                        shdr.sh_link,
-                                                        sym.st_name),
-                               val);
-                        break;
-                    default:
-                        printf("%s = %d\n",
-                               parse_elf32_string_table(&elf,
-                                                        shdr.sh_link,
-                                                        sym.st_name),
-                               val);
+                        Elf64_Shdr dhdr;
+                        int val = 0;
+                        char section_name[128];
+
+                        parse_elf_section(&elf, sym.st_shndx, NULL, &dhdr);
+
+                        /* For explanition - refer to _MSC_VER version of code */
+                        strcpy(section_name, (char *)(elf.buf + strtab_off64 + dhdr.sh_name));
+                        /* log_msg("Section_name: %s, Section_type: %d\n", section_name, dhdr.sh_type); */
+
+                        if ((strcmp(section_name, ".bss")))
+                        {
+                            if (sizeof(val) != sym.st_size)
+                            {
+                                /* The target value is declared as an int in
+                                 * asm_*_offsets.c, which is 4 bytes on all
+                                 * targets we currently use. Complain loudly if
+                                 * this is not true.
+                                 */
+                                log_msg("Symbol size is wrong\n");
+                                goto bail;
+                            }
+
+                            memcpy(&val,
+                                   elf.buf + dhdr.sh_offset + sym.st_value,
+                                   sym.st_size);
+                        }
+
+                        if (!elf.le_data)
+                        {
+                            log_msg("Big Endian data not supported yet!\n");
+                            goto bail;
+                        }
+
+                        switch (mode)
+                        {
+                            case OUTPUT_FMT_RVDS:
+                                printf("%-40s EQU %5d\n",
+                                       parse_elf_string_table(&elf,
+                                                              shdr.sh_link,
+                                                              sym.st_name),
+                                       val);
+                                break;
+                            case OUTPUT_FMT_GAS:
+                                printf(".equ %-40s, %5d\n",
+                                       parse_elf_string_table(&elf,
+                                                              shdr.sh_link,
+                                                              sym.st_name),
+                                       val);
+                                break;
+                            default:
+                                printf("%s = %d\n",
+                                       parse_elf_string_table(&elf,
+                                                              shdr.sh_link,
+                                                              sym.st_name),
+                                       val);
+                        }
                     }
                 }
             }
@@ -454,7 +736,7 @@
 
     return 0;
 bail:
-    log_msg("Parse error: File does not appear to be valid ELF32\n");
+    log_msg("Parse error: File does not appear to be valid ELF32 or ELF64\n");
     return 1;
 }
 
@@ -521,8 +803,7 @@
         goto bail;
     }
 
-    res = parse_elf32(file_buf, stat_buf.st_size, mode);
-    //res = parse_coff(file_buf, stat_buf.st_size);
+    res = parse_elf(file_buf, stat_buf.st_size, mode);
     free(file_buf);
 
     if (!res)
@@ -535,7 +816,7 @@
 #endif
 
 
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) || defined(__MINGW32__)
 /*  See "Microsoft Portable Executable and Common Object File Format Specification"
     for reference.
 */
@@ -549,7 +830,6 @@
     unsigned int i;
     unsigned __int8 *ptr;
     unsigned __int32 symoffset;
-    FILE *fp;
 
     char **sectionlist;  //this array holds all section names in their correct order.
     //it is used to check if the symbol is in .bss or .data section.
@@ -560,9 +840,18 @@
     strtab_ptr = symtab_ptr + symtab_sz * 18;
 
     if (nsections > 96)
-        goto bail;
+    {
+        log_msg("Too many sections\n");
+        return 1;
+    }
 
-    sectionlist = malloc(nsections * sizeof * sectionlist);
+    sectionlist = malloc(nsections * sizeof(sectionlist));
+
+    if (sectionlist == NULL)
+    {
+        log_msg("Allocating first level of section list failed\n");
+        return 1;
+    }
 
     //log_msg("COFF: Found %u symbols in %u sections.\n", symtab_sz, nsections);
 
@@ -580,6 +869,12 @@
         //log_msg("COFF: Parsing section %s\n",sectionname);
 
         sectionlist[i] = malloc(strlen(sectionname) + 1);
+
+        if (sectionlist[i] == NULL)
+        {
+            log_msg("Allocating storage for %s failed\n", sectionname);
+            goto bail;
+        }
         strcpy(sectionlist[i], sectionname);
 
         if (!strcmp(sectionname, ".data")) sectionrawdata_ptr = get_le32(ptr + 20);
@@ -590,14 +885,6 @@
     //log_msg("COFF: Symbol table at offset %u\n", symtab_ptr);
     //log_msg("COFF: raw data pointer ofset for section .data is %u\n", sectionrawdata_ptr);
 
-    fp = fopen("vpx_asm_offsets.asm", "w");
-
-    if (fp == NULL)
-    {
-        perror("open file");
-        goto bail;
-    }
-
     /*  The compiler puts the data with non-zero offset in .data section, but puts the data with
         zero offset in .bss section. So, if the data in in .bss section, set offset=0.
         Note from Wiki: In an object module compiled from C, the bss section contains
@@ -631,13 +918,23 @@
                 char name[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
                 strncpy(name, ptr, 8);
                 //log_msg("COFF: Parsing symbol %s\n",name);
-                fprintf(fp, "%-40s EQU ", name);
+                /* The 64bit Windows compiler doesn't prefix with an _.
+                 * Check what's there, and bump if necessary
+                 */
+                if (name[0] == '_')
+                    printf("%-40s EQU ", name + 1);
+                else
+                    printf("%-40s EQU ", name);
             }
             else
             {
                 //log_msg("COFF: Parsing symbol %s\n",
                 //        buf + strtab_ptr + get_le32(ptr+4));
-                fprintf(fp, "%-40s EQU ", buf + strtab_ptr + get_le32(ptr + 4));
+                if ((buf + strtab_ptr + get_le32(ptr + 4))[0] == '_')
+                    printf("%-40s EQU ",
+                           buf + strtab_ptr + get_le32(ptr + 4) + 1);
+                else
+                    printf("%-40s EQU ", buf + strtab_ptr + get_le32(ptr + 4));
             }
 
             if (!(strcmp(sectionlist[section-1], ".bss")))
@@ -654,14 +951,13 @@
             //log_msg("      Address: %u\n",get_le32(ptr+8));
             //log_msg("      Offset: %u\n", symoffset);
 
-            fprintf(fp, "%5d\n", symoffset);
+            printf("%5d\n", symoffset);
         }
 
         ptr += 18;
     }
 
-    fprintf(fp, "    END\n");
-    fclose(fp);
+    printf("    END\n");
 
     for (i = 0; i < nsections; i++)
     {
@@ -711,11 +1007,7 @@
     else
         f = argv[1];
 
-    if (_sopen_s(&fd, f, _O_BINARY, _SH_DENYNO, _S_IREAD | _S_IWRITE))
-    {
-        perror("Unable to open file");
-        goto bail;
-    }
+    fd = _sopen(f, _O_BINARY, _SH_DENYNO, _S_IREAD | _S_IWRITE);
 
     if (_fstat(fd, &stat_buf))
     {

diff --git a/build/x86-msvs/obj_int_extract.bat b/build/x86-msvs/obj_int_extract.bat
new file mode 100644
index 0000000..1bb8653
--- /dev/null
+++ b/build/x86-msvs/obj_int_extract.bat

@@ -0,0 +1,15 @@
+REM   Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+REM
+REM   Use of this source code is governed by a BSD-style license
+REM   that can be found in the LICENSE file in the root of the source
+REM   tree. An additional intellectual property rights grant can be found
+REM   in the file PATENTS.  All contributing project authors may
+REM   be found in the AUTHORS file in the root of the source tree.
+echo on
+
+cl /I "./" /I "%1" /nologo /c "%1/vp8/common/asm_com_offsets.c"
+cl /I "./" /I "%1" /nologo /c "%1/vp8/decoder/asm_dec_offsets.c"
+cl /I "./" /I "%1" /nologo /c "%1/vp8/encoder/asm_enc_offsets.c"
+obj_int_extract.exe rvds "asm_com_offsets.obj" > "asm_com_offsets.asm"
+obj_int_extract.exe rvds "asm_dec_offsets.obj" > "asm_dec_offsets.asm"
+obj_int_extract.exe rvds "asm_enc_offsets.obj" > "asm_enc_offsets.asm"

diff --git a/configure b/configure
index ca7affa..c177865 100755
--- a/configure
+++ b/configure

@@ -40,7 +40,6 @@
   ${toggle_runtime_cpu_detect}    runtime cpu detection
   ${toggle_shared}                shared library support
   ${toggle_small}                 favor smaller size over speed
-  ${toggle_arm_asm_detok}         assembly version of the detokenizer (ARM platforms only)
   ${toggle_postproc_visualizer}   macro block / block level visualizers
 
 Codecs:
@@ -81,19 +80,15 @@
 all_platforms="${all_platforms} armv5te-linux-gcc"
 all_platforms="${all_platforms} armv5te-none-rvct"
 all_platforms="${all_platforms} armv5te-symbian-gcc"
-all_platforms="${all_platforms} armv5te-wince-vs8"
 all_platforms="${all_platforms} armv6-darwin-gcc"
 all_platforms="${all_platforms} armv6-linux-rvct"
 all_platforms="${all_platforms} armv6-linux-gcc"
 all_platforms="${all_platforms} armv6-none-rvct"
 all_platforms="${all_platforms} armv6-symbian-gcc"
-all_platforms="${all_platforms} armv6-wince-vs8"
 all_platforms="${all_platforms} iwmmxt-linux-rvct"
 all_platforms="${all_platforms} iwmmxt-linux-gcc"
-all_platforms="${all_platforms} iwmmxt-wince-vs8"
 all_platforms="${all_platforms} iwmmxt2-linux-rvct"
 all_platforms="${all_platforms} iwmmxt2-linux-gcc"
-all_platforms="${all_platforms} iwmmxt2-wince-vs8"
 all_platforms="${all_platforms} armv7-darwin-gcc"    #neon Cortex-A8
 all_platforms="${all_platforms} armv7-linux-rvct"    #neon Cortex-A8
 all_platforms="${all_platforms} armv7-linux-gcc"     #neon Cortex-A8
@@ -255,7 +250,6 @@
     realtime_only
     shared
     small
-    arm_asm_detok
     postproc_visualizer
     os_support
 "
@@ -296,7 +290,6 @@
     realtime_only
     shared
     small
-    arm_asm_detok
     postproc_visualizer
 "
 

diff --git a/docs.mk b/docs.mk
index 28df9d2..98332a2 100644
--- a/docs.mk
+++ b/docs.mk

@@ -34,7 +34,8 @@
 
 EXAMPLE_PATH += $(SRC_PATH_BARE) #for CHANGELOG, README, etc
 
-doxyfile: libs.doxy_template libs.doxy examples.doxy
+doxyfile: $(if $(findstring examples, $(ALL_TARGETS)),examples.doxy)
+doxyfile: libs.doxy_template libs.doxy
 	@echo "    [CREATE] $@"
 	@cat $^ > $@
 	@echo "STRIP_FROM_PATH += $(SRC_PATH_BARE) $(BUILD_ROOT)" >> $@

diff --git a/examples/decoder_tmpl.c b/examples/decoder_tmpl.c
index 26b745d..c70681b 100644
--- a/examples/decoder_tmpl.c
+++ b/examples/decoder_tmpl.c

@@ -19,7 +19,7 @@
 #define VPX_CODEC_DISABLE_COMPAT 1
 #include "vpx/vpx_decoder.h"
 #include "vpx/vp8dx.h"
-#define interface (&vpx_codec_vp8_dx_algo)
+#define interface (vpx_codec_vp8_dx())
 @EXTRA_INCLUDES
 
 

diff --git a/examples/decoder_tmpl.txt b/examples/decoder_tmpl.txt
index 310c66d..7dd05d1 100644
--- a/examples/decoder_tmpl.txt
+++ b/examples/decoder_tmpl.txt

@@ -2,7 +2,7 @@
 #define VPX_CODEC_DISABLE_COMPAT 1
 #include "vpx/vpx_decoder.h"
 #include "vpx/vp8dx.h"
-#define interface (&vpx_codec_vp8_dx_algo)
+#define interface (vpx_codec_vp8_dx())
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DEC_INCLUDES
 
 

diff --git a/examples/encoder_tmpl.c b/examples/encoder_tmpl.c
index d9e4d03..f109e62 100644
--- a/examples/encoder_tmpl.c
+++ b/examples/encoder_tmpl.c

@@ -19,7 +19,7 @@
 #define VPX_CODEC_DISABLE_COMPAT 1
 #include "vpx/vpx_encoder.h"
 #include "vpx/vp8cx.h"
-#define interface (&vpx_codec_vp8_cx_algo)
+#define interface (vpx_codec_vp8_cx())
 #define fourcc    0x30385056
 @EXTRA_INCLUDES
 

diff --git a/examples/encoder_tmpl.txt b/examples/encoder_tmpl.txt
index 3273164..0042071 100644
--- a/examples/encoder_tmpl.txt
+++ b/examples/encoder_tmpl.txt

@@ -2,7 +2,7 @@
 #define VPX_CODEC_DISABLE_COMPAT 1
 #include "vpx/vpx_encoder.h"
 #include "vpx/vp8cx.h"
-#define interface (&vpx_codec_vp8_cx_algo)
+#define interface (vpx_codec_vp8_cx())
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ENC_INCLUDES
 
 

diff --git a/examples/simple_decoder.txt b/examples/simple_decoder.txt
index be8ca73..90d9a68 100644
--- a/examples/simple_decoder.txt
+++ b/examples/simple_decoder.txt

@@ -33,7 +33,7 @@
 ----------------------
 The decoder is initialized by the following code. This is an example for
 the VP8 decoder, but the code is analogous for all algorithms. Replace
-`&vpx_codec_vp8_dx_algo` with a pointer to the interface exposed by the
+`vpx_codec_vp8_dx()` with a pointer to the interface exposed by the
 algorithm you want to use. The `cfg` argument is left as NULL in this
 example, because we want the algorithm to determine the stream
 configuration (width/height) and allocate memory automatically. This

diff --git a/libs.mk b/libs.mk
index 9ded394..6a5dc18 100644
--- a/libs.mk
+++ b/libs.mk

@@ -9,7 +9,13 @@
 ##
 
 
-ASM:=$(if $(filter yes,$(CONFIG_GCC)),.asm.s,.asm)
+# ARM assembly files are written in RVCT-style. We use some make magic to
+# filter those files to allow GCC compilation
+ifeq ($(ARCH_ARM),yes)
+  ASM:=$(if $(filter yes,$(CONFIG_GCC)),.asm.s,.asm)
+else
+  ASM:=.asm
+endif
 
 CODEC_SRCS-yes += libs.mk
 
@@ -126,28 +132,22 @@
 ifeq ($(CONFIG_EXTERNAL_BUILD),yes)
 ifeq ($(CONFIG_MSVS),yes)
 
-ifeq ($(ARCH_ARM),yes)
-ifeq ($(HAVE_ARMV5TE),yes)
-ARM_ARCH=v5
-endif
-ifeq ($(HAVE_ARMV6),yes)
-ARM_ARCH=v6
-endif
 obj_int_extract.vcproj: $(SRC_PATH_BARE)/build/make/obj_int_extract.c
-	@cp $(SRC_PATH_BARE)/build/arm-wince-vs8/obj_int_extract.bat .
+	@cp $(SRC_PATH_BARE)/build/x86-msvs/obj_int_extract.bat .
 	@echo "    [CREATE] $@"
-	$(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh\
-			--exe\
-			--target=$(TOOLCHAIN)\
-            $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
-            --name=obj_int_extract\
-            --proj-guid=E1360C65-D375-4335-8057-7ED99CC3F9B2\
-            --out=$@ $^\
-            -I".&quot;;&quot;$(SRC_PATH_BARE)"
+	$(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh \
+    --exe \
+    --target=$(TOOLCHAIN) \
+    --name=obj_int_extract \
+    --ver=$(CONFIG_VS_VERSION) \
+    --proj-guid=E1360C65-D375-4335-8057-7ED99CC3F9B2 \
+    $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
+    --out=$@ $^ \
+    -I. \
+    -I"$(SRC_PATH_BARE)" \
 
 PROJECTS-$(BUILD_LIBVPX) += obj_int_extract.vcproj
 PROJECTS-$(BUILD_LIBVPX) += obj_int_extract.bat
-endif
 
 vpx.def: $(call enabled,CODEC_EXPORTS)
 	@echo "    [CREATE] $@"
@@ -158,15 +158,16 @@
 
 vpx.vcproj: $(CODEC_SRCS) vpx.def
 	@echo "    [CREATE] $@"
-	$(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh\
-			--lib\
-			--target=$(TOOLCHAIN)\
+	$(SRC_PATH_BARE)/build/make/gen_msvs_proj.sh \
+			--lib \
+			--target=$(TOOLCHAIN) \
             $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
-            --name=vpx\
-            --proj-guid=DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74\
-            --module-def=vpx.def\
-            --ver=$(CONFIG_VS_VERSION)\
-            --out=$@ $(CFLAGS) $^\
+            --name=vpx \
+            --proj-guid=DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74 \
+            --module-def=vpx.def \
+            --ver=$(CONFIG_VS_VERSION) \
+            --out=$@ $(CFLAGS) $^ \
+            --src-path-bare="$(SRC_PATH_BARE)" \
 
 PROJECTS-$(BUILD_LIBVPX) += vpx.vcproj
 
@@ -230,9 +231,44 @@
 #
 # Add assembler dependencies for configuration and offsets
 #
-#$(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm $(BUILD_PFX)vpx_asm_offsets.asm
-$(filter %.s.o,$(OBJS-yes)):   $(BUILD_PFX)vpx_config.asm
-$(filter %.asm.o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm
+$(filter %.s.o,$(OBJS-yes)):     $(BUILD_PFX)vpx_config.asm
+$(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm
+
+#
+# Calculate platform- and compiler-specific offsets for hand coded assembly
+#
+ifeq ($(CONFIG_EXTERNAL_BUILD),) # Visual Studio uses obj_int_extract.bat
+  ifeq ($(ARCH_ARM), yes)
+    asm_com_offsets.asm: obj_int_extract
+    asm_com_offsets.asm: $(VP8_PREFIX)common/asm_com_offsets.c.o
+	./obj_int_extract rvds $< $(ADS2GAS) > $@
+    OBJS-yes += $(VP8_PREFIX)common/asm_com_offsets.c.o
+    CLEAN-OBJS += asm_com_offsets.asm
+    $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_com_offsets.asm
+  endif
+
+  ifeq ($(ARCH_ARM)$(ARCH_X86)$(ARCH_X86_64), yes)
+    ifeq ($(CONFIG_VP8_ENCODER), yes)
+      asm_enc_offsets.asm: obj_int_extract
+      asm_enc_offsets.asm: $(VP8_PREFIX)encoder/asm_enc_offsets.c.o
+	./obj_int_extract rvds $< $(ADS2GAS) > $@
+      OBJS-yes += $(VP8_PREFIX)encoder/asm_enc_offsets.c.o
+      CLEAN-OBJS += asm_enc_offsets.asm
+      $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_enc_offsets.asm
+    endif
+  endif
+
+  ifeq ($(ARCH_ARM), yes)
+    ifeq ($(CONFIG_VP8_DECODER), yes)
+      asm_dec_offsets.asm: obj_int_extract
+      asm_dec_offsets.asm: $(VP8_PREFIX)decoder/asm_dec_offsets.c.o
+	./obj_int_extract rvds $< $(ADS2GAS) > $@
+      OBJS-yes += $(VP8_PREFIX)decoder/asm_dec_offsets.c.o
+      CLEAN-OBJS += asm_dec_offsets.asm
+      $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_dec_offsets.asm
+    endif
+  endif
+endif
 
 $(shell $(SRC_PATH_BARE)/build/make/version.sh "$(SRC_PATH_BARE)" $(BUILD_PFX)vpx_version.h)
 CLEAN-OBJS += $(BUILD_PFX)vpx_version.h

diff --git a/mainpage.dox b/mainpage.dox
index 49dff7b..5613ae7 100644
--- a/mainpage.dox
+++ b/mainpage.dox

@@ -31,7 +31,7 @@
   The WebM project is an open source project supported by its community. For
   questions about this SDK, please mail the apps-devel@webmproject.org list.
   To contribute, see http://www.webmproject.org/code/contribute and mail
-  vpx-devel@webmproject.org.
+  codec-devel@webmproject.org.
 */
 
 /*!\page changelog CHANGELOG

diff --git a/solution.mk b/solution.mk
index 6d2c08d..782150f 100644
--- a/solution.mk
+++ b/solution.mk

@@ -9,38 +9,13 @@
 ##
 
 
-ifeq ($(ARCH_ARM),yes)
-ARM_DEVELOP=no
-ARM_DEVELOP:=$(if $(filter %vpx.vcproj,$(wildcard *.vcproj)),yes)
-
-ifeq ($(ARM_DEVELOP),yes)
-vpx.sln:
-	@echo "    [COPY] $@"
-	@cp $(SRC_PATH_BARE)/build/arm-wince-vs8/vpx.sln .
-PROJECTS-yes += vpx.sln
-else
-vpx.sln: $(wildcard *.vcproj)
-	@echo "    [CREATE] $@"
-	$(SRC_PATH_BARE)/build/make/gen_msvs_sln.sh \
-            $(if $(filter %vpx.vcproj,$^),--dep=vpxdec:vpx) \
-            $(if $(filter %vpx.vcproj,$^),--dep=xma:vpx) \
-            --ver=$(CONFIG_VS_VERSION)\
-            --target=$(TOOLCHAIN)\
-            --out=$@ $^
-vpx.sln.mk: vpx.sln
-	@true
-
-PROJECTS-yes += vpx.sln vpx.sln.mk
--include vpx.sln.mk
-endif
-
-else
 vpx.sln: $(wildcard *.vcproj)
 	@echo "    [CREATE] $@"
 	$(SRC_PATH_BARE)/build/make/gen_msvs_sln.sh \
             $(if $(filter %vpx.vcproj,$^),\
-                $(foreach vcp,$(filter-out %vpx.vcproj,$^),\
+                $(foreach vcp,$(filter-out %vpx.vcproj %obj_int_extract.vcproj,$^),\
                   --dep=$(vcp:.vcproj=):vpx)) \
+            --dep=vpx:obj_int_extract \
             --ver=$(CONFIG_VS_VERSION)\
             --out=$@ $^
 vpx.sln.mk: vpx.sln
@@ -48,7 +23,6 @@
 
 PROJECTS-yes += vpx.sln vpx.sln.mk
 -include vpx.sln.mk
-endif
 
 # Always install this file, as it is an unconditional post-build rule.
 INSTALL_MAPS += src/%     $(SRC_PATH_BARE)/%

diff --git a/usage.dox b/usage.dox
index 53808fd..0db080b 100644
--- a/usage.dox
+++ b/usage.dox

@@ -25,7 +25,7 @@
     codec may write into to store details about a single instance of that codec.
     Most of the context is implementation specific, and thus opaque to the
     application. The context structure as seen by the application is of fixed
-    size, and thus can be allocated eith with automatic storage or dynamically
+    size, and thus can be allocated with automatic storage or dynamically
     on the heap.
 
     Most operations require an initialized codec context. Codec context
@@ -74,7 +74,7 @@
     the ABI is versioned. The ABI version number must be passed at
     initialization time to ensure the application is using a header file that
     matches the library. The current ABI version number is stored in the
-    prepropcessor macros #VPX_CODEC_ABI_VERSION, #VPX_ENCODER_ABI_VERSION, and
+    preprocessor macros #VPX_CODEC_ABI_VERSION, #VPX_ENCODER_ABI_VERSION, and
     #VPX_DECODER_ABI_VERSION. For convenience, each initialization function has
     a wrapper macro that inserts the correct version number. These macros are
     named like the initialization methods, but without the _ver suffix.
@@ -125,7 +125,7 @@
 
     The special value <code>0</code> is reserved to represent an infinite
     deadline. In this case, the codec will perform as much processing as
-    possible to yeild the highest quality frame.
+    possible to yield the highest quality frame.
 
     By convention, the value <code>1</code> is used to mean "return as fast as
     possible."
@@ -135,7 +135,7 @@
 
 /*! \page usage_xma External Memory Allocation
     Applications that wish to have fine grained control over how and where
-    decoders allocate memory \ref MAY make use of the e_xternal Memory Allocation
+    decoders allocate memory \ref MAY make use of the eXternal Memory Allocation
     (XMA) interface. Not all codecs support the XMA \ref usage_features.
 
     To use a decoder in XMA mode, the decoder \ref MUST be initialized with the
@@ -143,7 +143,7 @@
     allocate is heavily dependent on the size of the encoded video frames. The
     size of the video must be known before requesting the decoder's memory map.
     This stream information can be obtained with the vpx_codec_peek_stream_info()
-    function, which does not require a contructed decoder context. If the exact
+    function, which does not require a constructed decoder context. If the exact
     stream is not known, a stream info structure can be created that reflects
     the maximum size that the decoder instance is required to support.
 
@@ -175,7 +175,7 @@
     \section usage_xma_seg_szalign Segment Size and Alignment
     The sz (size) and align (alignment) parameters describe the required size
     and alignment of the requested segment. Alignment will always be a power of
-    two. Applications \ref MUST honor the aligment requested. Failure to do so
+    two. Applications \ref MUST honor the alignment requested. Failure to do so
     could result in program crashes or may incur a speed penalty.
 
     \section usage_xma_seg_flags Segment Flags

diff --git a/vp8/common/alloccommon.c b/vp8/common/alloccommon.c
index 9dce8c8..edef360 100644
--- a/vp8/common/alloccommon.c
+++ b/vp8/common/alloccommon.c

@@ -16,12 +16,11 @@
 #include "findnearmv.h"
 #include "entropymode.h"
 #include "systemdependent.h"
-#include "vpxerrors.h"
 
 
 extern  void vp8_init_scan_order_mask();
 
-void vp8_update_mode_info_border(MODE_INFO *mi, int rows, int cols)
+static void update_mode_info_border(MODE_INFO *mi, int rows, int cols)
 {
     int i;
     vpx_memset(mi - cols - 2, 0, sizeof(MODE_INFO) * (cols + 1));
@@ -71,7 +70,7 @@
       if (vp8_yv12_alloc_frame_buffer(&oci->yv12_fb[i],  width, height, VP8BORDERINPIXELS) < 0)
         {
             vp8_de_alloc_frame_buffers(oci);
-            return ALLOC_FAILURE;
+            return 1;
         }
     }
 
@@ -88,13 +87,13 @@
     if (vp8_yv12_alloc_frame_buffer(&oci->temp_scale_frame,   width, 16, VP8BORDERINPIXELS) < 0)
     {
         vp8_de_alloc_frame_buffers(oci);
-        return ALLOC_FAILURE;
+        return 1;
     }
 
     if (vp8_yv12_alloc_frame_buffer(&oci->post_proc_buffer, width, height, VP8BORDERINPIXELS) < 0)
     {
         vp8_de_alloc_frame_buffers(oci);
-        return ALLOC_FAILURE;
+        return 1;
     }
 
     oci->mb_rows = height >> 4;
@@ -106,7 +105,7 @@
     if (!oci->mip)
     {
         vp8_de_alloc_frame_buffers(oci);
-        return ALLOC_FAILURE;
+        return 1;
     }
 
     oci->mi = oci->mip + oci->mode_info_stride + 1;
@@ -117,10 +116,10 @@
     if (!oci->above_context)
     {
         vp8_de_alloc_frame_buffers(oci);
-        return ALLOC_FAILURE;
+        return 1;
     }
 
-    vp8_update_mode_info_border(oci->mi, oci->mb_rows, oci->mb_cols);
+    update_mode_info_border(oci->mi, oci->mb_rows, oci->mb_cols);
 
     return 0;
 }

diff --git a/vp8/common/arm/arm_systemdependent.c b/vp8/common/arm/arm_systemdependent.c
index 83921f8..bd5c075 100644
--- a/vp8/common/arm/arm_systemdependent.c
+++ b/vp8/common/arm/arm_systemdependent.c

@@ -11,21 +11,13 @@
 
 #include "vpx_ports/config.h"
 #include "vpx_ports/arm.h"
-#include "g_common.h"
-#include "pragmas.h"
-#include "subpixel.h"
-#include "loopfilter.h"
-#include "recon.h"
-#include "idct.h"
-#include "onyxc_int.h"
-
-extern void (*vp8_build_intra_predictors_mby_ptr)(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mby(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mby_neon(MACROBLOCKD *x);
-
-extern void (*vp8_build_intra_predictors_mby_s_ptr)(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x);
+#include "vp8/common/g_common.h"
+#include "vp8/common/pragmas.h"
+#include "vp8/common/subpixel.h"
+#include "vp8/common/loopfilter.h"
+#include "vp8/common/recon.h"
+#include "vp8/common/idct.h"
+#include "vp8/common/onyxc_int.h"
 
 void vp8_arch_arm_common_init(VP8_COMMON *ctx)
 {
@@ -106,31 +98,12 @@
         rtcd->recon.recon2      = vp8_recon2b_neon;
         rtcd->recon.recon4      = vp8_recon4b_neon;
         rtcd->recon.recon_mb    = vp8_recon_mb_neon;
-
+        rtcd->recon.build_intra_predictors_mby =
+            vp8_build_intra_predictors_mby_neon;
+        rtcd->recon.build_intra_predictors_mby_s =
+            vp8_build_intra_predictors_mby_s_neon;
     }
 #endif
 
 #endif
-
-#if HAVE_ARMV6
-#if CONFIG_RUNTIME_CPU_DETECT
-    if (has_media)
-#endif
-    {
-        vp8_build_intra_predictors_mby_ptr = vp8_build_intra_predictors_mby;
-        vp8_build_intra_predictors_mby_s_ptr = vp8_build_intra_predictors_mby_s;
-    }
-#endif
-
-#if HAVE_ARMV7
-#if CONFIG_RUNTIME_CPU_DETECT
-    if (has_neon)
-#endif
-    {
-        vp8_build_intra_predictors_mby_ptr =
-         vp8_build_intra_predictors_mby_neon;
-        vp8_build_intra_predictors_mby_s_ptr =
-         vp8_build_intra_predictors_mby_s_neon;
-    }
-#endif
 }

diff --git a/vp8/common/arm/armv6/bilinearfilter_v6.asm b/vp8/common/arm/armv6/bilinearfilter_v6.asm
index 09d7338..a86ed5d 100644
--- a/vp8/common/arm/armv6/bilinearfilter_v6.asm
+++ b/vp8/common/arm/armv6/bilinearfilter_v6.asm

@@ -15,19 +15,19 @@
     AREA    |.text|, CODE, READONLY  ; name this block of code
 
 ;-------------------------------------
-; r0    unsigned char *src_ptr,
-; r1    unsigned short *output_ptr,
-; r2    unsigned int src_pixels_per_line,
-; r3    unsigned int output_height,
-; stack    unsigned int output_width,
-; stack    const short *vp8_filter
+; r0    unsigned char  *src_ptr,
+; r1    unsigned short *dst_ptr,
+; r2    unsigned int    src_pitch,
+; r3    unsigned int    height,
+; stack unsigned int    width,
+; stack const short    *vp8_filter
 ;-------------------------------------
 ; The output is transposed stroed in output array to make it easy for second pass filtering.
 |vp8_filter_block2d_bil_first_pass_armv6| PROC
     stmdb   sp!, {r4 - r11, lr}
 
     ldr     r11, [sp, #40]                  ; vp8_filter address
-    ldr     r4, [sp, #36]                   ; output width
+    ldr     r4, [sp, #36]                   ; width
 
     mov     r12, r3                         ; outer-loop counter
     sub     r2, r2, r4                      ; src increment for height loop
@@ -38,10 +38,10 @@
 
     ldr     r5, [r11]                       ; load up filter coefficients
 
-    mov     r3, r3, lsl #1                  ; output_height*2
+    mov     r3, r3, lsl #1                  ; height*2
     add     r3, r3, #2                      ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1)
 
-    mov     r11, r1                         ; save output_ptr for each row
+    mov     r11, r1                         ; save dst_ptr for each row
 
     cmp     r5, #128                        ; if filter coef = 128, then skip the filter
     beq     bil_null_1st_filter
@@ -140,17 +140,17 @@
 
 ;---------------------------------
 ; r0    unsigned short *src_ptr,
-; r1    unsigned char *output_ptr,
-; r2    int output_pitch,
-; r3    unsigned int  output_height,
-; stack unsigned int  output_width,
-; stack const short *vp8_filter
+; r1    unsigned char  *dst_ptr,
+; r2    int             dst_pitch,
+; r3    unsigned int    height,
+; stack unsigned int    width,
+; stack const short    *vp8_filter
 ;---------------------------------
 |vp8_filter_block2d_bil_second_pass_armv6| PROC
     stmdb   sp!, {r4 - r11, lr}
 
     ldr     r11, [sp, #40]                  ; vp8_filter address
-    ldr     r4, [sp, #36]                   ; output width
+    ldr     r4, [sp, #36]                   ; width
 
     ldr     r5, [r11]                       ; load up filter coefficients
     mov     r12, r4                         ; outer-loop counter = width, since we work on transposed data matrix

diff --git a/vp8/common/arm/bilinearfilter_arm.c b/vp8/common/arm/bilinearfilter_arm.c
index 65afb41..7340e20 100644
--- a/vp8/common/arm/bilinearfilter_arm.c
+++ b/vp8/common/arm/bilinearfilter_arm.c

@@ -10,128 +10,29 @@
 
 
 #include <math.h>
-#include "subpixel.h"
-
-#define BLOCK_HEIGHT_WIDTH 4
-#define VP8_FILTER_WEIGHT 128
-#define VP8_FILTER_SHIFT  7
-
-static const short bilinear_filters[8][2] =
-{
-    { 128,   0 },
-    { 112,  16 },
-    {  96,  32 },
-    {  80,  48 },
-    {  64,  64 },
-    {  48,  80 },
-    {  32,  96 },
-    {  16, 112 }
-};
-
-
-extern void vp8_filter_block2d_bil_first_pass_armv6
-(
-    unsigned char *src_ptr,
-    unsigned short *output_ptr,
-    unsigned int src_pixels_per_line,
-    unsigned int output_height,
-    unsigned int output_width,
-    const short *vp8_filter
-);
-
-extern void vp8_filter_block2d_bil_second_pass_armv6
-(
-    unsigned short *src_ptr,
-    unsigned char  *output_ptr,
-    int output_pitch,
-    unsigned int  output_height,
-    unsigned int  output_width,
-    const short *vp8_filter
-);
-
-#if 0
-void vp8_filter_block2d_bil_first_pass_6
-(
-    unsigned char *src_ptr,
-    unsigned short *output_ptr,
-    unsigned int src_pixels_per_line,
-    unsigned int output_height,
-    unsigned int output_width,
-    const short *vp8_filter
-)
-{
-    unsigned int i, j;
-
-    for ( i=0; i<output_height; i++ )
-    {
-        for ( j=0; j<output_width; j++ )
-        {
-            /* Apply bilinear filter */
-            output_ptr[j] = ( ( (int)src_ptr[0]          * vp8_filter[0]) +
-                               ((int)src_ptr[1] * vp8_filter[1]) +
-                                (VP8_FILTER_WEIGHT/2) ) >> VP8_FILTER_SHIFT;
-            src_ptr++;
-        }
-
-        /* Next row... */
-        src_ptr    += src_pixels_per_line - output_width;
-        output_ptr += output_width;
-    }
-}
-
-void vp8_filter_block2d_bil_second_pass_6
-(
-    unsigned short *src_ptr,
-    unsigned char  *output_ptr,
-    int output_pitch,
-    unsigned int  output_height,
-    unsigned int  output_width,
-    const short *vp8_filter
-)
-{
-    unsigned int  i,j;
-    int  Temp;
-
-    for ( i=0; i<output_height; i++ )
-    {
-        for ( j=0; j<output_width; j++ )
-        {
-            /* Apply filter */
-            Temp =  ((int)src_ptr[0]         * vp8_filter[0]) +
-                    ((int)src_ptr[output_width] * vp8_filter[1]) +
-                    (VP8_FILTER_WEIGHT/2);
-            output_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
-            src_ptr++;
-        }
-
-        /* Next row... */
-        /*src_ptr    += src_pixels_per_line - output_width;*/
-        output_ptr += output_pitch;
-    }
-}
-#endif
+#include "vp8/common/filter.h"
+#include "vp8/common/subpixel.h"
+#include "bilinearfilter_arm.h"
 
 void vp8_filter_block2d_bil_armv6
 (
     unsigned char *src_ptr,
-    unsigned char *output_ptr,
-    unsigned int   src_pixels_per_line,
+    unsigned char *dst_ptr,
+    unsigned int   src_pitch,
     unsigned int   dst_pitch,
-    const short      *HFilter,
-    const short      *VFilter,
+    const short   *HFilter,
+    const short   *VFilter,
     int            Width,
     int            Height
 )
 {
-
-    unsigned short FData[36*16]; /* Temp data bufffer used in filtering */
+    unsigned short FData[36*16]; /* Temp data buffer used in filtering */
 
     /* First filter 1-D horizontally... */
-    /* pixel_step = 1; */
-    vp8_filter_block2d_bil_first_pass_armv6(src_ptr, FData, src_pixels_per_line, Height + 1, Width, HFilter);
+    vp8_filter_block2d_bil_first_pass_armv6(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
 
     /* then 1-D vertically... */
-    vp8_filter_block2d_bil_second_pass_armv6(FData, output_ptr, dst_pitch, Height, Width, VFilter);
+    vp8_filter_block2d_bil_second_pass_armv6(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
 }
 
 
@@ -148,8 +49,8 @@
     const short  *HFilter;
     const short  *VFilter;
 
-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
 
     vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);
 }
@@ -167,8 +68,8 @@
     const short  *HFilter;
     const short  *VFilter;
 
-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
 
     vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);
 }
@@ -186,8 +87,8 @@
     const short  *HFilter;
     const short  *VFilter;
 
-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
 
     vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);
 }
@@ -205,8 +106,8 @@
     const short  *HFilter;
     const short  *VFilter;
 
-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
 
     vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
 }

diff --git a/vp8/common/arm/bilinearfilter_arm.h b/vp8/common/arm/bilinearfilter_arm.h
new file mode 100644
index 0000000..b7155d3
--- /dev/null
+++ b/vp8/common/arm/bilinearfilter_arm.h

@@ -0,0 +1,35 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef BILINEARFILTER_ARM_H
+#define BILINEARFILTER_ARM_H
+
+extern void vp8_filter_block2d_bil_first_pass_armv6
+(
+    const unsigned char  *src_ptr,
+    unsigned short       *dst_ptr,
+    unsigned int          src_pitch,
+    unsigned int          height,
+    unsigned int          width,
+    const short          *vp8_filter
+);
+
+extern void vp8_filter_block2d_bil_second_pass_armv6
+(
+    const unsigned short *src_ptr,
+    unsigned char        *dst_ptr,
+    int                   dst_pitch,
+    unsigned int          height,
+    unsigned int          width,
+    const short         *vp8_filter
+);
+
+#endif /* BILINEARFILTER_ARM_H */

diff --git a/vp8/common/arm/filter_arm.c b/vp8/common/arm/filter_arm.c
index b4f2fe6..fe3c5a5 100644
--- a/vp8/common/arm/filter_arm.c
+++ b/vp8/common/arm/filter_arm.c

@@ -11,26 +11,10 @@
 
 #include "vpx_ports/config.h"
 #include <math.h>
-#include "subpixel.h"
+#include "vp8/common/filter.h"
+#include "vp8/common/subpixel.h"
 #include "vpx_ports/mem.h"
 
-#define BLOCK_HEIGHT_WIDTH 4
-#define VP8_FILTER_WEIGHT 128
-#define VP8_FILTER_SHIFT  7
-
-DECLARE_ALIGNED(16, static const short, sub_pel_filters[8][6]) =
-{
-    { 0,  0,  128,    0,   0,  0 },         /* note that 1/8 pel positions are just as per alpha -0.5 bicubic */
-    { 0, -6,  123,   12,  -1,  0 },
-    { 2, -11, 108,   36,  -8,  1 },         /* New 1/4 pel 6 tap filter */
-    { 0, -9,   93,   50,  -6,  0 },
-    { 3, -16,  77,   77, -16,  3 },         /* New 1/2 pel 6 tap filter */
-    { 0, -6,   50,   93,  -9,  0 },
-    { 1, -8,   36,  108, -11,  2 },         /* New 1/4 pel 6 tap filter */
-    { 0, -1,   12,  123,  -6,  0 },
-};
-
-
 extern void vp8_filter_block2d_first_pass_armv6
 (
     unsigned char *src_ptr,
@@ -93,11 +77,11 @@
 {
     const short  *HFilter;
     const short  *VFilter;
-    DECLARE_ALIGNED_ARRAY(4, short, FData, 12*4); /* Temp data bufffer used in filtering */
+    DECLARE_ALIGNED_ARRAY(4, short, FData, 12*4); /* Temp data buffer used in filtering */
 
 
-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];       /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
 
     /* Vfilter is null. First pass only */
     if (xoffset && !yoffset)
@@ -129,47 +113,6 @@
     }
 }
 
-#if 0
-void vp8_sixtap_predict8x4_armv6
-(
-    unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    unsigned char *dst_ptr,
-    int  dst_pitch
-)
-{
-    const short  *HFilter;
-    const short  *VFilter;
-    DECLARE_ALIGNED_ARRAY(4, short, FData, 16*8); /* Temp data bufffer used in filtering */
-
-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];       /* 6 tap */
-
-
-    /*if (xoffset && !yoffset)
-    {
-        vp8_filter_block2d_first_pass_only_armv6 (  src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, HFilter );
-    }*/
-    /* Hfilter is null. Second pass only */
-    /*else if (!xoffset && yoffset)
-    {
-        vp8_filter_block2d_second_pass_only_armv6 ( src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, VFilter );
-    }
-    else
-    {
-        if (yoffset & 0x1)
-            vp8_filter_block2d_first_pass_armv6 ( src_ptr-src_pixels_per_line, FData+1, src_pixels_per_line, 8, 7, HFilter );
-        else*/
-
-        vp8_filter_block2d_first_pass_armv6 ( src_ptr-(2*src_pixels_per_line), FData, src_pixels_per_line, 8, 9, HFilter );
-
-        vp8_filter_block2d_second_pass_armv6 ( FData+2, dst_ptr, dst_pitch, 4, 8, VFilter );
-    /*}*/
-}
-#endif
-
 void vp8_sixtap_predict8x8_armv6
 (
     unsigned char  *src_ptr,
@@ -182,10 +125,10 @@
 {
     const short  *HFilter;
     const short  *VFilter;
-    DECLARE_ALIGNED_ARRAY(4, short, FData, 16*8); /* Temp data bufffer used in filtering */
+    DECLARE_ALIGNED_ARRAY(4, short, FData, 16*8); /* Temp data buffer used in filtering */
 
-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];       /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
 
     if (xoffset && !yoffset)
     {
@@ -224,10 +167,10 @@
 {
     const short  *HFilter;
     const short  *VFilter;
-    DECLARE_ALIGNED_ARRAY(4, short, FData, 24*16);    /* Temp data bufffer used in filtering */
+    DECLARE_ALIGNED_ARRAY(4, short, FData, 24*16);    /* Temp data buffer used in filtering */
 
-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];       /* 6 tap */
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
 
     if (xoffset && !yoffset)
     {

diff --git a/vp8/common/arm/loopfilter_arm.c b/vp8/common/arm/loopfilter_arm.c
index 684a7f1..3532a03 100644
--- a/vp8/common/arm/loopfilter_arm.c
+++ b/vp8/common/arm/loopfilter_arm.c

@@ -11,8 +11,8 @@
 
 #include "vpx_ports/config.h"
 #include <math.h>
-#include "loopfilter.h"
-#include "onyxc_int.h"
+#include "vp8/common/loopfilter.h"
+#include "vp8/common/onyxc_int.h"
 
 extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_armv6);
 extern prototype_loopfilter(vp8_loop_filter_vertical_edge_armv6);

diff --git a/vp8/common/arm/neon/recon_neon.c b/vp8/common/arm/neon/recon_neon.c
index f7930ee..09fd2a5 100644
--- a/vp8/common/arm/neon/recon_neon.c
+++ b/vp8/common/arm/neon/recon_neon.c

@@ -10,8 +10,8 @@
 
 
 #include "vpx_ports/config.h"
-#include "recon.h"
-#include "blockd.h"
+#include "vp8/common/recon.h"
+#include "vp8/common/blockd.h"
 
 extern void vp8_recon16x16mb_neon(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int ystride, unsigned char *udst_ptr, unsigned char *vdst_ptr);
 

diff --git a/vp8/common/arm/recon_arm.h b/vp8/common/arm/recon_arm.h
index b46b7fc..377cb2a 100644
--- a/vp8/common/arm/recon_arm.h
+++ b/vp8/common/arm/recon_arm.h

@@ -53,6 +53,9 @@
 
 extern prototype_recon_macroblock(vp8_recon_mb_neon);
 
+extern prototype_build_intra_predictors(vp8_build_intra_predictors_mby_neon);
+extern prototype_build_intra_predictors(vp8_build_intra_predictors_mby_s_neon);
+
 #if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_recon_recon
 #define vp8_recon_recon vp8_recon_b_neon
@@ -74,6 +77,13 @@
 
 #undef  vp8_recon_recon_mb
 #define vp8_recon_recon_mb vp8_recon_mb_neon
+
+#undef  vp8_recon_build_intra_predictors_mby
+#define vp8_recon_build_intra_predictors_mby vp8_build_intra_predictors_mby_neon
+
+#undef  vp8_recon_build_intra_predictors_mby_s
+#define vp8_recon_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s_neon
+
 #endif
 #endif
 

diff --git a/vp8/common/arm/reconintra_arm.c b/vp8/common/arm/reconintra_arm.c
index 4cc93d1..f8f4dca 100644
--- a/vp8/common/arm/reconintra_arm.c
+++ b/vp8/common/arm/reconintra_arm.c

@@ -10,10 +10,10 @@
 
 
 #include "vpx_ports/config.h"
-#include "blockd.h"
-#include "reconintra.h"
+#include "vp8/common/blockd.h"
+#include "vp8/common/reconintra.h"
 #include "vpx_mem/vpx_mem.h"
-#include "recon.h"
+#include "vp8/common/recon.h"
 
 #if HAVE_ARMV7
 extern void vp8_build_intra_predictors_mby_neon_func(

diff --git a/vp8/common/arm/vpx_asm_offsets.c b/vp8/common/arm/vpx_asm_offsets.c
deleted file mode 100644
index 5baf8cc..0000000
--- a/vp8/common/arm/vpx_asm_offsets.c
+++ /dev/null

@@ -1,87 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include <stddef.h>
-
-#if CONFIG_VP8_ENCODER
-#include "vpx_scale/yv12config.h"
-#endif
-
-#if CONFIG_VP8_DECODER
-#include "onyxd_int.h"
-#endif
-
-#define DEFINE(sym, val) int sym = val;
-
-/*
-#define BLANK() asm volatile("\n->" : : )
-*/
-
-/*
- * int main(void)
- * {
- */
-
-#if CONFIG_VP8_DECODER || CONFIG_VP8_ENCODER
-DEFINE(yv12_buffer_config_y_width,              offsetof(YV12_BUFFER_CONFIG, y_width));
-DEFINE(yv12_buffer_config_y_height,             offsetof(YV12_BUFFER_CONFIG, y_height));
-DEFINE(yv12_buffer_config_y_stride,             offsetof(YV12_BUFFER_CONFIG, y_stride));
-DEFINE(yv12_buffer_config_uv_width,             offsetof(YV12_BUFFER_CONFIG, uv_width));
-DEFINE(yv12_buffer_config_uv_height,            offsetof(YV12_BUFFER_CONFIG, uv_height));
-DEFINE(yv12_buffer_config_uv_stride,            offsetof(YV12_BUFFER_CONFIG, uv_stride));
-DEFINE(yv12_buffer_config_y_buffer,             offsetof(YV12_BUFFER_CONFIG, y_buffer));
-DEFINE(yv12_buffer_config_u_buffer,             offsetof(YV12_BUFFER_CONFIG, u_buffer));
-DEFINE(yv12_buffer_config_v_buffer,             offsetof(YV12_BUFFER_CONFIG, v_buffer));
-DEFINE(yv12_buffer_config_border,               offsetof(YV12_BUFFER_CONFIG, border));
-#endif
-
-#if CONFIG_VP8_DECODER
-DEFINE(mb_diff,                                 offsetof(MACROBLOCKD, diff));
-DEFINE(mb_predictor,                            offsetof(MACROBLOCKD, predictor));
-DEFINE(mb_dst_y_stride,                         offsetof(MACROBLOCKD, dst.y_stride));
-DEFINE(mb_dst_y_buffer,                         offsetof(MACROBLOCKD, dst.y_buffer));
-DEFINE(mb_dst_u_buffer,                         offsetof(MACROBLOCKD, dst.u_buffer));
-DEFINE(mb_dst_v_buffer,                         offsetof(MACROBLOCKD, dst.v_buffer));
-DEFINE(mb_up_available,                         offsetof(MACROBLOCKD, up_available));
-DEFINE(mb_left_available,                       offsetof(MACROBLOCKD, left_available));
-
-DEFINE(detok_scan,                              offsetof(DETOK, scan));
-DEFINE(detok_ptr_block2leftabove,               offsetof(DETOK, ptr_block2leftabove));
-DEFINE(detok_coef_tree_ptr,                     offsetof(DETOK, vp8_coef_tree_ptr));
-DEFINE(detok_teb_base_ptr,                      offsetof(DETOK, teb_base_ptr));
-DEFINE(detok_norm_ptr,                          offsetof(DETOK, norm_ptr));
-DEFINE(detok_ptr_coef_bands_x,                  offsetof(DETOK, ptr_coef_bands_x));
-
-DEFINE(detok_A,                                 offsetof(DETOK, A));
-DEFINE(detok_L,                                 offsetof(DETOK, L));
-
-DEFINE(detok_qcoeff_start_ptr,                  offsetof(DETOK, qcoeff_start_ptr));
-DEFINE(detok_current_bc,                        offsetof(DETOK, current_bc));
-DEFINE(detok_coef_probs,                        offsetof(DETOK, coef_probs));
-DEFINE(detok_eob,                               offsetof(DETOK, eob));
-
-DEFINE(bool_decoder_user_buffer_end,            offsetof(BOOL_DECODER, user_buffer_end));
-DEFINE(bool_decoder_user_buffer,                offsetof(BOOL_DECODER, user_buffer));
-DEFINE(bool_decoder_value,                      offsetof(BOOL_DECODER, value));
-DEFINE(bool_decoder_count,                      offsetof(BOOL_DECODER, count));
-DEFINE(bool_decoder_range,                      offsetof(BOOL_DECODER, range));
-
-DEFINE(tokenextrabits_min_val,                  offsetof(TOKENEXTRABITS, min_val));
-DEFINE(tokenextrabits_length,                   offsetof(TOKENEXTRABITS, Length));
-#endif
-
-//add asserts for any offset that is not supported by assembly code
-//add asserts for any size that is not supported by assembly code
-/*
- * return 0;
- * }
- */

diff --git a/vp8/common/asm_com_offsets.c b/vp8/common/asm_com_offsets.c
new file mode 100644
index 0000000..d299dd2
--- /dev/null
+++ b/vp8/common/asm_com_offsets.c

@@ -0,0 +1,49 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include <stddef.h>
+
+#include "vpx_scale/yv12config.h"
+
+#define ct_assert(name,cond) \
+    static void assert_##name(void) UNUSED;\
+    static void assert_##name(void) {switch(0){case 0:case !!(cond):;}}
+
+#define DEFINE(sym, val) int sym = val;
+
+/*
+#define BLANK() asm volatile("\n->" : : )
+*/
+
+/*
+ * int main(void)
+ * {
+ */
+
+//vpx_scale
+DEFINE(yv12_buffer_config_y_width,              offsetof(YV12_BUFFER_CONFIG, y_width));
+DEFINE(yv12_buffer_config_y_height,             offsetof(YV12_BUFFER_CONFIG, y_height));
+DEFINE(yv12_buffer_config_y_stride,             offsetof(YV12_BUFFER_CONFIG, y_stride));
+DEFINE(yv12_buffer_config_uv_width,             offsetof(YV12_BUFFER_CONFIG, uv_width));
+DEFINE(yv12_buffer_config_uv_height,            offsetof(YV12_BUFFER_CONFIG, uv_height));
+DEFINE(yv12_buffer_config_uv_stride,            offsetof(YV12_BUFFER_CONFIG, uv_stride));
+DEFINE(yv12_buffer_config_y_buffer,             offsetof(YV12_BUFFER_CONFIG, y_buffer));
+DEFINE(yv12_buffer_config_u_buffer,             offsetof(YV12_BUFFER_CONFIG, u_buffer));
+DEFINE(yv12_buffer_config_v_buffer,             offsetof(YV12_BUFFER_CONFIG, v_buffer));
+DEFINE(yv12_buffer_config_border,               offsetof(YV12_BUFFER_CONFIG, border));
+
+//add asserts for any offset that is not supported by assembly code
+//add asserts for any size that is not supported by assembly code
+/*
+ * return 0;
+ * }
+ */

diff --git a/vp8/common/blockd.c b/vp8/common/blockd.c
index 7f75a72..1fc3cd0 100644
--- a/vp8/common/blockd.c
+++ b/vp8/common/blockd.c

@@ -12,8 +12,6 @@
 #include "blockd.h"
 #include "vpx_mem/vpx_mem.h"
 
-const int vp8_block2type[25] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 1};
-
 const unsigned char vp8_block2left[25] =
 {
     0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8

diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h
index 5a8991e..fc8e072 100644
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h

@@ -28,11 +28,6 @@
 #define DCPREDSIMTHRESH 0
 #define DCPREDCNTTHRESH 3
 
-#define Y1CONTEXT 0
-#define UCONTEXT 1
-#define VCONTEXT 2
-#define Y2CONTEXT 3
-
 #define MB_FEATURE_TREE_PROBS   3
 #define MAX_MB_SEGMENTS         4
 
@@ -48,6 +43,11 @@
     int r, c;
 } POS;
 
+#define PLANE_TYPE_Y_NO_DC    0
+#define PLANE_TYPE_Y2         1
+#define PLANE_TYPE_UV         2
+#define PLANE_TYPE_Y_WITH_DC  3
+
 
 typedef char ENTROPY_CONTEXT;
 typedef struct
@@ -58,8 +58,6 @@
     ENTROPY_CONTEXT y2;
 } ENTROPY_CONTEXT_PLANES;
 
-extern const int vp8_block2type[25];
-
 extern const unsigned char vp8_block2left[25];
 extern const unsigned char vp8_block2above[25];
 

diff --git a/vp8/common/boolcoder.h b/vp8/common/boolcoder.h
deleted file mode 100644
index 5658868..0000000
--- a/vp8/common/boolcoder.h
+++ /dev/null

@@ -1,570 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef bool_coder_h
-#define bool_coder_h 1
-
-/* Arithmetic bool coder with largish probability range.
-   Timothy S Murphy  6 August 2004 */
-
-/* So as not to force users to drag in too much of my idiosyncratic C++ world,
-   I avoid fancy storage management. */
-
-#include <assert.h>
-
-#include <stddef.h>
-#include <stdio.h>
-
-typedef unsigned char vp8bc_index_t; // probability index
-
-/* There are a couple of slight variants in the details of finite-precision
-   arithmetic coding.  May be safely ignored by most users. */
-
-enum vp8bc_rounding
-{
-    vp8bc_down = 0,     // just like VP8
-    vp8bc_down_full = 1, // handles minimum probability correctly
-    vp8bc_up = 2
-};
-
-#if _MSC_VER
-
-/* Note that msvc by default does not inline _anything_ (regardless of the
-   setting of inline_depth) and that a command-line option (-Ob1 or -Ob2)
-   is required to inline even the smallest functions. */
-
-#   pragma inline_depth( 255)           // I mean it when I inline something
-#   pragma warning( disable : 4099)     // No class vs. struct harassment
-#   pragma warning( disable : 4250)     // dominance complaints
-#   pragma warning( disable : 4284)     // operator-> in templates
-#   pragma warning( disable : 4800)     // bool conversion
-
-// don't let prefix ++,-- stand in for postfix, disaster would ensue
-
-#   pragma warning( error : 4620 4621)
-
-#endif  // _MSC_VER
-
-
-#if __cplusplus
-
-// Sometimes one wishes to be definite about integer lengths.
-
-struct int_types
-{
-    typedef const bool cbool;
-    typedef const signed char cchar;
-    typedef const short cshort;
-    typedef const int cint;
-    typedef const int clong;
-
-    typedef const double cdouble;
-    typedef const size_t csize_t;
-
-    typedef unsigned char uchar;    // 8 bits
-    typedef const uchar cuchar;
-
-    typedef short int16;
-    typedef unsigned short uint16;
-    typedef const int16 cint16;
-    typedef const uint16 cuint16;
-
-    typedef int int32;
-    typedef unsigned int uint32;
-    typedef const int32 cint32;
-    typedef const uint32 cuint32;
-
-    typedef unsigned int uint;
-    typedef unsigned int ulong;
-    typedef const uint cuint;
-    typedef const ulong culong;
-
-
-    // All structs consume space, may as well have a vptr.
-
-    virtual ~int_types();
-};
-
-
-struct bool_coder_spec;
-struct bool_coder;
-struct bool_writer;
-struct bool_reader;
-
-
-struct bool_coder_namespace : int_types
-{
-    typedef vp8bc_index_t Index;
-    typedef bool_coder_spec Spec;
-    typedef const Spec c_spec;
-
-    enum Rounding
-    {
-        Down = vp8bc_down,
-        down_full = vp8bc_down_full,
-        Up = vp8bc_up
-    };
-};
-
-
-// Archivable specification of a bool coder includes rounding spec
-// and probability mapping table.  The latter replaces a uchar j
-// (0 <= j < 256) with an arbitrary uint16 tbl[j] = p.
-// p/65536 is then the probability of a zero.
-
-struct bool_coder_spec : bool_coder_namespace
-{
-    friend struct bool_coder;
-    friend struct bool_writer;
-    friend struct bool_reader;
-    friend struct bool_coder_spec_float;
-    friend struct bool_coder_spec_explicit_table;
-    friend struct bool_coder_spec_exponential_table;
-    friend struct BPsrc;
-private:
-    uint w;                 // precision
-    Rounding r;
-
-    uint ebits, mbits, ebias;
-    uint32 mmask;
-
-    Index max_index, half_index;
-
-    uint32 mantissa(Index i) const
-    {
-        assert(i < half_index);
-        return (1 << mbits) + (i & mmask);
-    }
-    uint exponent(Index i) const
-    {
-        assert(i < half_index);
-        return ebias - (i >> mbits);
-    }
-
-    uint16 Ptbl[256];       // kinda clunky, but so is storage management.
-
-    /* Cost in bits of encoding a zero at every probability, scaled by 2^20.
-       Assumes that index is at most 8 bits wide. */
-
-    uint32 Ctbl[256];
-
-    uint32 split(Index i, uint32 R) const    // 1 <= split <= max( 1, R-1)
-    {
-        if (!ebias)
-            return 1 + (((R - 1) * Ptbl[i]) >> 16);
-
-        if (i >= half_index)
-            return R - split(max_index - i, R);
-
-        return 1 + (((R - 1) * mantissa(i)) >> exponent(i));
-    }
-
-    uint32 max_range() const
-    {
-        return (1 << w) - (r == down_full ? 0 : 1);
-    }
-    uint32 min_range() const
-    {
-        return (1 << (w - 1)) + (r == down_full ? 1 : 0);
-    }
-    uint32 Rinc() const
-    {
-        return r == Up ? 1 : 0;
-    }
-
-    void check_prec() const;
-
-    bool float_init(uint Ebits, uint Mbits);
-
-    void cost_init();
-
-    bool_coder_spec(
-        uint prec, Rounding rr, uint Ebits = 0, uint Mbits = 0
-    )
-        : w(prec), r(rr)
-    {
-        float_init(Ebits, Mbits);
-    }
-public:
-    // Read complete spec from file.
-    bool_coder_spec(FILE *);
-
-    // Write spec to file.
-    void dump(FILE *) const;
-
-    // return probability index best approximating prob.
-    Index operator()(double prob) const;
-
-    // probability corresponding to index
-    double operator()(Index i) const;
-
-    Index complement(Index i) const
-    {
-        return max_index - i;
-    }
-
-    Index max_index() const
-    {
-        return max_index;
-    }
-    Index half_index() const
-    {
-        return half_index;
-    }
-
-    uint32 cost_zero(Index i) const
-    {
-        return Ctbl[i];
-    }
-    uint32 cost_one(Index i) const
-    {
-        return Ctbl[ max_index - i];
-    }
-    uint32 cost_bit(Index i, bool b) const
-    {
-        return Ctbl[b? max_index-i:i];
-    }
-};
-
-
-/* Pseudo floating-point probability specification.
-
-   At least one of Ebits and Mbits must be nonzero.
-
-   Since all arithmetic is done at 32 bits, Ebits is at most 5.
-
-   Total significant bits in index is Ebits + Mbits + 1.
-
-   Below the halfway point (i.e. when the top significant bit is 0),
-   the index is (e << Mbits) + m.
-
-   The exponent e is between 0 and (2**Ebits) - 1,
-   the mantissa m is between 0 and (2**Mbits) - 1.
-
-   Prepending an implicit 1 to the mantissa, the probability is then
-
-        (2**Mbits + m) >> (e - 2**Ebits - 1 - Mbits),
-
-   which has (1/2)**(2**Ebits + 1) as a minimum
-   and (1/2) * [1 - 2**(Mbits + 1)] as a maximum.
-
-   When the index is above the halfway point, the probability is the
-   complement of the probability associated to the complement of the index.
-
-   Note that the probability increases with the index and that, because of
-   the symmetry, we cannot encode probability exactly 1/2; though we
-   can get as close to 1/2 as we like, provided we have enough Mbits.
-
-   The latter is of course not a problem in practice, one never has
-   exact probabilities and entropy errors are second order, that is, the
-   "overcoding" of a zero will be largely compensated for by the
-   "undercoding" of a one (or vice-versa).
-
-   Compared to arithmetic probability specs (a la VP8), this will do better
-   at very high and low probabilities and worse at probabilities near 1/2,
-   as well as facilitating the usage of wider or narrower probability indices.
-*/
-
-struct bool_coder_spec_float : bool_coder_spec
-{
-    bool_coder_spec_float(
-        uint Ebits = 3, uint Mbits = 4, Rounding rr = down_full, uint prec = 12
-    )
-        : bool_coder_spec(prec, rr, Ebits, Mbits)
-    {
-        cost_init();
-    }
-};
-
-
-struct bool_coder_spec_explicit_table : bool_coder_spec
-{
-    bool_coder_spec_explicit_table(
-        cuint16 probability_table[256] = 0,  // default is tbl[i] = i << 8.
-        Rounding = down_full,
-        uint precision = 16
-    );
-};
-
-// Contruct table via multiplicative interpolation between
-// p[128] = 1/2  and p[0] = (1/2)^x.
-// Since we are working with 16-bit precision, x is at most 16.
-// For probabilities to increase with i, we must have x > 1.
-// For 0 <= i <= 128, p[i] = (1/2)^{ 1 + [1 - (i/128)]*[x-1] }.
-// Finally, p[128+i] = 1 - p[128 - i].
-
-struct bool_coder_spec_exponential_table : bool_coder_spec
-{
-    bool_coder_spec_exponential_table(uint x, Rounding = down_full, uint prec = 16);
-};
-
-
-// Commonalities between writer and reader.
-
-struct bool_coder : bool_coder_namespace
-{
-    friend struct bool_writer;
-    friend struct bool_reader;
-    friend struct BPsrc;
-private:
-    uint32 Low, Range;
-    cuint32 min_range;
-    cuint32 rinc;
-    c_spec spec;
-
-    void _reset()
-    {
-        Low = 0;
-        Range = spec.max_range();
-    }
-
-    bool_coder(c_spec &s)
-        :  min_range(s.min_range()),
-           rinc(s.Rinc()),
-           spec(s)
-    {
-        _reset();
-    }
-
-    uint32 half() const
-    {
-        return 1 + ((Range - 1) >> 1);
-    }
-public:
-    c_spec &Spec() const
-    {
-        return spec;
-    }
-};
-
-
-struct bool_writer : bool_coder
-{
-    friend struct BPsrc;
-private:
-    uchar *Bstart, *Bend, *B;
-    int bit_lag;
-    bool is_toast;
-    void carry();
-    void reset()
-    {
-        _reset();
-        bit_lag = 32 - spec.w;
-        is_toast = 0;
-    }
-    void raw(bool value, uint32 split);
-public:
-    bool_writer(c_spec &, uchar *Dest, size_t Len);
-    virtual ~bool_writer();
-
-    void operator()(Index p, bool v)
-    {
-        raw(v, spec.split(p, Range));
-    }
-
-    uchar *buf() const
-    {
-        return Bstart;
-    }
-    size_t bytes_written() const
-    {
-        return B - Bstart;
-    }
-
-    // Call when done with input, flushes internal state.
-    // DO NOT write any more data after calling this.
-
-    bool_writer &flush();
-
-    void write_bits(int n, uint val)
-    {
-        if (n)
-        {
-            uint m = 1 << (n - 1);
-
-            do
-            {
-                raw((bool)(val & m), half());
-            }
-            while (m >>= 1);
-        }
-    }
-
-#   if 0
-    // We are agnostic about storage management.
-    // By default, overflows throw an assert but user can
-    // override to provide an expanding buffer using ...
-
-    virtual void overflow(uint Len) const;
-
-    // ... this function copies already-written data into new buffer
-    // and retains new buffer location.
-
-    void new_buffer(uchar *dest, uint Len);
-
-    // Note that storage management is the user's responsibility.
-#   endif
-};
-
-
-// This could be adjusted to use a little less lookahead.
-
-struct bool_reader : bool_coder
-{
-    friend struct BPsrc;
-private:
-    cuchar *const Bstart;   // for debugging
-    cuchar *B;
-    cuchar *const Bend;
-    cuint shf;
-    uint bct;
-    bool raw(uint32 split);
-public:
-    bool_reader(c_spec &s, cuchar *src, size_t Len);
-
-    bool operator()(Index p)
-    {
-        return raw(spec.split(p, Range));
-    }
-
-    uint read_bits(int num_bits)
-    {
-        uint v = 0;
-
-        while (--num_bits >= 0)
-            v += v + (raw(half()) ? 1 : 0);
-
-        return v;
-    }
-};
-
-extern "C" {
-
-#endif /*  __cplusplus */
-
-
-    /* C interface */
-
-    typedef struct bool_coder_spec bool_coder_spec;
-    typedef struct bool_writer bool_writer;
-    typedef struct bool_reader bool_reader;
-
-    typedef const bool_coder_spec c_bool_coder_spec;
-    typedef const bool_writer c_bool_writer;
-    typedef const bool_reader c_bool_reader;
-
-
-    /* Optionally override default precision when constructing coder_specs.
-       Just pass a zero pointer if you don't care.
-       Precision is at most 16 bits for table specs, at most 23 otherwise. */
-
-    struct vp8bc_prec
-    {
-        enum vp8bc_rounding r;      /* see top header file for def */
-        unsigned int prec;          /* range precision in bits */
-    };
-
-    typedef const struct vp8bc_prec vp8bc_c_prec;
-
-    /* bool_coder_spec contains mapping of uchars to actual probabilities
-       (16 bit uints) as well as (usually immaterial) selection of
-       exact finite-precision algorithm used (for now, the latter can only
-       be overridden using the C++ interface).
-       See comments above the corresponding C++ constructors for discussion,
-       especially of exponential probability table generation. */
-
-    bool_coder_spec *vp8bc_vp8spec(); // just like vp8
-
-    bool_coder_spec *vp8bc_literal_spec(
-        const unsigned short prob_map[256],  // 0 is like vp8 w/more precision
-        vp8bc_c_prec*
-    );
-
-    bool_coder_spec *vp8bc_float_spec(
-        unsigned int exponent_bits, unsigned int mantissa_bits, vp8bc_c_prec*
-    );
-
-    bool_coder_spec *vp8bc_exponential_spec(unsigned int min_exp, vp8bc_c_prec *);
-
-    bool_coder_spec *vp8bc_spec_from_file(FILE *);
-
-
-    void vp8bc_destroy_spec(c_bool_coder_spec *);
-
-    void vp8bc_spec_to_file(c_bool_coder_spec *, FILE *);
-
-
-    /* Nearest index to supplied probability of zero, 0 <= prob <= 1. */
-
-    vp8bc_index_t vp8bc_index(c_bool_coder_spec *, double prob);
-
-    vp8bc_index_t vp8bc_index_from_counts(
-        c_bool_coder_spec *p, unsigned int zero_ct, unsigned int one_ct
-    );
-
-    /* In case you want to look */
-
-    double vp8bc_probability(c_bool_coder_spec *, vp8bc_index_t);
-
-    /* Opposite index */
-
-    vp8bc_index_t vp8bc_complement(c_bool_coder_spec *, vp8bc_index_t);
-
-    /* Cost in bits of encoding a zero at given probability, scaled by 2^20.
-       (assumes that an int holds at least 32 bits). */
-
-    unsigned int vp8bc_cost_zero(c_bool_coder_spec *, vp8bc_index_t);
-
-    unsigned int vp8bc_cost_one(c_bool_coder_spec *, vp8bc_index_t);
-    unsigned int vp8bc_cost_bit(c_bool_coder_spec *, vp8bc_index_t, int);
-
-
-    /* bool_writer interface */
-
-    /* Length = 0 disables checking for writes beyond buffer end. */
-
-    bool_writer *vp8bc_create_writer(
-        c_bool_coder_spec *, unsigned char *Destination, size_t Length
-    );
-
-    /* Flushes out any buffered data and returns total # of bytes written. */
-
-    size_t vp8bc_destroy_writer(bool_writer *);
-
-    void vp8bc_write_bool(bool_writer *, int boolean_val, vp8bc_index_t false_prob);
-
-    void vp8bc_write_bits(
-        bool_writer *, unsigned int integer_value, int number_of_bits
-    );
-
-    c_bool_coder_spec *vp8bc_writer_spec(c_bool_writer *);
-
-
-    /* bool_reader interface */
-
-    /* Length = 0 disables checking for reads beyond buffer end. */
-
-    bool_reader *vp8bc_create_reader(
-        c_bool_coder_spec *, const unsigned char *Source, size_t Length
-    );
-    void vp8bc_destroy_reader(bool_reader *);
-
-    int vp8bc_read_bool(bool_reader *, vp8bc_index_t false_prob);
-
-    unsigned int vp8bc_read_bits(bool_reader *, int number_of_bits);
-
-    c_bool_coder_spec *vp8bc_reader_spec(c_bool_reader *);
-
-#if __cplusplus
-}
-#endif
-
-#endif  /* bool_coder_h */

diff --git a/vp8/common/codec_common_interface.h b/vp8/common/codec_common_interface.h
deleted file mode 100644
index 7a7db38..0000000
--- a/vp8/common/codec_common_interface.h
+++ /dev/null

@@ -1,93 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef CODEC_COMMON_INTERFACE_H
-#define CODEC_COMMON_INTERFACE_H
-
-#define __export
-#define _export
-#define dll_export   __declspec( dllexport )
-#define dll_import   __declspec( dllimport )
-
-// Playback ERROR Codes.
-#define NO_DECODER_ERROR            0
-#define REMOTE_DECODER_ERROR        -1
-
-#define DFR_BAD_DCT_COEFF           -100
-#define DFR_ZERO_LENGTH_FRAME       -101
-#define DFR_FRAME_SIZE_INVALID      -102
-#define DFR_OUTPUT_BUFFER_OVERFLOW  -103
-#define DFR_INVALID_FRAME_HEADER    -104
-#define FR_INVALID_MODE_TOKEN       -110
-#define ETR_ALLOCATION_ERROR        -200
-#define ETR_INVALID_ROOT_PTR        -201
-#define SYNCH_ERROR                 -400
-#define BUFFER_UNDERFLOW_ERROR      -500
-#define PB_IB_OVERFLOW_ERROR        -501
-
-// External error triggers
-#define PB_HEADER_CHECKSUM_ERROR    -601
-#define PB_DATA_CHECKSUM_ERROR      -602
-
-// DCT Error Codes
-#define DDCT_EXPANSION_ERROR        -700
-#define DDCT_INVALID_TOKEN_ERROR    -701
-
-// exception_errors
-#define GEN_EXCEPTIONS              -800
-#define EX_UNQUAL_ERROR             -801
-
-// Unrecoverable error codes
-#define FATAL_PLAYBACK_ERROR        -1000
-#define GEN_ERROR_CREATING_CDC      -1001
-#define GEN_THREAD_CREATION_ERROR   -1002
-#define DFR_CREATE_BMP_FAILED       -1003
-
-// YUV buffer configuration structure
-typedef struct
-{
-    int     y_width;
-    int     y_height;
-    int     y_stride;
-
-    int     uv_width;
-    int     uv_height;
-    int     uv_stride;
-
-    unsigned char   *y_buffer;
-    unsigned char   *u_buffer;
-    unsigned char   *v_buffer;
-
-} YUV_BUFFER_CONFIG;
-typedef enum
-{
-    C_SET_KEY_FRAME,
-    C_SET_FIXED_Q,
-    C_SET_FIRSTPASS_FILE,
-    C_SET_EXPERIMENTAL_MIN,
-    C_SET_EXPERIMENTAL_MAX = C_SET_EXPERIMENTAL_MIN + 255,
-    C_SET_CHECKPROTECT,
-    C_SET_TESTMODE,
-    C_SET_INTERNAL_SIZE,
-    C_SET_RECOVERY_FRAME,
-    C_SET_REFERENCEFRAME,
-    C_SET_GOLDENFRAME
-
-#ifndef VP50_COMP_INTERFACE
-    // Specialist test facilities.
-//    C_VCAP_PARAMS,              // DO NOT USE FOR NOW WITH VFW CODEC
-#endif
-
-} C_SETTING;
-
-typedef unsigned long C_SET_VALUE;
-
-
-#endif

diff --git a/vp8/common/filter.c b/vp8/common/filter.c
new file mode 100644
index 0000000..ae59529
--- /dev/null
+++ b/vp8/common/filter.c

@@ -0,0 +1,494 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdlib.h>
+#include "filter.h"
+#include "vpx_ports/mem.h"
+
+DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]) =
+{
+    { 128,   0 },
+    { 112,  16 },
+    {  96,  32 },
+    {  80,  48 },
+    {  64,  64 },
+    {  48,  80 },
+    {  32,  96 },
+    {  16, 112 }
+};
+
+DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]) =
+{
+
+    { 0,  0,  128,    0,   0,  0 },         /* note that 1/8 pel positions are just as per alpha -0.5 bicubic */
+    { 0, -6,  123,   12,  -1,  0 },
+    { 2, -11, 108,   36,  -8,  1 },         /* New 1/4 pel 6 tap filter */
+    { 0, -9,   93,   50,  -6,  0 },
+    { 3, -16,  77,   77, -16,  3 },         /* New 1/2 pel 6 tap filter */
+    { 0, -6,   50,   93,  -9,  0 },
+    { 1, -8,   36,  108, -11,  2 },         /* New 1/4 pel 6 tap filter */
+    { 0, -1,   12,  123,  -6,  0 },
+};
+
+static void filter_block2d_first_pass
+(
+    unsigned char *src_ptr,
+    int *output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    const short *vp8_filter
+)
+{
+    unsigned int i, j;
+    int  Temp;
+
+    for (i = 0; i < output_height; i++)
+    {
+        for (j = 0; j < output_width; j++)
+        {
+            Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) +
+                   ((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) +
+                   ((int)src_ptr[0]                 * vp8_filter[2]) +
+                   ((int)src_ptr[pixel_step]         * vp8_filter[3]) +
+                   ((int)src_ptr[2*pixel_step]       * vp8_filter[4]) +
+                   ((int)src_ptr[3*pixel_step]       * vp8_filter[5]) +
+                   (VP8_FILTER_WEIGHT >> 1);      /* Rounding */
+
+            /* Normalize back to 0-255 */
+            Temp = Temp >> VP8_FILTER_SHIFT;
+
+            if (Temp < 0)
+                Temp = 0;
+            else if (Temp > 255)
+                Temp = 255;
+
+            output_ptr[j] = Temp;
+            src_ptr++;
+        }
+
+        /* Next row... */
+        src_ptr    += src_pixels_per_line - output_width;
+        output_ptr += output_width;
+    }
+}
+
+static void filter_block2d_second_pass
+(
+    int *src_ptr,
+    unsigned char *output_ptr,
+    int output_pitch,
+    unsigned int src_pixels_per_line,
+    unsigned int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    const short *vp8_filter
+)
+{
+    unsigned int i, j;
+    int  Temp;
+
+    for (i = 0; i < output_height; i++)
+    {
+        for (j = 0; j < output_width; j++)
+        {
+            /* Apply filter */
+            Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) +
+                   ((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) +
+                   ((int)src_ptr[0]                 * vp8_filter[2]) +
+                   ((int)src_ptr[pixel_step]         * vp8_filter[3]) +
+                   ((int)src_ptr[2*pixel_step]       * vp8_filter[4]) +
+                   ((int)src_ptr[3*pixel_step]       * vp8_filter[5]) +
+                   (VP8_FILTER_WEIGHT >> 1);   /* Rounding */
+
+            /* Normalize back to 0-255 */
+            Temp = Temp >> VP8_FILTER_SHIFT;
+
+            if (Temp < 0)
+                Temp = 0;
+            else if (Temp > 255)
+                Temp = 255;
+
+            output_ptr[j] = (unsigned char)Temp;
+            src_ptr++;
+        }
+
+        /* Start next row */
+        src_ptr    += src_pixels_per_line - output_width;
+        output_ptr += output_pitch;
+    }
+}
+
+
+static void filter_block2d
+(
+    unsigned char  *src_ptr,
+    unsigned char  *output_ptr,
+    unsigned int src_pixels_per_line,
+    int output_pitch,
+    const short  *HFilter,
+    const short  *VFilter
+)
+{
+    int FData[9*4]; /* Temp data buffer used in filtering */
+
+    /* First filter 1-D horizontally... */
+    filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 4, HFilter);
+
+    /* then filter verticaly... */
+    filter_block2d_second_pass(FData + 8, output_ptr, output_pitch, 4, 4, 4, 4, VFilter);
+}
+
+
+void vp8_sixtap_predict_c
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
+
+    filter_block2d(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter);
+}
+void vp8_sixtap_predict8x8_c
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+    int FData[13*16];   /* Temp data buffer used in filtering */
+
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
+
+    /* First filter 1-D horizontally... */
+    filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 13, 8, HFilter);
+
+
+    /* then filter verticaly... */
+    filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter);
+
+}
+
+void vp8_sixtap_predict8x4_c
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+    int FData[13*16];   /* Temp data buffer used in filtering */
+
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
+
+    /* First filter 1-D horizontally... */
+    filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 8, HFilter);
+
+
+    /* then filter verticaly... */
+    filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 4, 8, VFilter);
+
+}
+
+void vp8_sixtap_predict16x16_c
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short  *HFilter;
+    const short  *VFilter;
+    int FData[21*24];   /* Temp data buffer used in filtering */
+
+
+    HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
+    VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
+
+    /* First filter 1-D horizontally... */
+    filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 21, 16, HFilter);
+
+    /* then filter verticaly... */
+    filter_block2d_second_pass(FData + 32, dst_ptr, dst_pitch, 16, 16, 16, 16, VFilter);
+
+}
+
+
+/****************************************************************************
+ *
+ *  ROUTINE       : filter_block2d_bil_first_pass
+ *
+ *  INPUTS        : UINT8  *src_ptr    : Pointer to source block.
+ *                  UINT32  src_stride : Stride of source block.
+ *                  UINT32  height     : Block height.
+ *                  UINT32  width      : Block width.
+ *                  INT32  *vp8_filter : Array of 2 bi-linear filter taps.
+ *
+ *  OUTPUTS       : INT32  *dst_ptr    : Pointer to filtered block.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block
+ *                  in the horizontal direction to produce the filtered output
+ *                  block. Used to implement first-pass of 2-D separable filter.
+ *
+ *  SPECIAL NOTES : Produces INT32 output to retain precision for next pass.
+ *                  Two filter taps should sum to VP8_FILTER_WEIGHT.
+ *
+ ****************************************************************************/
+static void filter_block2d_bil_first_pass
+(
+    unsigned char  *src_ptr,
+    unsigned short *dst_ptr,
+    unsigned int    src_stride,
+    unsigned int    height,
+    unsigned int    width,
+    const short    *vp8_filter
+)
+{
+    unsigned int i, j;
+
+    for (i = 0; i < height; i++)
+    {
+        for (j = 0; j < width; j++)
+        {
+            /* Apply bilinear filter */
+            dst_ptr[j] = (((int)src_ptr[0] * vp8_filter[0]) +
+                          ((int)src_ptr[1] * vp8_filter[1]) +
+                          (VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT;
+            src_ptr++;
+        }
+
+        /* Next row... */
+        src_ptr += src_stride - width;
+        dst_ptr += width;
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : filter_block2d_bil_second_pass
+ *
+ *  INPUTS        : INT32  *src_ptr    : Pointer to source block.
+ *                  UINT32  dst_pitch  : Destination block pitch.
+ *                  UINT32  height     : Block height.
+ *                  UINT32  width      : Block width.
+ *                  INT32  *vp8_filter : Array of 2 bi-linear filter taps.
+ *
+ *  OUTPUTS       : UINT16 *dst_ptr    : Pointer to filtered block.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block
+ *                  in the vertical direction to produce the filtered output
+ *                  block. Used to implement second-pass of 2-D separable filter.
+ *
+ *  SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.
+ *                  Two filter taps should sum to VP8_FILTER_WEIGHT.
+ *
+ ****************************************************************************/
+static void filter_block2d_bil_second_pass
+(
+    unsigned short *src_ptr,
+    unsigned char  *dst_ptr,
+    int             dst_pitch,
+    unsigned int    height,
+    unsigned int    width,
+    const short    *vp8_filter
+)
+{
+    unsigned int  i, j;
+    int  Temp;
+
+    for (i = 0; i < height; i++)
+    {
+        for (j = 0; j < width; j++)
+        {
+            /* Apply filter */
+            Temp = ((int)src_ptr[0]     * vp8_filter[0]) +
+                   ((int)src_ptr[width] * vp8_filter[1]) +
+                   (VP8_FILTER_WEIGHT / 2);
+            dst_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
+            src_ptr++;
+        }
+
+        /* Next row... */
+        dst_ptr += dst_pitch;
+    }
+}
+
+
+/****************************************************************************
+ *
+ *  ROUTINE       : filter_block2d_bil
+ *
+ *  INPUTS        : UINT8  *src_ptr          : Pointer to source block.
+ *                  UINT32  src_pitch        : Stride of source block.
+ *                  UINT32  dst_pitch        : Stride of destination block.
+ *                  INT32  *HFilter          : Array of 2 horizontal filter taps.
+ *                  INT32  *VFilter          : Array of 2 vertical filter taps.
+ *                  INT32  Width             : Block width
+ *                  INT32  Height            : Block height
+ *
+ *  OUTPUTS       : UINT16 *dst_ptr       : Pointer to filtered block.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : 2-D filters an input block by applying a 2-tap
+ *                  bi-linear filter horizontally followed by a 2-tap
+ *                  bi-linear filter vertically on the result.
+ *
+ *  SPECIAL NOTES : The largest block size can be handled here is 16x16
+ *
+ ****************************************************************************/
+static void filter_block2d_bil
+(
+    unsigned char *src_ptr,
+    unsigned char *dst_ptr,
+    unsigned int   src_pitch,
+    unsigned int   dst_pitch,
+    const short   *HFilter,
+    const short   *VFilter,
+    int            Width,
+    int            Height
+)
+{
+
+    unsigned short FData[17*16];    /* Temp data buffer used in filtering */
+
+    /* First filter 1-D horizontally... */
+    filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
+
+    /* then 1-D vertically... */
+    filter_block2d_bil_second_pass(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
+}
+
+
+void vp8_bilinear_predict4x4_c
+(
+    unsigned char  *src_ptr,
+    int   src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pitch
+)
+{
+    const short *HFilter;
+    const short *VFilter;
+
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
+#if 0
+    {
+        int i;
+        unsigned char temp1[16];
+        unsigned char temp2[16];
+
+        bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4);
+        filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4);
+
+        for (i = 0; i < 16; i++)
+        {
+            if (temp1[i] != temp2[i])
+            {
+                bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4);
+                filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4);
+            }
+        }
+    }
+#endif
+    filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);
+
+}
+
+void vp8_bilinear_predict8x8_c
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short *HFilter;
+    const short *VFilter;
+
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
+
+    filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);
+
+}
+
+void vp8_bilinear_predict8x4_c
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short *HFilter;
+    const short *VFilter;
+
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
+
+    filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);
+
+}
+
+void vp8_bilinear_predict16x16_c
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int  dst_pitch
+)
+{
+    const short *HFilter;
+    const short *VFilter;
+
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
+
+    filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
+}

diff --git a/vp8/common/filter.h b/vp8/common/filter.h
new file mode 100644
index 0000000..0f225c2
--- /dev/null
+++ b/vp8/common/filter.h

@@ -0,0 +1,22 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef FILTER_H
+#define FILTER_H
+
+#define BLOCK_HEIGHT_WIDTH 4
+#define VP8_FILTER_WEIGHT 128
+#define VP8_FILTER_SHIFT  7
+
+extern const short vp8_bilinear_filters[8][2];
+extern const short vp8_sub_pel_filters[8][6];
+
+#endif //FILTER_H

diff --git a/vp8/common/filter_c.c b/vp8/common/filter_c.c
deleted file mode 100644
index 399a847..0000000
--- a/vp8/common/filter_c.c
+++ /dev/null

@@ -1,540 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include <stdlib.h>
-
-#define BLOCK_HEIGHT_WIDTH 4
-#define VP8_FILTER_WEIGHT 128
-#define VP8_FILTER_SHIFT  7
-
-
-static const int bilinear_filters[8][2] =
-{
-    { 128,   0 },
-    { 112,  16 },
-    {  96,  32 },
-    {  80,  48 },
-    {  64,  64 },
-    {  48,  80 },
-    {  32,  96 },
-    {  16, 112 }
-};
-
-
-static const short sub_pel_filters[8][6] =
-{
-
-    { 0,  0,  128,    0,   0,  0 },         /* note that 1/8 pel positions are just as per alpha -0.5 bicubic */
-    { 0, -6,  123,   12,  -1,  0 },
-    { 2, -11, 108,   36,  -8,  1 },         /* New 1/4 pel 6 tap filter */
-    { 0, -9,   93,   50,  -6,  0 },
-    { 3, -16,  77,   77, -16,  3 },         /* New 1/2 pel 6 tap filter */
-    { 0, -6,   50,   93,  -9,  0 },
-    { 1, -8,   36,  108, -11,  2 },         /* New 1/4 pel 6 tap filter */
-    { 0, -1,   12,  123,  -6,  0 },
-
-
-
-};
-
-void vp8_filter_block2d_first_pass
-(
-    unsigned char *src_ptr,
-    int *output_ptr,
-    unsigned int src_pixels_per_line,
-    unsigned int pixel_step,
-    unsigned int output_height,
-    unsigned int output_width,
-    const short *vp8_filter
-)
-{
-    unsigned int i, j;
-    int  Temp;
-
-    for (i = 0; i < output_height; i++)
-    {
-        for (j = 0; j < output_width; j++)
-        {
-            Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) +
-                   ((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) +
-                   ((int)src_ptr[0]                 * vp8_filter[2]) +
-                   ((int)src_ptr[pixel_step]         * vp8_filter[3]) +
-                   ((int)src_ptr[2*pixel_step]       * vp8_filter[4]) +
-                   ((int)src_ptr[3*pixel_step]       * vp8_filter[5]) +
-                   (VP8_FILTER_WEIGHT >> 1);      /* Rounding */
-
-            /* Normalize back to 0-255 */
-            Temp = Temp >> VP8_FILTER_SHIFT;
-
-            if (Temp < 0)
-                Temp = 0;
-            else if (Temp > 255)
-                Temp = 255;
-
-            output_ptr[j] = Temp;
-            src_ptr++;
-        }
-
-        /* Next row... */
-        src_ptr    += src_pixels_per_line - output_width;
-        output_ptr += output_width;
-    }
-}
-
-void vp8_filter_block2d_second_pass
-(
-    int *src_ptr,
-    unsigned char *output_ptr,
-    int output_pitch,
-    unsigned int src_pixels_per_line,
-    unsigned int pixel_step,
-    unsigned int output_height,
-    unsigned int output_width,
-    const short *vp8_filter
-)
-{
-    unsigned int i, j;
-    int  Temp;
-
-    for (i = 0; i < output_height; i++)
-    {
-        for (j = 0; j < output_width; j++)
-        {
-            /* Apply filter */
-            Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) +
-                   ((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) +
-                   ((int)src_ptr[0]                 * vp8_filter[2]) +
-                   ((int)src_ptr[pixel_step]         * vp8_filter[3]) +
-                   ((int)src_ptr[2*pixel_step]       * vp8_filter[4]) +
-                   ((int)src_ptr[3*pixel_step]       * vp8_filter[5]) +
-                   (VP8_FILTER_WEIGHT >> 1);   /* Rounding */
-
-            /* Normalize back to 0-255 */
-            Temp = Temp >> VP8_FILTER_SHIFT;
-
-            if (Temp < 0)
-                Temp = 0;
-            else if (Temp > 255)
-                Temp = 255;
-
-            output_ptr[j] = (unsigned char)Temp;
-            src_ptr++;
-        }
-
-        /* Start next row */
-        src_ptr    += src_pixels_per_line - output_width;
-        output_ptr += output_pitch;
-    }
-}
-
-
-void vp8_filter_block2d
-(
-    unsigned char  *src_ptr,
-    unsigned char  *output_ptr,
-    unsigned int src_pixels_per_line,
-    int output_pitch,
-    const short  *HFilter,
-    const short  *VFilter
-)
-{
-    int FData[9*4]; /* Temp data bufffer used in filtering */
-
-    /* First filter 1-D horizontally... */
-    vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 4, HFilter);
-
-    /* then filter verticaly... */
-    vp8_filter_block2d_second_pass(FData + 8, output_ptr, output_pitch, 4, 4, 4, 4, VFilter);
-}
-
-
-void vp8_block_variation_c
-(
-    unsigned char  *src_ptr,
-    int   src_pixels_per_line,
-    int *HVar,
-    int *VVar
-)
-{
-    int i, j;
-    unsigned char *Ptr = src_ptr;
-
-    for (i = 0; i < 4; i++)
-    {
-        for (j = 0; j < 4; j++)
-        {
-            *HVar += abs((int)Ptr[j] - (int)Ptr[j+1]);
-            *VVar += abs((int)Ptr[j] - (int)Ptr[j+src_pixels_per_line]);
-        }
-
-        Ptr += src_pixels_per_line;
-    }
-}
-
-
-
-
-void vp8_sixtap_predict_c
-(
-    unsigned char  *src_ptr,
-    int   src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    unsigned char *dst_ptr,
-    int dst_pitch
-)
-{
-    const short  *HFilter;
-    const short  *VFilter;
-
-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];   /* 6 tap */
-
-    vp8_filter_block2d(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter);
-}
-void vp8_sixtap_predict8x8_c
-(
-    unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    unsigned char *dst_ptr,
-    int  dst_pitch
-)
-{
-    const short  *HFilter;
-    const short  *VFilter;
-    int FData[13*16];   /* Temp data bufffer used in filtering */
-
-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];   /* 6 tap */
-
-    /* First filter 1-D horizontally... */
-    vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 13, 8, HFilter);
-
-
-    /* then filter verticaly... */
-    vp8_filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter);
-
-}
-
-void vp8_sixtap_predict8x4_c
-(
-    unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    unsigned char *dst_ptr,
-    int  dst_pitch
-)
-{
-    const short  *HFilter;
-    const short  *VFilter;
-    int FData[13*16];   /* Temp data bufffer used in filtering */
-
-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];   /* 6 tap */
-
-    /* First filter 1-D horizontally... */
-    vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 9, 8, HFilter);
-
-
-    /* then filter verticaly... */
-    vp8_filter_block2d_second_pass(FData + 16, dst_ptr, dst_pitch, 8, 8, 4, 8, VFilter);
-
-}
-
-void vp8_sixtap_predict16x16_c
-(
-    unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    unsigned char *dst_ptr,
-    int  dst_pitch
-)
-{
-    const short  *HFilter;
-    const short  *VFilter;
-    int FData[21*24];   /* Temp data bufffer used in filtering */
-
-
-    HFilter = sub_pel_filters[xoffset];   /* 6 tap */
-    VFilter = sub_pel_filters[yoffset];   /* 6 tap */
-
-    /* First filter 1-D horizontally... */
-    vp8_filter_block2d_first_pass(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 1, 21, 16, HFilter);
-
-    /* then filter verticaly... */
-    vp8_filter_block2d_second_pass(FData + 32, dst_ptr, dst_pitch, 16, 16, 16, 16, VFilter);
-
-}
-
-
-/****************************************************************************
- *
- *  ROUTINE       : filter_block2d_bil_first_pass
- *
- *  INPUTS        : UINT8  *src_ptr          : Pointer to source block.
- *                  UINT32 src_pixels_per_line : Stride of input block.
- *                  UINT32 pixel_step        : Offset between filter input samples (see notes).
- *                  UINT32 output_height     : Input block height.
- *                  UINT32 output_width      : Input block width.
- *                  INT32  *vp8_filter          : Array of 2 bi-linear filter taps.
- *
- *  OUTPUTS       : INT32 *output_ptr        : Pointer to filtered block.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
- *                  either horizontal or vertical direction to produce the
- *                  filtered output block. Used to implement first-pass
- *                  of 2-D separable filter.
- *
- *  SPECIAL NOTES : Produces INT32 output to retain precision for next pass.
- *                  Two filter taps should sum to VP8_FILTER_WEIGHT.
- *                  pixel_step defines whether the filter is applied
- *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).
- *                  It defines the offset required to move from one input
- *                  to the next.
- *
- ****************************************************************************/
-void vp8_filter_block2d_bil_first_pass
-(
-    unsigned char *src_ptr,
-    unsigned short *output_ptr,
-    unsigned int src_pixels_per_line,
-    int pixel_step,
-    unsigned int output_height,
-    unsigned int output_width,
-    const int *vp8_filter
-)
-{
-    unsigned int i, j;
-
-    for (i = 0; i < output_height; i++)
-    {
-        for (j = 0; j < output_width; j++)
-        {
-            /* Apply bilinear filter */
-            output_ptr[j] = (((int)src_ptr[0]          * vp8_filter[0]) +
-                             ((int)src_ptr[pixel_step] * vp8_filter[1]) +
-                             (VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT;
-            src_ptr++;
-        }
-
-        /* Next row... */
-        src_ptr    += src_pixels_per_line - output_width;
-        output_ptr += output_width;
-    }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : filter_block2d_bil_second_pass
- *
- *  INPUTS        : INT32  *src_ptr          : Pointer to source block.
- *                  UINT32 src_pixels_per_line : Stride of input block.
- *                  UINT32 pixel_step        : Offset between filter input samples (see notes).
- *                  UINT32 output_height     : Input block height.
- *                  UINT32 output_width      : Input block width.
- *                  INT32  *vp8_filter          : Array of 2 bi-linear filter taps.
- *
- *  OUTPUTS       : UINT16 *output_ptr       : Pointer to filtered block.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
- *                  either horizontal or vertical direction to produce the
- *                  filtered output block. Used to implement second-pass
- *                  of 2-D separable filter.
- *
- *  SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.
- *                  Two filter taps should sum to VP8_FILTER_WEIGHT.
- *                  pixel_step defines whether the filter is applied
- *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).
- *                  It defines the offset required to move from one input
- *                  to the next.
- *
- ****************************************************************************/
-void vp8_filter_block2d_bil_second_pass
-(
-    unsigned short *src_ptr,
-    unsigned char  *output_ptr,
-    int output_pitch,
-    unsigned int  src_pixels_per_line,
-    unsigned int  pixel_step,
-    unsigned int  output_height,
-    unsigned int  output_width,
-    const int *vp8_filter
-)
-{
-    unsigned int  i, j;
-    int  Temp;
-
-    for (i = 0; i < output_height; i++)
-    {
-        for (j = 0; j < output_width; j++)
-        {
-            /* Apply filter */
-            Temp = ((int)src_ptr[0]         * vp8_filter[0]) +
-                   ((int)src_ptr[pixel_step] * vp8_filter[1]) +
-                   (VP8_FILTER_WEIGHT / 2);
-            output_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
-            src_ptr++;
-        }
-
-        /* Next row... */
-        src_ptr    += src_pixels_per_line - output_width;
-        output_ptr += output_pitch;
-    }
-}
-
-
-/****************************************************************************
- *
- *  ROUTINE       : filter_block2d_bil
- *
- *  INPUTS        : UINT8  *src_ptr          : Pointer to source block.
- *                  UINT32 src_pixels_per_line : Stride of input block.
- *                  INT32  *HFilter         : Array of 2 horizontal filter taps.
- *                  INT32  *VFilter         : Array of 2 vertical filter taps.
- *
- *  OUTPUTS       : UINT16 *output_ptr       : Pointer to filtered block.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : 2-D filters an input block by applying a 2-tap
- *                  bi-linear filter horizontally followed by a 2-tap
- *                  bi-linear filter vertically on the result.
- *
- *  SPECIAL NOTES : The largest block size can be handled here is 16x16
- *
- ****************************************************************************/
-void vp8_filter_block2d_bil
-(
-    unsigned char *src_ptr,
-    unsigned char *output_ptr,
-    unsigned int   src_pixels_per_line,
-    unsigned int   dst_pitch,
-    const int      *HFilter,
-    const int      *VFilter,
-    int            Width,
-    int            Height
-)
-{
-
-    unsigned short FData[17*16];    /* Temp data bufffer used in filtering */
-
-    /* First filter 1-D horizontally... */
-    vp8_filter_block2d_bil_first_pass(src_ptr, FData, src_pixels_per_line, 1, Height + 1, Width, HFilter);
-
-    /* then 1-D vertically... */
-    vp8_filter_block2d_bil_second_pass(FData, output_ptr, dst_pitch, Width, Width, Height, Width, VFilter);
-}
-
-
-void vp8_bilinear_predict4x4_c
-(
-    unsigned char  *src_ptr,
-    int   src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    unsigned char *dst_ptr,
-    int dst_pitch
-)
-{
-    const int  *HFilter;
-    const int  *VFilter;
-
-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
-#if 0
-    {
-        int i;
-        unsigned char temp1[16];
-        unsigned char temp2[16];
-
-        bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4);
-        vp8_filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4);
-
-        for (i = 0; i < 16; i++)
-        {
-            if (temp1[i] != temp2[i])
-            {
-                bilinear_predict4x4_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, temp1, 4);
-                vp8_filter_block2d_bil(src_ptr, temp2, src_pixels_per_line, 4, HFilter, VFilter, 4, 4);
-            }
-        }
-    }
-#endif
-    vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);
-
-}
-
-void vp8_bilinear_predict8x8_c
-(
-    unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    unsigned char *dst_ptr,
-    int  dst_pitch
-)
-{
-    const int  *HFilter;
-    const int  *VFilter;
-
-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
-
-    vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);
-
-}
-
-void vp8_bilinear_predict8x4_c
-(
-    unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    unsigned char *dst_ptr,
-    int  dst_pitch
-)
-{
-    const int  *HFilter;
-    const int  *VFilter;
-
-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
-
-    vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);
-
-}
-
-void vp8_bilinear_predict16x16_c
-(
-    unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    unsigned char *dst_ptr,
-    int  dst_pitch
-)
-{
-    const int  *HFilter;
-    const int  *VFilter;
-
-    HFilter = bilinear_filters[xoffset];
-    VFilter = bilinear_filters[yoffset];
-
-    vp8_filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
-}

diff --git a/vp8/common/findnearmv.c b/vp8/common/findnearmv.c
index cab0403..2041afa 100644
--- a/vp8/common/findnearmv.c
+++ b/vp8/common/findnearmv.c

@@ -11,7 +11,12 @@
 
 #include "findnearmv.h"
 
-#define FINDNEAR_SEARCH_SITES   3
+const unsigned char vp8_mbsplit_offset[4][16] = {
+    { 0,  8,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
+    { 0,  2,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
+    { 0,  2,  8, 10,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
+    { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}
+};
 
 /* Predict motion vectors using those from already-decoded nearby blocks.
    Note that we only consider one 4x4 subblock from each candidate 16x16

diff --git a/vp8/common/findnearmv.h b/vp8/common/findnearmv.h
index cb5a58e..bdf9eca 100644
--- a/vp8/common/findnearmv.h
+++ b/vp8/common/findnearmv.h

@@ -70,4 +70,6 @@
 
 const B_MODE_INFO *vp8_above_bmi(const MODE_INFO *cur_mb, int b, int mi_stride);
 
+extern const unsigned char vp8_mbsplit_offset[4][16];
+
 #endif

diff --git a/vp8/common/fourcc.hpp b/vp8/common/fourcc.hpp
deleted file mode 100644
index c582628..0000000
--- a/vp8/common/fourcc.hpp
+++ /dev/null

@@ -1,121 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef FOURCC_HPP
-#define FOURCC_HPP
-
-#include <iosfwd>
-#include <cstring>
-
-
-#if defined(__POWERPC__) || defined(__APPLE__) || defined(__MERKS__)
-using namespace std;
-#endif
-
-class four_cc
-{
-public:
-
-    four_cc();
-    four_cc(const char*);
-    explicit four_cc(unsigned long);
-
-    bool operator==(const four_cc&) const;
-    bool operator!=(const four_cc&) const;
-
-    bool operator==(const char*) const;
-    bool operator!=(const char*) const;
-
-    operator unsigned long() const;
-    unsigned long as_long() const;
-
-    four_cc& operator=(unsigned long);
-
-    char operator[](int) const;
-
-    std::ostream& put(std::ostream&) const;
-
-    bool printable() const;
-
-private:
-
-    union
-    {
-        char code[4];
-        unsigned long code_as_long;
-    };
-
-};
-
-
-inline four_cc::four_cc()
-{
-}
-
-inline four_cc::four_cc(unsigned long x)
-    : code_as_long(x)
-{
-}
-
-inline four_cc::four_cc(const char* str)
-{
-    memcpy(code, str, 4);
-}
-
-
-inline bool four_cc::operator==(const four_cc& rhs) const
-{
-    return code_as_long == rhs.code_as_long;
-}
-
-inline bool four_cc::operator!=(const four_cc& rhs) const
-{
-    return !operator==(rhs);
-}
-
-inline bool four_cc::operator==(const char* rhs) const
-{
-    return (memcmp(code, rhs, 4) == 0);
-}
-
-inline bool four_cc::operator!=(const char* rhs) const
-{
-    return !operator==(rhs);
-}
-
-
-inline four_cc::operator unsigned long() const
-{
-    return code_as_long;
-}
-
-inline unsigned long four_cc::as_long() const
-{
-    return code_as_long;
-}
-
-inline char four_cc::operator[](int i) const
-{
-    return code[i];
-}
-
-inline four_cc& four_cc::operator=(unsigned long val)
-{
-    code_as_long = val;
-    return *this;
-}
-
-inline std::ostream& operator<<(std::ostream& os, const four_cc& rhs)
-{
-    return rhs.put(os);
-}
-
-#endif

diff --git a/vp8/common/generic/systemdependent.c b/vp8/common/generic/systemdependent.c
index b3eadaf..5c64647 100644
--- a/vp8/common/generic/systemdependent.c
+++ b/vp8/common/generic/systemdependent.c

@@ -10,22 +10,16 @@
 
 
 #include "vpx_ports/config.h"
-#include "g_common.h"
-#include "subpixel.h"
-#include "loopfilter.h"
-#include "recon.h"
-#include "idct.h"
-#include "onyxc_int.h"
+#include "vp8/common/g_common.h"
+#include "vp8/common/subpixel.h"
+#include "vp8/common/loopfilter.h"
+#include "vp8/common/recon.h"
+#include "vp8/common/idct.h"
+#include "vp8/common/onyxc_int.h"
 
 extern void vp8_arch_x86_common_init(VP8_COMMON *ctx);
 extern void vp8_arch_arm_common_init(VP8_COMMON *ctx);
 
-void (*vp8_build_intra_predictors_mby_ptr)(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mby(MACROBLOCKD *x);
-
-void (*vp8_build_intra_predictors_mby_s_ptr)(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x);
-
 void vp8_machine_specific_config(VP8_COMMON *ctx)
 {
 #if CONFIG_RUNTIME_CPU_DETECT
@@ -45,6 +39,10 @@
     rtcd->recon.recon4      = vp8_recon4b_c;
     rtcd->recon.recon_mb    = vp8_recon_mb_c;
     rtcd->recon.recon_mby   = vp8_recon_mby_c;
+    rtcd->recon.build_intra_predictors_mby =
+        vp8_build_intra_predictors_mby;
+    rtcd->recon.build_intra_predictors_mby_s =
+        vp8_build_intra_predictors_mby_s;
 
     rtcd->subpix.sixtap16x16   = vp8_sixtap_predict16x16_c;
     rtcd->subpix.sixtap8x8     = vp8_sixtap_predict8x8_c;
@@ -75,9 +73,6 @@
 #endif
 
 #endif
-    /* Pure C: */
-    vp8_build_intra_predictors_mby_ptr = vp8_build_intra_predictors_mby;
-    vp8_build_intra_predictors_mby_s_ptr = vp8_build_intra_predictors_mby_s;
 
 #if ARCH_X86 || ARCH_X86_64
     vp8_arch_x86_common_init(ctx);

diff --git a/vp8/common/mac_specs.h b/vp8/common/mac_specs.h
deleted file mode 100644
index 4b8ee58..0000000
--- a/vp8/common/mac_specs.h
+++ /dev/null

@@ -1,31 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#if !defined(_mac_specs_h)
-#define _mac_specs_h
-
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-    extern unsigned int vp8_read_tsc();
-
-    extern unsigned int vp8_get_processor_freq();
-
-    extern unsigned int vpx_has_altivec();
-
-#if defined(__cplusplus)
-}
-#endif
-
-
-#endif

diff --git a/vp8/common/mbpitch.c b/vp8/common/mbpitch.c
index af55e2f..054042c 100644
--- a/vp8/common/mbpitch.c
+++ b/vp8/common/mbpitch.c

@@ -17,7 +17,7 @@
     DEST = 1
 } BLOCKSET;
 
-void vp8_setup_block
+static void setup_block
 (
     BLOCKD *b,
     int mv_stride,
@@ -43,7 +43,8 @@
 
 }
 
-void vp8_setup_macroblock(MACROBLOCKD *x, BLOCKSET bs)
+
+static void setup_macroblock(MACROBLOCKD *x, BLOCKSET bs)
 {
     int block;
 
@@ -64,16 +65,16 @@
 
     for (block = 0; block < 16; block++) /* y blocks */
     {
-        vp8_setup_block(&x->block[block], x->dst.y_stride, y, x->dst.y_stride,
+        setup_block(&x->block[block], x->dst.y_stride, y, x->dst.y_stride,
                         (block >> 2) * 4 * x->dst.y_stride + (block & 3) * 4, bs);
     }
 
     for (block = 16; block < 20; block++) /* U and V blocks */
     {
-        vp8_setup_block(&x->block[block], x->dst.uv_stride, u, x->dst.uv_stride,
+        setup_block(&x->block[block], x->dst.uv_stride, u, x->dst.uv_stride,
                         ((block - 16) >> 1) * 4 * x->dst.uv_stride + (block & 1) * 4, bs);
 
-        vp8_setup_block(&x->block[block+4], x->dst.uv_stride, v, x->dst.uv_stride,
+        setup_block(&x->block[block+4], x->dst.uv_stride, v, x->dst.uv_stride,
                         ((block - 16) >> 1) * 4 * x->dst.uv_stride + (block & 1) * 4, bs);
     }
 }
@@ -124,6 +125,6 @@
 {
 
     /* handle the destination pitch features */
-    vp8_setup_macroblock(x, DEST);
-    vp8_setup_macroblock(x, PRED);
+    setup_macroblock(x, DEST);
+    setup_macroblock(x, PRED);
 }

diff --git a/vp8/common/partialgfupdate.h b/vp8/common/partialgfupdate.h
deleted file mode 100644
index 115134a..0000000
--- a/vp8/common/partialgfupdate.h
+++ /dev/null

@@ -1,19 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_PARTIALGFUPDATE_H
-#define __INC_PARTIALGFUPDATE_H
-
-#include "onyxc_int.h"
-
-extern void update_gf_selective(ONYX_COMMON *cm, MACROBLOCKD *x);
-
-#endif

diff --git a/vp8/common/postproc.c b/vp8/common/postproc.c
index d30068e..5bfc7d6 100644
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c

@@ -211,7 +211,7 @@
     }
 }
 
-int vp8_q2mbl(int x)
+static int q2mbl(int x)
 {
     if (x < 20) x = 20;
 
@@ -314,8 +314,8 @@
     (void) flag;
 
     POSTPROC_INVOKE(rtcd, downacross)(source->y_buffer, post->y_buffer, source->y_stride,  post->y_stride, source->y_height, source->y_width,  ppl);
-    POSTPROC_INVOKE(rtcd, across)(post->y_buffer, post->y_stride, post->y_height, post->y_width, vp8_q2mbl(q));
-    POSTPROC_INVOKE(rtcd, down)(post->y_buffer, post->y_stride, post->y_height, post->y_width, vp8_q2mbl(q));
+    POSTPROC_INVOKE(rtcd, across)(post->y_buffer, post->y_stride, post->y_height, post->y_width, q2mbl(q));
+    POSTPROC_INVOKE(rtcd, down)(post->y_buffer, post->y_stride, post->y_height, post->y_width, q2mbl(q));
 
     POSTPROC_INVOKE(rtcd, downacross)(source->u_buffer, post->u_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl);
     POSTPROC_INVOKE(rtcd, downacross)(source->v_buffer, post->v_buffer, source->uv_stride, post->uv_stride, source->uv_height, source->uv_width, ppl);

diff --git a/vp8/common/predictdc.c b/vp8/common/predictdc.c
deleted file mode 100644
index f315f50..0000000
--- a/vp8/common/predictdc.c
+++ /dev/null

@@ -1,44 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include <stdlib.h>
-#include "blockd.h"
-
-
-void vp8_predict_dc(short *lastdc, short *thisdc, short quant, short *cons)
-{
-    int diff;
-    int sign;
-    int last_dc = *lastdc;
-    int this_dc = *thisdc;
-
-    if (*cons  > DCPREDCNTTHRESH)
-    {
-        this_dc += last_dc;
-    }
-
-    diff = abs(last_dc - this_dc);
-    sign  = (last_dc >> 31) ^(this_dc >> 31);
-    sign |= (!last_dc | !this_dc);
-
-    if (sign)
-    {
-        *cons = 0;
-    }
-    else
-    {
-        if (diff <= DCPREDSIMTHRESH * quant)
-            (*cons)++ ;
-    }
-
-    *thisdc = this_dc;
-    *lastdc = this_dc;
-}

diff --git a/vp8/common/predictdc.h b/vp8/common/predictdc.h
deleted file mode 100644
index fa85968..0000000
--- a/vp8/common/predictdc.h
+++ /dev/null

@@ -1,18 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __PREDICTDC_H
-#define __PREDICTDC_H
-
-void uvvp8_predict_dc(short *lastdc, short *thisdc, short quant, short *cons);
-void vp8_predict_dc(short *lastdc, short *thisdc, short quant, short *cons);
-
-#endif

diff --git a/vp8/common/proposed.h b/vp8/common/proposed.h
deleted file mode 100644
index c965990..0000000
--- a/vp8/common/proposed.h
+++ /dev/null

@@ -1,71 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-typedef struct core_codec *codec_ptr;
-typedef struct interface_table *interface_ptr;
-
-typedef struct
-{
-    void (*Initialize)();
-    void (*Shutdown)();
-    codec_ptr(*Create)();
-    int (*compress_frame)(codec_ptr, unsigned int *frame_flags, YV12_BUFFER_CONFIG *sd, unsigned long *size, char *dest, INT64 time_stamp);
-    int (*show_frame)(codec_ptr , YV12_BUFFER_CONFIG *dest, int deblock_level, int noise_level, int flags);
-    void (*Remove)(codec_ptr *comp);
-    interface_ptr(*get_interface)(unsigned int id);
-
-} core_codec;
-
-typedef struct
-{
-    int (*set_bitrate)(codec_ptr, END_USAGE usage, int Datarate);
-    int (*get_bitrate)(codec_ptr, END_USAGE *usage, int *Datarate);
-    int (*set_mode)(codec_ptr, MODE mode, int Speed, char *File);
-    int (*get_mode)(codec_ptr, MODE *mode, int *Speed, char **File);
-} codec_settings_basic;
-
-typedef struct
-{
-    int (*set_bitrate)(codec_ptr, END_USAGE usage, int Datarate);
-    int (*get_bitrate)(codec_ptr, END_USAGE *usage, int *Datarate);
-    int (*set_mode)(codec_ptr, MODE  mode, int  Speed, char   *File);
-    int (*get_mode)(codec_ptr, MODE *mode, int *Speed, char **File);
-    int (*set_denoise)(codec_ptr, int  Level);
-    int (*get_denoise)(codec_ptr, int *Level);
-    int (*set_sharpness)(codec_ptr, int  sharpness);
-    int (*get_sharpness)(codec_ptr, int *sharpness);
-    int (*set_keyframing)(codec_ptr, int  Auto, int  max_distance);
-    int (*get_keyframing)(codec_ptr, int *Auto, int *max_distance);
-    int (*set_buffering)(codec_ptr, int  buffer_level, int  max_buffer_level);
-    int (*get_buffering)(codec_ptr, int *buffer_level, int *max_buffer_level);
-    int (*set_adjust_frame_rate)(codec_ptr, int Allowed, int at_buffer_level_pct);
-    int (*get_adjust_frame_rate)(codec_ptr, int *Allowed, int *at_buffer_level_pct);
-    int (*set_adjust_frame_size)(codec_ptr, int Allowed, int down_at_buffer_level_pct, int up_at_buffer_level_pct);
-    int (*get_adjust_frame_size)(codec_ptr, int *Allowed, int *down_at_buffer_level_pct, int *up_at_buffer_level_pct);
-    int (*set_adjust_quality)(codec_ptr, int Allowed, int min_quantizer, int max_quantizer);
-    int (*get_adjust_quality)(codec_ptr, int *Allowed, int *min_quantizer, int *max_quantizer);
-    int (*set_vbrparms)(codec_ptr, int Bias, int Min, int Max);
-    int (*get_vbrparms)(codec_ptr, int *Bias, int *Min, int *Max);
-
-} codec_settings_v1;
-
-typedef struct
-{
-    int (*request_recovery)(codec_ptr);
-    int (*request_droppable)(codec_ptr);
-    int (*internal_size)(codec_ptr, VPX_SCALING Vertical, VPX_SCALING Horizontal);
-    int (*update_last)(codec_ptr);
-    int (*update_gold)(codec_ptr);
-    int (*use_only_last)(codec_ptr);
-    int (*use_only_gold)(codec_ptr);
-    int (*update_entropy)(codec_ptr);
-
-} codec_realtime_requests;

diff --git a/vp8/common/recon.h b/vp8/common/recon.h
index 1e6e343..e608f21 100644
--- a/vp8/common/recon.h
+++ b/vp8/common/recon.h

@@ -23,6 +23,9 @@
 #define prototype_recon_macroblock(sym) \
     void sym(const struct vp8_recon_rtcd_vtable *rtcd, MACROBLOCKD *x)
 
+#define prototype_build_intra_predictors(sym) \
+    void sym(MACROBLOCKD *x)
+
 struct vp8_recon_rtcd_vtable;
 
 #if ARCH_X86 || ARCH_X86_64
@@ -73,9 +76,23 @@
 #endif
 extern prototype_recon_macroblock(vp8_recon_recon_mby);
 
+#ifndef vp8_recon_build_intra_predictors_mby
+#define vp8_recon_build_intra_predictors_mby vp8_build_intra_predictors_mby
+#endif
+extern prototype_build_intra_predictors\
+    (vp8_recon_build_intra_predictors_mby);
+
+#ifndef vp8_recon_build_intra_predictors_mby_s
+#define vp8_recon_build_intra_predictors_mby_s vp8_build_intra_predictors_mby_s
+#endif
+extern prototype_build_intra_predictors\
+    (vp8_recon_build_intra_predictors_mby_s);
+
+
 typedef prototype_copy_block((*vp8_copy_block_fn_t));
 typedef prototype_recon_block((*vp8_recon_fn_t));
 typedef prototype_recon_macroblock((*vp8_recon_mb_fn_t));
+typedef prototype_build_intra_predictors((*vp8_build_intra_pred_fn_t));
 typedef struct vp8_recon_rtcd_vtable
 {
     vp8_copy_block_fn_t  copy16x16;
@@ -86,6 +103,8 @@
     vp8_recon_fn_t       recon4;
     vp8_recon_mb_fn_t    recon_mb;
     vp8_recon_mb_fn_t    recon_mby;
+    vp8_build_intra_pred_fn_t  build_intra_predictors_mby_s;
+    vp8_build_intra_pred_fn_t  build_intra_predictors_mby;
 } vp8_recon_rtcd_vtable_t;
 
 #if CONFIG_RUNTIME_CPU_DETECT
@@ -94,6 +113,5 @@
 #define RECON_INVOKE(ctx,fn) vp8_recon_##fn
 #endif
 
-void vp8_recon_intra4x4mb(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
 void vp8_recon_intra_mbuv(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
 #endif

diff --git a/vp8/common/reconinter.c b/vp8/common/reconinter.c
index 74871c0..7cfab41 100644
--- a/vp8/common/reconinter.c
+++ b/vp8/common/reconinter.c

@@ -168,7 +168,7 @@
     }
 }
 
-void vp8_build_inter_predictors4b(MACROBLOCKD *x, BLOCKD *d, int pitch)
+static void build_inter_predictors4b(MACROBLOCKD *x, BLOCKD *d, int pitch)
 {
     unsigned char *ptr_base;
     unsigned char *ptr;
@@ -187,7 +187,7 @@
     }
 }
 
-void vp8_build_inter_predictors2b(MACROBLOCKD *x, BLOCKD *d, int pitch)
+static void build_inter_predictors2b(MACROBLOCKD *x, BLOCKD *d, int pitch)
 {
     unsigned char *ptr_base;
     unsigned char *ptr;
@@ -246,7 +246,7 @@
             BLOCKD *d1 = &x->block[i+1];
 
             if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
-                vp8_build_inter_predictors2b(x, d0, 8);
+                build_inter_predictors2b(x, d0, 8);
             else
             {
                 vp8_build_inter_predictors_b(d0, 8, x->subpixel_predict);
@@ -291,7 +291,7 @@
             for (i = 0; i < 4; i++)
             {
                 BLOCKD *d = &x->block[bbb[i]];
-                vp8_build_inter_predictors4b(x, d, 16);
+                build_inter_predictors4b(x, d, 16);
             }
 
         }
@@ -303,7 +303,7 @@
                 BLOCKD *d1 = &x->block[i+1];
 
                 if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
-                    vp8_build_inter_predictors2b(x, d0, 16);
+                    build_inter_predictors2b(x, d0, 16);
                 else
                 {
                     vp8_build_inter_predictors_b(d0, 16, x->subpixel_predict);
@@ -372,7 +372,7 @@
             for (i = 0; i < 4; i++)
             {
                 BLOCKD *d = &x->block[bbb[i]];
-                vp8_build_inter_predictors4b(x, d, 16);
+                build_inter_predictors4b(x, d, 16);
             }
         }
         else
@@ -383,7 +383,7 @@
                 BLOCKD *d1 = &x->block[i+1];
 
                 if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
-                    vp8_build_inter_predictors2b(x, d0, 16);
+                    build_inter_predictors2b(x, d0, 16);
                 else
                 {
                     vp8_build_inter_predictors_b(d0, 16, x->subpixel_predict);
@@ -400,7 +400,7 @@
             BLOCKD *d1 = &x->block[i+1];
 
             if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
-                vp8_build_inter_predictors2b(x, d0, 8);
+                build_inter_predictors2b(x, d0, 8);
             else
             {
                 vp8_build_inter_predictors_b(d0, 8, x->subpixel_predict);
@@ -600,7 +600,7 @@
             for (i = 0; i < 4; i++)
             {
                 BLOCKD *d = &x->block[bbb[i]];
-                /*vp8_build_inter_predictors4b(x, d, 16);*/
+                /*build_inter_predictors4b(x, d, 16);*/
 
                 {
                     unsigned char *ptr_base;
@@ -630,7 +630,7 @@
 
                 if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
                 {
-                    /*vp8_build_inter_predictors2b(x, d0, 16);*/
+                    /*build_inter_predictors2b(x, d0, 16);*/
                     unsigned char *ptr_base;
                     unsigned char *ptr;
                     unsigned char *pred_ptr = d0->predictor;
@@ -662,7 +662,7 @@
 
             if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
             {
-                /*vp8_build_inter_predictors2b(x, d0, 8);*/
+                /*build_inter_predictors2b(x, d0, 8);*/
                 unsigned char *ptr_base;
                 unsigned char *ptr;
                 unsigned char *pred_ptr = d0->predictor;

diff --git a/vp8/common/reconintra.h b/vp8/common/reconintra.h
index 988b43a..4025a53 100644
--- a/vp8/common/reconintra.h
+++ b/vp8/common/reconintra.h

@@ -14,13 +14,6 @@
 
 extern void init_intra_left_above_pixels(MACROBLOCKD *x);
 
-extern void (*vp8_build_intra_predictors_mby_ptr)(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mby(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mby_neon(MACROBLOCKD *x);
-extern void (*vp8_build_intra_predictors_mby_s_ptr)(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x);
-extern void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x);
-
 extern void vp8_build_intra_predictors_mbuv(MACROBLOCKD *x);
 extern void vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x);
 

diff --git a/vp8/common/reconintra4x4.c b/vp8/common/reconintra4x4.c
index db44fa1..cd70dca 100644
--- a/vp8/common/reconintra4x4.c
+++ b/vp8/common/reconintra4x4.c

@@ -313,89 +313,3 @@
 }
 
 
-void vp8_recon_intra4x4mb(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
-{
-    int i;
-
-    vp8_intra_prediction_down_copy(x);
-
-#if ARCH_ARM
-    {
-        BLOCKD *b = &x->block[0];
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-    }
-#else
-    for (i = 0; i < 16; i++)
-    {
-        BLOCKD *b = &x->block[i];
-
-        vp8_predict_intra4x4(b, x->block[i].bmi.mode, x->block[i].predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-    }
-#endif
-
-    vp8_recon_intra_mbuv(rtcd, x);
-
-}

diff --git a/vp8/common/threading.h b/vp8/common/threading.h
index 0dd7bbc..44eaf08 100644
--- a/vp8/common/threading.h
+++ b/vp8/common/threading.h

@@ -14,7 +14,7 @@
 
 #define VPXINFINITE 10000       /* 10second. */
 
-#if CONFIG_OS_SUPPORT
+#if CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD
 
 /* Thread management macros */
 #ifdef _WIN32
@@ -38,6 +38,7 @@
 #define pthread_self() GetCurrentThreadId()
 #else
 #ifdef __APPLE__
+#include <mach/mach_init.h>
 #include <mach/semaphore.h>
 #include <mach/task.h>
 #include <time.h>
@@ -90,8 +91,6 @@
 #define x86_pause_hint()
 #endif
 
-#else /* CONFIG_OS_SUPPORT = 0 */
-#define THREAD_FUNCTION void *
-#endif /* CONFIG_OS_SUPPORT */
+#endif /* CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD */
 
 #endif

diff --git a/vp8/common/vfwsetting.hpp b/vp8/common/vfwsetting.hpp
deleted file mode 100644
index 44869ec..0000000
--- a/vp8/common/vfwsetting.hpp
+++ /dev/null

@@ -1,76 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#if !defined(VFWSETTING_HPP)
-#define VFWSETTING_HPP
-//______________________________________________________________________________
-//
-//  VFWSetting.hpp
-//
-
-#include "four_cc.hpp"
-#include <iosfwd>
-
-namespace vpxvp
-{
-
-    //--------------------------------------
-    class VFWSetting
-    {
-        friend std::ostream& operator<<(std::ostream& os, const VFWSetting& vfws);
-
-    public:
-
-        enum Mode
-        {
-            m_setting,
-            m_config
-        };
-
-        enum
-        {
-            header_size = 8,
-            Size = 16
-        };
-
-        VFWSetting(four_cc fcc);
-        ~VFWSetting();
-
-        four_cc fcc() const;
-        Mode mode() const;
-
-        int setting() const;
-        int value() const;
-        void setting_value(int i_setting, int i_value);  //  Sets mode to m_setting
-
-        long size() const;
-        const void* data() const;
-        int data(const void* p_data, unsigned long ul_size);
-
-    private:
-
-        VFWSetting(const VFWSetting& vfws);  //  Not implemented
-        VFWSetting& operator=(const VFWSetting& vfws);  //  Not implemented
-
-        int extract_(const void* p_data, unsigned long ul_size);
-        void update_() const;
-
-        four_cc m_fcc;
-        Mode m_mode;
-        int m_i_setting;
-        int m_i_value;
-
-        mutable unsigned char m_p_data[Size];
-    };
-
-}  //  namespace vpxvp
-
-#endif  //  VFWSETTING_HPP

diff --git a/vp8/common/vpx_ref_build_prefix.h b/vp8/common/vpx_ref_build_prefix.h
deleted file mode 100644
index a2fce65..0000000
--- a/vp8/common/vpx_ref_build_prefix.h
+++ /dev/null

@@ -1,24 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef _VPX_REF_BUILD_PREFIX_h
-#define _VPX_REF_BUILD_PREFIX_h
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif /* include guards */

diff --git a/vp8/common/vpxblit.h b/vp8/common/vpxblit.h
deleted file mode 100644
index a95d905..0000000
--- a/vp8/common/vpxblit.h
+++ /dev/null

@@ -1,112 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VPXBLIT_H_INCL
-#define VPXBLIT_H_INCL
-/*==============================================================================
-                              Includes
-==============================================================================*/
-
-/*==============================================================================
-                              Defines
-==============================================================================*/
-
-
-#ifdef VPX_BIG_ENDIAN
-#define BYTE_ZERO(X)    ((X & 0xFF000000) >> (24 - 2)   )
-#define BYTE_ONE(X)     ((X & 0x00FF0000) >> (16 - 2)   )
-#define BYTE_TWO(X)     ((X & 0x0000FF00) >> (8 - 2)    )
-#define BYTE_THREE(X)   ((X & 0x000000FF) << (0 + 2)    )
-
-#define BYTE_ZERO_UV(X) ((X & 0x0000FF00) >> (8 - 2)    )
-#define BYTE_ONE_UV(X)  ((X & 0x000000FF) << (0 + 2)    )
-
-#define REREFERENCE(X) (*((int *) &(X)))
-
-#else
-
-#define BYTE_THREE(X)   ((X & 0xFF000000) >> (24 - 2)   )
-#define BYTE_TWO(X)     ((X & 0x00FF0000) >> (16 - 2)   )
-#define BYTE_ONE(X)     ((X & 0x0000FF00) >> (8 - 2)    )
-#define BYTE_ZERO(X)    ((X & 0x000000FF) << (0 + 2)    )
-
-#define BYTE_ONE_UV(X) ((X & 0x0000FF00) >> (8 - 2) )
-#define BYTE_ZERO_UV(X)     ((X & 0x000000FF) << (0 + 2)    )
-
-#define REREFERENCE(X) (*((int *) &(X)))
-
-#endif
-
-
-/*==============================================================================
-                            Type Definitions
-==============================================================================*/
-typedef struct  // YUV buffer configuration structure
-{
-    int   y_width;
-    int   y_height;
-    int   y_stride;
-
-    int   uv_width;
-    int   uv_height;
-    int   uv_stride;
-
-    char *y_buffer;
-    char *u_buffer;
-    char *v_buffer;
-
-    char *uv_start;
-    int   uv_dst_area;
-    int   uv_used_area;
-
-} VPX_BLIT_CONFIG;
-
-typedef struct tx86_params
-{
-    unsigned int pushed_registers[6];
-    unsigned int return_address;
-    unsigned int dst;
-    unsigned int scrn_pitch;
-    VPX_BLIT_CONFIG *buff_config;
-} x86_params;
-
-/*=============================================================================
-                                Enums
-==============================================================================*/
-
-
-/*==============================================================================
-                              Structures
-==============================================================================*/
-
-/*==============================================================================
-                             Constants
-==============================================================================*/
-
-
-/*==============================================================================
-                               Variables
-==============================================================================*/
-
-
-
-
-/*==============================================================================
-                            Function Protoypes/MICROS
-==============================================================================*/
-int vpx_get_size_of_pixel(unsigned int bd);
-void *vpx_get_blitter(unsigned int bd);
-void vpx_set_blit(void);
-void vpx_destroy_blit(void);
-
-
-
-#endif //VPXBLIT_H_INCL

diff --git a/vp8/common/vpxblit_c64.h b/vp8/common/vpxblit_c64.h
deleted file mode 100644
index 4ee617f..0000000
--- a/vp8/common/vpxblit_c64.h
+++ /dev/null

@@ -1,48 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef _VPX_BLIT_C64_h
-#define _VPX_BLIT_C64_h
-
-/****************************************************************************
-*  Typedefs
-****************************************************************************/
-
-typedef struct  // YUV buffer configuration structure
-{
-    int     y_width;
-    int     y_height;
-    int     y_stride;
-
-    int     uv_width;
-    int     uv_height;
-    int     uv_stride;
-
-    unsigned char *y_buffer;
-    unsigned char *u_buffer;
-    unsigned char *v_buffer;
-
-    unsigned char *y_ptr_scrn;
-    unsigned char *u_ptr_scrn;
-    unsigned char *v_ptr_scrn;
-
-} DXV_YUV_BUFFER_CONFIG;
-
-typedef struct
-{
-    unsigned char *rgbptr_scrn;
-    unsigned char *y_ptr_scrn;
-    unsigned char *u_ptr_scrn;
-    unsigned char *v_ptr_scrn;
-    unsigned char *rgbptr_scrn2;
-} DXV_FINAL_VIDEO;
-
-#endif /* include guards */

diff --git a/vp8/common/vpxerrors.h b/vp8/common/vpxerrors.h
deleted file mode 100644
index b70f296..0000000
--- a/vp8/common/vpxerrors.h
+++ /dev/null

@@ -1,13 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-
-#define ALLOC_FAILURE -2

diff --git a/vp8/common/x86/loopfilter_x86.c b/vp8/common/x86/loopfilter_x86.c
index 3621fe1..5837bc0 100644
--- a/vp8/common/x86/loopfilter_x86.c
+++ b/vp8/common/x86/loopfilter_x86.c

@@ -10,7 +10,7 @@
 
 
 #include "vpx_ports/config.h"
-#include "loopfilter.h"
+#include "vp8/common/loopfilter.h"
 
 prototype_loopfilter(vp8_loop_filter_horizontal_edge_c);
 prototype_loopfilter(vp8_loop_filter_vertical_edge_c);

diff --git a/vp8/common/x86/subpixel_mmx.asm b/vp8/common/x86/subpixel_mmx.asm
index 23ed4e2..9004b52 100644
--- a/vp8/common/x86/subpixel_mmx.asm
+++ b/vp8/common/x86/subpixel_mmx.asm

@@ -113,97 +113,6 @@
     ret
 
 
-;
-; THIS FUNCTION APPEARS TO BE UNUSED
-;
-;void vp8_filter_block1d_v6_mmx
-;(
-;   short *src_ptr,
-;   unsigned char *output_ptr,
-;   unsigned int pixels_per_line,
-;   unsigned int pixel_step,
-;   unsigned int output_height,
-;   unsigned int output_width,
-;   short * vp8_filter
-;)
-global sym(vp8_filter_block1d_v6_mmx)
-sym(vp8_filter_block1d_v6_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        movq      mm5, [GLOBAL(rd)]
-        push        rbx
-        mov         rbx, arg(6) ;vp8_filter
-        movq      mm1, [rbx + 16]             ; do both the negative taps first!!!
-        movq      mm2, [rbx + 32]         ;
-        movq      mm6, [rbx + 48]        ;
-        movq      mm7, [rbx + 64]        ;
-
-        movsxd      rdx, dword ptr arg(2) ;pixels_per_line
-        mov         rdi, arg(1) ;output_ptr
-        mov         rsi, arg(0) ;src_ptr
-        sub         rsi, rdx
-        sub         rsi, rdx
-        movsxd      rcx, DWORD PTR arg(4) ;output_height
-        movsxd      rax, DWORD PTR arg(5) ;output_width      ; destination pitch?
-        pxor        mm0, mm0              ; mm0 = 00000000
-
-
-nextrow_v:
-        movq        mm3, [rsi+rdx]        ; mm3 = p0..p8  = row -1
-        pmullw      mm3, mm1              ; mm3 *= kernel 1 modifiers.
-
-
-        movq        mm4, [rsi + 4*rdx]      ; mm4 = p0..p3  = row 2
-        pmullw      mm4, mm7              ; mm4 *= kernel 4 modifiers.
-        paddsw      mm3, mm4              ; mm3 += mm4
-
-        movq        mm4, [rsi + 2*rdx]           ; mm4 = p0..p3  = row 0
-        pmullw      mm4, mm2              ; mm4 *= kernel 2 modifiers.
-        paddsw      mm3, mm4              ; mm3 += mm4
-
-        movq        mm4, [rsi]            ; mm4 = p0..p3  = row -2
-        pmullw      mm4, [rbx]            ; mm4 *= kernel 0 modifiers.
-        paddsw      mm3, mm4              ; mm3 += mm4
-
-
-        add         rsi, rdx              ; move source forward 1 line to avoid 3 * pitch
-        movq        mm4, [rsi + 2*rdx]     ; mm4 = p0..p3  = row 1
-        pmullw      mm4, mm6              ; mm4 *= kernel 3 modifiers.
-        paddsw      mm3, mm4              ; mm3 += mm4
-
-        movq        mm4, [rsi + 4*rdx]    ; mm4 = p0..p3  = row 3
-        pmullw      mm4, [rbx +80]        ; mm4 *= kernel 3 modifiers.
-        paddsw      mm3, mm4              ; mm3 += mm4
-
-
-        paddsw      mm3, mm5               ; mm3 += round value
-        psraw       mm3, VP8_FILTER_SHIFT     ; mm3 /= 128
-        packuswb    mm3, mm0              ; pack and saturate
-
-        movd        [rdi],mm3             ; store the results in the destination
-
-        add         rdi,rax;
-
-        dec         rcx                   ; decrement count
-        jnz         nextrow_v             ; next row
-
-        pop         rbx
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
 ;void vp8_filter_block1dc_v6_mmx
 ;(
 ;   short *src_ptr,

diff --git a/vp8/common/x86/vp8_asm_stubs.c b/vp8/common/x86/vp8_asm_stubs.c
index 8dd07c9..7904006 100644
--- a/vp8/common/x86/vp8_asm_stubs.c
+++ b/vp8/common/x86/vp8_asm_stubs.c

@@ -11,7 +11,7 @@
 
 #include "vpx_ports/config.h"
 #include "vpx_ports/mem.h"
-#include "subpixel.h"
+#include "vp8/common/subpixel.h"
 
 extern const short vp8_six_tap_mmx[8][6*8];
 extern const short vp8_bilinear_filters_mmx[8][2*8];

diff --git a/vp8/common/x86/x86_systemdependent.c b/vp8/common/x86/x86_systemdependent.c
index 38500fd..e89c07a 100644
--- a/vp8/common/x86/x86_systemdependent.c
+++ b/vp8/common/x86/x86_systemdependent.c

@@ -11,13 +11,13 @@
 
 #include "vpx_ports/config.h"
 #include "vpx_ports/x86.h"
-#include "g_common.h"
-#include "subpixel.h"
-#include "loopfilter.h"
-#include "recon.h"
-#include "idct.h"
-#include "pragmas.h"
-#include "onyxc_int.h"
+#include "vp8/common/g_common.h"
+#include "vp8/common/subpixel.h"
+#include "vp8/common/loopfilter.h"
+#include "vp8/common/recon.h"
+#include "vp8/common/idct.h"
+#include "vp8/common/pragmas.h"
+#include "vp8/common/onyxc_int.h"
 
 void vp8_arch_x86_common_init(VP8_COMMON *ctx)
 {

diff --git a/vp8/decoder/arm/arm_dsystemdependent.c b/vp8/decoder/arm/arm_dsystemdependent.c
index e9741e2..51e901d 100644
--- a/vp8/decoder/arm/arm_dsystemdependent.c
+++ b/vp8/decoder/arm/arm_dsystemdependent.c

@@ -11,12 +11,11 @@
 
 #include "vpx_ports/config.h"
 #include "vpx_ports/arm.h"
-#include "blockd.h"
-#include "pragmas.h"
-#include "postproc.h"
-#include "dboolhuff.h"
-#include "dequantize.h"
-#include "onyxd_int.h"
+#include "vp8/common/blockd.h"
+#include "vp8/common/pragmas.h"
+#include "vp8/common/postproc.h"
+#include "vp8/decoder/dequantize.h"
+#include "vp8/decoder/onyxd_int.h"
 
 void vp8_arch_arm_decode_init(VP8D_COMP *pbi)
 {
@@ -35,12 +34,6 @@
         pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_v6;
         pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_v6;
         pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_v6;
-#if 0 /*For use with RTCD, when implemented*/
-        pbi->dboolhuff.start             = vp8dx_start_decode_c;
-        pbi->dboolhuff.fill              = vp8dx_bool_decoder_fill_c;
-        pbi->dboolhuff.debool            = vp8dx_decode_bool_c;
-        pbi->dboolhuff.devalue           = vp8dx_decode_value_c;
-#endif
     }
 #endif
 
@@ -54,12 +47,6 @@
         pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_neon;
         pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_neon;
         pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_neon;
-#if 0 /*For use with RTCD, when implemented*/
-        pbi->dboolhuff.start             = vp8dx_start_decode_c;
-        pbi->dboolhuff.fill              = vp8dx_bool_decoder_fill_c;
-        pbi->dboolhuff.debool            = vp8dx_decode_bool_c;
-        pbi->dboolhuff.devalue           = vp8dx_decode_value_c;
-#endif
     }
 #endif
 #endif

diff --git a/vp8/decoder/arm/armv6/dboolhuff_v6.asm b/vp8/decoder/arm/armv6/dboolhuff_v6.asm
deleted file mode 100644
index 6515804..0000000
--- a/vp8/decoder/arm/armv6/dboolhuff_v6.asm
+++ /dev/null

@@ -1,163 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_decode_value_v6|
-    EXPORT  |vp8dx_start_decode_v6|
-    EXPORT  |vp8dx_stop_decode_v6|
-    EXPORT  |vp8dx_decode_bool_v6|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    INCLUDE vpx_asm_offsets.asm
-
-br      RN  r0
-prob    RN  r1
-bits    RN  r1
-    AREA    |.text|, CODE, READONLY  ; name this block of code
-
-;   int z = 0;
-;   int bit;
-;   for ( bit=bits-1; bit>=0; bit-- )
-;   {
-;       z |= (vp8dx_decode_bool(br, 0x80)<<bit);
-;   }
-;   return z;
-
-;int vp8_decode_value_v6 ( BOOL_DECODER *br, int bits )
-|vp8_decode_value_v6| PROC
-    stmdb   sp!, {r4 - r6, lr}
-    mov     r4, br
-    mov     r5, bits
-    mov     r6, #0
-
-    subs    r5, r5, #1
-    bmi     decode_value_exit
-
-decode_value_loop
-    mov     prob, #0x80
-    mov     br, r4
-    bl      vp8dx_decode_bool_v6_internal     ; needed for conversion to s file
-    orr     r6, r6, r0, lsl r5
-    subs    r5, r5, #1
-    bpl     decode_value_loop
-
-decode_value_exit
-    mov     r0, r6
-    ldmia   sp!, {r4 - r6, pc}
-    ENDP    ; |vp8_decode_value_v6|
-
-
-;void vp8dx_start_decode_v6 ( BOOL_DECODER *br, unsigned char *source )
-|vp8dx_start_decode_v6| PROC
-    stmdb   sp!, {r4 - r5, lr}
-    mov     r2, #0
-    mov     r3, #255
-
-    str     r2, [br, #bool_decoder_lowvalue]
-    str     r3, [br, #bool_decoder_range]
-    str     r1, [br, #bool_decoder_buffer]
-
-    mov     r3, #8
-    mov     r2, #4
-    str     r3, [br, #bool_decoder_count]
-    str     r2, [br, #bool_decoder_pos]
-
-    ldrb    r2, [r1, #3]
-    ldrb    r3, [r1, #2]
-    ldrb    r4, [r1, #1]
-    ldrb    r5, [r1]
-
-    orr     r1, r2, r3, lsl #8
-    orr     r1, r1, r4, lsl #16
-    orr     r1, r1, r5, lsl #24
-
-    str     r1, [br, #bool_decoder_value]
-
-    ldmia   sp!, {r4 - r5, pc}
-    ENDP    ; |vp8dx_start_decode_v6|
-
-
-;void vp8dx_stop_decode_v6 ( BOOL_DECODER *bc );
-|vp8dx_stop_decode_v6| PROC
-    mov     pc, lr
-    ENDP    ; |vp8dx_stop_decode_v6|
-
-
-; bigsplit  RN  r1
-; buffer_v  RN  r1
-; count_v       RN  r4
-; range_v       RN  r2
-; value_v       RN  r3
-; pos_v     RN  r5
-; split     RN  r6
-; bit           RN  lr
-;int vp8dx_decode_bool_v6 ( BOOL_DECODER *br, int probability )
-|vp8dx_decode_bool_v6| PROC
-vp8dx_decode_bool_v6_internal
-    stmdb   sp!, {r4 - r6, lr}
-
-    ldr     r2, [br, #bool_decoder_range]
-    ldr     r3, [br, #bool_decoder_value]
-
-    mov     r6, r2, lsl #8
-    sub     r6, r6, #256                ;   split = 1 +  (((range-1) * probability) >> 8)
-    mov     r12, #1
-    smlawb  r6, r6, prob, r12
-
-    mov     lr, #0
-    subs    r5, r3, r6, lsl #24
-
-    ;cmp        r3, r1
-    movhs   lr, #1
-    movhs   r3, r5
-    subhs   r2, r2, r6
-    movlo   r2, r6
-
-    cmp     r2, #0x80
-    blt     range_less_0x80
-    ;strd   r2, r3, [br, #bool_decoder_range]
-    str     r2, [br, #bool_decoder_range]
-    str     r3, [br, #bool_decoder_value]
-    mov     r0, lr
-    ldmia   sp!, {r4 - r6, pc}
-
-range_less_0x80
-    ldr     r5, [br, #bool_decoder_pos]
-    ldr     r1, [br, #bool_decoder_buffer]
-    ldr     r4, [br, #bool_decoder_count]
-    add     r1, r1, r5
-
-    clz       r12, r2
-    sub       r12, r12, #24
-    subs      r4, r4, r12
-    ldrleb    r6, [r1], #1
-    mov       r2, r2, lsl r12
-    mov       r3, r3, lsl r12
-    addle     r4, r4, #8
-    rsble     r12, r4, #8
-    addle     r5, r5, #1
-    orrle     r3, r3, r6, lsl r12
-
-    ;strd       r2, r3, [br, #bool_decoder_range]
-    ;strd       r4, r5, [br, #bool_decoder_count]
-    str         r2, [br, #bool_decoder_range]
-    str         r3, [br, #bool_decoder_value]
-    str         r4, [br, #bool_decoder_count]
-    str         r5, [br, #bool_decoder_pos]
-
-    mov     r0, lr
-
-    ldmia   sp!, {r4 - r6, pc}
-    ENDP    ; |vp8dx_decode_bool_v6|
-
-    END

diff --git a/vp8/decoder/arm/armv6/idct_blk_v6.c b/vp8/decoder/arm/armv6/idct_blk_v6.c
index 3c7bc50..57c3446 100644
--- a/vp8/decoder/arm/armv6/idct_blk_v6.c
+++ b/vp8/decoder/arm/armv6/idct_blk_v6.c

@@ -9,8 +9,8 @@
  */
 
 #include "vpx_ports/config.h"
-#include "idct.h"
-#include "dequantize.h"
+#include "vp8/common/idct.h"
+#include "vp8/decoder/dequantize.h"
 
 void vp8_dequant_dc_idct_add_y_block_v6
             (short *q, short *dq, unsigned char *pre,

diff --git a/vp8/decoder/arm/dboolhuff_arm.h b/vp8/decoder/arm/dboolhuff_arm.h
deleted file mode 100644
index 985951c..0000000
--- a/vp8/decoder/arm/dboolhuff_arm.h
+++ /dev/null

@@ -1,43 +0,0 @@
-#ifndef DBOOLHUFF_ARM_H
-#define DBOOLHUFF_ARM_H
-
-/* JLK
- * There are currently no arm-optimized versions of
- * these functions. As they are implemented, they
- * can be uncommented below and added to
- * arm/dsystemdependent.c
- *
- * The existing asm code is likely so different as
- * to be useless. However, its been left (for now)
- * for reference.
- */
-#if 0
-#if HAVE_ARMV6
-#undef vp8_dbool_start
-#define vp8_dbool_start vp8dx_start_decode_v6
-
-#undef vp8_dbool_fill
-#define vp8_dbool_fill vp8_bool_decoder_fill_v6
-
-#undef vp8_dbool_debool
-#define vp8_dbool_debool vp8_decode_bool_v6
-
-#undef vp8_dbool_devalue
-#define vp8_dbool_devalue vp8_decode_value_v6
-#endif /* HAVE_ARMV6 */
-
-#if HAVE_ARMV7
-#undef vp8_dbool_start
-#define vp8_dbool_start vp8dx_start_decode_neon
-
-#undef vp8_dbool_fill
-#define vp8_dbool_fill vp8_bool_decoder_fill_neon
-
-#undef vp8_dbool_debool
-#define vp8_dbool_debool vp8_decode_bool_neon
-
-#undef vp8_dbool_devalue
-#define vp8_dbool_devalue vp8_decode_value_neon
-#endif /* HAVE_ARMV7 */
-#endif
-#endif /* DBOOLHUFF_ARM_H */

diff --git a/vp8/decoder/arm/dequantize_arm.c b/vp8/decoder/arm/dequantize_arm.c
index b3e14b7..d88adb7 100644
--- a/vp8/decoder/arm/dequantize_arm.c
+++ b/vp8/decoder/arm/dequantize_arm.c

@@ -10,9 +10,8 @@
 
 
 #include "vpx_ports/config.h"
-#include "dequantize.h"
-#include "predictdc.h"
-#include "idct.h"
+#include "vp8/decoder/dequantize.h"
+#include "vp8/common/idct.h"
 #include "vpx_mem/vpx_mem.h"
 
 #if HAVE_ARMV7

diff --git a/vp8/decoder/arm/detokenize.asm b/vp8/decoder/arm/detokenize.asm
deleted file mode 100644
index 45e068a..0000000
--- a/vp8/decoder/arm/detokenize.asm
+++ /dev/null

@@ -1,320 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_decode_mb_tokens_v6|
-
-    AREA    |.text|, CODE, READONLY  ; name this block of code
-
-    INCLUDE vpx_asm_offsets.asm
-
-l_qcoeff    EQU     0
-l_i         EQU     4
-l_type      EQU     8
-l_stop      EQU     12
-l_c         EQU     16
-l_l_ptr     EQU     20
-l_a_ptr     EQU     24
-l_bc        EQU     28
-l_coef_ptr  EQU     32
-l_stacksize EQU     64
-
-
-;; constant offsets -- these should be created at build time
-c_block2above_offset         EQU 25
-c_entropy_nodes              EQU 11
-c_dct_eob_token              EQU 11
-
-|vp8_decode_mb_tokens_v6| PROC
-    stmdb       sp!, {r4 - r11, lr}
-    sub         sp, sp, #l_stacksize
-    mov         r7, r1                      ; type
-    mov         r9, r0                      ; detoken
-
-    ldr         r1, [r9, #detok_current_bc]
-    ldr         r0, [r9, #detok_qcoeff_start_ptr]
-    mov         r11, #0                     ; i
-    mov         r3, #16                     ; stop
-
-    cmp         r7, #1                      ; type ?= 1
-    addeq       r11, r11, #24               ; i = 24
-    addeq       r3, r3, #8                  ; stop = 24
-    addeq       r0, r0, #3, 24              ; qcoefptr += 24*16
-
-    str         r0, [sp, #l_qcoeff]
-    str         r11, [sp, #l_i]
-    str         r7, [sp, #l_type]
-    str         r3, [sp, #l_stop]
-    str         r1, [sp, #l_bc]
-
-    add         lr, r9, r7, lsl #2          ; detoken + type*4
-
-    ldr         r8, [r1, #bool_decoder_user_buffer]
-
-    ldr         r10, [lr, #detok_coef_probs]
-    ldr         r5, [r1, #bool_decoder_count]
-    ldr         r6, [r1, #bool_decoder_range]
-    ldr         r4, [r1, #bool_decoder_value]
-
-    str         r10, [sp, #l_coef_ptr]
-
-BLOCK_LOOP
-    ldr         r3, [r9, #detok_ptr_block2leftabove]
-    ldr         r1, [r9, #detok_L]
-    ldr         r2, [r9, #detok_A]
-    ldrb        r12, [r3, r11]!             ; block2left[i]
-    ldrb        r3, [r3, #c_block2above_offset]; block2above[i]
-
-    cmp         r7, #0                      ; c = !type
-    moveq       r7, #1
-    movne       r7, #0
-
-    ldrb        r0, [r1, r12]!              ; *(L += block2left[i])
-    ldrb        r3, [r2, r3]!               ; *(A += block2above[i])
-    mov         lr, #c_entropy_nodes        ; ENTROPY_NODES = 11
-
-; VP8_COMBINEENTROPYCONTETEXTS(t, *a, *l) => t = ((*a) != 0) + ((*l) !=0)
-    cmp         r0, #0                      ; *l ?= 0
-    movne       r0, #1
-    cmp         r3, #0                      ; *a ?= 0
-    addne       r0, r0, #1                  ; t
-
-    str         r1, [sp, #l_l_ptr]          ; save &l
-    str         r2, [sp, #l_a_ptr]          ; save &a
-    smlabb      r0, r0, lr, r10             ; Prob = coef_probs + (t * ENTROPY_NODES)
-    mov         r1, #0                      ; t = 0
-    str         r7, [sp, #l_c]
-
-    ;align 4
-COEFF_LOOP
-    ldr         r3, [r9, #detok_ptr_coef_bands_x]
-    ldr         lr, [r9, #detok_coef_tree_ptr]
-    ;STALL
-    ldrb        r3, [r3, r7]                ; coef_bands_x[c]
-    ;STALL
-    ;STALL
-    add         r0, r0, r3                  ; Prob += coef_bands_x[c]
-
-get_token_loop
-    ldrb        r2, [r0, +r1, asr #1]       ; Prob[t >> 1]
-    mov         r3, r6, lsl #8              ; range << 8
-    sub         r3, r3, #256                ; (range << 8) - (1 << 8)
-    mov         r10, #1                     ; 1
-
-    smlawb      r2, r3, r2, r10             ; split = 1 + (((range-1) * probability) >> 8)
-
-    ldrb        r12, [r8]                   ; load cx data byte in stall slot : r8 = bufptr
-    ;++
-
-    subs        r3, r4, r2, lsl #24         ; value-(split<<24): used later to calculate shift for NORMALIZE
-    addhs       r1, r1, #1                  ; t += 1
-    movhs       r4, r3                      ; value -= bigsplit (split << 24)
-    subhs       r2, r6, r2                  ; range -= split
- ;   movlo       r6, r2                      ; range = split
-
-    ldrsb     r1, [lr, r1]                  ; t = onyx_coef_tree_ptr[t]
-
-; NORMALIZE
-    clz         r3, r2                      ; vp8dx_bitreader_norm[range] + 24
-    sub         r3, r3, #24                 ; vp8dx_bitreader_norm[range]
-    subs        r5, r5, r3                  ; count -= shift
-    mov         r6, r2, lsl r3              ; range <<= shift
-    mov         r4, r4, lsl r3              ; value <<= shift
-
-; if count <= 0, += BR_COUNT; value |= *bufptr++ << (BR_COUNT-count); BR_COUNT = 8, but need to upshift values by +16
-    addle         r5, r5, #8                ; count += 8
-    rsble         r3, r5, #24               ; 24 - count
-    addle         r8, r8, #1                ; bufptr++
-    orrle         r4, r4, r12, lsl r3       ; value |= *bufptr << shift + 16
-
-    cmp         r1, #0                      ; t ?= 0
-    bgt         get_token_loop              ; while (t > 0)
-
-    cmn         r1, #c_dct_eob_token        ; if(t == -DCT_EOB_TOKEN)
-    beq         END_OF_BLOCK                ; break
-
-    rsb         lr, r1, #0                  ; v = -t;
-
-    cmp         lr, #4                      ; if(v > FOUR_TOKEN)
-    ble         SKIP_EXTRABITS
-
-    ldr         r3, [r9, #detok_teb_base_ptr]
-    mov         r11, #1                     ; 1 in split = 1 + ... nope, v+= 1 << bits_count
-    add         r7, r3, lr, lsl #4          ; detok_teb_base_ptr + (v << 4)
-
-    ldrsh       lr, [r7, #tokenextrabits_min_val] ; v = teb_ptr->min_val
-    ldrsh       r0, [r7, #tokenextrabits_length] ; bits_count = teb_ptr->Length
-
-extrabits_loop
-    add         r3, r0, r7                  ; &teb_ptr->Probs[bits_count]
-
-    ldrb        r2, [r3, #4]                ; probability. why +4?
-    mov         r3, r6, lsl #8              ; range << 8
-    sub         r3, r3, #256                ; range << 8 + 1 << 8
-
-    smlawb      r2, r3, r2, r11             ; split = 1 +  (((range-1) * probability) >> 8)
-
-    ldrb        r12, [r8]                   ; *bufptr
-    ;++
-
-    subs        r10, r4, r2, lsl #24        ; value - (split<<24)
-    movhs       r4, r10                     ; value = value - (split << 24)
-    subhs       r2, r6, r2                  ; range = range - split
-    addhs       lr, lr, r11, lsl r0         ; v += ((UINT16)1<<bits_count)
-
-; NORMALIZE
-    clz         r3, r2                      ; shift - leading zeros in split
-    sub         r3, r3, #24                 ; don't count first 3 bytes
-    subs        r5, r5, r3                  ; count -= shift
-    mov         r6, r2, lsl r3              ; range = range << shift
-    mov         r4, r4, lsl r3              ; value <<= shift
-
-    addle       r5, r5, #8                  ; count += BR_COUNT
-    addle       r8, r8, #1                  ; bufptr++
-    rsble       r3, r5, #24                 ; BR_COUNT - count
-    orrle       r4, r4, r12, lsl r3         ; value |= *bufptr << (BR_COUNT - count)
-
-    subs        r0, r0, #1                  ; bits_count --
-    bpl         extrabits_loop
-
-
-SKIP_EXTRABITS
-    ldr         r11, [sp, #l_qcoeff]
-    ldr         r0, [sp, #l_coef_ptr]       ; Prob = coef_probs
-
-    cmp         r1, #0                      ; check for nonzero token - if (t)
-    beq         SKIP_EOB_CHECK              ; if t is zero, we will skip the eob table chec
-
-    add         r3, r6, #1                  ; range + 1
-    mov         r2, r3, lsr #1              ; split = (range + 1) >> 1
-
-    subs        r3, r4, r2, lsl #24         ; value - (split<<24)
-    movhs       r4, r3                      ; value -= (split << 24)
-    subhs       r2, r6, r2                  ; range -= split
-    mvnhs       r3, lr                      ; -v
-    addhs       lr, r3, #1                  ; v = (v ^ -1) + 1
-
-; NORMALIZE
-    clz         r3, r2                      ; leading 0s in split
-    sub         r3, r3, #24                 ; shift
-    subs        r5, r5, r3                  ; count -= shift
-    mov         r6, r2, lsl r3              ; range <<= shift
-    mov         r4, r4, lsl r3              ; value <<= shift
-    ldrleb      r2, [r8], #1                ; *(bufptr++)
-    addle       r5, r5, #8                  ; count += 8
-    rsble       r3, r5, #24                 ; BR_COUNT - count
-    orrle       r4, r4, r2, lsl r3          ; value |= *bufptr << (BR_COUNT - count)
-
-    add         r0, r0, #11                 ; Prob += ENTROPY_NODES (11)
-
-    cmn         r1, #1                      ; t < -ONE_TOKEN
-
-    addlt       r0, r0, #11                 ; Prob += ENTROPY_NODES (11)
-
-    mvn         r1, #1                      ; t = -1 ???? C is -2
-
-SKIP_EOB_CHECK
-    ldr         r7, [sp, #l_c]              ; c
-    ldr         r3, [r9, #detok_scan]
-    add         r1, r1, #2                  ; t+= 2
-    cmp         r7, #15                     ; c should will be one higher
-
-    ldr         r3, [r3, +r7, lsl #2]       ; scan[c] this needs pre-inc c value
-    add         r7, r7, #1                  ; c++
-    add         r3, r11, r3, lsl #1         ; qcoeff + scan[c]
-
-    str         r7, [sp, #l_c]              ; store c
-    strh        lr, [r3]                    ; qcoef_ptr[scan[c]] = v
-
-    blt         COEFF_LOOP
-
-    sub         r7, r7, #1                  ; if(t != -DCT_EOB_TOKEN) --c
-
-END_OF_BLOCK
-    ldr         r3, [sp, #l_type]           ; type
-    ldr         r10, [sp, #l_coef_ptr]      ; coef_ptr
-    ldr         r0, [sp, #l_qcoeff]         ; qcoeff
-    ldr         r11, [sp, #l_i]             ; i
-    ldr         r12, [sp, #l_stop]          ; stop
-
-    cmp         r3, #0                      ; type ?= 0
-    moveq       r1, #1
-    movne       r1, #0
-    add         r3, r11, r9                 ; detok + i
-
-    cmp         r7, r1                      ; c ?= !type
-    strb        r7, [r3, #detok_eob]        ; eob[i] = c
-
-    ldr         r7, [sp, #l_l_ptr]          ; l
-    ldr         r2, [sp, #l_a_ptr]          ; a
-    movne       r3, #1                      ; t
-    moveq       r3, #0
-
-    add         r0, r0, #32                 ; qcoeff += 32 (16 * 2?)
-    add         r11, r11, #1                ; i++
-    strb        r3, [r7]                    ; *l = t
-    strb        r3, [r2]                    ; *a = t
-    str         r0, [sp, #l_qcoeff]         ; qcoeff
-    str         r11, [sp, #l_i]             ; i
-
-    cmp         r11, r12                    ; i < stop
-    ldr         r7, [sp, #l_type]           ; type
-
-    blt         BLOCK_LOOP
-
-    cmp         r11, #25                    ; i ?= 25
-    bne         ln2_decode_mb_to
-
-    ldr         r12, [r9, #detok_qcoeff_start_ptr]
-    ldr         r10, [r9, #detok_coef_probs]
-    mov         r7, #0                      ; type/i = 0
-    mov         r3, #16                     ; stop = 16
-    str         r12, [sp, #l_qcoeff]        ; qcoeff_ptr = qcoeff_start_ptr
-    str         r7, [sp, #l_i]
-    str         r7, [sp, #l_type]
-    str         r3, [sp, #l_stop]
-
-    str         r10, [sp, #l_coef_ptr]      ; coef_probs = coef_probs[type=0]
-
-    b           BLOCK_LOOP
-
-ln2_decode_mb_to
-    cmp         r11, #16                    ; i ?= 16
-    bne         ln1_decode_mb_to
-
-    mov         r10, #detok_coef_probs
-    add         r10, r10, #2*4              ; coef_probs[type]
-    ldr         r10, [r9, r10]              ; detok + detok_coef_probs[type]
-
-    mov         r7, #2                      ; type = 2
-    mov         r3, #24                     ; stop = 24
-
-    str         r7, [sp, #l_type]
-    str         r3, [sp, #l_stop]
-
-    str         r10, [sp, #l_coef_ptr]      ; coef_probs = coef_probs[type]
-    b           BLOCK_LOOP
-
-ln1_decode_mb_to
-    ldr         r2, [sp, #l_bc]
-    mov         r0, #0
-    nop
-
-    str         r8, [r2, #bool_decoder_user_buffer]
-    str         r5, [r2, #bool_decoder_count]
-    str         r4, [r2, #bool_decoder_value]
-    str         r6, [r2, #bool_decoder_range]
-
-    add         sp, sp, #l_stacksize
-    ldmia       sp!, {r4 - r11, pc}
-
-    ENDP  ; |vp8_decode_mb_tokens_v6|
-
-    END

diff --git a/vp8/decoder/arm/detokenize_arm.h b/vp8/decoder/arm/detokenize_arm.h
deleted file mode 100644
index 9bb19b6..0000000
--- a/vp8/decoder/arm/detokenize_arm.h
+++ /dev/null

@@ -1,22 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef DETOKENIZE_ARM_H
-#define DETOKENIZE_ARM_H
-
-#if HAVE_ARMV6
-#if CONFIG_ARM_ASM_DETOK
-void vp8_init_detokenizer(VP8D_COMP *dx);
-void vp8_decode_mb_tokens_v6(DETOK *detoken, int type);
-#endif
-#endif
-
-#endif

diff --git a/vp8/decoder/arm/neon/dboolhuff_neon.asm b/vp8/decoder/arm/neon/dboolhuff_neon.asm
deleted file mode 100644
index ff3ffda..0000000
--- a/vp8/decoder/arm/neon/dboolhuff_neon.asm
+++ /dev/null

@@ -1,160 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_decode_value_neon|
-    EXPORT  |vp8dx_start_decode_neon|
-    EXPORT  |vp8dx_stop_decode_neon|
-    EXPORT  |vp8dx_decode_bool_neon|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    INCLUDE vpx_asm_offsets.asm
-
-    AREA    |.text|, CODE, READONLY  ; name this block of code
-
-;   int z = 0;
-;   int bit;
-;   for ( bit=bits-1; bit>=0; bit-- )
-;   {
-;       z |= (vp8dx_decode_bool(br, 0x80)<<bit);
-;   }
-;   return z;
-
-;int vp8_decode_value_neon ( BOOL_DECODER *br, int bits )
-|vp8_decode_value_neon| PROC
-    stmdb   sp!, {r4 - r6, lr}
-    mov     r4, r0
-    mov     r5, r1
-    mov     r6, #0
-
-    subs    r5, r5, #1
-    bmi     decode_value_exit
-
-decode_value_loop
-    mov     r1, #0x80
-    mov     r0, r4
-    bl      vp8dx_decode_bool_neon_internal       ; needed for conversion to s file
-    orr     r6, r6, r0, lsl r5
-    subs    r5, r5, #1
-    bpl     decode_value_loop
-
-decode_value_exit
-    mov     r0, r6
-    ldmia   sp!, {r4 - r6, pc}
-    ENDP    ; |vp8_decode_value_neon|
-
-
-;void vp8dx_start_decode_neon ( BOOL_DECODER *br, unsigned char *source )
-|vp8dx_start_decode_neon| PROC
-    stmdb   sp!, {r4 - r5, lr}
-    mov     r2, #0
-    mov     r3, #255
-
-    str     r2, [r0, #bool_decoder_lowvalue]
-    str     r3, [r0, #bool_decoder_range]
-    str     r1, [r0, #bool_decoder_buffer]
-
-    mov     r3, #8
-    mov     r2, #4
-    str     r3, [r0, #bool_decoder_count]
-    str     r2, [r0, #bool_decoder_pos]
-
-    ldrb    r2, [r1, #3]
-    ldrb    r3, [r1, #2]
-    ldrb    r4, [r1, #1]
-    ldrb    r5, [r1]
-
-    orr     r1, r2, r3, lsl #8
-    orr     r1, r1, r4, lsl #16
-    orr     r1, r1, r5, lsl #24
-
-    str     r1, [r0, #bool_decoder_value]
-
-    ldmia   sp!, {r4 - r5, pc}
-    ENDP    ; |vp8dx_start_decode_neon|
-
-
-;void vp8dx_stop_decode_neon ( BOOL_DECODER *bc );
-|vp8dx_stop_decode_neon| PROC
-    mov     pc, lr
-    ENDP    ; |vp8dx_stop_decode_neon|
-
-
-; bigsplit  RN  r1
-; buffer_v  RN  r1
-; count_v       RN  r4
-; range_v       RN  r2
-; value_v       RN  r3
-; pos_v     RN  r5
-; split     RN  r6
-; bit           RN  lr
-;int vp8dx_decode_bool_neon ( BOOL_DECODER *br, int probability )
-|vp8dx_decode_bool_neon| PROC
-vp8dx_decode_bool_neon_internal
-;LDRD and STRD doubleword data transfers must be eight-byte aligned. Use ALIGN 8
-;before memory allocation
-    stmdb   sp!, {r4 - r5, lr}
-
-    ldr     r2, [r0, #bool_decoder_range]       ;load range (r2), value(r3)
-    ldr     r3, [r0, #bool_decoder_value]
-    ;ldrd   r2, r3, [r0, #bool_decoder_range]   ;ldrd costs 2 cycles
-    ;
-
-    mov     r4, r2, lsl #8
-    sub     r4, r4, #256
-    mov     r12, #1
-
-    smlawb  r4, r4, r1, r12         ;split = 1 +  (((range-1) * probability) >> 8)
-
-    mov     lr, r0
-    mov     r0, #0                  ;bit = 0
-    ;
-    subs    r5, r3, r4, lsl #24
-
-    subhs   r2, r2, r4              ;range = br->range-split
-    movlo   r2, r4                  ;range = split
-    movhs   r0, #1                  ;bit = 1
-    movhs   r3, r5                  ;value = value-bigsplit
-
-    cmp     r2, #0x80
-    blt     range_less_0x80
-    strd    r2, r3, [lr, #bool_decoder_range]   ;store result
-
-    ldmia   sp!, {r4 - r5, pc}
-
-range_less_0x80
-
-    ldrd    r4, r5, [lr, #bool_decoder_count]   ;load count, pos, buffer
-    ldr     r1, [lr, #bool_decoder_buffer]
-
-    clz     r12, r2
-    add     r1, r1, r5
-
-    sub     r12, r12, #24
-    subs    r4, r4, r12             ;count -= shift
-    mov     r2, r2, lsl r12         ;range <<= shift
-    mov     r3, r3, lsl r12         ;value <<= shift
-    addle   r4, r4, #8              ;count += 8
-    ldrleb  r12, [r1], #1           ;br->buffer[br->pos]
-
-    rsble   r1, r4, #8              ;-count
-    addle   r5, r5, #1              ;br->pos++
-    orrle   r3, r3, r12, lsl r1     ;value |= (br->buffer[br->pos]) << (-count)
-
-    strd    r2, r3, [lr, #bool_decoder_range]   ;store result
-    strd    r4, r5, [lr, #bool_decoder_count]
-
-    ldmia   sp!, {r4 - r5, pc}
-    ENDP    ; |vp8dx_decode_bool_neon|
-
-    END

diff --git a/vp8/decoder/arm/neon/idct_blk_neon.c b/vp8/decoder/arm/neon/idct_blk_neon.c
index fe4f2e0..ee35004 100644
--- a/vp8/decoder/arm/neon/idct_blk_neon.c
+++ b/vp8/decoder/arm/neon/idct_blk_neon.c

@@ -9,8 +9,8 @@
  */
 
 #include "vpx_ports/config.h"
-#include "idct.h"
-#include "dequantize.h"
+#include "vp8/common/idct.h"
+#include "vp8/decoder/dequantize.h"
 
 /* place these declarations here because we don't want to maintain them
  * outside of this scope

diff --git a/vp8/decoder/asm_dec_offsets.c b/vp8/decoder/asm_dec_offsets.c
new file mode 100644
index 0000000..e485cb4
--- /dev/null
+++ b/vp8/decoder/asm_dec_offsets.c

@@ -0,0 +1,57 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include <stddef.h>
+
+#include "onyxd_int.h"
+
+#define DEFINE(sym, val) int sym = val;
+
+/*
+#define BLANK() asm volatile("\n->" : : )
+*/
+
+/*
+ * int main(void)
+ * {
+ */
+
+DEFINE(detok_scan,                              offsetof(DETOK, scan));
+DEFINE(detok_ptr_block2leftabove,               offsetof(DETOK, ptr_block2leftabove));
+DEFINE(detok_coef_tree_ptr,                     offsetof(DETOK, vp8_coef_tree_ptr));
+DEFINE(detok_teb_base_ptr,                      offsetof(DETOK, teb_base_ptr));
+DEFINE(detok_norm_ptr,                          offsetof(DETOK, norm_ptr));
+DEFINE(detok_ptr_coef_bands_x,                  offsetof(DETOK, ptr_coef_bands_x));
+
+DEFINE(detok_A,                                 offsetof(DETOK, A));
+DEFINE(detok_L,                                 offsetof(DETOK, L));
+
+DEFINE(detok_qcoeff_start_ptr,                  offsetof(DETOK, qcoeff_start_ptr));
+DEFINE(detok_current_bc,                        offsetof(DETOK, current_bc));
+DEFINE(detok_coef_probs,                        offsetof(DETOK, coef_probs));
+DEFINE(detok_eob,                               offsetof(DETOK, eob));
+
+DEFINE(bool_decoder_user_buffer_end,            offsetof(BOOL_DECODER, user_buffer_end));
+DEFINE(bool_decoder_user_buffer,                offsetof(BOOL_DECODER, user_buffer));
+DEFINE(bool_decoder_value,                      offsetof(BOOL_DECODER, value));
+DEFINE(bool_decoder_count,                      offsetof(BOOL_DECODER, count));
+DEFINE(bool_decoder_range,                      offsetof(BOOL_DECODER, range));
+
+DEFINE(tokenextrabits_min_val,                  offsetof(TOKENEXTRABITS, min_val));
+DEFINE(tokenextrabits_length,                   offsetof(TOKENEXTRABITS, Length));
+
+//add asserts for any offset that is not supported by assembly code
+//add asserts for any size that is not supported by assembly code
+/*
+ * return 0;
+ * }
+ */

diff --git a/vp8/decoder/dboolhuff.c b/vp8/decoder/dboolhuff.c
index 57cba16..8527d51 100644
--- a/vp8/decoder/dboolhuff.c
+++ b/vp8/decoder/dboolhuff.c

@@ -26,8 +26,9 @@
 };
 
 
-int vp8dx_start_decode_c(BOOL_DECODER *br, const unsigned char *source,
-                        unsigned int source_sz)
+int vp8dx_start_decode(BOOL_DECODER *br,
+                       const unsigned char *source,
+                       unsigned int source_sz)
 {
     br->user_buffer_end = source+source_sz;
     br->user_buffer     = source;
@@ -39,13 +40,13 @@
         return 1;
 
     /* Populate the buffer */
-    vp8dx_bool_decoder_fill_c(br);
+    vp8dx_bool_decoder_fill(br);
 
     return 0;
 }
 
 
-void vp8dx_bool_decoder_fill_c(BOOL_DECODER *br)
+void vp8dx_bool_decoder_fill(BOOL_DECODER *br)
 {
     const unsigned char *bufptr;
     const unsigned char *bufend;
@@ -62,69 +63,3 @@
     br->value = value;
     br->count = count;
 }
-
-#if 0
-/*
- * Until optimized versions of these functions are available, we
- * keep the implementation in the header to allow inlining.
- *
- * The RTCD-style invocations are still in place so this can
- * be switched by just uncommenting these functions here and
- * the DBOOLHUFF_INVOKE calls in the header.
- */
-int vp8dx_decode_bool_c(BOOL_DECODER *br, int probability)
-{
-    unsigned int bit=0;
-    VP8_BD_VALUE value;
-    unsigned int split;
-    VP8_BD_VALUE bigsplit;
-    int count;
-    unsigned int range;
-
-    value = br->value;
-    count = br->count;
-    range = br->range;
-
-    split = 1 + (((range-1) * probability) >> 8);
-    bigsplit = (VP8_BD_VALUE)split << (VP8_BD_VALUE_SIZE - 8);
-
-    range = split;
-    if(value >= bigsplit)
-    {
-        range = br->range-split;
-        value = value-bigsplit;
-        bit = 1;
-    }
-
-    /*if(range>=0x80)
-    {
-        br->value = value;
-        br->range = range;
-        return bit;
-    }*/
-
-    {
-        register unsigned int shift = vp8dx_bitreader_norm[range];
-        range <<= shift;
-        value <<= shift;
-        count -= shift;
-    }
-    br->value = value;
-    br->count = count;
-    br->range = range;
-    if (count < 0)
-        vp8dx_bool_decoder_fill_c(br);
-    return bit;
-}
-
-int vp8dx_decode_value_c(BOOL_DECODER *br, int bits)
-{
-    int z = 0;
-    int bit;
-    for ( bit=bits-1; bit>=0; bit-- )
-    {
-        z |= (vp8dx_decode_bool(br, 0x80)<<bit);
-    }
-    return z;
-}
-#endif

diff --git a/vp8/decoder/dboolhuff.h b/vp8/decoder/dboolhuff.h
index d14f4dc..a83e3f0 100644
--- a/vp8/decoder/dboolhuff.h
+++ b/vp8/decoder/dboolhuff.h

@@ -25,10 +25,6 @@
   Even relatively modest values like 100 would work fine.*/
 # define VP8_LOTS_OF_BITS (0x40000000)
 
-
-
-struct vp8_dboolhuff_rtcd_vtable;
-
 typedef struct
 {
     const unsigned char *user_buffer_end;
@@ -36,82 +32,15 @@
     VP8_BD_VALUE         value;
     int                  count;
     unsigned int         range;
-#if CONFIG_RUNTIME_CPU_DETECT
-    struct vp8_dboolhuff_rtcd_vtable *rtcd;
-#endif
 } BOOL_DECODER;
 
-#define prototype_dbool_start(sym) int sym(BOOL_DECODER *br, \
-    const unsigned char *source, unsigned int source_sz)
-#define prototype_dbool_fill(sym) void sym(BOOL_DECODER *br)
-#define prototype_dbool_debool(sym) int sym(BOOL_DECODER *br, int probability)
-#define prototype_dbool_devalue(sym) int sym(BOOL_DECODER *br, int bits)
-
-#if ARCH_ARM
-#include "arm/dboolhuff_arm.h"
-#endif
-
-#ifndef vp8_dbool_start
-#define vp8_dbool_start vp8dx_start_decode_c
-#endif
-
-#ifndef vp8_dbool_fill
-#define vp8_dbool_fill vp8dx_bool_decoder_fill_c
-#endif
-
-#ifndef vp8_dbool_debool
-#define vp8_dbool_debool vp8dx_decode_bool_c
-#endif
-
-#ifndef vp8_dbool_devalue
-#define vp8_dbool_devalue vp8dx_decode_value_c
-#endif
-
-extern prototype_dbool_start(vp8_dbool_start);
-extern prototype_dbool_fill(vp8_dbool_fill);
-extern prototype_dbool_debool(vp8_dbool_debool);
-extern prototype_dbool_devalue(vp8_dbool_devalue);
-
-typedef prototype_dbool_start((*vp8_dbool_start_fn_t));
-typedef prototype_dbool_fill((*vp8_dbool_fill_fn_t));
-typedef prototype_dbool_debool((*vp8_dbool_debool_fn_t));
-typedef prototype_dbool_devalue((*vp8_dbool_devalue_fn_t));
-
-typedef struct vp8_dboolhuff_rtcd_vtable {
-    vp8_dbool_start_fn_t   start;
-    vp8_dbool_fill_fn_t    fill;
-    vp8_dbool_debool_fn_t  debool;
-    vp8_dbool_devalue_fn_t devalue;
-} vp8_dboolhuff_rtcd_vtable_t;
-
-/* There are no processor-specific versions of these
- * functions right now. Disable RTCD to avoid using
- * function pointers which gives a speed boost
- */
-/*#ifdef ENABLE_RUNTIME_CPU_DETECT
-#define DBOOLHUFF_INVOKE(ctx,fn) (ctx)->fn
-#define IF_RTCD(x) (x)
-#else*/
-#define DBOOLHUFF_INVOKE(ctx,fn) vp8_dbool_##fn
-#define IF_RTCD(x) NULL
-/*#endif*/
-
 DECLARE_ALIGNED(16, extern const unsigned char, vp8dx_bitreader_norm[256]);
 
-/* wrapper functions to hide RTCD. static means inline means hopefully no
- * penalty
- */
-static int vp8dx_start_decode(BOOL_DECODER *br,
-        struct vp8_dboolhuff_rtcd_vtable *rtcd,
-        const unsigned char *source, unsigned int source_sz) {
-#if CONFIG_RUNTIME_CPU_DETECT
-    br->rtcd = rtcd;
-#endif
-    return DBOOLHUFF_INVOKE(rtcd, start)(br, source, source_sz);
-}
-static void vp8dx_bool_decoder_fill(BOOL_DECODER *br) {
-    DBOOLHUFF_INVOKE(br->rtcd, fill)(br);
-}
+int vp8dx_start_decode(BOOL_DECODER *br,
+                       const unsigned char *source,
+                       unsigned int source_sz);
+
+void vp8dx_bool_decoder_fill(BOOL_DECODER *br);
 
 /*The refill loop is used in several places, so define it in a macro to make
    sure they're all consistent.
@@ -138,12 +67,6 @@
 
 
 static int vp8dx_decode_bool(BOOL_DECODER *br, int probability) {
-  /*
-   * Until optimized versions of this function are available, we
-   * keep the implementation in the header to allow inlining.
-   *
-   *return DBOOLHUFF_INVOKE(br->rtcd, debool)(br, probability);
-   */
     unsigned int bit = 0;
     VP8_BD_VALUE value;
     unsigned int split;
@@ -167,13 +90,6 @@
         bit = 1;
     }
 
-    /*if(range>=0x80)
-    {
-        br->value = value;
-        br->range = range;
-        return bit
-    }*/
-
     {
         register unsigned int shift = vp8dx_bitreader_norm[range];
         range <<= shift;
@@ -190,12 +106,6 @@
 
 static int vp8_decode_value(BOOL_DECODER *br, int bits)
 {
-  /*
-   * Until optimized versions of this function are available, we
-   * keep the implementation in the header to allow inlining.
-   *
-   *return DBOOLHUFF_INVOKE(br->rtcd, devalue)(br, bits);
-   */
     int z = 0;
     int bit;
 

diff --git a/vp8/decoder/decodemv.c b/vp8/decoder/decodemv.c
index 203d72d..e5830e8 100644
--- a/vp8/decoder/decodemv.c
+++ b/vp8/decoder/decodemv.c

@@ -10,10 +10,10 @@
 
 
 #include "treereader.h"
-#include "entropymv.h"
-#include "entropymode.h"
+#include "vp8/common/entropymv.h"
+#include "vp8/common/entropymode.h"
 #include "onyxd_int.h"
-#include "findnearmv.h"
+#include "vp8/common/findnearmv.h"
 
 #if CONFIG_DEBUG
 #include <assert.h>
@@ -228,15 +228,8 @@
 };
 #endif
 
-unsigned char vp8_mbsplit_offset[4][16] = {
-    { 0,  8,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
-    { 0,  2,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
-    { 0,  2,  8, 10,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
-    { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}
-};
-
-unsigned char vp8_mbsplit_fill_count[4] = {8, 8, 4, 1};
-unsigned char vp8_mbsplit_fill_offset[4][16] = {
+static const unsigned char mbsplit_fill_count[4] = {8, 8, 4, 1};
+static const unsigned char mbsplit_fill_offset[4][16] = {
     { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15},
     { 0,  1,  4,  5,  8,  9, 12, 13,  2,  3,   6,  7, 10, 11, 14, 15},
     { 0,  1,  4,  5,  2,  3,  6,  7,  8,  9,  12, 13, 10, 11, 14, 15},
@@ -246,7 +239,7 @@
 
 
 
-void vp8_mb_mode_mv_init(VP8D_COMP *pbi)
+static void mb_mode_mv_init(VP8D_COMP *pbi)
 {
     vp8_reader *const bc = & pbi->bc;
     MV_CONTEXT *const mvc = pbi->common.fc.mvc;
@@ -287,7 +280,7 @@
     }
 }
 
-void vp8_read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
+static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
                             int mb_row, int mb_col)
 {
     const MV Zero = { 0, 0};
@@ -405,10 +398,10 @@
                     /* Fill (uniform) modes, mvs of jth subset.
                      Must do it here because ensuing subsets can
                      refer back to us via "left" or "above". */
-                    unsigned char *fill_offset;
-                    unsigned int fill_count = vp8_mbsplit_fill_count[s];
+                    const unsigned char *fill_offset;
+                    unsigned int fill_count = mbsplit_fill_count[s];
 
-                    fill_offset = &vp8_mbsplit_fill_offset[s][(unsigned char)j * vp8_mbsplit_fill_count[s]];
+                    fill_offset = &mbsplit_fill_offset[s][(unsigned char)j * mbsplit_fill_count[s]];
 
                     do {
                         mi->bmi[ *fill_offset] = bmi;
@@ -525,7 +518,7 @@
     MODE_INFO *mi = pbi->common.mi;
     int mb_row = -1;
 
-    vp8_mb_mode_mv_init(pbi);
+    mb_mode_mv_init(pbi);
 
     while (++mb_row < pbi->common.mb_rows)
     {
@@ -543,11 +536,11 @@
 
         while (++mb_col < pbi->common.mb_cols)
         {
-            /*vp8_read_mb_modes_mv(pbi, xd->mode_info_context, &xd->mode_info_context->mbmi, mb_row, mb_col);*/
+            /*read_mb_modes_mv(pbi, xd->mode_info_context, &xd->mode_info_context->mbmi, mb_row, mb_col);*/
             if(pbi->common.frame_type == KEY_FRAME)
                 vp8_kfread_modes(pbi, mi, mb_row, mb_col);
             else
-                vp8_read_mb_modes_mv(pbi, mi, &mi->mbmi, mb_row, mb_col);
+                read_mb_modes_mv(pbi, mi, &mi->mbmi, mb_row, mb_col);
 
             mi++;       /* next macroblock */
         }

diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c
index d3972b3..82841e8 100644
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c

@@ -10,28 +10,27 @@
 
 
 #include "onyxd_int.h"
-#include "header.h"
-#include "reconintra.h"
-#include "reconintra4x4.h"
-#include "recon.h"
-#include "reconinter.h"
+#include "vp8/common/header.h"
+#include "vp8/common/reconintra.h"
+#include "vp8/common/reconintra4x4.h"
+#include "vp8/common/recon.h"
+#include "vp8/common/reconinter.h"
 #include "dequantize.h"
 #include "detokenize.h"
-#include "invtrans.h"
-#include "alloccommon.h"
-#include "entropymode.h"
-#include "quant_common.h"
+#include "vp8/common/invtrans.h"
+#include "vp8/common/alloccommon.h"
+#include "vp8/common/entropymode.h"
+#include "vp8/common/quant_common.h"
 #include "vpx_scale/vpxscale.h"
 #include "vpx_scale/yv12extend.h"
-#include "setupintrarecon.h"
+#include "vp8/common/setupintrarecon.h"
 
 #include "decodemv.h"
-#include "extend.h"
+#include "vp8/common/extend.h"
 #include "vpx_mem/vpx_mem.h"
-#include "idct.h"
+#include "vp8/common/idct.h"
 #include "dequantize.h"
-#include "predictdc.h"
-#include "threading.h"
+#include "vp8/common/threading.h"
 #include "decoderthreading.h"
 #include "dboolhuff.h"
 
@@ -116,8 +115,8 @@
     {
 
         vp8_build_intra_predictors_mbuv_s(xd);
-        vp8_build_intra_predictors_mby_s_ptr(xd);
-
+        RECON_INVOKE(&pbi->common.rtcd.recon,
+                     build_intra_predictors_mby_s)(xd);
     }
     else
     {
@@ -176,7 +175,7 @@
 
 }
 
-void vp8_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd)
+static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd)
 {
     int eobtotal = 0;
     int i, do_clamp = xd->mode_info_context->mbmi.need_to_clamp_mvs;
@@ -215,7 +214,8 @@
 
         if (xd->mode_info_context->mbmi.mode != B_PRED)
         {
-            vp8_build_intra_predictors_mby_ptr(xd);
+            RECON_INVOKE(&pbi->common.rtcd.recon,
+                         build_intra_predictors_mby)(xd);
         } else {
             vp8_intra_prediction_down_copy(xd);
         }
@@ -320,10 +320,8 @@
 
 
 
-void vp8_decode_mb_row(VP8D_COMP *pbi,
-                       VP8_COMMON *pc,
-                       int mb_row,
-                       MACROBLOCKD *xd)
+static void
+decode_mb_row(VP8D_COMP *pbi, VP8_COMMON *pc, int mb_row, MACROBLOCKD *xd)
 {
 
     int i;
@@ -395,7 +393,7 @@
         else
         pbi->debugoutput =0;
         */
-        vp8_decode_macroblock(pbi, xd);
+        decode_macroblock(pbi, xd);
 
         /* check if the boolean decoder has suffered an error */
         xd->corrupted |= vp8dx_bool_error(xd->current_bc);
@@ -475,8 +473,7 @@
                                "Truncated packet or corrupt partition "
                                "%d length", i + 1);
 
-        if (vp8dx_start_decode(bool_decoder, IF_RTCD(&pbi->dboolhuff),
-                               partition, partition_size))
+        if (vp8dx_start_decode(bool_decoder, partition, partition_size))
             vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
                                "Failed to allocate bool decoder %d", i + 1);
 
@@ -485,9 +482,11 @@
         bool_decoder++;
     }
 
+#if CONFIG_MULTITHREAD
     /* Clamp number of decoder threads */
     if (pbi->decoding_thread_count > num_part - 1)
         pbi->decoding_thread_count = num_part - 1;
+#endif
 }
 
 
@@ -651,8 +650,7 @@
 
     init_frame(pbi);
 
-    if (vp8dx_start_decode(bc, IF_RTCD(&pbi->dboolhuff),
-                           data, data_end - data))
+    if (vp8dx_start_decode(bc, data, data_end - data))
         vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
                            "Failed to allocate bool decoder 0");
     if (pc->frame_type == KEY_FRAME) {
@@ -846,7 +844,9 @@
     vpx_memcpy(&xd->dst, &pc->yv12_fb[pc->new_fb_idx], sizeof(YV12_BUFFER_CONFIG));
 
     /* set up frame new frame for intra coded blocks */
+#if CONFIG_MULTITHREAD
     if (!(pbi->b_multithreaded_rd) || pc->multi_token_partition == ONE_PARTITION || !(pc->filter_level))
+#endif
         vp8_setup_intra_recon(&pc->yv12_fb[pc->new_fb_idx]);
 
     vp8_setup_block_dptrs(xd);
@@ -866,6 +866,7 @@
 
     vpx_memcpy(&xd->block[0].bmi, &xd->mode_info_context->bmi[0], sizeof(B_MODE_INFO));
 
+#if CONFIG_MULTITHREAD
     if (pbi->b_multithreaded_rd && pc->multi_token_partition != ONE_PARTITION)
     {
         vp8mt_decode_mb_rows(pbi, xd);
@@ -880,6 +881,7 @@
         vp8_yv12_extend_frame_borders_ptr(&pc->yv12_fb[pc->new_fb_idx]);    /*cm->frame_to_show);*/
     }
     else
+#endif
     {
         int ibc = 0;
         int num_part = 1 << pc->multi_token_partition;
@@ -897,7 +899,7 @@
                     ibc = 0;
             }
 
-            vp8_decode_mb_row(pbi, pc, mb_row, xd);
+            decode_mb_row(pbi, pc, mb_row, xd);
         }
     }
 

diff --git a/vp8/decoder/dequantize.c b/vp8/decoder/dequantize.c
index 84a9fd9..dd0c13b 100644
--- a/vp8/decoder/dequantize.c
+++ b/vp8/decoder/dequantize.c

@@ -11,8 +11,7 @@
 
 #include "vpx_ports/config.h"
 #include "dequantize.h"
-#include "predictdc.h"
-#include "idct.h"
+#include "vp8/common/idct.h"
 #include "vpx_mem/vpx_mem.h"
 
 extern void vp8_short_idct4x4llm_c(short *input, short *output, int pitch) ;

diff --git a/vp8/decoder/dequantize.h b/vp8/decoder/dequantize.h
index b78e39c..2e662a5 100644
--- a/vp8/decoder/dequantize.h
+++ b/vp8/decoder/dequantize.h

@@ -11,7 +11,7 @@
 
 #ifndef DEQUANTIZE_H
 #define DEQUANTIZE_H
-#include "blockd.h"
+#include "vp8/common/blockd.h"
 
 #define prototype_dequant_block(sym) \
     void sym(BLOCKD *x)

diff --git a/vp8/decoder/detokenize.c b/vp8/decoder/detokenize.c
index 7d013d2..b2e0819 100644
--- a/vp8/decoder/detokenize.c
+++ b/vp8/decoder/detokenize.c

@@ -9,8 +9,8 @@
  */
 
 
-#include "type_aliases.h"
-#include "blockd.h"
+#include "vp8/common/type_aliases.h"
+#include "vp8/common/blockd.h"
 #include "onyxd_int.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
@@ -19,7 +19,13 @@
 #define BOOL_DATA UINT8
 
 #define OCB_X PREV_COEF_CONTEXTS * ENTROPY_NODES
-DECLARE_ALIGNED(16, UINT8, vp8_coef_bands_x[16]) = { 0, 1 * OCB_X, 2 * OCB_X, 3 * OCB_X, 6 * OCB_X, 4 * OCB_X, 5 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 7 * OCB_X};
+DECLARE_ALIGNED(16, static const unsigned char, coef_bands_x[16]) =
+{
+    0 * OCB_X, 1 * OCB_X, 2 * OCB_X, 3 * OCB_X,
+    6 * OCB_X, 4 * OCB_X, 5 * OCB_X, 6 * OCB_X,
+    6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X,
+    6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 7 * OCB_X
+};
 #define EOB_CONTEXT_NODE            0
 #define ZERO_CONTEXT_NODE           1
 #define ONE_CONTEXT_NODE            2
@@ -74,37 +80,6 @@
     }
 }
 
-#if CONFIG_ARM_ASM_DETOK
-/* mashup of vp8_block2left and vp8_block2above so we only need one pointer
- * for the assembly version.
- */
-DECLARE_ALIGNED(16, const UINT8, vp8_block2leftabove[25*2]) =
-{
-    /* vp8_block2left */
-    0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
-    /* vp8_block2above */
-    0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8
-};
-
-void vp8_init_detokenizer(VP8D_COMP *dx)
-{
-    const VP8_COMMON *const oc = & dx->common;
-    MACROBLOCKD *x = & dx->mb;
-
-    dx->detoken.vp8_coef_tree_ptr = vp8_coef_tree;
-    dx->detoken.ptr_block2leftabove = vp8_block2leftabove;
-    dx->detoken.ptr_coef_bands_x = vp8_coef_bands_x;
-    dx->detoken.scan = vp8_default_zig_zag1d;
-    dx->detoken.teb_base_ptr = vp8d_token_extra_bits2;
-    dx->detoken.qcoeff_start_ptr = &x->qcoeff[0];
-
-    dx->detoken.coef_probs[0] = (oc->fc.coef_probs [0] [ 0 ] [0]);
-    dx->detoken.coef_probs[1] = (oc->fc.coef_probs [1] [ 0 ] [0]);
-    dx->detoken.coef_probs[2] = (oc->fc.coef_probs [2] [ 0 ] [0]);
-    dx->detoken.coef_probs[3] = (oc->fc.coef_probs [3] [ 0 ] [0]);
-}
-#endif
-
 DECLARE_ALIGNED(16, extern const unsigned char, vp8dx_bitreader_norm[256]);
 #define FILL \
     if(count < 0) \
@@ -166,7 +141,7 @@
             Prob = coef_probs; \
             if(c<15) {\
             ++c; \
-            Prob += vp8_coef_bands_x[c]; \
+            Prob += coef_bands_x[c]; \
             goto branch; \
             } goto BLOCK_FINISHED; /*for malformed input */\
         } \
@@ -202,35 +177,6 @@
     }\
     NORMALIZE
 
-#if CONFIG_ARM_ASM_DETOK
-int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x)
-{
-    int eobtotal = 0;
-    int i, type;
-
-    dx->detoken.current_bc = x->current_bc;
-    dx->detoken.A = x->above_context;
-    dx->detoken.L = x->left_context;
-
-    type = 3;
-
-    if (x->mode_info_context->mbmi.mode != B_PRED && x->mode_info_context->mbmi.mode != SPLITMV)
-    {
-        type = 1;
-        eobtotal -= 16;
-    }
-
-    vp8_decode_mb_tokens_v6(&dx->detoken, type);
-
-    for (i = 0; i < 25; i++)
-    {
-        x->eobs[i] = dx->detoken.eob[i];
-        eobtotal += dx->detoken.eob[i];
-    }
-
-    return eobtotal;
-}
-#else
 int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x)
 {
     ENTROPY_CONTEXT *A = (ENTROPY_CONTEXT *)x->above_context;
@@ -304,7 +250,7 @@
     Prob += v * ENTROPY_NODES;
 
 DO_WHILE:
-    Prob += vp8_coef_bands_x[c];
+    Prob += coef_bands_x[c];
     DECODE_AND_BRANCH_IF_ZERO(Prob[EOB_CONTEXT_NODE], BLOCK_FINISHED);
 
 CHECK_0_:
@@ -423,4 +369,3 @@
     return eobtotal;
 
 }
-#endif /*!CONFIG_ASM_DETOK*/

diff --git a/vp8/decoder/detokenize.h b/vp8/decoder/detokenize.h
index 294a4a5..8640bda 100644
--- a/vp8/decoder/detokenize.h
+++ b/vp8/decoder/detokenize.h

@@ -14,10 +14,6 @@
 
 #include "onyxd_int.h"
 
-#if ARCH_ARM
-#include "arm/detokenize_arm.h"
-#endif
-
 void vp8_reset_mb_tokens_context(MACROBLOCKD *x);
 int vp8_decode_mb_tokens(VP8D_COMP *, MACROBLOCKD *);
 

diff --git a/vp8/decoder/generic/dsystemdependent.c b/vp8/decoder/generic/dsystemdependent.c
index 2e28472..2406dea 100644
--- a/vp8/decoder/generic/dsystemdependent.c
+++ b/vp8/decoder/generic/dsystemdependent.c

@@ -10,8 +10,8 @@
 
 
 #include "vpx_ports/config.h"
-#include "dequantize.h"
-#include "onyxd_int.h"
+#include "vp8/decoder/dequantize.h"
+#include "vp8/decoder/onyxd_int.h"
 
 extern void vp8_arch_x86_decode_init(VP8D_COMP *pbi);
 extern void vp8_arch_arm_decode_init(VP8D_COMP *pbi);
@@ -27,12 +27,6 @@
     pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_c;
     pbi->dequant.idct_add_y_block    = vp8_dequant_idct_add_y_block_c;
     pbi->dequant.idct_add_uv_block   = vp8_dequant_idct_add_uv_block_c;
-    pbi->dboolhuff.start             = vp8dx_start_decode_c;
-    pbi->dboolhuff.fill              = vp8dx_bool_decoder_fill_c;
-#if 0 /*For use with RTCD, when implemented*/
-    pbi->dboolhuff.debool = vp8dx_decode_bool_c;
-    pbi->dboolhuff.devalue = vp8dx_decode_value_c;
-#endif
 #endif
 
 #if ARCH_X86 || ARCH_X86_64

diff --git a/vp8/decoder/idct_blk.c b/vp8/decoder/idct_blk.c
index c98bd5b..df01923 100644
--- a/vp8/decoder/idct_blk.c
+++ b/vp8/decoder/idct_blk.c

@@ -9,7 +9,7 @@
  */
 
 #include "vpx_ports/config.h"
-#include "idct.h"
+#include "vp8/common/idct.h"
 #include "dequantize.h"
 
 void vp8_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred,

diff --git a/vp8/decoder/onyxd_if.c b/vp8/decoder/onyxd_if.c
index adbb1f3..ef2e00d 100644
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c

@@ -9,25 +9,25 @@
  */
 
 
-#include "onyxc_int.h"
+#include "vp8/common/onyxc_int.h"
 #if CONFIG_POSTPROC
-#include "postproc.h"
+#include "vp8/common/postproc.h"
 #endif
-#include "onyxd.h"
+#include "vp8/common/onyxd.h"
 #include "onyxd_int.h"
 #include "vpx_mem/vpx_mem.h"
-#include "alloccommon.h"
+#include "vp8/common/alloccommon.h"
 #include "vpx_scale/yv12extend.h"
-#include "loopfilter.h"
-#include "swapyv12buffer.h"
-#include "g_common.h"
-#include "threading.h"
+#include "vp8/common/loopfilter.h"
+#include "vp8/common/swapyv12buffer.h"
+#include "vp8/common/g_common.h"
+#include "vp8/common/threading.h"
 #include "decoderthreading.h"
 #include <stdio.h>
 
-#include "quant_common.h"
+#include "vp8/common/quant_common.h"
 #include "vpx_scale/vpxscale.h"
-#include "systemdependent.h"
+#include "vp8/common/systemdependent.h"
 #include "vpx_ports/vpx_timer.h"
 #include "detokenize.h"
 #if ARCH_ARM
@@ -37,43 +37,6 @@
 extern void vp8_init_loop_filter(VP8_COMMON *cm);
 extern void vp8cx_init_de_quantizer(VP8D_COMP *pbi);
 
-#if CONFIG_DEBUG
-void vp8_recon_write_yuv_frame(unsigned char *name, YV12_BUFFER_CONFIG *s)
-{
-    FILE *yuv_file = fopen((char *)name, "ab");
-    unsigned char *src = s->y_buffer;
-    int h = s->y_height;
-
-    do
-    {
-        fwrite(src, s->y_width, 1,  yuv_file);
-        src += s->y_stride;
-    }
-    while (--h);
-
-    src = s->u_buffer;
-    h = s->uv_height;
-
-    do
-    {
-        fwrite(src, s->uv_width, 1,  yuv_file);
-        src += s->uv_stride;
-    }
-    while (--h);
-
-    src = s->v_buffer;
-    h = s->uv_height;
-
-    do
-    {
-        fwrite(src, s->uv_width, 1, yuv_file);
-        src += s->uv_stride;
-    }
-    while (--h);
-
-    fclose(yuv_file);
-}
-#endif
 
 void vp8dx_initialize()
 {
@@ -114,8 +77,10 @@
     pbi->ready_for_new_data = 1;
 
     pbi->CPUFreq = 0; /*vp8_get_processor_freq();*/
+#if CONFIG_MULTITHREAD
     pbi->max_threads = oxcf->max_threads;
     vp8_decoder_create_threads(pbi);
+#endif
 
     /* vp8cx_init_de_quantizer() is first called here. Add check in frame_init_dequantizer() to avoid
      *  unnecessary calling of vp8cx_init_de_quantizer() for every frame.
@@ -131,9 +96,6 @@
         cm->last_sharpness_level = cm->sharpness_level;
     }
 
-#if CONFIG_ARM_ASM_DETOK
-    vp8_init_detokenizer(pbi);
-#endif
     pbi->common.error.setjmp = 0;
     return (VP8D_PTR) pbi;
 }
@@ -149,42 +111,13 @@
 #if CONFIG_MULTITHREAD
     if (pbi->b_multithreaded_rd)
         vp8mt_de_alloc_temp_buffers(pbi, pbi->common.mb_rows);
-#endif
     vp8_decoder_remove_threads(pbi);
+#endif
     vp8_remove_common(&pbi->common);
     vpx_free(pbi);
 }
 
 
-void vp8dx_set_setting(VP8D_PTR comp, VP8D_SETTING oxst, int x)
-{
-    VP8D_COMP *pbi = (VP8D_COMP *) comp;
-
-    (void) pbi;
-    (void) x;
-
-    switch (oxst)
-    {
-    case VP8D_OK:
-        break;
-    }
-}
-
-int vp8dx_get_setting(VP8D_PTR comp, VP8D_SETTING oxst)
-{
-    VP8D_COMP *pbi = (VP8D_COMP *) comp;
-
-    (void) pbi;
-
-    switch (oxst)
-    {
-    case VP8D_OK:
-        break;
-    }
-
-    return -1;
-}
-
 int vp8dx_get_reference(VP8D_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
 {
     VP8D_COMP *pbi = (VP8D_COMP *) ptr;
@@ -204,6 +137,8 @@
 
     return 0;
 }
+
+
 int vp8dx_set_reference(VP8D_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
 {
     VP8D_COMP *pbi = (VP8D_COMP *) ptr;
@@ -407,6 +342,7 @@
         return retcode;
     }
 
+#if CONFIG_MULTITHREAD
     if (pbi->b_multithreaded_rd && cm->multi_token_partition != ONE_PARTITION)
     {
         if (swap_frame_buffers (cm))
@@ -424,6 +360,7 @@
             return -1;
         }
     } else
+#endif
     {
         if (swap_frame_buffers (cm))
         {
@@ -458,12 +395,6 @@
         vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show);
     }
 
-#if 0
-    /* DEBUG code */
-    /*vp8_recon_write_yuv_frame("recon.yuv", cm->frame_to_show);*/
-    if (cm->current_video_frame <= 5)
-        write_dx_frame_to_file(cm->frame_to_show, cm->current_video_frame);
-#endif
 
     vp8_clear_system_state();
 

diff --git a/vp8/decoder/onyxd_int.h b/vp8/decoder/onyxd_int.h
index 7593edf..ac1e332 100644
--- a/vp8/decoder/onyxd_int.h
+++ b/vp8/decoder/onyxd_int.h

@@ -12,10 +12,10 @@
 #ifndef __INC_VP8D_INT_H
 #define __INC_VP8D_INT_H
 #include "vpx_ports/config.h"
-#include "onyxd.h"
+#include "vp8/common/onyxd.h"
 #include "treereader.h"
-#include "onyxc_int.h"
-#include "threading.h"
+#include "vp8/common/onyxc_int.h"
+#include "vp8/common/threading.h"
 #include "dequantize.h"
 
 typedef struct
@@ -87,14 +87,15 @@
     unsigned int time_decoding;
     unsigned int time_loop_filtering;
 
+#if CONFIG_MULTITHREAD
+    /* variable for threading */
+
     volatile int b_multithreaded_rd;
     int max_threads;
     int current_mb_col_main;
     int decoding_thread_count;
     int allocated_decoding_thread_count;
 
-    /* variable for threading */
-#if CONFIG_MULTITHREAD
     int mt_baseline_filter_level[MAX_MB_SEGMENTS];
     int sync_range;
     int *mt_current_mb_col;                  /* Each row remembers its already decoded column. */
@@ -125,7 +126,6 @@
 
 #if CONFIG_RUNTIME_CPU_DETECT
     vp8_dequant_rtcd_vtable_t        dequant;
-    struct vp8_dboolhuff_rtcd_vtable dboolhuff;
 #endif
 
 

diff --git a/vp8/decoder/reconintra_mt.c b/vp8/decoder/reconintra_mt.c
index ad4324b..b9d2b37 100644
--- a/vp8/decoder/reconintra_mt.c
+++ b/vp8/decoder/reconintra_mt.c

@@ -10,8 +10,8 @@
 
 
 #include "vpx_ports/config.h"
-#include "recon.h"
-#include "reconintra.h"
+#include "vp8/common/recon.h"
+#include "vp8/common/reconintra.h"
 #include "vpx_mem/vpx_mem.h"
 #include "onyxd_int.h"
 
@@ -21,7 +21,6 @@
 
 void vp8mt_build_intra_predictors_mby(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)
 {
-#if CONFIG_MULTITHREAD
     unsigned char *yabove_row;    /* = x->dst.y_buffer - x->dst.y_stride; */
     unsigned char *yleft_col;
     unsigned char yleft_buf[16];
@@ -146,17 +145,10 @@
     case MB_MODE_COUNT:
         break;
     }
-#else
-    (void) pbi;
-    (void) x;
-    (void) mb_row;
-    (void) mb_col;
-#endif
 }
 
 void vp8mt_build_intra_predictors_mby_s(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)
 {
-#if CONFIG_MULTITHREAD
     unsigned char *yabove_row;    /* = x->dst.y_buffer - x->dst.y_stride; */
     unsigned char *yleft_col;
     unsigned char yleft_buf[16];
@@ -289,17 +281,10 @@
     case MB_MODE_COUNT:
         break;
     }
-#else
-    (void) pbi;
-    (void) x;
-    (void) mb_row;
-    (void) mb_col;
-#endif
 }
 
 void vp8mt_build_intra_predictors_mbuv(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)
 {
-#if CONFIG_MULTITHREAD
     unsigned char *uabove_row;   /* = x->dst.u_buffer - x->dst.uv_stride; */
     unsigned char *uleft_col;    /*[16];*/
     unsigned char uleft_buf[8];
@@ -452,17 +437,10 @@
     case MB_MODE_COUNT:
         break;
     }
-#else
-    (void) pbi;
-    (void) x;
-    (void) mb_row;
-    (void) mb_col;
-#endif
 }
 
 void vp8mt_build_intra_predictors_mbuv_s(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)
 {
-#if CONFIG_MULTITHREAD
     unsigned char *uabove_row;  /* = x->dst.u_buffer - x->dst.uv_stride; */
     unsigned char *uleft_col;   /*[16];*/
     unsigned char uleft_buf[8];
@@ -621,12 +599,6 @@
     case MB_MODE_COUNT:
         break;
     }
-#else
-    (void) pbi;
-    (void) x;
-    (void) mb_row;
-    (void) mb_col;
-#endif
 }
 
 
@@ -638,7 +610,6 @@
                           int mb_col,
                           int num)
 {
-#if CONFIG_MULTITHREAD
     int i, r, c;
 
     unsigned char *Above;   /* = *(x->base_dst) + x->dst - x->dst_stride; */
@@ -935,15 +906,6 @@
 
 
     }
-#else
-    (void) pbi;
-    (void) xd;
-    (void) b_mode;
-    (void) predictor;
-    (void) mb_row;
-    (void) mb_col;
-    (void) num;
-#endif
 }
 
 /* copy 4 bytes from the above right down so that the 4x4 prediction modes using pixels above and
@@ -951,7 +913,6 @@
  */
 void vp8mt_intra_prediction_down_copy(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col)
 {
-#if CONFIG_MULTITHREAD
     unsigned char *above_right;   /* = *(x->block[0].base_dst) + x->block[0].dst - x->block[0].dst_stride + 16; */
     unsigned int *src_ptr;
     unsigned int *dst_ptr0;
@@ -973,10 +934,4 @@
     *dst_ptr0 = *src_ptr;
     *dst_ptr1 = *src_ptr;
     *dst_ptr2 = *src_ptr;
-#else
-    (void) pbi;
-    (void) x;
-    (void) mb_row;
-    (void) mb_col;
-#endif
 }

diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c
index ec2cb2b..3d9d428 100644
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c

@@ -12,18 +12,15 @@
 #if !defined(WIN32) && CONFIG_OS_SUPPORT == 1
 # include <unistd.h>
 #endif
-#ifdef __APPLE__
-#include <mach/mach_init.h>
-#endif
 #include "onyxd_int.h"
 #include "vpx_mem/vpx_mem.h"
-#include "threading.h"
+#include "vp8/common/threading.h"
 
-#include "loopfilter.h"
-#include "extend.h"
+#include "vp8/common/loopfilter.h"
+#include "vp8/common/extend.h"
 #include "vpx_ports/vpx_timer.h"
 #include "detokenize.h"
-#include "reconinter.h"
+#include "vp8/common/reconinter.h"
 #include "reconintra_mt.h"
 
 extern void mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd);
@@ -36,9 +33,8 @@
 #define RTCD_VTABLE(x) NULL
 #endif
 
-void vp8_setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC *mbrd, int count)
+static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC *mbrd, int count)
 {
-#if CONFIG_MULTITHREAD
     VP8_COMMON *const pc = & pbi->common;
     int i, j;
 
@@ -88,18 +84,11 @@
 
     for (i=0; i< pc->mb_rows; i++)
         pbi->mt_current_mb_col[i]=-1;
-#else
-    (void) pbi;
-    (void) xd;
-    (void) mbrd;
-    (void) count;
-#endif
 }
 
 
-void vp8mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int mb_col)
+static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int mb_col)
 {
-#if CONFIG_MULTITHREAD
     int eobtotal = 0;
     int i, do_clamp = xd->mode_info_context->mbmi.need_to_clamp_mvs;
     VP8_COMMON *pc = &pbi->common;
@@ -222,18 +211,11 @@
                     (xd->qcoeff+16*16, xd->block[16].dequant,
                      xd->predictor+16*16, xd->dst.u_buffer, xd->dst.v_buffer,
                      xd->dst.uv_stride, xd->eobs+16);
-#else
-    (void) pbi;
-    (void) xd;
-    (void) mb_row;
-    (void) mb_col;
-#endif
 }
 
 
-THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)
+static THREAD_FUNCTION thread_decoding_proc(void *p_data)
 {
-#if CONFIG_MULTITHREAD
     int ithread = ((DECODETHREAD_DATA *)p_data)->ithread;
     VP8D_COMP *pbi = (VP8D_COMP *)(((DECODETHREAD_DATA *)p_data)->ptr1);
     MB_ROW_DEC *mbrd = (MB_ROW_DEC *)(((DECODETHREAD_DATA *)p_data)->ptr2);
@@ -311,18 +293,6 @@
                             }
                         }
 
-                        if(pbi->common.filter_level)
-                        {
-                            /*update loopfilter info*/
-                            Segment = (alt_flt_enabled) ? xd->mode_info_context->mbmi.segment_id : 0;
-                            filter_level = pbi->mt_baseline_filter_level[Segment];
-                            /* Distance of Mb to the various image edges.
-                             * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units
-                             * Apply any context driven MB level adjustment
-                             */
-                            filter_level = vp8_adjust_mb_lf_value(xd, filter_level);
-                        }
-
                         /* Distance of Mb to the various image edges.
                          * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units
                          */
@@ -348,7 +318,7 @@
                         xd->pre.v_buffer = pc->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
 
                         vp8_build_uvmvs(xd, pc->full_pixel);
-                        vp8mt_decode_macroblock(pbi, xd, mb_row, mb_col);
+                        decode_macroblock(pbi, xd, mb_row, mb_col);
 
                         if (pbi->common.filter_level)
                         {
@@ -377,7 +347,16 @@
                                 }
                             }
 
-                          /* loopfilter on this macroblock. */
+                            /* update loopfilter info */
+                            Segment = (alt_flt_enabled) ? xd->mode_info_context->mbmi.segment_id : 0;
+                            filter_level = pbi->mt_baseline_filter_level[Segment];
+                            /* Distance of Mb to the various image edges.
+                             * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units
+                             * Apply any context driven MB level adjustment
+                             */
+                            filter_level = vp8_adjust_mb_lf_value(xd, filter_level);
+
+                            /* loopfilter on this macroblock. */
                             if (filter_level)
                             {
                                 if (mb_col > 0)
@@ -438,9 +417,6 @@
             sem_post(&pbi->h_event_end_decoding);
         }
     }
-#else
-    (void) p_data;
-#endif
 
     return 0 ;
 }
@@ -448,7 +424,6 @@
 
 void vp8_decoder_create_threads(VP8D_COMP *pbi)
 {
-#if CONFIG_MULTITHREAD
     int core_count = 0;
     int ithread;
 
@@ -475,44 +450,33 @@
             pbi->de_thread_data[ithread].ptr1     = (void *)pbi;
             pbi->de_thread_data[ithread].ptr2     = (void *) &pbi->mb_row_di[ithread];
 
-            pthread_create(&pbi->h_decoding_thread[ithread], 0, vp8_thread_decoding_proc, (&pbi->de_thread_data[ithread]));
+            pthread_create(&pbi->h_decoding_thread[ithread], 0, thread_decoding_proc, (&pbi->de_thread_data[ithread]));
         }
 
         sem_init(&pbi->h_event_end_decoding, 0, 0);
 
         pbi->allocated_decoding_thread_count = pbi->decoding_thread_count;
     }
-
-#else
-    (void) pbi;
-#endif
 }
 
 
 void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows)
 {
-#if CONFIG_MULTITHREAD
     VP8_COMMON *const pc = & pbi->common;
     int i;
 
     if (pbi->b_multithreaded_rd)
     {
-        if (pbi->mt_current_mb_col)
-        {
             vpx_free(pbi->mt_current_mb_col);
             pbi->mt_current_mb_col = NULL ;
-        }
 
         /* Free above_row buffers. */
         if (pbi->mt_yabove_row)
         {
             for (i=0; i< mb_rows; i++)
             {
-                if (pbi->mt_yabove_row[i])
-                {
                     vpx_free(pbi->mt_yabove_row[i]);
                     pbi->mt_yabove_row[i] = NULL ;
-                }
             }
             vpx_free(pbi->mt_yabove_row);
             pbi->mt_yabove_row = NULL ;
@@ -522,11 +486,8 @@
         {
             for (i=0; i< mb_rows; i++)
             {
-                if (pbi->mt_uabove_row[i])
-                {
                     vpx_free(pbi->mt_uabove_row[i]);
                     pbi->mt_uabove_row[i] = NULL ;
-                }
             }
             vpx_free(pbi->mt_uabove_row);
             pbi->mt_uabove_row = NULL ;
@@ -536,11 +497,8 @@
         {
             for (i=0; i< mb_rows; i++)
             {
-                if (pbi->mt_vabove_row[i])
-                {
                     vpx_free(pbi->mt_vabove_row[i]);
                     pbi->mt_vabove_row[i] = NULL ;
-                }
             }
             vpx_free(pbi->mt_vabove_row);
             pbi->mt_vabove_row = NULL ;
@@ -551,11 +509,8 @@
         {
             for (i=0; i< mb_rows; i++)
             {
-                if (pbi->mt_yleft_col[i])
-                {
                     vpx_free(pbi->mt_yleft_col[i]);
                     pbi->mt_yleft_col[i] = NULL ;
-                }
             }
             vpx_free(pbi->mt_yleft_col);
             pbi->mt_yleft_col = NULL ;
@@ -565,11 +520,8 @@
         {
             for (i=0; i< mb_rows; i++)
             {
-                if (pbi->mt_uleft_col[i])
-                {
                     vpx_free(pbi->mt_uleft_col[i]);
                     pbi->mt_uleft_col[i] = NULL ;
-                }
             }
             vpx_free(pbi->mt_uleft_col);
             pbi->mt_uleft_col = NULL ;
@@ -579,25 +531,18 @@
         {
             for (i=0; i< mb_rows; i++)
             {
-                if (pbi->mt_vleft_col[i])
-                {
                     vpx_free(pbi->mt_vleft_col[i]);
                     pbi->mt_vleft_col[i] = NULL ;
-                }
             }
             vpx_free(pbi->mt_vleft_col);
             pbi->mt_vleft_col = NULL ;
         }
     }
-#else
-    (void) pbi;
-#endif
 }
 
 
 void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows)
 {
-#if CONFIG_MULTITHREAD
     VP8_COMMON *const pc = & pbi->common;
     int i;
     int uv_width;
@@ -646,17 +591,11 @@
         for (i=0; i< pc->mb_rows; i++)
             CHECK_MEM_ERROR(pbi->mt_vleft_col[i], vpx_calloc(sizeof(unsigned char) * 8, 1));
     }
-#else
-    (void) pbi;
-    (void) width;
-#endif
 }
 
 
 void vp8_decoder_remove_threads(VP8D_COMP *pbi)
 {
-#if CONFIG_MULTITHREAD
-
     /* shutdown MB Decoding thread; */
     if (pbi->b_multithreaded_rd)
     {
@@ -678,39 +617,23 @@
 
         sem_destroy(&pbi->h_event_end_decoding);
 
-        if (pbi->h_decoding_thread)
-        {
             vpx_free(pbi->h_decoding_thread);
             pbi->h_decoding_thread = NULL;
-        }
 
-        if (pbi->h_event_start_decoding)
-        {
             vpx_free(pbi->h_event_start_decoding);
             pbi->h_event_start_decoding = NULL;
-        }
 
-        if (pbi->mb_row_di)
-        {
             vpx_free(pbi->mb_row_di);
             pbi->mb_row_di = NULL ;
-        }
 
-        if (pbi->de_thread_data)
-        {
             vpx_free(pbi->de_thread_data);
             pbi->de_thread_data = NULL;
-        }
     }
-#else
-    (void) pbi;
-#endif
 }
 
 
-void vp8mt_lpf_init( VP8D_COMP *pbi, int default_filt_lvl)
+static void lpf_init( VP8D_COMP *pbi, int default_filt_lvl)
 {
-#if CONFIG_MULTITHREAD
     VP8_COMMON *cm  = &pbi->common;
     MACROBLOCKD *mbd = &pbi->mb;
     /*YV12_BUFFER_CONFIG *post = &cm->new_frame;*/  /*frame_to_show;*/
@@ -752,16 +675,11 @@
         vp8_init_loop_filter(cm);
     else if (frame_type != cm->last_frame_type)
         vp8_frame_init_loop_filter(lfi, frame_type);
-#else
-    (void) pbi;
-    (void) default_filt_lvl;
-#endif
 }
 
 
 void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
 {
-#if CONFIG_MULTITHREAD
     int mb_row;
     VP8_COMMON *pc = &pbi->common;
 
@@ -797,10 +715,10 @@
             vpx_memset(pbi->mt_uleft_col[i], (unsigned char)129, 8);
             vpx_memset(pbi->mt_vleft_col[i], (unsigned char)129, 8);
         }
-        vp8mt_lpf_init(pbi, pc->filter_level);
+        lpf_init(pbi, pc->filter_level);
     }
 
-    vp8_setup_decoding_thread_data(pbi, xd, pbi->mb_row_di, pbi->decoding_thread_count);
+    setup_decoding_thread_data(pbi, xd, pbi->mb_row_di, pbi->decoding_thread_count);
 
     for (i = 0; i < pbi->decoding_thread_count; i++)
         sem_post(&pbi->h_event_start_decoding[i]);
@@ -854,18 +772,6 @@
                     }
                 }
 
-                if(pbi->common.filter_level)
-                {
-                    /* update loopfilter info */
-                    Segment = (alt_flt_enabled) ? xd->mode_info_context->mbmi.segment_id : 0;
-                    filter_level = pbi->mt_baseline_filter_level[Segment];
-                    /* Distance of Mb to the various image edges.
-                     * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units
-                     * Apply any context driven MB level adjustment
-                     */
-                    filter_level = vp8_adjust_mb_lf_value(xd, filter_level);
-                }
-
                 /* Distance of Mb to the various image edges.
                  * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units
                  */
@@ -897,7 +803,7 @@
                 }
 
                 vp8_build_uvmvs(xd, pc->full_pixel);
-                vp8mt_decode_macroblock(pbi, xd, mb_row, mb_col);
+                decode_macroblock(pbi, xd, mb_row, mb_col);
 
                 /* check if the boolean decoder has suffered an error */
                 xd->corrupted |= vp8dx_bool_error(xd->current_bc);
@@ -929,6 +835,15 @@
                         }
                     }
 
+                    /* update loopfilter info */
+                    Segment = (alt_flt_enabled) ? xd->mode_info_context->mbmi.segment_id : 0;
+                    filter_level = pbi->mt_baseline_filter_level[Segment];
+                    /* Distance of Mb to the various image edges.
+                     * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units
+                     * Apply any context driven MB level adjustment
+                     */
+                    filter_level = vp8_adjust_mb_lf_value(xd, filter_level);
+
                     /* loopfilter on this macroblock. */
                     if (filter_level)
                     {
@@ -981,8 +896,4 @@
     }
 
     sem_wait(&pbi->h_event_end_decoding);   /* add back for each frame */
-#else
-    (void) pbi;
-    (void) xd;
-#endif
 }

diff --git a/vp8/decoder/treereader.h b/vp8/decoder/treereader.h
index 2778428..b50a4d2 100644
--- a/vp8/decoder/treereader.h
+++ b/vp8/decoder/treereader.h

@@ -12,7 +12,7 @@
 #ifndef tree_reader_h
 #define tree_reader_h 1
 
-#include "treecoder.h"
+#include "vp8/common/treecoder.h"
 
 #include "dboolhuff.h"
 

diff --git a/vp8/decoder/x86/idct_blk_mmx.c b/vp8/decoder/x86/idct_blk_mmx.c
index 78c91d3..8f1a363 100644
--- a/vp8/decoder/x86/idct_blk_mmx.c
+++ b/vp8/decoder/x86/idct_blk_mmx.c

@@ -9,8 +9,8 @@
  */
 
 #include "vpx_ports/config.h"
-#include "idct.h"
-#include "dequantize.h"
+#include "vp8/common/idct.h"
+#include "vp8/decoder/dequantize.h"
 
 void vp8_dequant_dc_idct_add_y_block_mmx
             (short *q, short *dq, unsigned char *pre,

diff --git a/vp8/decoder/x86/idct_blk_sse2.c b/vp8/decoder/x86/idct_blk_sse2.c
index 0273d6e..4c88db4 100644
--- a/vp8/decoder/x86/idct_blk_sse2.c
+++ b/vp8/decoder/x86/idct_blk_sse2.c

@@ -9,8 +9,8 @@
  */
 
 #include "vpx_ports/config.h"
-#include "idct.h"
-#include "dequantize.h"
+#include "vp8/common/idct.h"
+#include "vp8/decoder/dequantize.h"
 
 void idct_dequant_dc_0_2x_sse2
             (short *q, short *dq, unsigned char *pre,

diff --git a/vp8/decoder/x86/onyxdxv.c b/vp8/decoder/x86/onyxdxv.c
deleted file mode 100644
index 50293c7..0000000
--- a/vp8/decoder/x86/onyxdxv.c
+++ /dev/null

@@ -1,1080 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/****************************************************************************
-*
-*   Module Title :     onyxdxv.c
-*
-*   Description  :     VP80 interface to DXV.
-*
-*****************************************************************************
-*/
-/****************************************************************************
-*  Header Files
-****************************************************************************/
-#include <math.h>   // For Abs()
-#include "pragmas.h"
-
-#include "vpxdxv.h"
-#include "vpxdxv_plugin.h"
-
-#include "onyxd_int.h"
-#include "onyx.h"
-#include "codec_common_interface.h"
-#include "vpx_scale/vpxscale.h"
-#include "vpx_mem/vpx_mem.h"
-#include "postproc.h"
-#include "vpxblit.h"
-#include "g_common.h"
-#include "vpx_scale/yv12extend.h"
-
-#include <limits.h>
-#include <stdio.h>
-#include "scale_mode.h"
-#include "onyx_pb_interface.h"
-
-/****************************************************************************
-*  Macros
-****************************************************************************/
-
-#define VP8_FOURCC DXL_MKFOURCC( 'V', 'P', '8', '0')
-
-extern void vp8_blit_text(const char *msg, unsigned char *address, const int pitch);
-
-
-/****************************************************************************
-*  Typedefs
-****************************************************************************/
-
-typedef struct  // YUV buffer configuration structure
-{
-    int   y_width;
-    int   y_height;
-    int   y_stride;
-
-    int   uv_width;
-    int   uv_height;
-    int   uv_stride;
-
-    char *y_buffer;
-    char *u_buffer;
-    char *v_buffer;
-
-    char *uv_start;
-    int   uv_dst_area;
-    int   uv_used_area;
-
-    unsigned char *y_ptr_scrn;
-    unsigned char *u_ptr_scrn;
-    unsigned char *v_ptr_scrn;
-
-
-} DXV_YUV_BUFFER_CONFIG;
-
-
-typedef void ((*vp8blit_func)(unsigned char *, int, YUV_BUFFER_CONFIG *));
-
-/* define an x_image structure based on the core x_image struct */
-typedef struct t_ximage_codec
-{
-    DXV_YUV_BUFFER_CONFIG frame_buffer;
-    VP8D_COMP *my_pbi;
-    VP8_COMMON *common;
-    int owned;
-    int decompressed_once;
-
-    int sizeof_pixel;
-    vp8blit_func blitter;
-
-    unsigned int ppl_tag;
-    unsigned int bd_tag;
-    unsigned int *supported_output_format_list;
-
-    int cpu_free;
-    int postproc;
-    int add_noise;
-    int deinterlace;
-
-    int post_proc2time;
-    int post_proc4time;
-
-    int hs;
-    int hr;
-    int vs;
-    int vr;
-    YV12_BUFFER_CONFIG this_buffer;
-    YV12_BUFFER_CONFIG scaled_buffer;
-    YV12_BUFFER_CONFIG *passed_in_buffer;
-
-    int avgq;
-    int ppcount;
-
-
-} VP8_XIMAGE, *VP8_XIMAGE_HANDLE;
-
-
-/****************************************************************************
-*  Modul Statics
-****************************************************************************/
-static unsigned int g_vp8_preferred_output_format_list[] =
-{
-    VPXDXV_YUY2,
-    VPXDXV_UYVY,
-    VPXDXV_RGB8888,
-    VPXDXV_RGB888,
-    VPXDXV_RGB555,
-    VPXDXV_RGB565,
-    VPXDXV_YV12,
-    VPXDXV_I420,
-
-//    VPXDXV_YV12,
-//    VPXDXV_YUY2,
-//    VPXDXV_RGB565,
-//    VPXDXV_UYVY,
-    0
-};
-
-/****************************************************************************
-*  Forward declarationss
-****************************************************************************/
-void onyx_set_parameter(XIMAGE_HANDLE src, int Command, unsigned int Parameter);
-
-static int onyx_get_output_format(XIMAGE_HANDLE src, unsigned int *bd_tag);
-static int onyx_set_output_format(XIMAGE_HANDLE src, unsigned int bd_tag);
-
-static int vpx_get_size_of_pixel(unsigned int bd);
-
-/****************************************************************************
-*  Imports
-****************************************************************************/
-
-#define __Clamp255(x)   (unsigned char) ( (x) < 0 ? 0 : ( (x) <= 255 ? (x) : 255 ) )
-
-/*
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-*/
-/*
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-*/
-void
-convert_yv12_buffer_types(YV12_BUFFER_CONFIG *source, DXV_YUV_BUFFER_CONFIG *dest)
-{
-    dest->y_buffer = (char *)source->y_buffer;
-    dest->u_buffer = (char *)source->u_buffer;
-    dest->v_buffer = (char *)source->v_buffer;
-    dest->y_width  = source->y_width;
-    dest->y_height = source->y_height;
-    dest->y_stride = source->y_stride;
-    dest->uv_width  = source->uv_width;
-    dest->uv_height = source->uv_height;
-    dest->uv_stride = source->uv_stride;
-}
-
-/*
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-*/
-
-
-int onyx_blit
-(
-    XIMAGE_HANDLE src,
-    VSCREEN_HANDLE v_screen,
-    DXV_YUV_BUFFER_CONFIG *frame_buffer,
-    int x,
-    int y
-)
-{
-    VP8_XIMAGE_HANDLE tab = (VP8_XIMAGE_HANDLE)vpxdxv_get_algorithm_base_ptr(src);
-    VP8D_COMP *pbi;
-    VP8_COMMON *common = tab->common;
-    pbi = tab->my_pbi;
-
-    if (v_screen) /* if there is a v_screen, blit to it */
-    {
-        unsigned char *ptr_scrn;
-        int this_pitch, vs_height, vs_width;
-        unsigned int start_tick, stop_tick;
-
-        vpxdxv_get_vscreen_attributes(v_screen, (void **)&ptr_scrn,  &vs_width, &vs_height, &this_pitch);
-
-        if (ptr_scrn)
-        {
-            int w, h;
-
-            int p_size;
-            int view_x, view_y, view_w;
-            int hs, hr, vs, vr;
-            int neww, newh;
-            int cw, ch;
-            int microseconds_available = (int)(1000000 / 30);
-
-            microseconds_available = microseconds_available * tab->cpu_free / 100;
-
-            if (pbi)
-            {
-                microseconds_available -= pbi->decode_microseconds;
-
-                if (tab->cpu_free == 0)
-                    microseconds_available = INT_MAX;
-
-                if (tab->post_proc2time == 0)
-                    tab->post_proc2time = pbi->decode_microseconds * 1 / 2;
-
-                if (tab->post_proc4time == 0)
-                    tab->post_proc4time = pbi->decode_microseconds;
-            }
-
-
-            if (tab->ppcount == 0)
-            {
-                tab->post_proc2time = 0;
-                tab->post_proc4time = 0;
-                tab->ppcount = 64;
-            }
-            else
-            {
-                tab->ppcount --;
-            }
-
-            vpxdxv_get_vscreen_view(v_screen, &view_x, &view_y, &view_w, NULL);
-
-            Scale2Ratio(common->horiz_scale, &hr, &hs);
-            Scale2Ratio(common->vert_scale, &vr, &vs);
-
-            if (tab->postproc && tab->passed_in_buffer == 0)
-            {
-                int show_text = 0;
-
-                unsigned char message[512];
-
-                int pp = tab->postproc;
-                int q = (tab->avgq + 4) / 8;
-                int noise = 0;
-
-                vp8_clear_system_state();
-
-                if (pp >= 1000)
-                {
-                    pp -= 1000;
-                    noise = pp / 100;
-                    pp = pp - noise * 100;
-                }
-
-                if (pp >= 300)
-                {
-                    pp -= 300;
-                    show_text = 3;
-                }
-                else if (pp >= 200)
-                {
-                    pp -= 200;
-                    show_text = 2;
-                }
-                else if (pp >= 100)
-                {
-                    pp -= 100;
-                    show_text = 1;
-                }
-
-                if (pbi && (pbi->mb.segmentation_enabled & SEGMENT_PF) && tab->deinterlace)
-                {
-                    de_interlace(common->frame_to_show->y_buffer, common->post_proc_buffer.y_buffer,
-                                 common->post_proc_buffer.y_width, common->post_proc_buffer.y_height,
-                                 common->post_proc_buffer.y_stride);
-
-                    de_interlace(common->frame_to_show->u_buffer, common->post_proc_buffer.u_buffer,
-                                 common->post_proc_buffer.uv_width, common->post_proc_buffer.uv_height,
-                                 common->post_proc_buffer.uv_stride);
-                    de_interlace(common->frame_to_show->v_buffer, common->post_proc_buffer.v_buffer,
-                                 common->post_proc_buffer.uv_width, common->post_proc_buffer.uv_height,
-                                 common->post_proc_buffer.uv_stride);
-                }
-                else
-                {
-                    if (pp >= 10 && pp <= 20)
-                    {
-                        q = q + (pp - 15) * 10;
-
-                        if (q < 0)
-                            q = 0;
-                    }
-
-                    start_tick = vp8_get_high_res_timer_tick();
-
-                    if (pp > 3 && tab->post_proc4time < microseconds_available)
-                    {
-                        vp8_deblock_and_de_macro_block(common->frame_to_show, &common->post_proc_buffer, q, 1, 0);
-
-                        stop_tick = vp8_get_high_res_timer_tick();
-
-                        if (pbi)
-                            tab->post_proc4time = vp8_get_time_in_micro_sec(start_tick, stop_tick);
-                    }
-
-                    else if (pp > 0 && tab->post_proc2time < microseconds_available)
-                    {
-                        vp8_deblock(common->frame_to_show, &common->post_proc_buffer, q , 1,  0);
-                        stop_tick = vp8_get_high_res_timer_tick();
-
-                        if (pbi)
-                            tab->post_proc2time = vp8_get_time_in_micro_sec(start_tick, stop_tick);
-                    }
-                    else
-                    {
-                        vp8_yv12_copy_frame(common->frame_to_show, &common->post_proc_buffer);
-                    }
-
-                }
-
-                vp8_clear_system_state();
-
-                if (tab->add_noise == 1)
-                {
-
-                    vp8_plane_add_noise(common->post_proc_buffer.y_buffer,
-                                        common->post_proc_buffer.y_width, common->post_proc_buffer.y_height,
-                                        common->post_proc_buffer.y_stride, 63 - q, noise);
-                }
-
-
-                if (show_text == 1)
-                {
-#ifdef PACKET_TESTING
-                    {
-                        VP8_HEADER *oh2 = (VP8_HEADER *) pbi->Source;
-                        sprintf(message, "%8d %d%d%d%d%d size:%d\n",
-                        oh2->frame_number ,
-                        oh2->update_gold  ,
-                        oh2->update_last  ,
-                        oh2->uses_gold    ,
-                        oh2->uses_last    ,
-                        oh2->type,
-                        vpxdxv_get_ximage_csize(src));
-                    }
-#else
-                    sprintf(message, "F:%1ldG:%1ldQ:%3ldF:%3ld,%3ldP:%d_s:%6ld,N:%d,",
-                            (common->frame_type == KEY_FRAME),
-                            common->refresh_golden_frame,
-                            common->base_qindex,
-                            common->filter_level,
-                            q,
-                            tab->postproc,
-                            vpxdxv_get_ximage_csize(src), noise);
-#endif
-
-                    vp8_blit_text(message, common->post_proc_buffer.y_buffer, common->post_proc_buffer.y_stride);
-
-                }
-                else if (show_text == 2)
-                {
-                    int i, j;
-                    unsigned char *y_ptr;
-                    YV12_BUFFER_CONFIG *post = &common->post_proc_buffer;
-                    int mb_rows = post->y_height >> 4;
-                    int mb_cols = post->y_width  >> 4;
-                    int mb_index = 0;
-                    MODE_INFO *mi = common->mi;
-
-                    y_ptr = post->y_buffer + 4 * post->y_stride + 4;
-
-                    // vp8_filter each macro block
-                    for (i = 0; i < mb_rows; i++)
-                    {
-                        for (j = 0; j < mb_cols; j++)
-                        {
-                            char zz[4];
-
-                            if (pp == 4)
-                                sprintf(zz, "%c", mi[mb_index].mbmi.mode + 'a');
-                            else
-                                sprintf(zz, "%c", mi[mb_index].mbmi.ref_frame + 'a');
-
-                            vp8_blit_text(zz, y_ptr, post->y_stride);
-                            mb_index ++;
-                            y_ptr += 16;
-                        }
-
-                        mb_index ++; //border
-                        y_ptr += post->y_stride  * 16 - post->y_width;
-
-                    }
-                }
-                else if (show_text == 3)
-                {
-                    int i, j;
-                    unsigned char *y_ptr;
-                    YV12_BUFFER_CONFIG *post = &common->post_proc_buffer;
-                    int mb_rows = post->y_height >> 4;
-                    int mb_cols = post->y_width  >> 4;
-                    int mb_index = 0;
-                    MODE_INFO *mi = common->mi;
-
-                    y_ptr = post->y_buffer + 4 * post->y_stride + 4;
-
-                    // vp8_filter each macro block
-                    for (i = 0; i < mb_rows; i++)
-                    {
-                        for (j = 0; j < mb_cols; j++)
-                        {
-                            char zz[4];
-
-                            if (j == 0)
-                                sprintf(zz, "%c", '0' + i % 10);
-                            else
-                                sprintf(zz, "%c", '0' + j % 10);
-
-                            vp8_blit_text(zz, y_ptr, post->y_stride);
-                            mb_index ++;
-                            y_ptr += 16;
-                        }
-
-                        y_ptr += post->y_stride  * 16 - post->y_width;
-
-                    }
-                }
-
-                vpx_memcpy(&tab->this_buffer, &common->post_proc_buffer, sizeof(YV12_BUFFER_CONFIG));
-            }
-            else
-            {
-                vpx_memcpy(&tab->this_buffer, common->frame_to_show, sizeof(YV12_BUFFER_CONFIG));
-            }
-
-
-            /* get a frame pointer to the scaled and postprocessed reconstructed buffer */
-            if (tab->passed_in_buffer == 0)
-            {
-                if (common->horiz_scale != NORMAL || common->vert_scale != NORMAL)
-                {
-                    neww = hs * tab->this_buffer.y_width / hr;
-                    newh = vs * tab->this_buffer.y_height / vr;
-
-                    neww += neww & 1;
-
-                    if (tab->hs != hs || tab->hr != hr || tab->vs != vs || tab->vr != vr)
-                    {
-                        vp8_yv12_alloc_frame_buffer(&tab->scaled_buffer, neww, newh , 8);
-                    }
-
-                    vp8_yv12_scale_or_center(&tab->this_buffer,
-                                             &tab->scaled_buffer,
-                                             neww, newh, SCALE_TO_FIT, hs, hr, vs, vr);
-
-                    convert_yv12_buffer_types(&tab->scaled_buffer, frame_buffer);
-
-                    cw = hs * common->Width / hr;
-                    ch = vs * common->Height / vr;
-
-                }
-                else
-                {
-                    convert_yv12_buffer_types(&tab->this_buffer, frame_buffer);
-
-                    cw = common->Width;
-                    ch = common->Height;
-                }
-            }
-            else
-            {
-                convert_yv12_buffer_types(tab->passed_in_buffer, frame_buffer);
-                cw = common->Width;
-                ch = common->Height;
-                tab->passed_in_buffer = 0;
-            }
-
-            frame_buffer->y_width = cw;
-            frame_buffer->y_height = ch;
-            frame_buffer->uv_width = cw / 2;
-            frame_buffer->uv_height = ch / 2;
-
-            p_size = vpx_get_size_of_pixel(tab->bd_tag);
-
-            /* remember to offset if requested */
-            y += view_y;
-            x += view_x ;
-
-            /* for planar destinations */
-            w = view_w;
-            h = vs_height;
-
-            if (w < frame_buffer->y_width)
-            {
-                frame_buffer->y_width = w;
-                frame_buffer->uv_width = (w + 1) / 2;
-            }
-
-            if (h < frame_buffer->y_height)
-            {
-                frame_buffer->y_height = h;
-                frame_buffer->uv_height = (h + 1) / 2;
-            }
-
-            if (frame_buffer->y_width < view_w)
-                x += (view_w - frame_buffer->y_width) / 2;
-
-            if (x & 1)
-                x -= 1;
-
-            if (frame_buffer->y_height < vs_height)
-                y += (vs_height - frame_buffer->y_height) / 2;
-
-
-            ptr_scrn += (x * p_size) + (y * this_pitch);
-
-            frame_buffer->y_stride *= -1;
-            frame_buffer->uv_stride *= -1;
-
-            if (tab->bd_tag == VPXDXV_YV12 || tab->bd_tag == VPXDXV_I420)
-            {
-                if (this_pitch < 0)
-                {
-                    frame_buffer->uv_start = (char *)(ptr_scrn + abs(this_pitch) + abs(this_pitch) * h / 4 + this_pitch / 2);
-                    frame_buffer->uv_dst_area = abs((this_pitch * h) / 4);
-                    frame_buffer->uv_used_area = 0;
-                }
-                else
-                {
-                    frame_buffer->uv_start = (char *)(ptr_scrn + (this_pitch * h));
-                    frame_buffer->uv_dst_area = (((this_pitch + 1) / 2) * ((h + 1) / 2));
-                    frame_buffer->uv_used_area = (((this_pitch + 1) / 2) * frame_buffer->uv_height);
-                }
-            }
-
-            if ((pbi->mb.segmentation_enabled & SEGMENT_PF) && (tab->bd_tag != VPXDXV_YV12 && tab->bd_tag != VPXDXV_I420))
-            {
-                int ypitch = frame_buffer->y_stride;
-                int uvpitch = frame_buffer->uv_stride;
-
-                frame_buffer->y_stride <<= 1;
-                frame_buffer->y_height >>= 1;
-                frame_buffer->uv_stride <<= 1;
-                frame_buffer->uv_height >>= 1;
-
-                ptr_scrn += this_pitch;
-                frame_buffer->y_buffer -= ypitch;
-                frame_buffer->u_buffer -= uvpitch;
-                frame_buffer->v_buffer -= uvpitch;
-                tab->blitter(ptr_scrn, 2 * this_pitch, (YUV_BUFFER_CONFIG *)(&tab->frame_buffer));
-
-                ptr_scrn -= this_pitch;
-                frame_buffer->y_buffer += ypitch;
-                frame_buffer->u_buffer += uvpitch;
-                frame_buffer->v_buffer += uvpitch;
-                tab->blitter(ptr_scrn, 2 * this_pitch, (YUV_BUFFER_CONFIG *)(&tab->frame_buffer));
-
-            }
-            else
-            {
-                /* blit the screen */
-                tab->blitter(ptr_scrn, this_pitch, (YUV_BUFFER_CONFIG *)(&tab->frame_buffer));
-                vpx_log("Decoder: Frame shown \n");
-            }
-
-        }
-        else
-            vpx_log("Decoder: Frame not shown scrn pointer 0\n");
-    }
-    else
-        vpx_log("Decoder: Frame not shown vscreen 0\n");
-
-    return DXV_OK;
-}
-/****************************************************************************
- *
- *  ROUTINE       :     onyx_decompress
- *
- *  INPUTS        :     None
- *
- *  OUTPUTS       :     None
- *
- *  RETURNS       :     None.
- *
- *  FUNCTION      :
- *
- *  SPECIAL NOTES :
- *
- ****************************************************************************/
-static
-int onyx_decompress(XIMAGE_HANDLE src, VSCREEN_HANDLE v_screen)
-{
-    VP8_XIMAGE_HANDLE this_algorithm_base = (VP8_XIMAGE_HANDLE)vpxdxv_get_algorithm_base_ptr(src);
-    unsigned char *c_addr;
-    unsigned int c_size;
-    int w, h, x, y;
-    int vp8_rv;
-
-    c_addr = vpxdxv_get_ximage_cdata_addr(src);
-    c_size = vpxdxv_get_ximage_csize(src);
-    vpxdxv_get_ximage_xywh(src, &x, &y, &w, &h);
-
-    // if we have a compressed frame decompress it ( otherwise we'll just redo
-    // the scaling and postprocessing from the last frame )
-    if (c_addr)
-    {
-        if (c_size != 0)
-        {
-            int flags;
-            int ret_val;
-
-            int f;
-
-            // decode the frame
-            ret_val = vp8d_decompress_frame((VP8D_PTR) this_algorithm_base->my_pbi,
-                                            c_size,
-                                            (char *) c_addr,
-                                            &this_algorithm_base->this_buffer,
-                                            &flags);
-
-
-            f = this_algorithm_base->my_pbi->common.filter_level * 10 / 6;
-
-            if (this_algorithm_base->my_pbi->common.frame_type == KEY_FRAME)
-                this_algorithm_base->avgq = 8 * f;
-            else
-                this_algorithm_base->avgq = this_algorithm_base->avgq * 7 / 8 + f;
-
-
-
-            if (ret_val != 0)
-            {
-                if (ret_val == -1)
-                    return DXV_VERSION_CONFLICT;
-                else
-                    return DXV_BAD_DATA;
-            }
-
-        }
-    }
-
-
-    vp8_rv = onyx_blit(src, v_screen, &this_algorithm_base->frame_buffer, x, y);
-
-
-    return vp8_rv;
-}
-/*
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-*/
-static
-int vp8_ximagedestroy(XIMAGE_HANDLE src)
-{
-    VP8_XIMAGE_HANDLE this_algorithm_base = (VP8_XIMAGE_HANDLE)vpxdxv_get_algorithm_base_ptr(src);
-
-    if (this_algorithm_base)
-    {
-
-        vp8_yv12_de_alloc_frame_buffer(&this_algorithm_base->scaled_buffer);
-
-        /* safety check in case stopdecode was not called */
-        if (this_algorithm_base->owned)
-            vp8dx_remove_decompressor((VP8D_PTR)(this_algorithm_base->my_pbi));
-
-        duck_free(this_algorithm_base);
-    }
-
-    return DXV_OK;
-}
-/*
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-*/
-static int
-onyx_get_post_proc(XIMAGE_HANDLE src, unsigned int *ppl)
-{
-    VP8_XIMAGE_HANDLE this_algorithm_base = (VP8_XIMAGE_HANDLE)vpxdxv_get_algorithm_base_ptr(src);
-
-    if (this_algorithm_base)
-    {
-        *ppl = this_algorithm_base->ppl_tag;
-
-        return DXV_OK;
-    }
-
-    return DXV_NULL_BASE;
-}
-/*
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-*/
-static int
-onyx_set_post_proc(XIMAGE_HANDLE src, unsigned int ppl)
-{
-    VP8_XIMAGE_HANDLE this_algorithm_base = (VP8_XIMAGE_HANDLE)vpxdxv_get_algorithm_base_ptr(src);
-
-    if (this_algorithm_base)
-    {
-        this_algorithm_base->ppl_tag = ppl;
-
-        return DXV_OK;
-    }
-
-    return DXV_NULL_BASE;
-}
-/*
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-*/
-static
-int vp8_ximagestop_decode(XIMAGE_HANDLE src)
-{
-    VP8_XIMAGE_HANDLE this_algorithm_base = (VP8_XIMAGE_HANDLE)vpxdxv_get_algorithm_base_ptr(src);
-
-    if (this_algorithm_base)
-    {
-
-        vp8_yv12_de_alloc_frame_buffer(&this_algorithm_base->scaled_buffer);
-
-        if (this_algorithm_base->owned)
-            vp8dx_remove_decompressor((VP8D_PTR)(this_algorithm_base->my_pbi));
-
-        this_algorithm_base->owned = 0;
-    }
-
-    return DXV_OK;
-}
-
-
-/*
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-*/
-static
-int vp8_ximagestart_decode
-(
-    XIMAGE_HANDLE src
-)
-{
-    VP8_XIMAGE_HANDLE this_algorithm_base = (VP8_XIMAGE_HANDLE)vpxdxv_get_algorithm_base_ptr(src);
-    XIMAGE_INFO_PTR xinfo = vpxdxv_get_ximage_info(src);
-    VP8D_CONFIG ocf;
-
-    if (xinfo)
-    {
-        ocf.Width = xinfo->width;
-        ocf.Height = xinfo->height;
-    }
-
-    if (this_algorithm_base->common == 0)
-    {
-        this_algorithm_base->my_pbi = (VP8D_COMP *) vp8dx_create_decompressor(&ocf);
-        this_algorithm_base->owned = 1;
-        this_algorithm_base->common = &this_algorithm_base->my_pbi->common;
-        this_algorithm_base->avgq = 0;
-
-    }
-
-    this_algorithm_base->passed_in_buffer = 0;
-    this_algorithm_base->post_proc2time = 0;
-    this_algorithm_base->post_proc4time = 0;
-    this_algorithm_base->ppcount = 64;
-
-    return DXV_OK;
-}
-/*
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-*/
-static
-DXV_HANDLE vp8_ximagecreate(XIMAGE_HANDLE src)
-{
-    VP8_XIMAGE_HANDLE this_algorithm_base;
-
-    /* create a new algorithm base container */
-    this_algorithm_base = (VP8_XIMAGE_HANDLE)duck_calloc(1, sizeof(VP8_XIMAGE), DMEM_GENERAL);
-
-    if (this_algorithm_base == NULL)
-        return NULL;
-
-    vp8_scale_machine_specific_config();
-
-    vpxdxv_register_ximage_start_decode(src, vp8_ximagestart_decode);
-
-    vpxdxv_register_ximage_stop_decode(src, vp8_ximagestop_decode);
-
-    vpxdxv_register_ximage_destroy(src, vp8_ximagedestroy);
-
-    vpxdxv_register_ximage_dx(src, onyx_decompress);
-
-    vpxdxv_register_ximage_set_parameter(src, onyx_set_parameter);
-
-    vpxdxv_register_ximage_output_format_func(src,
-            onyx_get_output_format,
-            onyx_set_output_format);
-
-    vpxdxv_register_ximage_post_proc_level_func(src,
-            onyx_get_post_proc,
-            onyx_set_post_proc);
-
-    return (DXV_HANDLE)this_algorithm_base;
-}
-
-/*
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-*/
-
-static int store_output_list(unsigned int supported, int count,
-                             unsigned int *outlist)
-{
-    int i = 0, j = 0,
-        ret = DXV_OK;
-
-    while (i < count)
-    {
-        while (supported && !(supported & 0x01))
-        {
-            supported >>= 1;
-            ++j;
-        }
-
-        *(outlist + i) = g_vp8_preferred_output_format_list[j];
-        ++i;
-        ++j;
-        supported >>= 1;
-    }
-
-
-    return ret;
-}
-
-/*
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-*/
-static int onyx_get_output_list(XIMAGE_INFO_PTR xinfo, unsigned int *outlist,
-                                unsigned int *size)
-{
-    int i,
-        ret = DXV_INVALID_REQUEST;
-    unsigned int supported = 0,
-                 count = 0;
-    (void)xinfo;
-
-    if (size)
-    {
-        for (i = 0; i < sizeof(g_vp8_preferred_output_format_list) / sizeof(unsigned int) && i < 32; ++i)
-        {
-            if (vpx_get_blitter(g_vp8_preferred_output_format_list[i]) != (void *)0xffffffff)
-            {
-                supported |= (1 << i);
-                ++count;
-            }
-        }
-
-        if (outlist)
-        {
-            if (count && ((count + 1) == (*size / sizeof(int))))
-                ret = store_output_list(supported, count, outlist);
-            else
-                *outlist = 0;
-        }
-        else
-        {
-            *size = (count + 1) * sizeof(int);
-            ret = DXV_OK;
-        }
-    }
-
-    return ret;
-}
-
-/*
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-*/
-int onyx_init(void)
-{
-    int vp8_rv;
-
-    /* register VPX blitters based on cpu */
-    vpx_set_blit();
-
-    vp8_rv = vpxdxv_register_ximage(vp8_ximagecreate, onyx_get_output_list, VP8_FOURCC);
-    return vp8_rv;
-
-    return DXV_OK;
-}
-/*
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-*/
-int onyx_exit(void)
-{
-
-    vpxdxv_un_register_ximage(VP8_FOURCC);
-
-    return DXV_OK;
-}
-/****************************************************************************
- *
- *  ROUTINE       :  onyx_set_parameter
- *
- *  INPUTS        :  XIMAGE_HANDLE src   :
- *                   int Command             :
- *                   unsigned long Parameter :
- *
- *  OUTPUTS       :  None.
- *
- *  RETURNS       :  void
- *
- *  FUNCTION      :
- *
- *
- *  SPECIAL NOTES :  None.
- *
- ****************************************************************************/
-void onyx_set_parameter(XIMAGE_HANDLE src, int Command, unsigned int Parameter)
-{
-    VP8_XIMAGE_HANDLE this_algorithm_base = (VP8_XIMAGE_HANDLE)vpxdxv_get_algorithm_base_ptr(src);
-
-    switch (Command)
-    {
-    case PBC_SET_CPUFREE:
-        this_algorithm_base->cpu_free  = Parameter;
-        break;
-    case PBC_SET_POSTPROC:
-        this_algorithm_base->postproc = Parameter;
-        break;
-
-    case PBC_SET_BLITBUFF:
-        this_algorithm_base->passed_in_buffer = (YV12_BUFFER_CONFIG *) Parameter;
-        break;
-
-    case PBC_SET_REFERENCEFRAME:
-    {
-        VP8_XIMAGE_HANDLE tab = (VP8_XIMAGE_HANDLE)vpxdxv_get_algorithm_base_ptr(src);
-        VP8D_COMP *pbi;
-        pbi = tab->my_pbi;
-        vp8_yv12_copy_frame((YV12_BUFFER_CONFIG *) Parameter, &pbi->common.last_frame);
-    }
-    break;
-
-    case PBC_SET_COMMON:
-
-        if (Parameter)
-        {
-            this_algorithm_base->common = (VP8_COMMON *)Parameter;
-        }
-
-        break;
-    case PBC_SET_ADDNOISE:
-        this_algorithm_base->add_noise = Parameter;
-        break;
-    case PBC_SET_DEINTERLACEMODE:
-        this_algorithm_base->deinterlace = Parameter;
-        break;
-
-    }
-}
-/*
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-*/
-static int
-onyx_get_output_format(XIMAGE_HANDLE src, unsigned int *format_tag)
-{
-    VP8_XIMAGE_HANDLE this_algorithm_base = (VP8_XIMAGE_HANDLE)vpxdxv_get_algorithm_base_ptr(src);
-
-    if (this_algorithm_base)
-    {
-        *format_tag = this_algorithm_base->bd_tag;
-        return DXV_OK;
-    }
-
-    return DXV_NULL_BASE;
-}
-
-/*
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-*/
-static int
-onyx_set_output_format(XIMAGE_HANDLE src, unsigned int bd_tag)
-{
-    VP8_XIMAGE_HANDLE this_algorithm_base = (VP8_XIMAGE_HANDLE)vpxdxv_get_algorithm_base_ptr(src);
-    int i;
-    unsigned int bd_tag_found;
-
-    if (this_algorithm_base)
-    {
-        i = 0;
-        bd_tag_found = 0;
-
-        while (g_vp8_preferred_output_format_list[i] != 0)
-        {
-            if (g_vp8_preferred_output_format_list[i] == bd_tag)
-            {
-                bd_tag_found = 1;
-                break;
-            }
-
-            i++;
-        }
-
-        if (bd_tag_found)
-        {
-            this_algorithm_base->blitter = (vp8blit_func)vpx_get_blitter(bd_tag);
-            this_algorithm_base->bd_tag = bd_tag;
-            return DXV_OK;
-        }
-
-        return DXV_INVALID_BLIT;
-    }
-
-    return DXV_NULL_BASE;
-}
-
-/*
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-*/
-int
-vpx_get_size_of_pixel(unsigned int bd)
-{
-    int vp8_rv;
-
-    switch (bd)
-    {
-    case VPXDXV_YV12:
-    case VPXDXV_I420:
-        vp8_rv = 1;
-        break;
-
-#ifdef _ENABLE_SPLIT_PIXEL_
-    case VPXDXV_SPLIT565:
-#endif
-    case VPXDXV_RGB555:
-    case VPXDXV_RGB565:
-    case VPXDXV_YUY2:
-    case VPXDXV_UYVY:
-    case VPXDXV_YVYU:
-        vp8_rv = 2;
-        break;
-
-    case VPXDXV_RGB888:
-        vp8_rv = 3;
-        break;
-
-    case VPXDXV_RGB8888:
-        vp8_rv = 4;
-        break;
-
-    default:
-        vp8_rv = -1;
-        break;
-    }
-
-    return vp8_rv;
-}

diff --git a/vp8/decoder/x86/x86_dsystemdependent.c b/vp8/decoder/x86/x86_dsystemdependent.c
index 47e346d..5c16842 100644
--- a/vp8/decoder/x86/x86_dsystemdependent.c
+++ b/vp8/decoder/x86/x86_dsystemdependent.c

@@ -11,13 +11,13 @@
 
 #include "vpx_ports/config.h"
 #include "vpx_ports/x86.h"
-#include "onyxd_int.h"
+#include "vp8/decoder/onyxd_int.h"
 
 
 #if HAVE_MMX
 void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q);
 
-void vp8_dequantize_b_mmx(BLOCKD *d)
+static void dequantize_b_mmx(BLOCKD *d)
 {
     short *sq = (short *) d->qcoeff;
     short *dq = (short *) d->dqcoeff;
@@ -41,7 +41,7 @@
 #if HAVE_MMX
     if (flags & HAS_MMX)
     {
-        pbi->dequant.block               = vp8_dequantize_b_mmx;
+        pbi->dequant.block               = dequantize_b_mmx;
         pbi->dequant.idct_add            = vp8_dequant_idct_add_mmx;
         pbi->dequant.dc_idct_add         = vp8_dequant_dc_idct_add_mmx;
         pbi->dequant.dc_idct_add_y_block = vp8_dequant_dc_idct_add_y_block_mmx;

diff --git a/vp8/encoder/arm/arm_csystemdependent.c b/vp8/encoder/arm/arm_csystemdependent.c
index a1f1102..d4a88dc 100644
--- a/vp8/encoder/arm/arm_csystemdependent.c
+++ b/vp8/encoder/arm/arm_csystemdependent.c

@@ -11,8 +11,8 @@
 
 #include "vpx_ports/config.h"
 #include "vpx_ports/arm.h"
-#include "variance.h"
-#include "onyx_int.h"
+#include "vp8/encoder/variance.h"
+#include "vp8/encoder/onyx_int.h"
 
 extern void (*vp8_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
 extern void vp8_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
@@ -29,26 +29,29 @@
 #if HAVE_ARMV6
     if (has_media)
     {
-        /*cpi->rtcd.variance.sad16x16              = vp8_sad16x16_c;
-        cpi->rtcd.variance.sad16x8               = vp8_sad16x8_c;
+        cpi->rtcd.variance.sad16x16              = vp8_sad16x16_armv6;
+        /*cpi->rtcd.variance.sad16x8               = vp8_sad16x8_c;
         cpi->rtcd.variance.sad8x16               = vp8_sad8x16_c;
         cpi->rtcd.variance.sad8x8                = vp8_sad8x8_c;
         cpi->rtcd.variance.sad4x4                = vp8_sad4x4_c;*/
 
-        /*cpi->rtcd.variance.var4x4                = vp8_variance4x4_c;
-        cpi->rtcd.variance.var8x8                = vp8_variance8x8_c;
-        cpi->rtcd.variance.var8x16               = vp8_variance8x16_c;
-        cpi->rtcd.variance.var16x8               = vp8_variance16x8_c;
-        cpi->rtcd.variance.var16x16              = vp8_variance16x16_c;*/
+        /*cpi->rtcd.variance.var4x4                = vp8_variance4x4_c;*/
+        cpi->rtcd.variance.var8x8                = vp8_variance8x8_armv6;
+        /*cpi->rtcd.variance.var8x16               = vp8_variance8x16_c;
+        cpi->rtcd.variance.var16x8               = vp8_variance16x8_c;*/
+        cpi->rtcd.variance.var16x16              = vp8_variance16x16_armv6;
 
-        /*cpi->rtcd.variance.subpixvar4x4          = vp8_sub_pixel_variance4x4_c;
-        cpi->rtcd.variance.subpixvar8x8          = vp8_sub_pixel_variance8x8_c;
-        cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_c;
-        cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_c;
-        cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_c;*/
+        /*cpi->rtcd.variance.subpixvar4x4          = vp8_sub_pixel_variance4x4_c;*/
+        cpi->rtcd.variance.subpixvar8x8          = vp8_sub_pixel_variance8x8_armv6;
+        /*cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_c;
+        cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_c;*/
+        cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_armv6;
+        cpi->rtcd.variance.halfpixvar16x16_h     = vp8_variance_halfpixvar16x16_h_armv6;
+        cpi->rtcd.variance.halfpixvar16x16_v     = vp8_variance_halfpixvar16x16_v_armv6;
+        cpi->rtcd.variance.halfpixvar16x16_hv    = vp8_variance_halfpixvar16x16_hv_armv6;
 
-        /*cpi->rtcd.variance.mse16x16              = vp8_mse16x16_c;
-        cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_c;*/
+        cpi->rtcd.variance.mse16x16              = vp8_mse16x16_armv6;
+        /*cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_c;*/
 
         /*cpi->rtcd.variance.get16x16prederror     = vp8_get16x16pred_error_c;
         cpi->rtcd.variance.get8x8var             = vp8_get8x8var_c;
@@ -56,9 +59,9 @@
         cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_c;*/
 
         /*cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_c;
-        cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_c;
-        cpi->rtcd.fdct.fast4x4                   = vp8_fast_fdct4x4_c;
-        cpi->rtcd.fdct.fast8x4                   = vp8_fast_fdct8x4_c;*/
+        cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_c;*/
+        cpi->rtcd.fdct.fast4x4                   = vp8_fast_fdct4x4_armv6;
+        cpi->rtcd.fdct.fast8x4                   = vp8_fast_fdct8x4_armv6;
         cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_armv6;
 
         /*cpi->rtcd.encodemb.berr                  = vp8_block_error_c;
@@ -68,8 +71,8 @@
         cpi->rtcd.encodemb.submby                = vp8_subtract_mby_c;
         cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_c;*/
 
-        /*cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;
-        cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_c;*/
+        /*cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;*/
+        cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_armv6;
     }
 #endif
 

diff --git a/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm b/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm
index e78dc33..3c05f57 100644
--- a/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm
+++ b/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm

@@ -14,7 +14,7 @@
     EXPORT |vp8_stop_encode|
     EXPORT |vp8_encode_value|
 
-    INCLUDE vpx_vp8_enc_asm_offsets.asm
+    INCLUDE asm_enc_offsets.asm
 
     ARM
     REQUIRE8

diff --git a/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm b/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
index b2abadf..d939287 100644
--- a/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm

@@ -11,7 +11,7 @@
 
     EXPORT |vp8cx_pack_tokens_armv5|
 
-    INCLUDE vpx_vp8_enc_asm_offsets.asm
+    INCLUDE asm_enc_offsets.asm
 
     ARM
     REQUIRE8

diff --git a/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm b/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
index f9c3852..ac2bba6 100644
--- a/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm

@@ -11,7 +11,7 @@
 
     EXPORT |vp8cx_pack_mb_row_tokens_armv5|
 
-    INCLUDE vpx_vp8_enc_asm_offsets.asm
+    INCLUDE asm_enc_offsets.asm
 
     ARM
     REQUIRE8

diff --git a/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm b/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
index 42dae13..c2eccdb 100644
--- a/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm

@@ -11,7 +11,7 @@
 
     EXPORT |vp8cx_pack_tokens_into_partitions_armv5|
 
-    INCLUDE vpx_vp8_enc_asm_offsets.asm
+    INCLUDE asm_enc_offsets.asm
 
     ARM
     REQUIRE8

diff --git a/vp8/encoder/arm/armv6/vp8_fast_fdct4x4_armv6.asm b/vp8/encoder/arm/armv6/vp8_fast_fdct4x4_armv6.asm
new file mode 100644
index 0000000..65bd2b4
--- /dev/null
+++ b/vp8/encoder/arm/armv6/vp8_fast_fdct4x4_armv6.asm

@@ -0,0 +1,262 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT |vp8_fast_fdct4x4_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA    |.text|, CODE, READONLY
+; void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
+|vp8_fast_fdct4x4_armv6| PROC
+
+    stmfd       sp!, {r4 - r12, lr}
+
+    ; PART 1
+
+    ; coeffs 0-3
+    ldrd        r4, r5, [r0]        ; [i1 | i0] [i3 | i2]
+
+    ldr         r10, c7500
+    ldr         r11, c14500
+    ldr         r12, c0x22a453a0    ; [2217*4 | 5352*4]
+    ldr         lr, c0x00080008
+    ror         r5, r5, #16         ; [i2 | i3]
+
+    qadd16      r6, r4, r5          ; [i1+i2 | i0+i3] = [b1 | a1] without shift
+    qsub16      r7, r4, r5          ; [i1-i2 | i0-i3] = [c1 | d1] without shift
+
+    add         r0, r0, r2          ; update input pointer
+
+    qadd16      r7, r7, r7          ; 2*[c1|d1] --> we can use smlad and smlsd
+                                    ; with 2217*4 and 5352*4 without losing the
+                                    ; sign bit (overflow)
+
+    smuad       r4, r6, lr          ; o0 = (i1+i2)*8 + (i0+i3)*8
+    smusd       r5, r6, lr          ; o2 = (i1+i2)*8 - (i0+i3)*8
+
+    smlad       r6, r7, r12, r11    ; o1 = (c1 * 2217 + d1 * 5352 +  14500)
+    smlsdx      r7, r7, r12, r10    ; o3 = (d1 * 2217 - c1 * 5352 +   7500)
+
+    ldrd        r8, r9, [r0]        ; [i5 | i4] [i7 | i6]
+
+    pkhbt       r3, r4, r6, lsl #4  ; [o1 | o0], keep in register for PART 2
+    pkhbt       r6, r5, r7, lsl #4  ; [o3 | o2]
+
+    str         r6, [r1, #4]
+
+    ; coeffs 4-7
+    ror         r9, r9, #16         ; [i6 | i7]
+
+    qadd16      r6, r8, r9          ; [i5+i6 | i4+i7] = [b1 | a1] without shift
+    qsub16      r7, r8, r9          ; [i5-i6 | i4-i7] = [c1 | d1] without shift
+
+    add         r0, r0, r2          ; update input pointer
+
+    qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd
+                                    ; with 2217*4 and 5352*4 without losing the
+                                    ; sign bit (overflow)
+
+    smuad       r9, r6, lr          ; o4 = (i5+i6)*8 + (i4+i7)*8
+    smusd       r8, r6, lr          ; o6 = (i5+i6)*8 - (i4+i7)*8
+
+    smlad       r6, r7, r12, r11    ; o5 = (c1 * 2217 + d1 * 5352 +  14500)
+    smlsdx      r7, r7, r12, r10    ; o7 = (d1 * 2217 - c1 * 5352 +   7500)
+
+    ldrd        r4, r5, [r0]        ; [i9 | i8] [i11 | i10]
+
+    pkhbt       r9, r9, r6, lsl #4  ; [o5 | o4], keep in register for PART 2
+    pkhbt       r6, r8, r7, lsl #4  ; [o7 | o6]
+
+    str         r6, [r1, #12]
+
+    ; coeffs 8-11
+    ror         r5, r5, #16         ; [i10 | i11]
+
+    qadd16      r6, r4, r5          ; [i9+i10 | i8+i11]=[b1 | a1] without shift
+    qsub16      r7, r4, r5          ; [i9-i10 | i8-i11]=[c1 | d1] without shift
+
+    add         r0, r0, r2          ; update input pointer
+
+    qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd
+                                    ; with 2217*4 and 5352*4 without losing the
+                                    ; sign bit (overflow)
+
+    smuad       r2, r6, lr          ; o8 = (i9+i10)*8 + (i8+i11)*8
+    smusd       r8, r6, lr          ; o10 = (i9+i10)*8 - (i8+i11)*8
+
+    smlad       r6, r7, r12, r11    ; o9 = (c1 * 2217 + d1 * 5352 +  14500)
+    smlsdx      r7, r7, r12, r10    ; o11 = (d1 * 2217 - c1 * 5352 +   7500)
+
+    ldrd        r4, r5, [r0]        ; [i13 | i12] [i15 | i14]
+
+    pkhbt       r2, r2, r6, lsl #4  ; [o9 | o8], keep in register for PART 2
+    pkhbt       r6, r8, r7, lsl #4  ; [o11 | o10]
+
+    str         r6, [r1, #20]
+
+    ; coeffs 12-15
+    ror         r5, r5, #16         ; [i14 | i15]
+
+    qadd16      r6, r4, r5          ; [i13+i14 | i12+i15]=[b1|a1] without shift
+    qsub16      r7, r4, r5          ; [i13-i14 | i12-i15]=[c1|d1] without shift
+
+    qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd
+                                    ; with 2217*4 and 5352*4 without losing the
+                                    ; sign bit (overflow)
+
+    smuad       r4, r6, lr          ; o12 = (i13+i14)*8 + (i12+i15)*8
+    smusd       r5, r6, lr          ; o14 = (i13+i14)*8 - (i12+i15)*8
+
+    smlad       r6, r7, r12, r11    ; o13 = (c1 * 2217 + d1 * 5352 +  14500)
+    smlsdx      r7, r7, r12, r10    ; o15 = (d1 * 2217 - c1 * 5352 +   7500)
+
+    pkhbt       r0, r4, r6, lsl #4  ; [o13 | o12], keep in register for PART 2
+    pkhbt       r6, r5, r7, lsl #4  ; [o15 | o14]
+
+    str         r6, [r1, #28]
+
+
+    ; PART 2 -------------------------------------------------
+    ldr         r11, c12000
+    ldr         r10, c51000
+    ldr         lr, c0x00070007
+
+    qadd16      r4, r3, r0          ; a1 = [i1+i13 | i0+i12]
+    qadd16      r5, r9, r2          ; b1 = [i5+i9  |  i4+i8]
+    qsub16      r6, r9, r2          ; c1 = [i5-i9  |  i4-i8]
+    qsub16      r7, r3, r0          ; d1 = [i1-i13 | i0-i12]
+
+    qadd16      r4, r4, lr          ; a1 + 7
+
+    add         r0, r11, #0x10000   ; add (d!=0)
+
+    qadd16      r2, r4, r5          ; a1 + b1 + 7
+    qsub16      r3, r4, r5          ; a1 - b1 + 7
+
+    ldr         r12, c0x08a914e8    ; [2217 | 5352]
+
+    lsl         r8, r2, #16         ; prepare bottom halfword for scaling
+    asr         r2, r2, #4          ; scale top halfword
+    lsl         r9, r3, #16         ; prepare bottom halfword for scaling
+    asr         r3, r3, #4          ; scale top halfword
+    pkhtb       r4, r2, r8, asr #20 ; pack and scale bottom halfword
+    pkhtb       r5, r3, r9, asr #20 ; pack and scale bottom halfword
+
+    smulbt      r2, r6, r12         ; [ ------ | c1*2217]
+    str         r4, [r1, #0]        ; [     o1 |      o0]
+    smultt      r3, r6, r12         ; [c1*2217 | ------ ]
+    str         r5, [r1, #16]       ; [     o9 |      o8]
+
+    smlabb      r8, r7, r12, r2     ; [ ------ | d1*5352]
+    smlatb      r9, r7, r12, r3     ; [d1*5352 | ------ ]
+
+    smulbb      r2, r6, r12         ; [ ------ | c1*5352]
+    smultb      r3, r6, r12         ; [c1*5352 | ------ ]
+
+    lsls        r6, r7, #16         ; d1 != 0 ?
+    addeq       r8, r8, r11         ; c1_b*2217+d1_b*5352+12000 + (d==0)
+    addne       r8, r8, r0          ; c1_b*2217+d1_b*5352+12000 + (d!=0)
+    asrs        r6, r7, #16
+    addeq       r9, r9, r11         ; c1_t*2217+d1_t*5352+12000 + (d==0)
+    addne       r9, r9, r0          ; c1_t*2217+d1_t*5352+12000 + (d!=0)
+
+    smlabt      r4, r7, r12, r10    ; [ ------ | d1*2217] + 51000
+    smlatt      r5, r7, r12, r10    ; [d1*2217 | ------ ] + 51000
+
+    pkhtb       r9, r9, r8, asr #16
+
+    sub         r4, r4, r2
+    sub         r5, r5, r3
+
+    ldr         r3, [r1, #4]        ; [i3 | i2]
+
+    pkhtb       r5, r5, r4, asr #16 ; [o13|o12]
+
+    str         r9, [r1, #8]        ; [o5 | 04]
+
+    ldr         r9, [r1, #12]       ; [i7 | i6]
+    ldr         r8, [r1, #28]       ; [i15|i14]
+    ldr         r2, [r1, #20]       ; [i11|i10]
+    str         r5, [r1, #24]       ; [o13|o12]
+
+    qadd16      r4, r3, r8          ; a1 = [i3+i15 | i2+i14]
+    qadd16      r5, r9, r2          ; b1 = [i7+i11 | i6+i10]
+
+    qadd16      r4, r4, lr          ; a1 + 7
+
+    qsub16      r6, r9, r2          ; c1 = [i7-i11 | i6-i10]
+    qadd16      r2, r4, r5          ; a1 + b1 + 7
+    qsub16      r7, r3, r8          ; d1 = [i3-i15 | i2-i14]
+    qsub16      r3, r4, r5          ; a1 - b1 + 7
+
+    lsl         r8, r2, #16         ; prepare bottom halfword for scaling
+    asr         r2, r2, #4          ; scale top halfword
+    lsl         r9, r3, #16         ; prepare bottom halfword for scaling
+    asr         r3, r3, #4          ; scale top halfword
+    pkhtb       r4, r2, r8, asr #20 ; pack and scale bottom halfword
+    pkhtb       r5, r3, r9, asr #20 ; pack and scale bottom halfword
+
+    smulbt      r2, r6, r12         ; [ ------ | c1*2217]
+    str         r4, [r1, #4]        ; [     o3 |      o2]
+    smultt      r3, r6, r12         ; [c1*2217 | ------ ]
+    str         r5, [r1, #20]       ; [    o11 |     o10]
+
+    smlabb      r8, r7, r12, r2     ; [ ------ | d1*5352]
+    smlatb      r9, r7, r12, r3     ; [d1*5352 | ------ ]
+
+    smulbb      r2, r6, r12         ; [ ------ | c1*5352]
+    smultb      r3, r6, r12         ; [c1*5352 | ------ ]
+
+    lsls        r6, r7, #16         ; d1 != 0 ?
+    addeq       r8, r8, r11         ; c1_b*2217+d1_b*5352+12000 + (d==0)
+    addne       r8, r8, r0          ; c1_b*2217+d1_b*5352+12000 + (d!=0)
+
+    asrs        r6, r7, #16
+    addeq       r9, r9, r11         ; c1_t*2217+d1_t*5352+12000 + (d==0)
+    addne       r9, r9, r0          ; c1_t*2217+d1_t*5352+12000 + (d!=0)
+
+    smlabt      r4, r7, r12, r10    ; [ ------ | d1*2217] + 51000
+    smlatt      r5, r7, r12, r10    ; [d1*2217 | ------ ] + 51000
+
+    pkhtb       r9, r9, r8, asr #16
+
+    sub         r4, r4, r2
+    sub         r5, r5, r3
+
+    str         r9, [r1, #12]       ; [o7 | o6]
+    pkhtb       r5, r5, r4, asr #16 ; [o15|o14]
+
+    str         r5, [r1, #28]       ; [o15|o14]
+
+    ldmfd       sp!, {r4 - r12, pc}
+
+    ENDP
+
+; Used constants
+c7500
+    DCD     7500
+c14500
+    DCD     14500
+c0x22a453a0
+    DCD     0x22a453a0
+c0x00080008
+    DCD     0x00080008
+c12000
+    DCD     12000
+c51000
+    DCD     51000
+c0x00070007
+    DCD     0x00070007
+c0x08a914e8
+    DCD     0x08a914e8
+
+    END

diff --git a/vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm b/vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm
new file mode 100644
index 0000000..ae2f603
--- /dev/null
+++ b/vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm

@@ -0,0 +1,224 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_fast_quantize_b_armv6|
+
+    INCLUDE asm_enc_offsets.asm
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    BLOCK *b
+; r1    BLOCKD *d
+|vp8_fast_quantize_b_armv6| PROC
+    stmfd   sp!, {r1, r4-r11, lr}
+
+    ldr     r3, [r0, #vp8_block_coeff]      ; coeff
+    ldr     r4, [r0, #vp8_block_quant_fast] ; quant_fast
+    ldr     r5, [r0, #vp8_block_round]      ; round
+    ldr     r6, [r1, #vp8_blockd_qcoeff]    ; qcoeff
+    ldr     r7, [r1, #vp8_blockd_dqcoeff]   ; dqcoeff
+    ldr     r8, [r1, #vp8_blockd_dequant]   ; dequant
+
+    ldr     r2, loop_count          ; loop_count=0x1000000. 'lsls' instruction
+                                    ; is used to update the counter so that
+                                    ; it can be used to mark nonzero
+                                    ; quantized coefficient pairs.
+
+    mov     r1, #0                  ; flags for quantized coeffs
+
+    ; PART 1: quantization and dequantization loop
+loop
+    ldr     r9, [r3], #4            ; [z1 | z0]
+    ldr     r10, [r5], #4           ; [r1 | r0]
+    ldr     r11, [r4], #4           ; [q1 | q0]
+
+    ssat16  lr, #1, r9              ; [sz1 | sz0]
+    eor     r9, r9, lr              ; [z1 ^ sz1 | z0 ^ sz0]
+    ssub16  r9, r9, lr              ; x = (z ^ sz) - sz
+    sadd16  r9, r9, r10             ; [x1+r1 | x0+r0]
+
+    ldr     r12, [r3], #4           ; [z3 | z2]
+
+    smulbb  r0, r9, r11             ; [(x0+r0)*q0]
+    smultt  r9, r9, r11             ; [(x1+r1)*q1]
+
+    ldr     r10, [r5], #4           ; [r3 | r2]
+
+    ssat16  r11, #1, r12            ; [sz3 | sz2]
+    eor     r12, r12, r11           ; [z3 ^ sz3 | z2 ^ sz2]
+    pkhtb   r0, r9, r0, asr #16     ; [y1 | y0]
+    ldr     r9, [r4], #4            ; [q3 | q2]
+    ssub16  r12, r12, r11           ; x = (z ^ sz) - sz
+
+    sadd16  r12, r12, r10           ; [x3+r3 | x2+r2]
+
+    eor     r0, r0, lr              ; [(y1 ^ sz1) | (y0 ^ sz0)]
+
+    smulbb  r10, r12, r9            ; [(x2+r2)*q2]
+    smultt  r12, r12, r9            ; [(x3+r3)*q3]
+
+    ssub16  r0, r0, lr              ; x = (y ^ sz) - sz
+
+    cmp     r0, #0                  ; check if zero
+    orrne   r1, r1, r2, lsr #24     ; add flag for nonzero coeffs
+
+    str     r0, [r6], #4            ; *qcoeff++ = x
+    ldr     r9, [r8], #4            ; [dq1 | dq0]
+
+    pkhtb   r10, r12, r10, asr #16  ; [y3 | y2]
+    eor     r10, r10, r11           ; [(y3 ^ sz3) | (y2 ^ sz2)]
+    ssub16  r10, r10, r11           ; x = (y ^ sz) - sz
+
+    cmp     r10, #0                 ; check if zero
+    orrne   r1, r1, r2, lsr #23     ; add flag for nonzero coeffs
+
+    str     r10, [r6], #4           ; *qcoeff++ = x
+    ldr     r11, [r8], #4           ; [dq3 | dq2]
+
+    smulbb  r12, r0, r9             ; [x0*dq0]
+    smultt  r0, r0, r9              ; [x1*dq1]
+
+    smulbb  r9, r10, r11            ; [x2*dq2]
+    smultt  r10, r10, r11           ; [x3*dq3]
+
+    lsls    r2, r2, #2              ; update loop counter
+    strh    r12, [r7, #0]           ; dqcoeff[0] = [x0*dq0]
+    strh    r0, [r7, #2]            ; dqcoeff[1] = [x1*dq1]
+    strh    r9, [r7, #4]            ; dqcoeff[2] = [x2*dq2]
+    strh    r10, [r7, #6]           ; dqcoeff[3] = [x3*dq3]
+    add     r7, r7, #8              ; dqcoeff += 8
+    bne     loop
+
+    ; PART 2: check position for eob...
+    mov     lr, #0                  ; init eob
+    cmp     r1, #0                  ; coeffs after quantization?
+    ldr     r11, [sp, #0]           ; restore BLOCKD pointer
+    beq     end                     ; skip eob calculations if all zero
+
+    ldr     r0, [r11, #vp8_blockd_qcoeff]
+
+    ; check shortcut for nonzero qcoeffs
+    tst    r1, #0x80
+    bne    quant_coeff_15_14
+    tst    r1, #0x20
+    bne    quant_coeff_13_11
+    tst    r1, #0x8
+    bne    quant_coeff_12_7
+    tst    r1, #0x40
+    bne    quant_coeff_10_9
+    tst    r1, #0x10
+    bne    quant_coeff_8_3
+    tst    r1, #0x2
+    bne    quant_coeff_6_5
+    tst    r1, #0x4
+    bne    quant_coeff_4_2
+    b      quant_coeff_1_0
+
+quant_coeff_15_14
+    ldrh    r2, [r0, #30]       ; rc=15, i=15
+    mov     lr, #16
+    cmp     r2, #0
+    bne     end
+
+    ldrh    r3, [r0, #28]       ; rc=14, i=14
+    mov     lr, #15
+    cmp     r3, #0
+    bne     end
+
+quant_coeff_13_11
+    ldrh    r2, [r0, #22]       ; rc=11, i=13
+    mov     lr, #14
+    cmp     r2, #0
+    bne     end
+
+quant_coeff_12_7
+    ldrh    r3, [r0, #14]       ; rc=7,  i=12
+    mov     lr, #13
+    cmp     r3, #0
+    bne     end
+
+    ldrh    r2, [r0, #20]       ; rc=10, i=11
+    mov     lr, #12
+    cmp     r2, #0
+    bne     end
+
+quant_coeff_10_9
+    ldrh    r3, [r0, #26]       ; rc=13, i=10
+    mov     lr, #11
+    cmp     r3, #0
+    bne     end
+
+    ldrh    r2, [r0, #24]       ; rc=12, i=9
+    mov     lr, #10
+    cmp     r2, #0
+    bne     end
+
+quant_coeff_8_3
+    ldrh    r3, [r0, #18]       ; rc=9,  i=8
+    mov     lr, #9
+    cmp     r3, #0
+    bne     end
+
+    ldrh    r2, [r0, #12]       ; rc=6,  i=7
+    mov     lr, #8
+    cmp     r2, #0
+    bne     end
+
+quant_coeff_6_5
+    ldrh    r3, [r0, #6]        ; rc=3,  i=6
+    mov     lr, #7
+    cmp     r3, #0
+    bne     end
+
+    ldrh    r2, [r0, #4]        ; rc=2,  i=5
+    mov     lr, #6
+    cmp     r2, #0
+    bne     end
+
+quant_coeff_4_2
+    ldrh    r3, [r0, #10]       ; rc=5,  i=4
+    mov     lr, #5
+    cmp     r3, #0
+    bne     end
+
+    ldrh    r2, [r0, #16]       ; rc=8,  i=3
+    mov     lr, #4
+    cmp     r2, #0
+    bne     end
+
+    ldrh    r3, [r0, #8]        ; rc=4,  i=2
+    mov     lr, #3
+    cmp     r3, #0
+    bne     end
+
+quant_coeff_1_0
+    ldrh    r2, [r0, #2]        ; rc=1,  i=1
+    mov     lr, #2
+    cmp     r2, #0
+    bne     end
+
+    mov     lr, #1              ; rc=0,  i=0
+
+end
+    str     lr, [r11, #vp8_blockd_eob]
+    ldmfd   sp!, {r1, r4-r11, pc}
+
+    ENDP
+
+loop_count
+    DCD     0x1000000
+
+    END
+

diff --git a/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm b/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm
new file mode 100644
index 0000000..a9060d7
--- /dev/null
+++ b/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm

@@ -0,0 +1,133 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_mse16x16_armv6|
+
+    ARM
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+;
+;note: Based on vp8_variance16x16_armv6. In this function, sum is never used.
+;      So, we can remove this part of calculation.
+
+|vp8_mse16x16_armv6| PROC
+
+    push    {r4-r9, lr}
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+
+    mov     r4, #0              ; initialize sse = 0
+
+loop
+    ; 1st 4 pixels
+    ldr     r5, [r0, #0x0]      ; load 4 src pixels
+    ldr     r6, [r2, #0x0]      ; load 4 ref pixels
+
+    mov     lr, #0              ; constant zero
+
+    usub8   r8, r5, r6          ; calculate difference
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+
+    ldr     r5, [r0, #0x4]      ; load 4 src pixels
+
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r6, [r2, #0x4]      ; load 4 ref pixels
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r5, r6          ; calculate difference
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+    ldr     r5, [r0, #0x8]      ; load 4 src pixels
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r6, [r2, #0x8]      ; load 4 ref pixels
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r5, r6          ; calculate difference
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+
+    ldr     r5, [r0, #0xc]      ; load 4 src pixels
+
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r6, [r2, #0xc]      ; load 4 ref pixels
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r5, r6          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+
+    subs    r12, r12, #1        ; next row
+
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    bne     loop
+
+    ; return stuff
+    ldr     r1, [sp, #28]       ; get address of sse
+    mov     r0, r4              ; return sse
+    str     r4, [r1]            ; store sse
+
+    pop     {r4-r9, pc}
+
+    ENDP
+
+    END

diff --git a/vp8/encoder/arm/armv6/vp8_sad16x16_armv6.asm b/vp8/encoder/arm/armv6/vp8_sad16x16_armv6.asm
new file mode 100644
index 0000000..c759f7c
--- /dev/null
+++ b/vp8/encoder/arm/armv6/vp8_sad16x16_armv6.asm

@@ -0,0 +1,84 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_sad16x16_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    const unsigned char *src_ptr
+; r1    int  src_stride
+; r2    const unsigned char *ref_ptr
+; r3    int  ref_stride
+; stack max_sad (not used)
+|vp8_sad16x16_armv6| PROC
+    stmfd   sp!, {r4-r12, lr}
+    mov     r4, #0              ; sad = 0;
+    mov     r5, #8              ; loop count
+
+loop
+    ; 1st row
+    ldr     r6, [r0, #0x0]      ; load 4 src pixels (1A)
+    ldr     r8, [r2, #0x0]      ; load 4 ref pixels (1A)
+    ldr     r7, [r0, #0x4]      ; load 4 src pixels (1A)
+    ldr     r9, [r2, #0x4]      ; load 4 ref pixels (1A)
+    ldr     r10, [r0, #0x8]     ; load 4 src pixels (1B)
+    ldr     r11, [r0, #0xC]     ; load 4 src pixels (1B)
+
+    usada8  r4, r8, r6, r4      ; calculate sad for 4 pixels
+    usad8   r8, r7, r9          ; calculate sad for 4 pixels
+
+    ldr     r12, [r2, #0x8]     ; load 4 ref pixels (1B)
+    ldr     lr, [r2, #0xC]      ; load 4 ref pixels (1B)
+
+    add     r0, r0, r1          ; set src pointer to next row
+    add     r2, r2, r3          ; set dst pointer to next row
+
+    usada8  r4, r10, r12, r4    ; calculate sad for 4 pixels
+    usada8  r8, r11, lr, r8     ; calculate sad for 4 pixels
+
+    ldr     r6, [r0, #0x0]      ; load 4 src pixels (2A)
+    ldr     r7, [r0, #0x4]      ; load 4 src pixels (2A)
+    add     r4, r4, r8          ; add partial sad values
+
+    ; 2nd row
+    ldr     r8, [r2, #0x0]      ; load 4 ref pixels (2A)
+    ldr     r9, [r2, #0x4]      ; load 4 ref pixels (2A)
+    ldr     r10, [r0, #0x8]     ; load 4 src pixels (2B)
+    ldr     r11, [r0, #0xC]     ; load 4 src pixels (2B)
+
+    usada8  r4, r6, r8, r4      ; calculate sad for 4 pixels
+    usad8   r8, r7, r9          ; calculate sad for 4 pixels
+
+    ldr     r12, [r2, #0x8]     ; load 4 ref pixels (2B)
+    ldr     lr, [r2, #0xC]      ; load 4 ref pixels (2B)
+
+    add     r0, r0, r1          ; set src pointer to next row
+    add     r2, r2, r3          ; set dst pointer to next row
+
+    usada8  r4, r10, r12, r4    ; calculate sad for 4 pixels
+    usada8  r8, r11, lr, r8     ; calculate sad for 4 pixels
+
+    subs    r5, r5, #1          ; decrement loop counter
+    add     r4, r4, r8          ; add partial sad values
+
+    bne     loop
+
+    mov     r0, r4              ; return sad
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+    END
+

diff --git a/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm b/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm
new file mode 100644
index 0000000..8d7258a
--- /dev/null
+++ b/vp8/encoder/arm/armv6/vp8_variance16x16_armv6.asm

@@ -0,0 +1,147 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_variance16x16_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vp8_variance16x16_armv6| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+    mov     r8, #0              ; initialize sum = 0
+    mov     r11, #0             ; initialize sse = 0
+
+loop
+    ; 1st 4 pixels
+    ldr     r4, [r0, #0x0]      ; load 4 src pixels
+    ldr     r5, [r2, #0x0]      ; load 4 ref pixels
+
+    mov     lr, #0              ; constant zero
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+    ; calculate total sum
+    adds    r8, r8, r4          ; add positive differences to sum
+    subs    r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r4, [r0, #0x4]      ; load 4 src pixels
+    ldr     r5, [r2, #0x4]      ; load 4 ref pixels
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r4, [r0, #0x8]      ; load 4 src pixels
+    ldr     r5, [r2, #0x8]      ; load 4 ref pixels
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r4, [r0, #0xc]      ; load 4 src pixels
+    ldr     r5, [r2, #0xc]      ; load 4 ref pixels
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+
+    subs    r12, r12, #1
+
+    bne     loop
+
+    ; return stuff
+    ldr     r6, [sp, #0x28]     ; get address of sse
+    mul     r0, r8, r8          ; sum * sum
+    str     r11, [r6]           ; store sse
+    sub     r0, r11, r0, ASR #8 ; return (sse - ((sum * sum) >> 8))
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+    END

diff --git a/vp8/encoder/arm/armv6/vp8_variance8x8_armv6.asm b/vp8/encoder/arm/armv6/vp8_variance8x8_armv6.asm
new file mode 100644
index 0000000..7daecb9
--- /dev/null
+++ b/vp8/encoder/arm/armv6/vp8_variance8x8_armv6.asm

@@ -0,0 +1,95 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_variance8x8_armv6|
+
+    ARM
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vp8_variance8x8_armv6| PROC
+
+    push    {r4-r10, lr}
+    mov     r12, #8             ; set loop counter to 8 (=block height)
+    mov     r4, #0              ; initialize sum = 0
+    mov     r5, #0              ; initialize sse = 0
+
+loop
+    ; 1st 4 pixels
+    ldr     r6, [r0, #0x0]      ; load 4 src pixels
+    ldr     r7, [r2, #0x0]      ; load 4 ref pixels
+
+    mov     lr, #0              ; constant zero
+
+    usub8   r8, r6, r7          ; calculate difference
+    sel     r10, r8, lr         ; select bytes with positive difference
+    usub8   r9, r7, r6          ; calculate difference with reversed operands
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r6, r10, lr         ; calculate sum of positive differences
+    usad8   r7, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r10         ; differences of all 4 pixels
+    ; calculate total sum
+    add    r4, r4, r6           ; add positive differences to sum
+    sub    r4, r4, r7           ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r7, r8              ; byte (two pixels) to halfwords
+    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
+    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r6, [r0, #0x4]      ; load 4 src pixels
+    ldr     r7, [r2, #0x4]      ; load 4 ref pixels
+    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r6, r7          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r10, r8, lr         ; select bytes with positive difference
+    usub8   r9, r7, r6          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r6, r10, lr         ; calculate sum of positive differences
+    usad8   r7, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r10         ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r4, r4, r6          ; add positive differences to sum
+    sub     r4, r4, r7          ; substract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r7, r8              ; byte (two pixels) to halfwords
+    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
+    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
+    subs    r12, r12, #1        ; next row
+    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
+
+    bne     loop
+
+    ; return stuff
+    ldr     r8, [sp, #32]       ; get address of sse
+    mul     r1, r4, r4          ; sum * sum
+    str     r5, [r8]            ; store sse
+    sub     r0, r5, r1, ASR #6  ; return (sse - ((sum * sum) >> 6))
+
+    pop     {r4-r10, pc}
+
+    ENDP
+
+    END

diff --git a/vp8/encoder/arm/boolhuff_arm.c b/vp8/encoder/arm/boolhuff_arm.c
index fe8e70c..9089663 100644
--- a/vp8/encoder/arm/boolhuff_arm.c
+++ b/vp8/encoder/arm/boolhuff_arm.c

@@ -9,8 +9,8 @@
  */
 
 
-#include "boolhuff.h"
-#include "blockd.h"
+#include "vp8/encoder/boolhuff.h"
+#include "vp8/common/blockd.h"
 
 const unsigned int vp8_prob_cost[256] =
 {

diff --git a/vp8/encoder/arm/dct_arm.c b/vp8/encoder/arm/dct_arm.c
new file mode 100644
index 0000000..60d649d
--- /dev/null
+++ b/vp8/encoder/arm/dct_arm.c

@@ -0,0 +1,24 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp8/encoder/dct.h"
+
+#if HAVE_ARMV6
+
+void vp8_fast_fdct8x4_armv6(short *input, short *output, int pitch)
+{
+    vp8_fast_fdct4x4_armv6(input,   output,    pitch);
+    vp8_fast_fdct4x4_armv6(input + 4, output + 16, pitch);
+}
+
+#endif /* HAVE_ARMV6 */
+
+

diff --git a/vp8/encoder/arm/dct_arm.h b/vp8/encoder/arm/dct_arm.h
index 41fa5d1..769d5f4 100644
--- a/vp8/encoder/arm/dct_arm.h
+++ b/vp8/encoder/arm/dct_arm.h

@@ -14,12 +14,21 @@
 
 #if HAVE_ARMV6
 extern prototype_fdct(vp8_short_walsh4x4_armv6);
+extern prototype_fdct(vp8_fast_fdct4x4_armv6);
+extern prototype_fdct(vp8_fast_fdct8x4_armv6);
 
 #if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_fdct_walsh_short4x4
 #define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_armv6
+
+#undef  vp8_fdct_fast4x4
+#define vp8_fdct_fast4x4 vp8_fast_fdct4x4_armv6
+
+#undef  vp8_fdct_fast8x4
+#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_armv6
 #endif
-#endif
+
+#endif /* HAVE_ARMV6 */
 
 #if HAVE_ARMV7
 extern prototype_fdct(vp8_short_fdct4x4_neon);

diff --git a/vp8/encoder/arm/encodemb_arm.c b/vp8/encoder/arm/encodemb_arm.c
index cc9e014..88ad3fc 100644
--- a/vp8/encoder/arm/encodemb_arm.c
+++ b/vp8/encoder/arm/encodemb_arm.c

@@ -9,13 +9,13 @@
  */
 
 
-#include "encodemb.h"
-#include "reconinter.h"
-#include "quantize.h"
-#include "invtrans.h"
-#include "recon.h"
-#include "reconintra.h"
-#include "dct.h"
+#include "vp8/encoder/encodemb.h"
+#include "vp8/common/reconinter.h"
+#include "vp8/encoder/quantize.h"
+#include "vp8/common/invtrans.h"
+#include "vp8/common/recon.h"
+#include "vp8/common/reconintra.h"
+#include "vp8/encoder/dct.h"
 #include "vpx_mem/vpx_mem.h"
 
 extern void vp8_subtract_b_neon_func(short *diff, unsigned char *src, unsigned char *pred, int stride, int pitch);

diff --git a/vp8/encoder/arm/picklpf_arm.c b/vp8/encoder/arm/picklpf_arm.c
index b2d8f2b..3fb370c 100644
--- a/vp8/encoder/arm/picklpf_arm.c
+++ b/vp8/encoder/arm/picklpf_arm.c

@@ -9,13 +9,13 @@
  */
 
 
-#include "onyxc_int.h"
-#include "onyx_int.h"
-#include "quantize.h"
+#include "vp8/common/onyxc_int.h"
+#include "vp8/encoder/onyx_int.h"
+#include "vp8/encoder/quantize.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_scale/yv12extend.h"
 #include "vpx_scale/vpxscale.h"
-#include "alloccommon.h"
+#include "vp8/common/alloccommon.h"
 
 extern void vp8_memcpy_neon(unsigned char *dst_ptr, unsigned char *src_ptr, int sz);
 

diff --git a/vp8/encoder/arm/quantize_arm.c b/vp8/encoder/arm/quantize_arm.c
index 225feac..0e3334a 100644
--- a/vp8/encoder/arm/quantize_arm.c
+++ b/vp8/encoder/arm/quantize_arm.c

@@ -12,9 +12,8 @@
 #include <math.h>
 #include "vpx_mem/vpx_mem.h"
 
-#include "quantize.h"
-#include "entropy.h"
-#include "predictdc.h"
+#include "vp8/encoder/quantize.h"
+#include "vp8/common/entropy.h"
 
 DECLARE_ALIGNED(16, const short, vp8_rvsplus1_default_zig_zag1d[16]) =
 {

diff --git a/vp8/encoder/arm/quantize_arm.h b/vp8/encoder/arm/quantize_arm.h
index 5f9155e..0c6adf4 100644
--- a/vp8/encoder/arm/quantize_arm.h
+++ b/vp8/encoder/arm/quantize_arm.h

@@ -12,6 +12,16 @@
 #ifndef QUANTIZE_ARM_H
 #define QUANTIZE_ARM_H
 
+#if HAVE_ARMV6
+
+extern prototype_quantize_block(vp8_fast_quantize_b_armv6);
+
+#undef  vp8_quantize_fastquantb
+#define vp8_quantize_fastquantb vp8_fast_quantize_b_armv6
+
+#endif /* HAVE_ARMV6 */
+
+
 #if HAVE_ARMV7
 extern prototype_quantize_block(vp8_fast_quantize_b_neon);
 

diff --git a/vp8/encoder/arm/variance_arm.c b/vp8/encoder/arm/variance_arm.c
index b40c048..ed1fb16 100644
--- a/vp8/encoder/arm/variance_arm.c
+++ b/vp8/encoder/arm/variance_arm.c

@@ -9,7 +9,103 @@
  */
 
 #include "vpx_config.h"
-#include "variance.h"
+#include "vp8/encoder/variance.h"
+#include "vp8/common/filter.h"
+#include "vp8/common/arm/bilinearfilter_arm.h"
+
+#if HAVE_ARMV6
+
+unsigned int vp8_sub_pixel_variance8x8_armv6
+(
+    const unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    const unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    unsigned short first_pass[10*8];
+    unsigned char  second_pass[8*8];
+    const short *HFilter, *VFilter;
+
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
+
+    vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,
+                                            src_pixels_per_line,
+                                            9, 8, HFilter);
+    vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
+                                             8, 8, 8, VFilter);
+
+    return vp8_variance8x8_armv6(second_pass, 8, dst_ptr,
+                                   dst_pixels_per_line, sse);
+}
+
+unsigned int vp8_sub_pixel_variance16x16_armv6
+(
+    const unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    const unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    unsigned short first_pass[36*16];
+    unsigned char  second_pass[20*16];
+    const short *HFilter, *VFilter;
+
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
+
+    vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,
+                                            src_pixels_per_line,
+                                            17, 16, HFilter);
+    vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
+                                             16, 16, 16, VFilter);
+
+    return vp8_variance16x16_armv6(second_pass, 16, dst_ptr,
+                                   dst_pixels_per_line, sse);
+}
+
+unsigned int vp8_variance_halfpixvar16x16_h_armv6(
+    const unsigned char *src_ptr,
+    int  source_stride,
+    const unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    return vp8_sub_pixel_variance16x16_armv6(src_ptr, source_stride, 4, 0,
+                                         ref_ptr, recon_stride, sse);
+}
+
+unsigned int vp8_variance_halfpixvar16x16_v_armv6(
+    const unsigned char *src_ptr,
+    int  source_stride,
+    const unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    return vp8_sub_pixel_variance16x16_armv6(src_ptr, source_stride, 0, 4,
+                                         ref_ptr, recon_stride, sse);
+}
+
+unsigned int vp8_variance_halfpixvar16x16_hv_armv6(
+    const unsigned char *src_ptr,
+    int  source_stride,
+    const unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    return vp8_sub_pixel_variance16x16_armv6(src_ptr, source_stride, 4, 4,
+                                         ref_ptr, recon_stride, sse);
+}
+
+#endif /* HAVE_ARMV6 */
+
 
 #if HAVE_ARMV7
 

diff --git a/vp8/encoder/arm/variance_arm.h b/vp8/encoder/arm/variance_arm.h
index 3cbacfa..86de274 100644
--- a/vp8/encoder/arm/variance_arm.h
+++ b/vp8/encoder/arm/variance_arm.h

@@ -12,6 +12,52 @@
 #ifndef VARIANCE_ARM_H
 #define VARIANCE_ARM_H
 
+#if HAVE_ARMV6
+
+extern prototype_sad(vp8_sad16x16_armv6);
+extern prototype_variance(vp8_variance16x16_armv6);
+extern prototype_variance(vp8_variance8x8_armv6);
+extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_armv6);
+extern prototype_subpixvariance(vp8_sub_pixel_variance8x8_armv6);
+extern prototype_variance(vp8_variance_halfpixvar16x16_h_armv6);
+extern prototype_variance(vp8_variance_halfpixvar16x16_v_armv6);
+extern prototype_variance(vp8_variance_halfpixvar16x16_hv_armv6);
+extern prototype_variance(vp8_mse16x16_armv6);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#undef  vp8_variance_sad16x16
+#define vp8_variance_sad16x16 vp8_sad16x16_armv6
+
+#undef  vp8_variance_subpixvar16x16
+#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_armv6
+
+#undef  vp8_variance_subpixvar8x8
+#define vp8_variance_subpixvar8x8 vp8_sub_pixel_variance8x8_armv6
+
+#undef  vp8_variance_var16x16
+#define vp8_variance_var16x16 vp8_variance16x16_armv6
+
+#undef  vp8_variance_mse16x16
+#define vp8_variance_mse16x16 vp8_mse16x16_armv6
+
+#undef  vp8_variance_var8x8
+#define vp8_variance_var8x8 vp8_variance8x8_armv6
+
+#undef  vp8_variance_halfpixvar16x16_h
+#define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_armv6
+
+#undef  vp8_variance_halfpixvar16x16_v
+#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_armv6
+
+#undef  vp8_variance_halfpixvar16x16_hv
+#define vp8_variance_halfpixvar16x16_hv vp8_variance_halfpixvar16x16_hv_armv6
+
+#endif /* !CONFIG_RUNTIME_CPU_DETECT */
+
+#endif /* HAVE_ARMV6 */
+
+
 #if HAVE_ARMV7
 extern prototype_sad(vp8_sad4x4_neon);
 extern prototype_sad(vp8_sad8x8_neon);

diff --git a/vp8/encoder/arm/vpx_vp8_enc_asm_offsets.c b/vp8/encoder/arm/vpx_vp8_enc_asm_offsets.c
deleted file mode 100644
index 4703a84..0000000
--- a/vp8/encoder/arm/vpx_vp8_enc_asm_offsets.c
+++ /dev/null

@@ -1,77 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include <stddef.h>
-
-#include "../treewriter.h"
-#include "../tokenize.h"
-#include "../onyx_int.h"
-
-#define ct_assert(name,cond) \
-    static void assert_##name(void) UNUSED;\
-    static void assert_##name(void) {switch(0){case 0:case !!(cond):;}}
-
-#define DEFINE(sym, val) int sym = val;
-
-/*
-#define BLANK() asm volatile("\n->" : : )
-*/
-
-/*
- * int main(void)
- * {
- */
-
-DEFINE(vp8_writer_lowvalue,                     offsetof(vp8_writer, lowvalue));
-DEFINE(vp8_writer_range,                        offsetof(vp8_writer, range));
-DEFINE(vp8_writer_value,                        offsetof(vp8_writer, value));
-DEFINE(vp8_writer_count,                        offsetof(vp8_writer, count));
-DEFINE(vp8_writer_pos,                          offsetof(vp8_writer, pos));
-DEFINE(vp8_writer_buffer,                       offsetof(vp8_writer, buffer));
-
-DEFINE(tokenextra_token,                        offsetof(TOKENEXTRA, Token));
-DEFINE(tokenextra_extra,                        offsetof(TOKENEXTRA, Extra));
-DEFINE(tokenextra_context_tree,                  offsetof(TOKENEXTRA, context_tree));
-DEFINE(tokenextra_skip_eob_node,                offsetof(TOKENEXTRA, skip_eob_node));
-DEFINE(TOKENEXTRA_SZ,                           sizeof(TOKENEXTRA));
-
-DEFINE(vp8_extra_bit_struct_sz,                   sizeof(vp8_extra_bit_struct));
-
-DEFINE(vp8_token_value,                         offsetof(vp8_token, value));
-DEFINE(vp8_token_len,                           offsetof(vp8_token, Len));
-
-DEFINE(vp8_extra_bit_struct_tree,                 offsetof(vp8_extra_bit_struct, tree));
-DEFINE(vp8_extra_bit_struct_prob,                 offsetof(vp8_extra_bit_struct, prob));
-DEFINE(vp8_extra_bit_struct_len,                  offsetof(vp8_extra_bit_struct, Len));
-DEFINE(vp8_extra_bit_struct_base_val,              offsetof(vp8_extra_bit_struct, base_val));
-
-DEFINE(vp8_comp_tplist,                         offsetof(VP8_COMP, tplist));
-DEFINE(vp8_comp_common,                         offsetof(VP8_COMP, common));
-DEFINE(vp8_comp_bc2,                            offsetof(VP8_COMP, bc2));
-
-DEFINE(tokenlist_start,                         offsetof(TOKENLIST, start));
-DEFINE(tokenlist_stop,                          offsetof(TOKENLIST, stop));
-DEFINE(TOKENLIST_SZ,                            sizeof(TOKENLIST));
-
-DEFINE(vp8_common_mb_rows,                       offsetof(VP8_COMMON, mb_rows));
-
-// These two sizes are used in vp7cx_pack_tokens.  They are hard coded
-//  so if the size changes this will have to be adjusted.
-ct_assert(TOKENEXTRA_SZ, sizeof(TOKENEXTRA) == 8)
-ct_assert(vp8_extra_bit_struct_sz, sizeof(vp8_extra_bit_struct) == 16)
-
-//add asserts for any offset that is not supported by assembly code
-//add asserts for any size that is not supported by assembly code
-/*
- * return 0;
- * }
- */

diff --git a/vp8/encoder/asm_enc_offsets.c b/vp8/encoder/asm_enc_offsets.c
new file mode 100644
index 0000000..c7983c1
--- /dev/null
+++ b/vp8/encoder/asm_enc_offsets.c

@@ -0,0 +1,97 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include <stddef.h>
+
+#include "block.h"
+#include "vp8/common/blockd.h"
+#include "onyx_int.h"
+#include "treewriter.h"
+#include "tokenize.h"
+
+#define ct_assert(name,cond) \
+    static void assert_##name(void) UNUSED;\
+    static void assert_##name(void) {switch(0){case 0:case !!(cond):;}}
+
+#define DEFINE(sym, val) int sym = val;
+
+/*
+#define BLANK() asm volatile("\n->" : : )
+*/
+
+/*
+ * int main(void)
+ * {
+ */
+
+//regular quantize
+DEFINE(vp8_block_coeff,                         offsetof(BLOCK, coeff));
+DEFINE(vp8_block_zbin,                          offsetof(BLOCK, zbin));
+DEFINE(vp8_block_round,                         offsetof(BLOCK, round));
+DEFINE(vp8_block_quant,                         offsetof(BLOCK, quant));
+DEFINE(vp8_block_quant_fast,                    offsetof(BLOCK, quant_fast));
+DEFINE(vp8_block_zbin_extra,                    offsetof(BLOCK, zbin_extra));
+DEFINE(vp8_block_zrun_zbin_boost,               offsetof(BLOCK, zrun_zbin_boost));
+DEFINE(vp8_block_quant_shift,                   offsetof(BLOCK, quant_shift));
+
+DEFINE(vp8_blockd_qcoeff,                       offsetof(BLOCKD, qcoeff));
+DEFINE(vp8_blockd_dequant,                      offsetof(BLOCKD, dequant));
+DEFINE(vp8_blockd_dqcoeff,                      offsetof(BLOCKD, dqcoeff));
+DEFINE(vp8_blockd_eob,                          offsetof(BLOCKD, eob));
+
+//pack tokens
+DEFINE(vp8_writer_lowvalue,                     offsetof(vp8_writer, lowvalue));
+DEFINE(vp8_writer_range,                        offsetof(vp8_writer, range));
+DEFINE(vp8_writer_value,                        offsetof(vp8_writer, value));
+DEFINE(vp8_writer_count,                        offsetof(vp8_writer, count));
+DEFINE(vp8_writer_pos,                          offsetof(vp8_writer, pos));
+DEFINE(vp8_writer_buffer,                       offsetof(vp8_writer, buffer));
+
+DEFINE(tokenextra_token,                        offsetof(TOKENEXTRA, Token));
+DEFINE(tokenextra_extra,                        offsetof(TOKENEXTRA, Extra));
+DEFINE(tokenextra_context_tree,                 offsetof(TOKENEXTRA, context_tree));
+DEFINE(tokenextra_skip_eob_node,                offsetof(TOKENEXTRA, skip_eob_node));
+DEFINE(TOKENEXTRA_SZ,                           sizeof(TOKENEXTRA));
+
+DEFINE(vp8_extra_bit_struct_sz,                 sizeof(vp8_extra_bit_struct));
+
+DEFINE(vp8_token_value,                         offsetof(vp8_token, value));
+DEFINE(vp8_token_len,                           offsetof(vp8_token, Len));
+
+DEFINE(vp8_extra_bit_struct_tree,               offsetof(vp8_extra_bit_struct, tree));
+DEFINE(vp8_extra_bit_struct_prob,               offsetof(vp8_extra_bit_struct, prob));
+DEFINE(vp8_extra_bit_struct_len,                offsetof(vp8_extra_bit_struct, Len));
+DEFINE(vp8_extra_bit_struct_base_val,           offsetof(vp8_extra_bit_struct, base_val));
+
+DEFINE(vp8_comp_tplist,                         offsetof(VP8_COMP, tplist));
+DEFINE(vp8_comp_common,                         offsetof(VP8_COMP, common));
+DEFINE(vp8_comp_bc2,                            offsetof(VP8_COMP, bc2));
+
+DEFINE(tokenlist_start,                         offsetof(TOKENLIST, start));
+DEFINE(tokenlist_stop,                          offsetof(TOKENLIST, stop));
+DEFINE(TOKENLIST_SZ,                            sizeof(TOKENLIST));
+
+DEFINE(vp8_common_mb_rows,                      offsetof(VP8_COMMON, mb_rows));
+
+// These two sizes are used in vp8cx_pack_tokens.  They are hard coded
+// so if the size changes this will have to be adjusted.
+#if HAVE_ARMV5TE
+ct_assert(TOKENEXTRA_SZ, sizeof(TOKENEXTRA) == 8)
+ct_assert(vp8_extra_bit_struct_sz, sizeof(vp8_extra_bit_struct) == 16)
+#endif
+
+//add asserts for any offset that is not supported by assembly code
+//add asserts for any size that is not supported by assembly code
+/*
+ * return 0;
+ * }
+ */

diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c
index 412542d..adbd106 100644
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c

@@ -9,15 +9,15 @@
  */
 
 
-#include "header.h"
+#include "vp8/common/header.h"
 #include "encodemv.h"
-#include "entropymode.h"
-#include "findnearmv.h"
+#include "vp8/common/entropymode.h"
+#include "vp8/common/findnearmv.h"
 #include "mcomp.h"
-#include "systemdependent.h"
+#include "vp8/common/systemdependent.h"
 #include <assert.h>
 #include <stdio.h>
-#include "pragmas.h"
+#include "vp8/common/pragmas.h"
 #include "vpx_mem/vpx_mem.h"
 #include "bitstream.h"
 
@@ -58,16 +58,6 @@
 int count_mb_seg[4] = { 0, 0, 0, 0 };
 #endif
 
-#if CONFIG_BIG_ENDIAN
-# define make_endian_16(a)  \
-    (((unsigned int)(a & 0xff)) << 8) | (((unsigned int)(a & 0xff00)) >> 8)
-# define make_endian_32(a)                              \
-    (((unsigned int)(a & 0xff)) << 24)    | (((unsigned int)(a & 0xff00)) << 8) |   \
-    (((unsigned int)(a & 0xff0000)) >> 8) | (((unsigned int)(a & 0xff000000)) >> 24)
-#else
-# define make_endian_16(a)  a
-# define make_endian_32(a)  a
-#endif
 
 static void update_mode(
     vp8_writer *const w,
@@ -1376,6 +1366,7 @@
     oh.show_frame = (int) pc->show_frame;
     oh.type = (int)pc->frame_type;
     oh.version = pc->version;
+    oh.first_partition_length_in_bytes = 0;
 
     mb_feature_data_bits = vp8_mb_feature_data_bits;
     cx_data += 3;
@@ -1392,13 +1383,20 @@
     // every keyframe send startcode, width, height, scale factor, clamp and color type
     if (oh.type == KEY_FRAME)
     {
+        int v;
+
         // Start / synch code
         cx_data[0] = 0x9D;
         cx_data[1] = 0x01;
         cx_data[2] = 0x2a;
 
-        *((unsigned short *)(cx_data + 3)) = make_endian_16((pc->horiz_scale << 14) | pc->Width);
-        *((unsigned short *)(cx_data + 5)) = make_endian_16((pc->vert_scale << 14) | pc->Height);
+        v = (pc->horiz_scale << 14) | pc->Width;
+        cx_data[3] = v;
+        cx_data[4] = v >> 8;
+
+        v = (pc->vert_scale << 14) | pc->Height;
+        cx_data[5] = v;
+        cx_data[6] = v >> 8;
 
         extra_bytes_packed = 7;
         cx_data += extra_bytes_packed ;
@@ -1637,6 +1635,21 @@
 
     vp8_stop_encode(bc);
 
+    oh.first_partition_length_in_bytes = cpi->bc.pos;
+
+    /* update frame tag */
+    {
+        int v = (oh.first_partition_length_in_bytes << 5) |
+                (oh.show_frame << 4) |
+                (oh.version << 1) |
+                oh.type;
+
+        dest[0] = v;
+        dest[1] = v >> 8;
+        dest[2] = v >> 16;
+    }
+
+    *size = VP8_HEADER_SIZE + extra_bytes_packed + cpi->bc.pos;
 
     if (pc->multi_token_partition != ONE_PARTITION)
     {
@@ -1646,37 +1659,23 @@
 
         pack_tokens_into_partitions(cpi, cx_data + bc->pos, num_part, &asize);
 
-        oh.first_partition_length_in_bytes = cpi->bc.pos;
-
-        *size = cpi->bc.pos + VP8_HEADER_SIZE + asize + extra_bytes_packed;
+        *size += asize;
     }
     else
     {
         vp8_start_encode(&cpi->bc2, cx_data + bc->pos);
 
-        if (!cpi->b_multi_threaded)
-            pack_tokens(&cpi->bc2, cpi->tok, cpi->tok_count);
-        else
+#if CONFIG_MULTITHREAD
+        if (cpi->b_multi_threaded)
             pack_mb_row_tokens(cpi, &cpi->bc2);
+        else
+#endif
+            pack_tokens(&cpi->bc2, cpi->tok, cpi->tok_count);
 
         vp8_stop_encode(&cpi->bc2);
-        oh.first_partition_length_in_bytes = cpi->bc.pos ;
-        *size = cpi->bc2.pos + cpi->bc.pos + VP8_HEADER_SIZE + extra_bytes_packed;
-    }
 
-#if CONFIG_BIG_ENDIAN
-    {
-        int v = (oh.first_partition_length_in_bytes << 5) |
-                (oh.show_frame << 4) |
-                (oh.version << 1) |
-                oh.type;
-
-        v = make_endian_32(v);
-        vpx_memcpy(dest, &v, 3);
+        *size += cpi->bc2.pos;
     }
-#else
-    vpx_memcpy(dest, &oh, 3);
-#endif
 }
 
 #ifdef ENTROPY_STATS

diff --git a/vp8/encoder/block.h b/vp8/encoder/block.h
index 3ad40ef..2fd6782 100644
--- a/vp8/encoder/block.h
+++ b/vp8/encoder/block.h

@@ -12,10 +12,10 @@
 #ifndef __INC_BLOCK_H
 #define __INC_BLOCK_H
 
-#include "onyx.h"
-#include "blockd.h"
-#include "entropymv.h"
-#include "entropy.h"
+#include "vp8/common/onyx.h"
+#include "vp8/common/blockd.h"
+#include "vp8/common/entropymv.h"
+#include "vp8/common/entropy.h"
 #include "vpx_ports/mem.h"
 
 // motion search site

diff --git a/vp8/encoder/boolhuff.c b/vp8/encoder/boolhuff.c
index 82006b1..788d2b0 100644
--- a/vp8/encoder/boolhuff.c
+++ b/vp8/encoder/boolhuff.c

@@ -10,7 +10,7 @@
 
 
 #include "boolhuff.h"
-#include "blockd.h"
+#include "vp8/common/blockd.h"
 
 
 

diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index d535894..fa01d20 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c

@@ -12,22 +12,22 @@
 #include "vpx_ports/config.h"
 #include "encodemb.h"
 #include "encodemv.h"
-#include "common.h"
+#include "vp8/common/common.h"
 #include "onyx_int.h"
-#include "extend.h"
-#include "entropymode.h"
-#include "quant_common.h"
+#include "vp8/common/extend.h"
+#include "vp8/common/entropymode.h"
+#include "vp8/common/quant_common.h"
 #include "segmentation.h"
-#include "setupintrarecon.h"
+#include "vp8/common/setupintrarecon.h"
 #include "encodeintra.h"
-#include "reconinter.h"
+#include "vp8/common/reconinter.h"
 #include "rdopt.h"
 #include "pickinter.h"
-#include "findnearmv.h"
-#include "reconintra.h"
+#include "vp8/common/findnearmv.h"
+#include "vp8/common/reconintra.h"
 #include <stdio.h>
 #include <limits.h>
-#include "subpixel.h"
+#include "vp8/common/subpixel.h"
 #include "vpx_ports/vpx_timer.h"
 
 #if CONFIG_RUNTIME_CPU_DETECT
@@ -399,13 +399,6 @@
     // Clear Zbin mode boost for default case
     cpi->zbin_mode_boost = 0;
 
-    // vp8cx_init_quantizer() is first called in vp8_create_compressor(). A check is added here so that vp8cx_init_quantizer() is only called
-    // when these values are not all zero.
-    if (cpi->common.y1dc_delta_q | cpi->common.y2dc_delta_q | cpi->common.uvdc_delta_q | cpi->common.y2ac_delta_q | cpi->common.uvac_delta_q)
-    {
-        vp8cx_init_quantizer(cpi);
-    }
-
     // MB level quantizer setup
     vp8cx_mb_init_quantizer(cpi, &cpi->mb);
 }
@@ -807,34 +800,15 @@
         struct vpx_usec_timer  emr_timer;
         vpx_usec_timer_start(&emr_timer);
 
-        if (!cpi->b_multi_threaded)
-        {
-            // for each macroblock row in image
-            for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
-            {
-
-                vp8_zero(cm->left_context)
-
-                encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate);
-
-                // adjust to the next row of mbs
-                x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;
-                x->src.u_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;
-                x->src.v_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;
-            }
-
-            cpi->tok_count = tp - cpi->tok;
-
-        }
-        else
-        {
 #if CONFIG_MULTITHREAD
+        if (cpi->b_multi_threaded)
+        {
             int i;
 
             vp8cx_init_mbrthread_data(cpi, x, cpi->mb_row_ei, 1,  cpi->encoding_thread_count);
 
             for (i = 0; i < cm->mb_rows; i++)
-                cpi->mt_current_mb_col[i] = 0;
+                cpi->mt_current_mb_col[i] = -1;
 
             for (i = 0; i < cpi->encoding_thread_count; i++)
             {
@@ -893,7 +867,25 @@
                 x->activity_sum += cpi->mb_row_ei[i].mb.activity_sum;
             }
 
+        }
+        else
 #endif
+        {
+            // for each macroblock row in image
+            for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
+            {
+
+                vp8_zero(cm->left_context)
+
+                encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate);
+
+                // adjust to the next row of mbs
+                x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;
+                x->src.u_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;
+                x->src.v_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;
+            }
+
+            cpi->tok_count = tp - cpi->tok;
 
         }
 
@@ -1173,7 +1165,7 @@
 
         Error16x16 = vp8_rd_pick_intra16x16mby_mode(cpi, x, &rate16x16, &rate16x16_tokenonly, &dist16x16);
 
-        Error4x4 = vp8_rd_pick_intra4x4mby_modes(cpi, x, &rate4x4, &rate4x4_tokenonly, &dist4x4);
+        Error4x4 = vp8_rd_pick_intra4x4mby_modes(cpi, x, &rate4x4, &rate4x4_tokenonly, &dist4x4, Error16x16);
 
         rate += (Error4x4 < Error16x16) ? rate4x4 : rate16x16;
     }
@@ -1192,7 +1184,8 @@
             int distortion2;
 
             x->e_mbd.mode_info_context->mbmi.mode = mode;
-            vp8_build_intra_predictors_mby_ptr(&x->e_mbd);
+            RECON_INVOKE(&cpi->common.rtcd.recon, build_intra_predictors_mby)
+                (&x->e_mbd);
             distortion2 = VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16prederror)(x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16, 0x7fffffff);
             rate2  = x->mbmode_cost[x->e_mbd.frame_type][mode];
             this_rd = RD_ESTIMATE(x->rdmult, x->rddiv, rate2, distortion2);

diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c
index 9163b42..4400006 100644
--- a/vp8/encoder/encodeintra.c
+++ b/vp8/encoder/encodeintra.c

@@ -10,34 +10,21 @@
 
 
 #include "vpx_ports/config.h"
-#include "idct.h"
+#include "vp8/common/idct.h"
 #include "quantize.h"
-#include "reconintra.h"
-#include "reconintra4x4.h"
+#include "vp8/common/reconintra.h"
+#include "vp8/common/reconintra4x4.h"
 #include "encodemb.h"
-#include "invtrans.h"
-#include "recon.h"
+#include "vp8/common/invtrans.h"
+#include "vp8/common/recon.h"
 #include "dct.h"
-#include "g_common.h"
+#include "vp8/common/g_common.h"
 #include "encodeintra.h"
 
 #define intra4x4ibias_rate    128
 #define intra4x4pbias_rate    256
 
 
-void vp8_update_mode_context(int *abmode, int *lbmode, int i, int best_mode)
-{
-    if (i < 12)
-    {
-        abmode[i+4] = best_mode;
-    }
-
-    if ((i & 3) != 3)
-    {
-        lbmode[i+1] = best_mode;
-    }
-
-}
 #if CONFIG_RUNTIME_CPU_DETECT
 #define IF_RTCD(x) (x)
 #else
@@ -58,21 +45,6 @@
     RECON_INVOKE(&rtcd->common->recon, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
 }
 
-void vp8_encode_intra4x4block_rd(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x, BLOCK *be, BLOCKD *b, int best_mode)
-{
-    vp8_predict_intra4x4(b, best_mode, b->predictor);
-
-    ENCODEMB_INVOKE(&rtcd->encodemb, subb)(be, b, 16);
-
-    x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32);
-
-    x->quantize_b(be, b);
-
-    IDCT_INVOKE(&rtcd->common->idct, idct16)(b->dqcoeff, b->diff, 32);
-
-    RECON_INVOKE(&rtcd->common->recon, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-}
-
 void vp8_encode_intra4x4mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb)
 {
     int i;
@@ -95,7 +67,7 @@
 {
     int b;
 
-    vp8_build_intra_predictors_mby_ptr(&x->e_mbd);
+    RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mby)(&x->e_mbd);
 
     ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, x->src.y_buffer, x->e_mbd.predictor, x->src.y_stride);
 
@@ -168,17 +140,3 @@
     vp8_recon_intra_mbuv(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
 }
 
-void vp8_encode_intra16x16mbuvrd(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
-{
-    vp8_build_intra_predictors_mbuv(&x->e_mbd);
-
-    ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
-
-    vp8_transform_mbuv(x);
-
-    vp8_quantize_mbuv(x);
-
-    vp8_inverse_transform_mbuv(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
-
-    vp8_recon_intra_mbuv(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
-}

diff --git a/vp8/encoder/encodeintra.h b/vp8/encoder/encodeintra.h
index c0247b0..40930bc 100644
--- a/vp8/encoder/encodeintra.h
+++ b/vp8/encoder/encodeintra.h

@@ -17,8 +17,6 @@
 void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *, MACROBLOCK *x);
 void vp8_encode_intra4x4mby(const VP8_ENCODER_RTCD *, MACROBLOCK *mb);
 void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *, MACROBLOCK *x, BLOCK *be, BLOCKD *b, int best_mode);
-void vp8_update_mode_context(int *abmode, int *lbmode, int i, int best_mode);
 void vp8_encode_intra4x4block_rd(const VP8_ENCODER_RTCD *, MACROBLOCK *x, BLOCK *be, BLOCKD *b, int best_mode);
-void vp8_encode_intra16x16mbuvrd(const VP8_ENCODER_RTCD *, MACROBLOCK *x);
 
 #endif

diff --git a/vp8/encoder/encodemb.c b/vp8/encoder/encodemb.c
index efcea74..463dbca 100644
--- a/vp8/encoder/encodemb.c
+++ b/vp8/encoder/encodemb.c

@@ -11,12 +11,12 @@
 
 #include "vpx_ports/config.h"
 #include "encodemb.h"
-#include "reconinter.h"
+#include "vp8/common/reconinter.h"
 #include "quantize.h"
 #include "tokenize.h"
-#include "invtrans.h"
-#include "recon.h"
-#include "reconintra.h"
+#include "vp8/common/invtrans.h"
+#include "vp8/common/recon.h"
+#include "vp8/common/reconintra.h"
 #include "dct.h"
 #include "vpx_mem/vpx_mem.h"
 
@@ -104,7 +104,7 @@
     ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
 }
 
-void vp8_build_dcblock(MACROBLOCK *x)
+static void build_dcblock(MACROBLOCK *x)
 {
     short *src_diff_ptr = &x->src_diff[384];
     int i;
@@ -138,7 +138,7 @@
     }
 
     // build dc block from 16 y dc values
-    vp8_build_dcblock(x);
+    build_dcblock(x);
 
     // do 2nd order transform on the dc block
     x->short_walsh4x4(&x->block[24].src_diff[0],
@@ -147,7 +147,7 @@
 }
 
 
-void vp8_transform_mb(MACROBLOCK *x)
+static void transform_mb(MACROBLOCK *x)
 {
     int i;
 
@@ -159,7 +159,7 @@
 
     // build dc block from 16 y dc values
     if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
-        vp8_build_dcblock(x);
+        build_dcblock(x);
 
     for (i = 16; i < 24; i += 2)
     {
@@ -174,7 +174,8 @@
 
 }
 
-void vp8_transform_mby(MACROBLOCK *x)
+
+static void transform_mby(MACROBLOCK *x)
 {
     int i;
 
@@ -187,7 +188,7 @@
     // build dc block from 16 y dc values
     if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
     {
-        vp8_build_dcblock(x);
+        build_dcblock(x);
         x->short_walsh4x4(&x->block[24].src_diff[0],
             &x->block[24].coeff[0], 8);
     }
@@ -255,9 +256,9 @@
     Y1_RD_MULT
 };
 
-void vp8_optimize_b(MACROBLOCK *mb, int ib, int type,
-                    ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
-                    const VP8_ENCODER_RTCD *rtcd)
+static void optimize_b(MACROBLOCK *mb, int ib, int type,
+                       ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
+                       const VP8_ENCODER_RTCD *rtcd)
 {
     BLOCK *b;
     BLOCKD *d;
@@ -501,7 +502,7 @@
     *a = *l = (d->eob != !type);
 }
 
-void vp8_optimize_mb(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)
+static void optimize_mb(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)
 {
     int b;
     int type;
@@ -518,31 +519,24 @@
 
     has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
         && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
-    type = has_2nd_order ? 0 : 3;
+    type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC;
 
     for (b = 0; b < 16; b++)
     {
-        vp8_optimize_b(x, b, type,
+        optimize_b(x, b, type,
             ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
     }
 
-    for (b = 16; b < 20; b++)
+    for (b = 16; b < 24; b++)
     {
-        vp8_optimize_b(x, b, vp8_block2type[b],
+        optimize_b(x, b, PLANE_TYPE_UV,
             ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
     }
 
-    for (b = 20; b < 24; b++)
-    {
-        vp8_optimize_b(x, b, vp8_block2type[b],
-            ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
-    }
-
-
     if (has_2nd_order)
     {
         b=24;
-        vp8_optimize_b(x, b, vp8_block2type[b],
+        optimize_b(x, b, PLANE_TYPE_Y2,
             ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
     }
 }
@@ -572,11 +566,11 @@
 
     has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
         && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
-    type = has_2nd_order ? 0 : 3;
+    type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC;
 
     for (b = 0; b < 16; b++)
     {
-        vp8_optimize_b(x, b, type,
+        optimize_b(x, b, type,
         ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
     }
 
@@ -584,7 +578,7 @@
     if (has_2nd_order)
     {
         b=24;
-        vp8_optimize_b(x, b, vp8_block2type[b],
+        optimize_b(x, b, PLANE_TYPE_Y2,
             ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
     }
 }
@@ -608,18 +602,11 @@
     ta = (ENTROPY_CONTEXT *)&t_above;
     tl = (ENTROPY_CONTEXT *)&t_left;
 
-    for (b = 16; b < 20; b++)
+    for (b = 16; b < 24; b++)
     {
-        vp8_optimize_b(x, b, vp8_block2type[b],
+        optimize_b(x, b, PLANE_TYPE_UV,
             ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
     }
-
-    for (b = 20; b < 24; b++)
-    {
-        vp8_optimize_b(x, b, vp8_block2type[b],
-            ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
-    }
-
 }
 #endif
 
@@ -629,13 +616,13 @@
 
     vp8_subtract_mb(rtcd, x);
 
-    vp8_transform_mb(x);
+    transform_mb(x);
 
     vp8_quantize_mb(x);
 
 #if !(CONFIG_REALTIME_ONLY)
     if (x->optimize)
-        vp8_optimize_mb(x, rtcd);
+        optimize_mb(x, rtcd);
 #endif
 
     vp8_inverse_transform_mb(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
@@ -652,7 +639,7 @@
 
     ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, x->src.y_buffer, x->e_mbd.predictor, x->src.y_stride);
 
-    vp8_transform_mby(x);
+    transform_mby(x);
 
     vp8_quantize_mby(x);
 
@@ -663,22 +650,6 @@
 }
 
 
-void vp8_encode_inter16x16uv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
-{
-    vp8_build_inter_predictors_mbuv(&x->e_mbd);
-
-    ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
-
-    vp8_transform_mbuv(x);
-
-    vp8_quantize_mbuv(x);
-
-    vp8_inverse_transform_mbuv(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
-
-    vp8_recon_intra_mbuv(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
-}
-
-
 void vp8_encode_inter16x16uvrd(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
 {
     vp8_build_inter_predictors_mbuv(&x->e_mbd);

diff --git a/vp8/encoder/encodemb.h b/vp8/encoder/encodemb.h
index 08f75c3..8c93aa1 100644
--- a/vp8/encoder/encodemb.h
+++ b/vp8/encoder/encodemb.h

@@ -101,9 +101,6 @@
 void vp8_transform_mb(MACROBLOCK *mb);
 void vp8_transform_mbuv(MACROBLOCK *x);
 void vp8_transform_intra_mby(MACROBLOCK *x);
-void Encode16x16Y(MACROBLOCK *x);
-void Encode16x16UV(MACROBLOCK *x);
-void vp8_encode_inter16x16uv(const struct VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x);
 void vp8_encode_inter16x16uvrd(const struct VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x);
 void vp8_optimize_mby(MACROBLOCK *x, const struct VP8_ENCODER_RTCD *rtcd);
 void vp8_optimize_mbuv(MACROBLOCK *x, const struct VP8_ENCODER_RTCD *rtcd);

diff --git a/vp8/encoder/encodemv.c b/vp8/encoder/encodemv.c
index cce7530..6b1e6f9 100644
--- a/vp8/encoder/encodemv.c
+++ b/vp8/encoder/encodemv.c

@@ -9,10 +9,10 @@
  */
 
 
-#include "common.h"
+#include "vp8/common/common.h"
 #include "encodemv.h"
-#include "entropymode.h"
-#include "systemdependent.h"
+#include "vp8/common/entropymode.h"
+#include "vp8/common/systemdependent.h"
 
 #include <math.h>
 
@@ -128,7 +128,7 @@
 
         while (--i > 3);
 
-        if (x & 240)
+        if (x & 0xFFF0)
             cost += vp8_cost_bit(p [MVPbits + 3], (x >> 3) & 1);
     }
 

diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c
index 8da93a6..71da103 100644
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c

@@ -9,9 +9,11 @@
  */
 
 #include "onyx_int.h"
-#include "threading.h"
-#include "common.h"
-#include "extend.h"
+#include "vp8/common/threading.h"
+#include "vp8/common/common.h"
+#include "vp8/common/extend.h"
+
+#if CONFIG_MULTITHREAD
 
 extern int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
                                          TOKENEXTRA **t, int recon_yoffset,
@@ -22,10 +24,35 @@
 extern void vp8_build_block_offsets(MACROBLOCK *x);
 extern void vp8_setup_block_ptrs(MACROBLOCK *x);
 
+extern void loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm);
+
+static THREAD_FUNCTION loopfilter_thread(void *p_data)
+{
+    VP8_COMP *cpi = (VP8_COMP *)(((LPFTHREAD_DATA *)p_data)->ptr1);
+    VP8_COMMON *cm = &cpi->common;
+
+    while (1)
+    {
+        if (cpi->b_multi_threaded == 0)
+            break;
+
+        if (sem_wait(&cpi->h_event_start_lpf) == 0)
+        {
+            if (cpi->b_multi_threaded == FALSE) // we're shutting down
+                break;
+
+            loopfilter_frame(cpi, cm);
+
+            sem_post(&cpi->h_event_end_lpf);
+        }
+    }
+
+    return 0;
+}
+
 static
 THREAD_FUNCTION thread_encoding_proc(void *p_data)
 {
-#if CONFIG_MULTITHREAD
     int ithread = ((ENCODETHREAD_DATA *)p_data)->ithread;
     VP8_COMP *cpi = (VP8_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr1);
     MB_ROW_COMP *mbri = (MB_ROW_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr2);
@@ -247,10 +274,6 @@
         }
     }
 
-#else
-    (void) p_data;
-#endif
-
     //printf("exit thread %d\n", ithread);
     return 0;
 }
@@ -432,69 +455,76 @@
 
 void vp8cx_create_encoder_threads(VP8_COMP *cpi)
 {
+    const VP8_COMMON * cm = &cpi->common;
+
     cpi->b_multi_threaded = 0;
-
+    cpi->encoding_thread_count = 0;
     cpi->processor_core_count = 32; //vp8_get_proc_core_count();
 
-    CHECK_MEM_ERROR(cpi->tplist, vpx_malloc(sizeof(TOKENLIST) * cpi->common.mb_rows));
-
-#if CONFIG_MULTITHREAD
-
     if (cpi->processor_core_count > 1 && cpi->oxcf.multi_threaded > 1)
     {
         int ithread;
+        int th_count = cpi->oxcf.multi_threaded - 1;
 
         if (cpi->oxcf.multi_threaded > cpi->processor_core_count)
-            cpi->encoding_thread_count = cpi->processor_core_count - 1;
-        else
-            cpi->encoding_thread_count = cpi->oxcf.multi_threaded - 1;
+            th_count = cpi->processor_core_count - 1;
 
-        CHECK_MEM_ERROR(cpi->h_encoding_thread, vpx_malloc(sizeof(pthread_t) * cpi->encoding_thread_count));
-        CHECK_MEM_ERROR(cpi->h_event_start_encoding, vpx_malloc(sizeof(sem_t) * cpi->encoding_thread_count));
-        CHECK_MEM_ERROR(cpi->mb_row_ei, vpx_memalign(32, sizeof(MB_ROW_COMP) * cpi->encoding_thread_count));
-        vpx_memset(cpi->mb_row_ei, 0, sizeof(MB_ROW_COMP) * cpi->encoding_thread_count);
-        CHECK_MEM_ERROR(cpi->en_thread_data, vpx_malloc(sizeof(ENCODETHREAD_DATA) * cpi->encoding_thread_count));
-        CHECK_MEM_ERROR(cpi->mt_current_mb_col, vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cpi->common.mb_rows));
+        /* we have th_count + 1 (main) threads processing one row each */
+        /* no point to have more threads than the sync range allows */
+        if(th_count > ((cm->mb_cols / cpi->mt_sync_range) - 1))
+        {
+            th_count = (cm->mb_cols / cpi->mt_sync_range) - 1;
+        }
 
-        //cpi->h_event_main = CreateEvent(NULL, FALSE, FALSE, NULL);
+        if(th_count == 0)
+            return;
+
+        CHECK_MEM_ERROR(cpi->h_encoding_thread, vpx_malloc(sizeof(pthread_t) * th_count));
+        CHECK_MEM_ERROR(cpi->h_event_start_encoding, vpx_malloc(sizeof(sem_t) * th_count));
+        CHECK_MEM_ERROR(cpi->mb_row_ei, vpx_memalign(32, sizeof(MB_ROW_COMP) * th_count));
+        vpx_memset(cpi->mb_row_ei, 0, sizeof(MB_ROW_COMP) * th_count);
+        CHECK_MEM_ERROR(cpi->en_thread_data,
+                        vpx_malloc(sizeof(ENCODETHREAD_DATA) * th_count));
+        CHECK_MEM_ERROR(cpi->mt_current_mb_col,
+                        vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cm->mb_rows));
+
         sem_init(&cpi->h_event_end_encoding, 0, 0);
 
         cpi->b_multi_threaded = 1;
+        cpi->encoding_thread_count = th_count;
 
-        //printf("[VP8:] multi_threaded encoding is enabled with %d threads\n\n", (cpi->encoding_thread_count +1));
+        /*
+        printf("[VP8:] multi_threaded encoding is enabled with %d threads\n\n",
+               (cpi->encoding_thread_count +1));
+        */
 
-        for (ithread = 0; ithread < cpi->encoding_thread_count; ithread++)
+        for (ithread = 0; ithread < th_count; ithread++)
         {
             ENCODETHREAD_DATA * ethd = &cpi->en_thread_data[ithread];
 
-            //cpi->h_event_mbrencoding[ithread] = CreateEvent(NULL, FALSE, FALSE, NULL);
             sem_init(&cpi->h_event_start_encoding[ithread], 0, 0);
             ethd->ithread = ithread;
             ethd->ptr1 = (void *)cpi;
             ethd->ptr2 = (void *)&cpi->mb_row_ei[ithread];
 
-            //printf(" call begin thread %d \n", ithread);
-
-            //cpi->h_encoding_thread[ithread] =   (HANDLE)_beginthreadex(
-            //  NULL,           // security
-            //  0,              // stksize
-            //  thread_encoding_proc,
-            //  (&cpi->en_thread_data[ithread]),          // Thread data
-            //  0,
-            //  NULL);
-
             pthread_create(&cpi->h_encoding_thread[ithread], 0, thread_encoding_proc, ethd);
         }
 
+        {
+            LPFTHREAD_DATA * lpfthd = &cpi->lpf_thread_data;
+
+            sem_init(&cpi->h_event_start_lpf, 0, 0);
+            sem_init(&cpi->h_event_end_lpf, 0, 0);
+
+            lpfthd->ptr1 = (void *)cpi;
+            pthread_create(&cpi->h_filter_thread, 0, loopfilter_thread, lpfthd);
+        }
     }
 
-#endif
 }
 
 void vp8cx_remove_encoder_threads(VP8_COMP *cpi)
 {
-#if CONFIG_MULTITHREAD
-
     if (cpi->b_multi_threaded)
     {
         //shutdown other threads
@@ -510,9 +540,14 @@
 
                 sem_destroy(&cpi->h_event_start_encoding[i]);
             }
+
+            sem_post(&cpi->h_event_start_lpf);
+            pthread_join(cpi->h_filter_thread, 0);
         }
 
         sem_destroy(&cpi->h_event_end_encoding);
+        sem_destroy(&cpi->h_event_end_lpf);
+        sem_destroy(&cpi->h_event_start_lpf);
 
         //free thread related resources
         vpx_free(cpi->h_event_start_encoding);
@@ -521,7 +556,5 @@
         vpx_free(cpi->en_thread_data);
         vpx_free(cpi->mt_current_mb_col);
     }
-
-#endif
-    vpx_free(cpi->tplist);
 }
+#endif

diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index fc6f043..6c9433b 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c

@@ -14,18 +14,18 @@
 #include "onyx_int.h"
 #include "variance.h"
 #include "encodeintra.h"
-#include "setupintrarecon.h"
+#include "vp8/common/setupintrarecon.h"
 #include "mcomp.h"
 #include "vpx_scale/vpxscale.h"
 #include "encodemb.h"
-#include "extend.h"
-#include "systemdependent.h"
+#include "vp8/common/extend.h"
+#include "vp8/common/systemdependent.h"
 #include "vpx_scale/yv12extend.h"
 #include "vpx_mem/vpx_mem.h"
-#include "swapyv12buffer.h"
+#include "vp8/common/swapyv12buffer.h"
 #include <stdio.h>
 #include "rdopt.h"
-#include "quant_common.h"
+#include "vp8/common/quant_common.h"
 #include "encodemv.h"
 
 //#define OUTPUT_FPF 1
@@ -67,7 +67,7 @@
 static int hscale_lookup[7] = {0, 0, 1, 1, 2, 2, 3};
 
 
-const int cq_level[QINDEX_RANGE] =
+static const int cq_level[QINDEX_RANGE] =
 {
     0,0,1,1,2,3,3,4,4,5,6,6,7,8,8,9,
     9,10,11,11,12,13,13,14,15,15,16,17,17,18,19,20,
@@ -79,10 +79,9 @@
     86,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100
 };
 
-void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame);
-int vp8_input_stats(VP8_COMP *cpi, FIRSTPASS_STATS *fps);
+static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame);
 
-int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred)
+static int encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred)
 {
 
     int i;
@@ -146,7 +145,7 @@
     /*start_pos = cpi->stats_in;
     sum_iiratio = 0.0;
     i = 0;
-    while ( (i < 1) && vp8_input_stats(cpi,&next_frame) != EOF )
+    while ( (i < 1) && input_stats(cpi,&next_frame) != EOF )
     {
 
         next_iiratio = next_frame.intra_error / DOUBLE_DIVIDE_CHECK(next_frame.coded_error);
@@ -212,7 +211,7 @@
 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000
 };
 
-double vp8_simple_weight(YV12_BUFFER_CONFIG *source)
+static double simple_weight(YV12_BUFFER_CONFIG *source)
 {
     int i, j;
 
@@ -240,7 +239,7 @@
 
 
 // This function returns the current per frame maximum bitrate target
-int frame_max_bits(VP8_COMP *cpi)
+static int frame_max_bits(VP8_COMP *cpi)
 {
     // Max allocation for a single frame based on the max section guidelines passed in and how many bits are left
     int max_bits;
@@ -281,38 +280,26 @@
 }
 
 
-extern size_t vp8_firstpass_stats_sz(unsigned int mb_count)
-{
-    /* Calculate the size of a stats packet, which is dependent on the frame
-     * resolution. The FIRSTPASS_STATS struct has a single element array,
-     * motion_map, which is virtually expanded to have one element per
-     * macroblock.
-     */
-    size_t stats_sz;
-
-    stats_sz = sizeof(FIRSTPASS_STATS) + mb_count;
-    stats_sz = (stats_sz + 7) & ~7;
-    return stats_sz;
-}
-
-
-void vp8_output_stats(const VP8_COMP            *cpi,
-                      struct vpx_codec_pkt_list *pktlist,
-                      FIRSTPASS_STATS            *stats)
+static void output_stats(const VP8_COMP            *cpi,
+                         struct vpx_codec_pkt_list *pktlist,
+                         FIRSTPASS_STATS            *stats)
 {
     struct vpx_codec_cx_pkt pkt;
     pkt.kind = VPX_CODEC_STATS_PKT;
     pkt.data.twopass_stats.buf = stats;
-    pkt.data.twopass_stats.sz = vp8_firstpass_stats_sz(cpi->common.MBs);
+    pkt.data.twopass_stats.sz = sizeof(FIRSTPASS_STATS);
     vpx_codec_pkt_list_add(pktlist, &pkt);
 
 // TEMP debug code
 #if OUTPUT_FPF
+
     {
         FILE *fpfile;
         fpfile = fopen("firstpass.stt", "a");
 
-        fprintf(fpfile, "%12.0f %12.0f %12.0f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.0f\n",
+        fprintf(fpfile, "%12.0f %12.0f %12.0f %12.4f %12.4f %12.4f %12.4f"
+                " %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f"
+                " %12.0f %12.4f\n",
                 stats->frame,
                 stats->intra_error,
                 stats->coded_error,
@@ -320,6 +307,7 @@
                 stats->pcnt_inter,
                 stats->pcnt_motion,
                 stats->pcnt_second_ref,
+                stats->pcnt_neutral,
                 stats->MVr,
                 stats->mvr_abs,
                 stats->MVc,
@@ -327,30 +315,24 @@
                 stats->MVrv,
                 stats->MVcv,
                 stats->mv_in_out_count,
-                stats->count);
-        fclose(fpfile);
-
-
-        fpfile = fopen("fpmotionmap.stt", "a");
-        if(fwrite(cpi->fp_motion_map, 1, cpi->common.MBs, fpfile));
+                stats->count,
+                stats->duration);
         fclose(fpfile);
     }
 #endif
 }
 
-int vp8_input_stats(VP8_COMP *cpi, FIRSTPASS_STATS *fps)
+static int input_stats(VP8_COMP *cpi, FIRSTPASS_STATS *fps)
 {
-    size_t stats_sz = vp8_firstpass_stats_sz(cpi->common.MBs);
-
     if (cpi->stats_in >= cpi->stats_in_end)
         return EOF;
 
     *fps = *cpi->stats_in;
-    cpi->stats_in = (void*)((char *)cpi->stats_in + stats_sz);
+    cpi->stats_in = (void*)((char *)cpi->stats_in + sizeof(FIRSTPASS_STATS));
     return 1;
 }
 
-void vp8_zero_stats(FIRSTPASS_STATS *section)
+static void zero_stats(FIRSTPASS_STATS *section)
 {
     section->frame      = 0.0;
     section->intra_error = 0.0;
@@ -359,6 +341,7 @@
     section->pcnt_inter  = 0.0;
     section->pcnt_motion  = 0.0;
     section->pcnt_second_ref = 0.0;
+    section->pcnt_neutral = 0.0;
     section->MVr        = 0.0;
     section->mvr_abs     = 0.0;
     section->MVc        = 0.0;
@@ -369,7 +352,7 @@
     section->count      = 0.0;
     section->duration   = 1.0;
 }
-void vp8_accumulate_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame)
+static void accumulate_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame)
 {
     section->frame += frame->frame;
     section->intra_error += frame->intra_error;
@@ -378,6 +361,7 @@
     section->pcnt_inter  += frame->pcnt_inter;
     section->pcnt_motion += frame->pcnt_motion;
     section->pcnt_second_ref += frame->pcnt_second_ref;
+    section->pcnt_neutral += frame->pcnt_neutral;
     section->MVr        += frame->MVr;
     section->mvr_abs     += frame->mvr_abs;
     section->MVc        += frame->MVc;
@@ -388,7 +372,7 @@
     section->count      += frame->count;
     section->duration   += frame->duration;
 }
-void vp8_avg_stats(FIRSTPASS_STATS *section)
+static void avg_stats(FIRSTPASS_STATS *section)
 {
     if (section->count < 1.0)
         return;
@@ -398,6 +382,7 @@
     section->ssim_weighted_pred_err /= section->count;
     section->pcnt_inter  /= section->count;
     section->pcnt_second_ref /= section->count;
+    section->pcnt_neutral /= section->count;
     section->pcnt_motion /= section->count;
     section->MVr        /= section->count;
     section->mvr_abs     /= section->count;
@@ -409,65 +394,17 @@
     section->duration   /= section->count;
 }
 
-unsigned char *vp8_fpmm_get_pos(VP8_COMP *cpi)
-{
-    return cpi->fp_motion_map_stats;
-}
-void vp8_fpmm_reset_pos(VP8_COMP *cpi, unsigned char *target_pos)
-{
-    cpi->fp_motion_map_stats = target_pos;
-}
-
-void vp8_advance_fpmm(VP8_COMP *cpi, int count)
-{
-    cpi->fp_motion_map_stats = (void*)((char*)cpi->fp_motion_map_stats +
-        count * vp8_firstpass_stats_sz(cpi->common.MBs));
-}
-
-void vp8_input_fpmm(VP8_COMP *cpi)
-{
-    unsigned char *fpmm = cpi->fp_motion_map;
-    int MBs = cpi->common.MBs;
-    int max_frames = cpi->active_arnr_frames;
-    int i;
-
-    for (i=0; i<max_frames; i++)
-    {
-        char *motion_map = (char*)cpi->fp_motion_map_stats
-                           + sizeof(FIRSTPASS_STATS);
-
-        memcpy(fpmm, motion_map, MBs);
-        fpmm += MBs;
-        vp8_advance_fpmm(cpi, 1);
-    }
-
-    // Flag the use of weights in the temporal filter
-    cpi->use_weighted_temporal_filter = 1;
-}
-
 void vp8_init_first_pass(VP8_COMP *cpi)
 {
-    vp8_zero_stats(cpi->total_stats);
-
-// TEMP debug code
-#ifdef OUTPUT_FPF
-    {
-        FILE *fpfile;
-        fpfile = fopen("firstpass.stt", "w");
-        fclose(fpfile);
-        fpfile = fopen("fpmotionmap.stt", "wb");
-        fclose(fpfile);
-    }
-#endif
-
+    zero_stats(cpi->total_stats);
 }
 
 void vp8_end_first_pass(VP8_COMP *cpi)
 {
-    vp8_output_stats(cpi, cpi->output_pkt_list, cpi->total_stats);
+    output_stats(cpi, cpi->output_pkt_list, cpi->total_stats);
 }
 
-void vp8_zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x, YV12_BUFFER_CONFIG * recon_buffer, int * best_motion_err, int recon_yoffset )
+static void zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x, YV12_BUFFER_CONFIG * recon_buffer, int * best_motion_err, int recon_yoffset )
 {
     MACROBLOCKD * const xd = & x->e_mbd;
     BLOCK *b = &x->block[0];
@@ -486,7 +423,7 @@
     VARIANCE_INVOKE(IF_RTCD(&cpi->rtcd.variance), mse16x16) ( src_ptr, src_stride, ref_ptr, ref_stride, (unsigned int *)(best_motion_err));
 }
 
-void vp8_first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x, MV *ref_mv, MV *best_mv, YV12_BUFFER_CONFIG *recon_buffer, int *best_motion_err, int recon_yoffset )
+static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x, MV *ref_mv, MV *best_mv, YV12_BUFFER_CONFIG *recon_buffer, int *best_motion_err, int recon_yoffset )
 {
     MACROBLOCKD *const xd = & x->e_mbd;
     BLOCK *b = &x->block[0];
@@ -560,8 +497,8 @@
     YV12_BUFFER_CONFIG *gld_yv12 = &cm->yv12_fb[cm->gld_fb_idx];
     int recon_y_stride = lst_yv12->y_stride;
     int recon_uv_stride = lst_yv12->uv_stride;
-    int intra_error = 0;
-    int coded_error = 0;
+    long long intra_error = 0;
+    long long coded_error = 0;
 
     int sum_mvr = 0, sum_mvc = 0;
     int sum_mvr_abs = 0, sum_mvc_abs = 0;
@@ -570,13 +507,12 @@
     int intercount = 0;
     int second_ref_count = 0;
     int intrapenalty = 256;
+    int neutral_count = 0;
 
     int sum_in_vectors = 0;
 
     MV zero_ref_mv = {0, 0};
 
-    unsigned char *fp_motion_map_ptr = cpi->fp_motion_map;
-
     vp8_clear_system_state();  //__asm emms;
 
     x->src = * cpi->Source;
@@ -628,7 +564,6 @@
         for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
         {
             int this_error;
-            int zero_error;
             int zz_to_best_ratio;
             int gf_motion_error = INT_MAX;
             int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
@@ -639,7 +574,7 @@
             xd->left_available = (mb_col != 0);
 
             // do intra 16x16 prediction
-            this_error = vp8_encode_intra(cpi, x, use_dc_pred);
+            this_error = encode_intra(cpi, x, use_dc_pred);
 
             // "intrapenalty" below deals with situations where the intra and inter error scores are very low (eg a plain black frame)
             // We do not have special cases in first pass for 0,0 and nearest etc so all inter modes carry an overhead cost estimate fot the mv.
@@ -648,10 +583,7 @@
             this_error += intrapenalty;
 
             // Cumulative intra error total
-            intra_error += this_error;
-
-            // Indicate default assumption of intra in the motion map
-            *fp_motion_map_ptr = 0;
+            intra_error += (long long)this_error;
 
             // Set up limit values for motion vectors to prevent them extending outside the UMV borders
             x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16));
@@ -667,16 +599,13 @@
                 int motion_error = INT_MAX;
 
                 // Simple 0,0 motion with no mv overhead
-                vp8_zz_motion_search( cpi, x, lst_yv12, &motion_error, recon_yoffset );
+                zz_motion_search( cpi, x, lst_yv12, &motion_error, recon_yoffset );
                 d->bmi.mv.as_mv.row = 0;
                 d->bmi.mv.as_mv.col = 0;
 
-                // Save (0,0) error for later use
-                zero_error = motion_error;
-
                 // Test last reference frame using the previous best mv as the
                 // starting point (best reference) for the search
-                vp8_first_pass_motion_search(cpi, x, &best_ref_mv.as_mv,
+                first_pass_motion_search(cpi, x, &best_ref_mv.as_mv,
                                         &d->bmi.mv.as_mv, lst_yv12,
                                         &motion_error, recon_yoffset);
 
@@ -684,7 +613,7 @@
                 if (best_ref_mv.as_int)
                 {
                    tmp_err = INT_MAX;
-                   vp8_first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv,
+                   first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv,
                                      lst_yv12, &tmp_err, recon_yoffset);
 
                    if ( tmp_err < motion_error )
@@ -698,7 +627,7 @@
                 // Experimental search in a second reference frame ((0,0) based only)
                 if (cm->current_video_frame > 1)
                 {
-                    vp8_first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv, gld_yv12, &gf_motion_error, recon_yoffset);
+                    first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv, gld_yv12, &gf_motion_error, recon_yoffset);
 
                     if ((gf_motion_error < motion_error) && (gf_motion_error < this_error))
                     {
@@ -726,6 +655,17 @@
 
                 if (motion_error <= this_error)
                 {
+                    // Keep a count of cases where the inter and intra were
+                    // very close and very low. This helps with scene cut
+                    // detection for example in cropped clips with black bars
+                    // at the sides or top and bottom.
+                    if( (((this_error-intrapenalty) * 9) <=
+                         (motion_error*10)) &&
+                        (this_error < (2*intrapenalty)) )
+                    {
+                        neutral_count++;
+                    }
+
                     d->bmi.mv.as_mv.row <<= 3;
                     d->bmi.mv.as_mv.col <<= 3;
                     this_error = motion_error;
@@ -777,30 +717,11 @@
                             else if (d->bmi.mv.as_mv.col < 0)
                                 sum_in_vectors--;
                         }
-
-                        // Compute how close (0,0) predictor is to best
-                        // predictor in terms of their prediction error
-                        zz_to_best_ratio = (10*zero_error + this_error/2)
-                                            / (this_error+!this_error);
-
-                        if ((zero_error < 50000) &&
-                            (zz_to_best_ratio <= 11) )
-                            *fp_motion_map_ptr = 1;
-                        else
-                            *fp_motion_map_ptr = 0;
-                    }
-                    else
-                    {
-                        // 0,0 mv was best
-                        if( zero_error<50000 )
-                            *fp_motion_map_ptr = 2;
-                        else
-                            *fp_motion_map_ptr = 1;
                     }
                 }
             }
 
-            coded_error += this_error;
+            coded_error += (long long)this_error;
 
             // adjust to the next column of macroblocks
             x->src.y_buffer += 16;
@@ -809,9 +730,6 @@
 
             recon_yoffset += 16;
             recon_uvoffset += 8;
-
-            // Update the motion map
-            fp_motion_map_ptr++;
         }
 
         // adjust to the next row of mbs
@@ -833,7 +751,7 @@
         fps.frame      = cm->current_video_frame ;
         fps.intra_error = intra_error >> 8;
         fps.coded_error = coded_error >> 8;
-        weight = vp8_simple_weight(cpi->Source);
+        weight = simple_weight(cpi->Source);
 
 
         if (weight < 0.1)
@@ -854,6 +772,7 @@
 
         fps.pcnt_inter   = 1.0 * (double)intercount / cm->MBs;
         fps.pcnt_second_ref = 1.0 * (double)second_ref_count / cm->MBs;
+        fps.pcnt_neutral = 1.0 * (double)neutral_count / cm->MBs;
 
         if (mvcount > 0)
         {
@@ -872,15 +791,12 @@
         // than the full time between subsequent cpi->source_time_stamp s  .
         fps.duration = cpi->source_end_time_stamp - cpi->source_time_stamp;
 
-        // don't want to do outputstats with a stack variable!
+        // don't want to do output stats with a stack variable!
         memcpy(cpi->this_frame_stats,
                &fps,
                sizeof(FIRSTPASS_STATS));
-        memcpy((char*)cpi->this_frame_stats + sizeof(FIRSTPASS_STATS),
-               cpi->fp_motion_map,
-               sizeof(cpi->fp_motion_map[0]) * cpi->common.MBs);
-        vp8_output_stats(cpi, cpi->output_pkt_list, cpi->this_frame_stats);
-        vp8_accumulate_stats(cpi->total_stats, &fps);
+        output_stats(cpi, cpi->output_pkt_list, cpi->this_frame_stats);
+        accumulate_stats(cpi->total_stats, &fps);
     }
 
     // Copy the previous Last Frame into the GF buffer if specific conditions for doing so are met
@@ -924,10 +840,10 @@
 extern const int vp8_bits_per_mb[2][QINDEX_RANGE];
 
 #define BASE_ERRPERMB   150
-static int estimate_max_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, int Height, int Width)
+static int estimate_max_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh)
 {
     int Q;
-    int num_mbs = ((Height * Width) / (16 * 16));
+    int num_mbs = cpi->common.MBs;
     int target_norm_bits_per_mb;
 
     double err_per_mb = section_err / num_mbs;
@@ -1024,10 +940,10 @@
 
     return Q;
 }
-static int estimate_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, int Height, int Width)
+static int estimate_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh)
 {
     int Q;
-    int num_mbs = ((Height * Width) / (16 * 16));
+    int num_mbs = cpi->common.MBs;
     int target_norm_bits_per_mb;
 
     double err_per_mb = section_err / num_mbs;
@@ -1075,10 +991,10 @@
 }
 
 // Estimate a worst case Q for a KF group
-static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, int Height, int Width, double group_iiratio)
+static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, double group_iiratio)
 {
     int Q;
-    int num_mbs = ((Height * Width) / (16 * 16));
+    int num_mbs = cpi->common.MBs;
     int target_norm_bits_per_mb = (512 * section_target_bandwitdh) / num_mbs;
     int bits_per_mb_at_this_q;
 
@@ -1173,11 +1089,10 @@
 
 // For cq mode estimate a cq level that matches the observed
 // complexity and data rate.
-static int estimate_cq(VP8_COMP *cpi, double section_err,
-                       int section_target_bandwitdh, int Height, int Width)
+static int estimate_cq(VP8_COMP *cpi, double section_err, int section_target_bandwitdh)
 {
     int Q;
-    int num_mbs = ((Height * Width) / (16 * 16));
+    int num_mbs = cpi->common.MBs;
     int target_norm_bits_per_mb;
 
     double err_per_mb = section_err / num_mbs;
@@ -1252,7 +1167,7 @@
 
     double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
 
-    vp8_zero_stats(cpi->total_stats);
+    zero_stats(cpi->total_stats);
 
     if (!cpi->stats_in_end)
         return;
@@ -1286,7 +1201,7 @@
     cpi->kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs;
     cpi->gf_intra_err_min = GF_MB_INTRA_MIN * cpi->common.MBs;
 
-    vp8_avg_stats(cpi->total_stats);
+    avg_stats(cpi->total_stats);
 
     // Scan the first pass file and calculate an average Intra / Inter error score ratio for the sequence
     {
@@ -1295,7 +1210,7 @@
 
         start_pos = cpi->stats_in;               // Note starting "file" position
 
-        while (vp8_input_stats(cpi, &this_frame) != EOF)
+        while (input_stats(cpi, &this_frame) != EOF)
         {
             IIRatio = this_frame.intra_error / DOUBLE_DIVIDE_CHECK(this_frame.coded_error);
             IIRatio = (IIRatio < 1.0) ? 1.0 : (IIRatio > 20.0) ? 20.0 : IIRatio;
@@ -1316,7 +1231,7 @@
         cpi->modified_error_total = 0.0;
         cpi->modified_error_used = 0.0;
 
-        while (vp8_input_stats(cpi, &this_frame) != EOF)
+        while (input_stats(cpi, &this_frame) != EOF)
         {
             cpi->modified_error_total += calculate_modified_err(cpi, &this_frame);
         }
@@ -1331,8 +1246,6 @@
     cpi->clip_bpe =  cpi->bits_left /
                      DOUBLE_DIVIDE_CHECK(cpi->modified_error_total);
     cpi->observed_bpe = cpi->clip_bpe;
-
-    cpi->fp_motion_map_stats = (unsigned char *)cpi->stats_in;
 }
 
 void vp8_end_second_pass(VP8_COMP *cpi)
@@ -1340,8 +1253,8 @@
 }
 
 // This function gives and estimate of how badly we believe
-// the predicition quality is decaying from frame to frame.
-double gf_prediction_decay_rate(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame)
+// the prediction quality is decaying from frame to frame.
+static double get_prediction_decay_rate(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame)
 {
     double prediction_decay_rate;
     double motion_decay;
@@ -1376,6 +1289,52 @@
     return prediction_decay_rate;
 }
 
+// Function to test for a condition where a complex transition is followed
+// by a static section. For example in slide shows where there is a fade
+// between slides. This is to help with more optimal kf and gf positioning.
+static int detect_transition_to_still(
+    VP8_COMP *cpi,
+    int frame_interval,
+    int still_interval,
+    double loop_decay_rate,
+    double decay_accumulator )
+{
+    BOOL trans_to_still = FALSE;
+
+    // Break clause to detect very still sections after motion
+    // For example a static image after a fade or other transition
+    // instead of a clean scene cut.
+    if ( (frame_interval > MIN_GF_INTERVAL) &&
+         (loop_decay_rate >= 0.999) &&
+         (decay_accumulator < 0.9) )
+    {
+        int j;
+        FIRSTPASS_STATS * position = cpi->stats_in;
+        FIRSTPASS_STATS tmp_next_frame;
+        double decay_rate;
+
+        // Look ahead a few frames to see if static condition
+        // persists...
+        for ( j = 0; j < still_interval; j++ )
+        {
+            if (EOF == input_stats(cpi, &tmp_next_frame))
+                break;
+
+            decay_rate = get_prediction_decay_rate(cpi, &tmp_next_frame);
+            if ( decay_rate < 0.999 )
+                break;
+        }
+        // Reset file position
+        reset_fpf_position(cpi, position);
+
+        // Only if it does do we signal a transition to still
+        if ( j == still_interval )
+            trans_to_still = TRUE;
+    }
+
+    return trans_to_still;
+}
+
 // Analyse and define a gf/arf group .
 static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
 {
@@ -1406,8 +1365,6 @@
 
     int max_bits = frame_max_bits(cpi);     // Max for a single frame
 
-    unsigned char *fpmm_pos;
-
     unsigned int allow_alt_ref =
                     cpi->oxcf.play_alternate && cpi->oxcf.lag_in_frames;
 
@@ -1416,8 +1373,6 @@
 
     vp8_clear_system_state();  //__asm emms;
 
-    fpmm_pos = vp8_fpmm_get_pos(cpi);
-
     start_pos = cpi->stats_in;
 
     vpx_memset(&next_frame, 0, sizeof(next_frame)); // assure clean
@@ -1461,7 +1416,7 @@
         mod_err_per_mb_accumulator +=
             mod_frame_err / DOUBLE_DIVIDE_CHECK((double)cpi->common.MBs);
 
-        if (EOF == vp8_input_stats(cpi, &next_frame))
+        if (EOF == input_stats(cpi, &next_frame))
             break;
 
         // Accumulate motion stats.
@@ -1528,7 +1483,7 @@
         if (r > GF_RMAX)
             r = GF_RMAX;
 
-        loop_decay_rate = gf_prediction_decay_rate(cpi, &next_frame);
+        loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
 
         // Cumulative effect of decay
         decay_accumulator = decay_accumulator * loop_decay_rate;
@@ -1537,48 +1492,13 @@
         boost_score += (decay_accumulator * r);
 
         // Break clause to detect very still sections after motion
-        // For example a staic image after a fade or other transition
-        // instead of a clean key frame.
-        if ( (i > MIN_GF_INTERVAL) &&
-             (loop_decay_rate >= 0.999) &&
-             (decay_accumulator < 0.9) )
+        // For example a staic image after a fade or other transition.
+        if ( detect_transition_to_still( cpi, i, 5,
+                                         loop_decay_rate, decay_accumulator ) )
         {
-            int j;
-            FIRSTPASS_STATS * position = cpi->stats_in;
-            FIRSTPASS_STATS tmp_next_frame;
-            double decay_rate;
-
-            // Look ahead a few frames to see if static condition
-            // persists...
-            for ( j = 0; j < 4; j++ )
-            {
-                if (EOF == vp8_input_stats(cpi, &tmp_next_frame))
-                    break;
-
-                decay_rate = gf_prediction_decay_rate(cpi, &tmp_next_frame);
-                if ( decay_rate < 0.999 )
-                    break;
-            }
-            reset_fpf_position(cpi, position);            // Reset file position
-
-            // Force GF not alt ref
-            if ( j == 4 )
-            {
-                if (0)
-                {
-                    FILE *f = fopen("fadegf.stt", "a");
-                    fprintf(f, " %8d %8d %10.4f %10.4f %10.4f\n",
-                         cpi->common.current_video_frame+i, i,
-                         loop_decay_rate, decay_accumulator,
-                         boost_score );
-                    fclose(f);
-                }
-
-                allow_alt_ref = FALSE;
-
-                boost_score = old_boost_score;
-                break;
-            }
+            allow_alt_ref = FALSE;
+            boost_score = old_boost_score;
+            break;
         }
 
         // Break out conditions.
@@ -1686,7 +1606,7 @@
         arf_frame_bits = (int)((double)Boost * (group_bits / (double)allocation_chunks));
 
         // Estimate if there are enough bits available to make worthwhile use of an arf.
-        tmp_q = estimate_q(cpi, mod_frame_err, (int)arf_frame_bits, cpi->common.Height, cpi->common.Width);
+        tmp_q = estimate_q(cpi, mod_frame_err, (int)arf_frame_bits);
 
         // Only use an arf if it is likely we will be able to code it at a lower Q than the surrounding frames.
         if (tmp_q < cpi->worst_quality)
@@ -1749,20 +1669,6 @@
             }
 
             cpi->active_arnr_frames = frames_bwd + 1 + frames_fwd;
-
-            {
-                // Advance to & read in the motion map for those frames
-                // to be considered for filtering based on the position
-                // of the ARF
-                vp8_fpmm_reset_pos(cpi, cpi->fp_motion_map_stats_save);
-
-                // Position at the 'earliest' frame to be filtered
-                vp8_advance_fpmm(cpi,
-                    cpi->baseline_gf_interval - frames_bwd);
-
-                // Read / create a motion map for the region of interest
-                vp8_input_fpmm(cpi);
-            }
         }
         else
         {
@@ -1784,7 +1690,7 @@
         {
             while (cpi->baseline_gf_interval < cpi->frames_to_key)
             {
-                if (EOF == vp8_input_stats(cpi, this_frame))
+                if (EOF == input_stats(cpi, this_frame))
                     break;
 
                 cpi->baseline_gf_interval++;
@@ -1963,16 +1869,16 @@
         FIRSTPASS_STATS sectionstats;
         double Ratio;
 
-        vp8_zero_stats(&sectionstats);
+        zero_stats(&sectionstats);
         reset_fpf_position(cpi, start_pos);
 
         for (i = 0 ; i < cpi->baseline_gf_interval ; i++)
         {
-            vp8_input_stats(cpi, &next_frame);
-            vp8_accumulate_stats(&sectionstats, &next_frame);
+            input_stats(cpi, &next_frame);
+            accumulate_stats(&sectionstats, &next_frame);
         }
 
-        vp8_avg_stats(&sectionstats);
+        avg_stats(&sectionstats);
 
         cpi->section_intra_rating =
             sectionstats.intra_error /
@@ -1992,9 +1898,6 @@
 
         reset_fpf_position(cpi, start_pos);
     }
-
-    // Reset the First pass motion map file position
-    vp8_fpmm_reset_pos(cpi, fpmm_pos);
 }
 
 // Allocate bits to a normal frame that is neither a gf an arf or a key frame.
@@ -2073,16 +1976,9 @@
 
     vp8_clear_system_state();
 
-    if (EOF == vp8_input_stats(cpi, &this_frame))
+    if (EOF == input_stats(cpi, &this_frame))
         return;
 
-    vpx_memset(cpi->fp_motion_map, 0,
-                cpi->oxcf.arnr_max_frames*cpi->common.MBs);
-    cpi->fp_motion_map_stats_save = vp8_fpmm_get_pos(cpi);
-
-    // Step over this frame's first pass motion map
-    vp8_advance_fpmm(cpi, 1);
-
     this_frame_error = this_frame.ssim_weighted_pred_err;
     this_frame_intra_error = this_frame.intra_error;
     this_frame_coded_error = this_frame.coded_error;
@@ -2101,7 +1997,7 @@
     {
         // Define next KF group and assign bits to it
         vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
-        vp8_find_next_key_frame(cpi, &this_frame_copy);
+        find_next_key_frame(cpi, &this_frame_copy);
 
         // Special case: Error error_resilient_mode mode does not make much sense for two pass but with its current meaning but this code is designed to stop
         // outlandish behaviour if someone does set it when using two pass. It effectively disables GF groups.
@@ -2214,8 +2110,7 @@
             est_cq =
                 estimate_cq( cpi,
                              (cpi->total_coded_error_left / frames_left),
-                             (int)(cpi->bits_left / frames_left),
-                             cpi->common.Height, cpi->common.Width);
+                             (int)(cpi->bits_left / frames_left));
 
             cpi->cq_target_quality = cpi->oxcf.cq_level;
             if ( est_cq > cpi->cq_target_quality )
@@ -2227,9 +2122,7 @@
         cpi->maxq_min_limit = cpi->best_quality;
         tmp_q = estimate_max_q( cpi,
                                 (cpi->total_coded_error_left / frames_left),
-                                (int)(cpi->bits_left / frames_left),
-                                cpi->common.Height,
-                                cpi->common.Width);
+                                (int)(cpi->bits_left / frames_left));
 
         // Limit the maxq value returned subsequently.
         // This increases the risk of overspend or underspend if the initial
@@ -2257,7 +2150,7 @@
         if (frames_left < 1)
             frames_left = 1;
 
-        tmp_q = estimate_max_q(cpi, (cpi->total_coded_error_left / frames_left), (int)(cpi->bits_left / frames_left), cpi->common.Height, cpi->common.Width);
+        tmp_q = estimate_max_q(cpi, (cpi->total_coded_error_left / frames_left), (int)(cpi->bits_left / frames_left));
 
         // Move active_worst_quality but in a damped way
         if (tmp_q > cpi->active_worst_quality)
@@ -2285,7 +2178,7 @@
         (next_frame->pcnt_second_ref < 0.10) &&
         ((this_frame->pcnt_inter < 0.05) ||
          (
-             (this_frame->pcnt_inter < .25) &&
+             ((this_frame->pcnt_inter - this_frame->pcnt_neutral) < .25) &&
              ((this_frame->intra_error / DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < 2.5) &&
              ((fabs(last_frame->coded_error - this_frame->coded_error) / DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > .40) ||
               (fabs(last_frame->intra_error - this_frame->intra_error) / DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > .40) ||
@@ -2332,7 +2225,9 @@
             // Test various breakout clauses
             if ((local_next_frame.pcnt_inter < 0.05) ||
                 (next_iiratio < 1.5) ||
-                ((local_next_frame.pcnt_inter < 0.20) && (next_iiratio < 3.0)) ||
+                (((local_next_frame.pcnt_inter -
+                   local_next_frame.pcnt_neutral) < 0.20) &&
+                 (next_iiratio < 3.0)) ||
                 ((boost_score - old_boost_score) < 0.5) ||
                 (local_next_frame.intra_error < 200)
                )
@@ -2343,7 +2238,7 @@
             old_boost_score = boost_score;
 
             // Get the next frame details
-            if (EOF == vp8_input_stats(cpi, &local_next_frame))
+            if (EOF == input_stats(cpi, &local_next_frame))
                 break;
         }
 
@@ -2361,15 +2256,15 @@
 
     return is_viable_kf;
 }
-void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
+static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
 {
-    int i;
+    int i,j;
     FIRSTPASS_STATS last_frame;
     FIRSTPASS_STATS first_frame;
     FIRSTPASS_STATS next_frame;
     FIRSTPASS_STATS *start_position;
 
-    double decay_accumulator = 0;
+    double decay_accumulator = 1.0;
     double boost_score = 0;
     double old_boost_score = 0.0;
     double loop_decay_rate;
@@ -2379,6 +2274,7 @@
     double kf_group_intra_err = 0.0;
     double kf_group_coded_err = 0.0;
     double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
+    double recent_loop_decay[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
 
     vpx_memset(&next_frame, 0, sizeof(next_frame)); // assure clean
 
@@ -2407,6 +2303,7 @@
     kf_mod_err = calculate_modified_err(cpi, this_frame);
 
     // find the next keyframe
+    i = 0;
     while (cpi->stats_in < cpi->stats_in_end)
     {
         // Accumulate kf group error
@@ -2419,15 +2316,40 @@
 
         // load a the next frame's stats
         vpx_memcpy(&last_frame, this_frame, sizeof(*this_frame));
-        vp8_input_stats(cpi, this_frame);
+        input_stats(cpi, this_frame);
 
         // Provided that we are not at the end of the file...
         if (cpi->oxcf.auto_key
             && lookup_next_frame_stats(cpi, &next_frame) != EOF)
         {
+            // Normal scene cut check
             if (test_candidate_kf(cpi, &last_frame, this_frame, &next_frame))
                 break;
 
+            // How fast is prediction quality decaying
+            loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
+
+            // We want to know something about the recent past... rather than
+            // as used elsewhere where we are concened with decay in prediction
+            // quality since the last GF or KF.
+            recent_loop_decay[i%8] = loop_decay_rate;
+            decay_accumulator = 1.0;
+            for (j = 0; j < 8; j++)
+            {
+                decay_accumulator = decay_accumulator * recent_loop_decay[j];
+            }
+
+            // Special check for transition or high motion followed by a
+            // to a static scene.
+            if ( detect_transition_to_still( cpi, i,
+                                             (cpi->key_frame_frequency-i),
+                                             loop_decay_rate,
+                                             decay_accumulator ) )
+            {
+                break;
+            }
+
+
             // Step on to the next frame
             cpi->frames_to_key ++;
 
@@ -2437,6 +2359,8 @@
                 break;
         } else
             cpi->frames_to_key ++;
+
+        i++;
     }
 
     // If there is a max kf interval set by the user we must obey it.
@@ -2470,7 +2394,7 @@
             kf_group_coded_err += tmp_frame.coded_error;
 
             // Load a the next frame's stats
-            vp8_input_stats(cpi, &tmp_frame);
+            input_stats(cpi, &tmp_frame);
         }
 
         // Reset to the start of the group
@@ -2494,7 +2418,7 @@
     }
 
     // Calculate the number of bits that should be assigned to the kf group.
-    if ((cpi->bits_left > 0) && ((int)cpi->modified_error_left > 0))
+    if ((cpi->bits_left > 0) && (cpi->modified_error_left > 0.0))
     {
         // Max for a single normal frame (not key frame)
         int max_bits = frame_max_bits(cpi);
@@ -2575,7 +2499,7 @@
         double motion_decay;
         double motion_pct;
 
-        if (EOF == vp8_input_stats(cpi, &next_frame))
+        if (EOF == input_stats(cpi, &next_frame))
             break;
 
         if (next_frame.intra_error > cpi->kf_intra_err_min)
@@ -2588,32 +2512,8 @@
         if (r > RMAX)
             r = RMAX;
 
-        // Adjust loop decay rate
-        //if ( next_frame.pcnt_inter < loop_decay_rate )
-        loop_decay_rate = next_frame.pcnt_inter;
-
-        // High % motion -> somewhat higher decay rate
-        motion_pct = next_frame.pcnt_motion;
-        motion_decay = (1.0 - (motion_pct / 20.0));
-        if (motion_decay < loop_decay_rate)
-            loop_decay_rate = motion_decay;
-
-        // Adjustment to decay rate based on speed of motion
-        {
-            double this_mv_rabs;
-            double this_mv_cabs;
-            double distance_factor;
-
-            this_mv_rabs = fabs(next_frame.mvr_abs * motion_pct);
-            this_mv_cabs = fabs(next_frame.mvc_abs * motion_pct);
-
-            distance_factor = sqrt((this_mv_rabs * this_mv_rabs) +
-                                   (this_mv_cabs * this_mv_cabs)) / 250.0;
-            distance_factor = ((distance_factor > 1.0)
-                                    ? 0.0 : (1.0 - distance_factor));
-            if (distance_factor < loop_decay_rate)
-                loop_decay_rate = distance_factor;
-        }
+        // How fast is prediction quality decaying
+        loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
 
         decay_accumulator = decay_accumulator * loop_decay_rate;
         decay_accumulator = decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
@@ -2634,16 +2534,16 @@
         FIRSTPASS_STATS sectionstats;
         double Ratio;
 
-        vp8_zero_stats(&sectionstats);
+        zero_stats(&sectionstats);
         reset_fpf_position(cpi, start_position);
 
         for (i = 0 ; i < cpi->frames_to_key ; i++)
         {
-            vp8_input_stats(cpi, &next_frame);
-            vp8_accumulate_stats(&sectionstats, &next_frame);
+            input_stats(cpi, &next_frame);
+            accumulate_stats(&sectionstats, &next_frame);
         }
 
-        vp8_avg_stats(&sectionstats);
+        avg_stats(&sectionstats);
 
          cpi->section_intra_rating = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
 
@@ -2859,7 +2759,7 @@
             bits_per_frame = (cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
 
         // Work out if spatial resampling is necessary
-        kf_q = estimate_kf_group_q(cpi, err_per_frame, bits_per_frame, new_height, new_width, group_iiratio);
+        kf_q = estimate_kf_group_q(cpi, err_per_frame, bits_per_frame, group_iiratio);
 
         // If we project a required Q higher than the maximum allowed Q then make a guess at the actual size of frames in this section
         projected_bits_perframe = bits_per_frame;
@@ -2930,7 +2830,7 @@
                 effective_size_ratio = (1.0 + (3.0 * effective_size_ratio)) / 4.0;
 
                 // Now try again and see what Q we get with the smaller image size
-                kf_q = estimate_kf_group_q(cpi, err_per_frame * effective_size_ratio, bits_per_frame, new_height, new_width, group_iiratio);
+                kf_q = estimate_kf_group_q(cpi, err_per_frame * effective_size_ratio, bits_per_frame, group_iiratio);
 
                 if (0)
                 {

diff --git a/vp8/encoder/generic/csystemdependent.c b/vp8/encoder/generic/csystemdependent.c
index 4738a5b..1d672be 100644
--- a/vp8/encoder/generic/csystemdependent.c
+++ b/vp8/encoder/generic/csystemdependent.c

@@ -10,15 +10,13 @@
 
 
 #include "vpx_ports/config.h"
-#include "variance.h"
-#include "onyx_int.h"
+#include "vp8/encoder/variance.h"
+#include "vp8/encoder/onyx_int.h"
 
 
 void vp8_arch_x86_encoder_init(VP8_COMP *cpi);
 void vp8_arch_arm_encoder_init(VP8_COMP *cpi);
 
-
-void (*vp8_fast_quantize_b)(BLOCK *b, BLOCKD *d);
 extern void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d);
 
 void (*vp8_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
@@ -95,13 +93,18 @@
     cpi->rtcd.search.full_search             = vp8_full_search_sad;
 #endif
     cpi->rtcd.search.diamond_search          = vp8_diamond_search_sad;
-
+#if !(CONFIG_REALTIME_ONLY)
     cpi->rtcd.temporal.apply                 = vp8_temporal_filter_apply_c;
 #endif
+#endif
 
     // Pure C:
     vp8_yv12_copy_partial_frame_ptr = vp8_yv12_copy_partial_frame;
 
+#if CONFIG_PSNR
+    cpi->rtcd.variance.ssimpf_8x8            = ssim_parms_8x8_c;
+    cpi->rtcd.variance.ssimpf                = ssim_parms_c;
+#endif
 
 #if ARCH_X86 || ARCH_X86_64
     vp8_arch_x86_encoder_init(cpi);

diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c
index d9923fb..de6642b 100644
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c

@@ -43,7 +43,7 @@
     return ((mvcost[0][(mv->row - ref->row) >> 1] + mvcost[1][(mv->col - ref->col) >> 1]) * Weight) >> 7;
 }
 
-int vp8_mv_err_cost(MV *mv, MV *ref, int *mvcost[2], int error_per_bit)
+static int mv_err_cost(MV *mv, MV *ref, int *mvcost[2], int error_per_bit)
 {
     //int i;
     //return ((mvcost[0][(mv->row - ref->row)>>1] + mvcost[1][(mv->col - ref->col)>>1] + 128) * error_per_bit) >> 8;
@@ -221,7 +221,7 @@
 
     // calculate central point error
     besterr = vfp->vf(y, d->pre_stride, z, b->src_stride, &sse);
-    besterr += vp8_mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
+    besterr += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
 
     // TODO: Each subsequent iteration checks at least one point in common with the last iteration could be 2 ( if diag selected)
     while (--halfiters)
@@ -337,13 +337,13 @@
 
     // calculate central point error
     bestmse = vfp->vf(y, d->pre_stride, z, b->src_stride, &sse);
-    bestmse += vp8_mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
+    bestmse += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
 
     // go left then right and check error
     this_mv.row = startmv.row;
     this_mv.col = ((startmv.col - 8) | 4);
     left = vfp->svf_halfpix_h(y - 1, d->pre_stride, z, b->src_stride, &sse);
-    left += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    left += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (left < bestmse)
     {
@@ -353,7 +353,7 @@
 
     this_mv.col += 8;
     right = vfp->svf_halfpix_h(y, d->pre_stride, z, b->src_stride, &sse);
-    right += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    right += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (right < bestmse)
     {
@@ -365,7 +365,7 @@
     this_mv.col = startmv.col;
     this_mv.row = ((startmv.row - 8) | 4);
     up = vfp->svf_halfpix_v(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
-    up += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    up += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (up < bestmse)
     {
@@ -375,7 +375,7 @@
 
     this_mv.row += 8;
     down = vfp->svf_halfpix_v(y, d->pre_stride, z, b->src_stride, &sse);
-    down += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    down += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (down < bestmse)
     {
@@ -415,7 +415,7 @@
         break;
     }
 
-    diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    diag += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (diag < bestmse)
     {
@@ -451,7 +451,7 @@
         left = vfp->svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);
     }
 
-    left += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    left += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (left < bestmse)
     {
@@ -461,7 +461,7 @@
 
     this_mv.col += 4;
     right = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
-    right += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    right += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (right < bestmse)
     {
@@ -483,7 +483,7 @@
         up = vfp->svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse);
     }
 
-    up += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    up += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (up < bestmse)
     {
@@ -493,7 +493,7 @@
 
     this_mv.row += 4;
     down = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
-    down += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    down += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (down < bestmse)
     {
@@ -582,7 +582,7 @@
         break;
     }
 
-    diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    diag += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (diag < bestmse)
     {
@@ -621,13 +621,13 @@
 
     // calculate central point error
     bestmse = vfp->vf(y, d->pre_stride, z, b->src_stride, &sse);
-    bestmse += vp8_mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
+    bestmse += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
 
     // go left then right and check error
     this_mv.row = startmv.row;
     this_mv.col = ((startmv.col - 8) | 4);
     left = vfp->svf_halfpix_h(y - 1, d->pre_stride, z, b->src_stride, &sse);
-    left += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    left += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (left < bestmse)
     {
@@ -637,7 +637,7 @@
 
     this_mv.col += 8;
     right = vfp->svf_halfpix_h(y, d->pre_stride, z, b->src_stride, &sse);
-    right += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    right += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (right < bestmse)
     {
@@ -649,7 +649,7 @@
     this_mv.col = startmv.col;
     this_mv.row = ((startmv.row - 8) | 4);
     up = vfp->svf_halfpix_v(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
-    up += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    up += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (up < bestmse)
     {
@@ -659,7 +659,7 @@
 
     this_mv.row += 8;
     down = vfp->svf_halfpix_v(y, d->pre_stride, z, b->src_stride, &sse);
-    down += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    down += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (down < bestmse)
     {
@@ -697,7 +697,7 @@
         break;
     }
 
-    diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    diag += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (diag < bestmse)
     {
@@ -709,7 +709,7 @@
     this_mv.col = (this_mv.col - 8) | 4;
     this_mv.row = (this_mv.row - 8) | 4;
     diag = vfp->svf_halfpix_hv(y - 1 - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
-    diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    diag += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (diag < bestmse)
     {
@@ -719,7 +719,7 @@
 
     this_mv.col += 8;
     diag = vfp->svf_halfpix_hv(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
-    diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    diag += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (diag < bestmse)
     {
@@ -730,7 +730,7 @@
     this_mv.col = (this_mv.col - 8) | 4;
     this_mv.row = startmv.row + 4;
     diag = vfp->svf_halfpix_hv(y - 1, d->pre_stride, z, b->src_stride, &sse);
-    diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    diag += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (diag < bestmse)
     {
@@ -740,7 +740,7 @@
 
     this_mv.col += 8;
     diag = vfp->svf_halfpix_hv(y, d->pre_stride, z, b->src_stride, &sse);
-    diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    diag += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (diag < bestmse)
     {
@@ -779,15 +779,17 @@
     int *num00,
     const vp8_variance_fn_ptr_t *vfp,
     int *mvsadcost[2],
-    int *mvcost[2]
+    int *mvcost[2],
+    MV *center_mv
 )
 {
     MV hex[6] = { { -1, -2}, {1, -2}, {2, 0}, {1, 2}, { -1, 2}, { -2, 0} } ;
-    MV neighbors[8] = { { -1, -1}, { -1, 0}, { -1, 1}, {0, -1}, {0, 1}, {1, -1}, {1, 0}, {1, 1} } ;
+    MV neighbors[8] = { { -1, -1}, {0, -1}, {1, -1}, { -1, 0}, {1, 0}, { -1, 1}, {0, 1}, {1, 1} } ;
     int i, j;
     unsigned char *src = (*(b->base_src) + b->src);
     int src_stride = b->src_stride;
-    int rr = ref_mv->row, rc = ref_mv->col, br = rr >> 3, bc = rc >> 3, tr, tc;
+    int rr = center_mv->row, rc = center_mv->col;
+    int br = ref_mv->row >> 3, bc = ref_mv->col >> 3, tr, tc;
     unsigned int besterr, thiserr = 0x7fffffff;
     int k = -1, tk;
 
@@ -892,7 +894,7 @@
     best_mv->row = br;
     best_mv->col = bc;
 
-    return vfp->vf(src, src_stride, PRE(br, bc), d->pre_stride, &thiserr) + MVC(br, bc) ;
+    return vfp->vf(src, src_stride, PRE(br, bc), d->pre_stride, &thiserr) + mv_err_cost(best_mv, center_mv, mvcost, error_per_bit) ;
 }
 #undef MVC
 #undef PRE
@@ -953,7 +955,7 @@
     (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
     {
         // Check the starting position
-        bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
+        bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
     }
 
     // search_param determines the length of the initial step and hence the number of iterations
@@ -984,7 +986,7 @@
                 {
                     this_mv.row = this_row_offset << 3;
                     this_mv.col = this_col_offset << 3;
-                    thissad += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
+                    thissad += mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
 
                     if (thissad < bestsad)
                     {
@@ -1015,7 +1017,7 @@
         return INT_MAX;
 
     return fn_ptr->vf(what, what_stride, best_address, in_what_stride, (unsigned int *)(&thissad))
-    + vp8_mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
+    + mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
 }
 
 int vp8_diamond_search_sadx4
@@ -1069,7 +1071,7 @@
     (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
     {
         // Check the starting position
-        bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
+        bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
     }
 
     // search_param determines the length of the initial step and hence the number of iterations
@@ -1111,7 +1113,7 @@
                     {
                         this_mv.row = (best_mv->row + ss[i].mv.row) << 3;
                         this_mv.col = (best_mv->col + ss[i].mv.col) << 3;
-                        sad_array[t] += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
+                        sad_array[t] += mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
 
                         if (sad_array[t] < bestsad)
                         {
@@ -1140,7 +1142,7 @@
                     {
                         this_mv.row = this_row_offset << 3;
                         this_mv.col = this_col_offset << 3;
-                        thissad += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
+                        thissad += mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
 
                         if (thissad < bestsad)
                         {
@@ -1171,7 +1173,7 @@
         return INT_MAX;
 
     return fn_ptr->vf(what, what_stride, best_address, in_what_stride, (unsigned int *)(&thissad))
-    + vp8_mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
+    + mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
 }
 
 
@@ -1213,8 +1215,8 @@
     {
         // Baseline value at the centre
 
-        //bestsad = fn_ptr->sf( what,what_stride,bestaddress,in_what_stride) + (int)sqrt(vp8_mv_err_cost(ref_mv,ref_mv, mvcost,error_per_bit*14));
-        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
+        //bestsad = fn_ptr->sf( what,what_stride,bestaddress,in_what_stride) + (int)sqrt(mv_err_cost(ref_mv,ref_mv, mvcost,error_per_bit*14));
+        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
     }
 
     // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
@@ -1240,9 +1242,9 @@
             thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
 
             this_mv.col = c << 3;
-            //thissad += (int)sqrt(vp8_mv_err_cost(&this_mv,ref_mv, mvcost,error_per_bit*14));
+            //thissad += (int)sqrt(mv_err_cost(&this_mv,ref_mv, mvcost,error_per_bit*14));
             //thissad  += error_per_bit * mv_bits_sadcost[mv_bits(&this_mv, ref_mv, mvcost)];
-            thissad  += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit); //mv_bits(error_per_bit, &this_mv, ref_mv, mvsadcost);
+            thissad  += mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit); //mv_bits(error_per_bit, &this_mv, ref_mv, mvsadcost);
 
             if (thissad < bestsad)
             {
@@ -1261,7 +1263,7 @@
 
     if (bestsad < INT_MAX)
         return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad))
-        + vp8_mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
+        + mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
     else
         return INT_MAX;
 }
@@ -1304,7 +1306,7 @@
     (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
     {
         // Baseline value at the centre
-        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
+        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
     }
 
     // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
@@ -1339,7 +1341,7 @@
                 if (thissad < bestsad)
                 {
                     this_mv.col = c << 3;
-                    thissad  += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
+                    thissad  += mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
 
                     if (thissad < bestsad)
                     {
@@ -1362,7 +1364,7 @@
             if (thissad < bestsad)
             {
                 this_mv.col = c << 3;
-                thissad  += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
+                thissad  += mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
 
                 if (thissad < bestsad)
                 {
@@ -1384,7 +1386,7 @@
 
     if (bestsad < INT_MAX)
         return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad))
-        + vp8_mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
+        + mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
     else
         return INT_MAX;
 }
@@ -1413,7 +1415,7 @@
     int col_min = ref_col - distance;
     int col_max = ref_col + distance;
 
-    unsigned short sad_array8[8];
+    DECLARE_ALIGNED_ARRAY(16, unsigned short, sad_array8, 8);
     unsigned int sad_array[3];
 
     // Work out the mid point for the search
@@ -1428,7 +1430,7 @@
     (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
     {
         // Baseline value at the centre
-        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
+        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
     }
 
     // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
@@ -1463,7 +1465,7 @@
                 if (thissad < bestsad)
                 {
                     this_mv.col = c << 3;
-                    thissad  += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
+                    thissad  += mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
 
                     if (thissad < bestsad)
                     {
@@ -1492,7 +1494,7 @@
                 if (thissad < bestsad)
                 {
                     this_mv.col = c << 3;
-                    thissad  += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
+                    thissad  += mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
 
                     if (thissad < bestsad)
                     {
@@ -1515,7 +1517,7 @@
             if (thissad < bestsad)
             {
                 this_mv.col = c << 3;
-                thissad  += vp8_mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
+                thissad  += mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
 
                 if (thissad < bestsad)
                 {
@@ -1536,7 +1538,7 @@
 
     if (bestsad < INT_MAX)
         return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad))
-        + vp8_mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
+        + mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit);
     else
         return INT_MAX;
 }

diff --git a/vp8/encoder/mcomp.h b/vp8/encoder/mcomp.h
index 7600f87..83f95c6 100644
--- a/vp8/encoder/mcomp.h
+++ b/vp8/encoder/mcomp.h

@@ -43,8 +43,8 @@
     int *num00,
     const vp8_variance_fn_ptr_t *vf,
     int *mvsadcost[2],
-    int *mvcost[2]
-
+    int *mvcost[2],
+    MV *center_mv
 );
 
 typedef int (fractional_mv_step_fp)

diff --git a/vp8/encoder/modecosts.c b/vp8/encoder/modecosts.c
index d23c97e..c636c48 100644
--- a/vp8/encoder/modecosts.c
+++ b/vp8/encoder/modecosts.c

@@ -9,10 +9,10 @@
  */
 
 
-#include "blockd.h"
+#include "vp8/common/blockd.h"
 #include "onyx_int.h"
 #include "treewriter.h"
-#include "entropymode.h"
+#include "vp8/common/entropymode.h"
 
 
 void vp8_init_mode_costs(VP8_COMP *c)

diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index df1a6f5..931c51a 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c

@@ -9,27 +9,26 @@
  */
 
 
-#include "onyxc_int.h"
+#include "vp8/common/onyxc_int.h"
 #include "onyx_int.h"
-#include "systemdependent.h"
+#include "vp8/common/systemdependent.h"
 #include "quantize.h"
-#include "alloccommon.h"
+#include "vp8/common/alloccommon.h"
 #include "mcomp.h"
 #include "firstpass.h"
 #include "psnr.h"
 #include "vpx_scale/vpxscale.h"
-#include "extend.h"
+#include "vp8/common/extend.h"
 #include "ratectrl.h"
-#include "quant_common.h"
+#include "vp8/common/quant_common.h"
 #include "segmentation.h"
-#include "g_common.h"
+#include "vp8/common/g_common.h"
 #include "vpx_scale/yv12extend.h"
-#include "postproc.h"
+#include "vp8/common/postproc.h"
 #include "vpx_mem/vpx_mem.h"
-#include "swapyv12buffer.h"
-#include "threading.h"
+#include "vp8/common/swapyv12buffer.h"
+#include "vp8/common/threading.h"
 #include "vpx_ports/vpx_timer.h"
-#include "vpxerrors.h"
 #include "temporal_filter.h"
 #if ARCH_ARM
 #include "vpx_ports/arm.h"
@@ -71,7 +70,6 @@
 
 int vp8_estimate_entropy_savings(VP8_COMP *cpi);
 int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const vp8_variance_rtcd_vtable_t *rtcd);
-int vp8_calc_low_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const vp8_variance_rtcd_vtable_t *rtcd);
 
 extern void vp8_temporal_filter_prepare_c(VP8_COMP *cpi);
 
@@ -87,9 +85,11 @@
     YV12_BUFFER_CONFIG *source,
     YV12_BUFFER_CONFIG *dest,
     int lumamask,
-    double *weight
+    double *weight,
+    const vp8_variance_rtcd_vtable_t *rtcd
 );
 
+
 extern double vp8_calc_ssimg
 (
     YV12_BUFFER_CONFIG *source,
@@ -260,41 +260,28 @@
 }
 
 
-void vp8_dealloc_compressor_data(VP8_COMP *cpi)
+static void dealloc_compressor_data(VP8_COMP *cpi)
 {
-    // Delete last frame MV storage buffers
-    if (cpi->lfmv != 0)
-        vpx_free(cpi->lfmv);
+    vpx_free(cpi->tplist);
+    cpi->tplist = NULL;
 
+    // Delete last frame MV storage buffers
+    vpx_free(cpi->lfmv);
     cpi->lfmv = 0;
 
-    if (cpi->lf_ref_frame_sign_bias != 0)
-        vpx_free(cpi->lf_ref_frame_sign_bias);
-
+    vpx_free(cpi->lf_ref_frame_sign_bias);
     cpi->lf_ref_frame_sign_bias = 0;
 
-    if (cpi->lf_ref_frame != 0)
-        vpx_free(cpi->lf_ref_frame);
-
+    vpx_free(cpi->lf_ref_frame);
     cpi->lf_ref_frame = 0;
 
     // Delete sementation map
-    if (cpi->segmentation_map != 0)
-        vpx_free(cpi->segmentation_map);
-
+    vpx_free(cpi->segmentation_map);
     cpi->segmentation_map = 0;
 
-    if (cpi->active_map != 0)
-        vpx_free(cpi->active_map);
-
+    vpx_free(cpi->active_map);
     cpi->active_map = 0;
 
-    // Delete first pass motion map
-    if (cpi->fp_motion_map != 0)
-        vpx_free(cpi->fp_motion_map);
-
-    cpi->fp_motion_map = 0;
-
     vp8_de_alloc_frame_buffers(&cpi->common);
 
     vp8_yv12_de_alloc_frame_buffer(&cpi->last_frame_uf);
@@ -315,25 +302,17 @@
     cpi->tok = 0;
 
     // Structure used to monitor GF usage
-    if (cpi->gf_active_flags != 0)
-        vpx_free(cpi->gf_active_flags);
-
+    vpx_free(cpi->gf_active_flags);
     cpi->gf_active_flags = 0;
 
-    if(cpi->mb.pip)
-        vpx_free(cpi->mb.pip);
-
+    vpx_free(cpi->mb.pip);
     cpi->mb.pip = 0;
 
 #if !(CONFIG_REALTIME_ONLY)
-    if(cpi->total_stats)
-        vpx_free(cpi->total_stats);
-
+    vpx_free(cpi->total_stats);
     cpi->total_stats = 0;
 
-    if(cpi->this_frame_stats)
-        vpx_free(cpi->this_frame_stats);
-
+    vpx_free(cpi->this_frame_stats);
     cpi->this_frame_stats = 0;
 #endif
 }
@@ -438,7 +417,6 @@
     set_segment_data(ptr, &feature_data[0][0], SEGMENT_DELTADATA);
 
     // Delete sementation map
-    if (seg_map != 0)
         vpx_free(seg_map);
 
     seg_map = 0;
@@ -532,7 +510,6 @@
     set_segment_data((VP8_PTR)cpi, &feature_data[0][0], SEGMENT_DELTADATA);
 
     // Delete sementation map
-    if (seg_map != 0)
         vpx_free(seg_map);
 
     seg_map = 0;
@@ -598,6 +575,7 @@
 
     sf->first_step = 0;
     sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
+    sf->improved_mv_pred = 1;
 
     cpi->do_full[0] = 0;
     cpi->do_full[1] = 0;
@@ -640,34 +618,6 @@
 
         sf->first_step = 0;
         sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
-
-        if (!(cpi->ref_frame_flags & VP8_LAST_FLAG))
-        {
-            sf->thresh_mult[THR_NEWMV    ] = INT_MAX;
-            sf->thresh_mult[THR_NEARESTMV] = INT_MAX;
-            sf->thresh_mult[THR_ZEROMV   ] = INT_MAX;
-            sf->thresh_mult[THR_NEARMV   ] = INT_MAX;
-            sf->thresh_mult[THR_SPLITMV  ] = INT_MAX;
-        }
-
-        if (!(cpi->ref_frame_flags & VP8_GOLD_FLAG))
-        {
-            sf->thresh_mult[THR_NEARESTG ] = INT_MAX;
-            sf->thresh_mult[THR_ZEROG    ] = INT_MAX;
-            sf->thresh_mult[THR_NEARG    ] = INT_MAX;
-            sf->thresh_mult[THR_NEWG     ] = INT_MAX;
-            sf->thresh_mult[THR_SPLITG   ] = INT_MAX;
-        }
-
-        if (!(cpi->ref_frame_flags & VP8_ALT_FLAG))
-        {
-            sf->thresh_mult[THR_NEARESTA ] = INT_MAX;
-            sf->thresh_mult[THR_ZEROA    ] = INT_MAX;
-            sf->thresh_mult[THR_NEARA    ] = INT_MAX;
-            sf->thresh_mult[THR_NEWA     ] = INT_MAX;
-            sf->thresh_mult[THR_SPLITA   ] = INT_MAX;
-        }
-
         break;
     case 1:
     case 3:
@@ -725,41 +675,22 @@
         sf->full_freq[0] = 15;
         sf->full_freq[1] = 31;
 
-        sf->first_step = 0;
-        sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
-
-        if (!(cpi->ref_frame_flags & VP8_LAST_FLAG))
-        {
-            sf->thresh_mult[THR_NEWMV    ] = INT_MAX;
-            sf->thresh_mult[THR_NEARESTMV] = INT_MAX;
-            sf->thresh_mult[THR_ZEROMV   ] = INT_MAX;
-            sf->thresh_mult[THR_NEARMV   ] = INT_MAX;
-            sf->thresh_mult[THR_SPLITMV  ] = INT_MAX;
-        }
-
-        if (!(cpi->ref_frame_flags & VP8_GOLD_FLAG))
-        {
-            sf->thresh_mult[THR_NEARESTG ] = INT_MAX;
-            sf->thresh_mult[THR_ZEROG    ] = INT_MAX;
-            sf->thresh_mult[THR_NEARG    ] = INT_MAX;
-            sf->thresh_mult[THR_NEWG     ] = INT_MAX;
-            sf->thresh_mult[THR_SPLITG   ] = INT_MAX;
-        }
-
-        if (!(cpi->ref_frame_flags & VP8_ALT_FLAG))
-        {
-            sf->thresh_mult[THR_NEARESTA ] = INT_MAX;
-            sf->thresh_mult[THR_ZEROA    ] = INT_MAX;
-            sf->thresh_mult[THR_NEARA    ] = INT_MAX;
-            sf->thresh_mult[THR_NEWA     ] = INT_MAX;
-            sf->thresh_mult[THR_SPLITA   ] = INT_MAX;
-        }
-
         if (Speed > 0)
         {
-            // Disable coefficient optimization above speed 0
+            /* Disable coefficient optimization above speed 0 */
             sf->optimize_coefficients = 0;
+            sf->use_fastquant_for_pick = 1;
+            sf->no_skip_block4x4_search = 0;
 
+            sf->first_step = 1;
+
+            cpi->mode_check_freq[THR_SPLITG] = 2;
+            cpi->mode_check_freq[THR_SPLITA] = 2;
+            cpi->mode_check_freq[THR_SPLITMV] = 0;
+        }
+
+        if (Speed > 1)
+        {
             cpi->mode_check_freq[THR_SPLITG] = 4;
             cpi->mode_check_freq[THR_SPLITA] = 4;
             cpi->mode_check_freq[THR_SPLITMV] = 2;
@@ -792,18 +723,10 @@
                 sf->thresh_mult[THR_NEWA     ] = 2000;
                 sf->thresh_mult[THR_SPLITA   ] = 20000;
             }
-
-            sf->use_fastquant_for_pick = 1;
-
-            sf->first_step = 1;
-            sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
-            sf->no_skip_block4x4_search = 0;
         }
 
-        if (Speed > 1)
+        if (Speed > 2)
         {
-            sf->use_fastquant_for_pick = 0;
-
             cpi->mode_check_freq[THR_SPLITG] = 15;
             cpi->mode_check_freq[THR_SPLITA] = 15;
             cpi->mode_check_freq[THR_SPLITMV] = 7;
@@ -837,8 +760,6 @@
                 sf->thresh_mult[THR_SPLITA   ] = 50000;
             }
 
-            sf->first_step = 1;
-
             sf->improved_quant = 0;
             sf->improved_dct = 0;
 
@@ -848,38 +769,14 @@
 
             sf->full_freq[0] = 31;
             sf->full_freq[1] = 63;
-
-        }
-
-        if (Speed > 2)
-        {
-            sf->auto_filter = 0;                     // Faster selection of loop filter
-            cpi->mode_check_freq[THR_V_PRED] = 2;
-            cpi->mode_check_freq[THR_H_PRED] = 2;
-            cpi->mode_check_freq[THR_B_PRED] = 2;
-
-            if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
-            {
-                cpi->mode_check_freq[THR_NEARG] = 2;
-                cpi->mode_check_freq[THR_NEWG] = 4;
-            }
-
-            if (cpi->ref_frame_flags & VP8_ALT_FLAG)
-            {
-                cpi->mode_check_freq[THR_NEARA] = 2;
-                cpi->mode_check_freq[THR_NEWA] = 4;
-            }
-
-            sf->thresh_mult[THR_SPLITA  ] = INT_MAX;
-            sf->thresh_mult[THR_SPLITG  ] = INT_MAX;
-            sf->thresh_mult[THR_SPLITMV  ] = INT_MAX;
-
-            sf->full_freq[0] = 63;
-            sf->full_freq[1] = 127;
         }
 
         if (Speed > 3)
         {
+            sf->thresh_mult[THR_SPLITA  ] = INT_MAX;
+            sf->thresh_mult[THR_SPLITG  ] = INT_MAX;
+            sf->thresh_mult[THR_SPLITMV  ] = INT_MAX;
+
             cpi->mode_check_freq[THR_V_PRED] = 0;
             cpi->mode_check_freq[THR_H_PRED] = 0;
             cpi->mode_check_freq[THR_B_PRED] = 0;
@@ -891,13 +788,16 @@
             sf->auto_filter = 1;
             sf->recode_loop = 0; // recode loop off
             sf->RD = 0;         // Turn rd off
-            sf->full_freq[0] = INT_MAX;
-            sf->full_freq[1] = INT_MAX;
+
+            sf->full_freq[0] = 63;
+            sf->full_freq[1] = 127;
         }
 
         if (Speed > 4)
         {
             sf->auto_filter = 0;                     // Faster selection of loop filter
+            sf->full_freq[0] = INT_MAX;
+            sf->full_freq[1] = INT_MAX;
 
             cpi->mode_check_freq[THR_V_PRED] = 2;
             cpi->mode_check_freq[THR_H_PRED] = 2;
@@ -963,33 +863,6 @@
         sf->full_freq[1] = 31;
         sf->search_method = NSTEP;
 
-        if (!(cpi->ref_frame_flags & VP8_LAST_FLAG))
-        {
-            sf->thresh_mult[THR_NEWMV    ] = INT_MAX;
-            sf->thresh_mult[THR_NEARESTMV] = INT_MAX;
-            sf->thresh_mult[THR_ZEROMV   ] = INT_MAX;
-            sf->thresh_mult[THR_NEARMV   ] = INT_MAX;
-            sf->thresh_mult[THR_SPLITMV  ] = INT_MAX;
-        }
-
-        if (!(cpi->ref_frame_flags & VP8_GOLD_FLAG))
-        {
-            sf->thresh_mult[THR_NEARESTG ] = INT_MAX;
-            sf->thresh_mult[THR_ZEROG    ] = INT_MAX;
-            sf->thresh_mult[THR_NEARG    ] = INT_MAX;
-            sf->thresh_mult[THR_NEWG     ] = INT_MAX;
-            sf->thresh_mult[THR_SPLITG   ] = INT_MAX;
-        }
-
-        if (!(cpi->ref_frame_flags & VP8_ALT_FLAG))
-        {
-            sf->thresh_mult[THR_NEARESTA ] = INT_MAX;
-            sf->thresh_mult[THR_ZEROA    ] = INT_MAX;
-            sf->thresh_mult[THR_NEARA    ] = INT_MAX;
-            sf->thresh_mult[THR_NEWA     ] = INT_MAX;
-            sf->thresh_mult[THR_SPLITA   ] = INT_MAX;
-        }
-
         if (Speed > 0)
         {
             cpi->mode_check_freq[THR_SPLITG] = 4;
@@ -1118,6 +991,7 @@
 #else
             sf->search_method = DIAMOND;
 #endif
+            sf->iterative_sub_pixel = 0;
 
             cpi->mode_check_freq[THR_V_PRED] = 4;
             cpi->mode_check_freq[THR_H_PRED] = 4;
@@ -1169,7 +1043,6 @@
             int total_skip;
 
             int min = 2000;
-            sf->iterative_sub_pixel = 0;
 
             if (cpi->oxcf.encode_breakout > 2000)
                 min = cpi->oxcf.encode_breakout;
@@ -1225,6 +1098,7 @@
             sf->thresh_mult[THR_V_PRED] = INT_MAX;
             sf->thresh_mult[THR_H_PRED] = INT_MAX;
 
+            sf->improved_mv_pred = 0;
         }
 
         if (Speed > 8)
@@ -1270,7 +1144,36 @@
 
         vpx_memset(cpi->error_bins, 0, sizeof(cpi->error_bins));
 
-    };
+    }; /* switch */
+
+    /* disable frame modes if flags not set */
+    if (!(cpi->ref_frame_flags & VP8_LAST_FLAG))
+    {
+        sf->thresh_mult[THR_NEWMV    ] = INT_MAX;
+        sf->thresh_mult[THR_NEARESTMV] = INT_MAX;
+        sf->thresh_mult[THR_ZEROMV   ] = INT_MAX;
+        sf->thresh_mult[THR_NEARMV   ] = INT_MAX;
+        sf->thresh_mult[THR_SPLITMV  ] = INT_MAX;
+    }
+
+    if (!(cpi->ref_frame_flags & VP8_GOLD_FLAG))
+    {
+        sf->thresh_mult[THR_NEARESTG ] = INT_MAX;
+        sf->thresh_mult[THR_ZEROG    ] = INT_MAX;
+        sf->thresh_mult[THR_NEARG    ] = INT_MAX;
+        sf->thresh_mult[THR_NEWG     ] = INT_MAX;
+        sf->thresh_mult[THR_SPLITG   ] = INT_MAX;
+    }
+
+    if (!(cpi->ref_frame_flags & VP8_ALT_FLAG))
+    {
+        sf->thresh_mult[THR_NEARESTA ] = INT_MAX;
+        sf->thresh_mult[THR_ZEROA    ] = INT_MAX;
+        sf->thresh_mult[THR_NEARA    ] = INT_MAX;
+        sf->thresh_mult[THR_NEWA     ] = INT_MAX;
+        sf->thresh_mult[THR_SPLITA   ] = INT_MAX;
+    }
+
 
     // Slow quant, dct and trellis not worthwhile for first pass
     // so make sure they are always turned off.
@@ -1350,6 +1253,8 @@
 static void alloc_raw_frame_buffers(VP8_COMP *cpi)
 {
     int i, buffers;
+    /* allocate source_buffer to be multiples of 16 */
+    int width = (cpi->oxcf.Width + 15) & ~15;
 
     buffers = cpi->oxcf.lag_in_frames;
 
@@ -1361,7 +1266,7 @@
 
     for (i = 0; i < buffers; i++)
         if (vp8_yv12_alloc_frame_buffer(&cpi->src_buffer[i].source_buffer,
-                                        cpi->oxcf.Width, cpi->oxcf.Height,
+                                        width, cpi->oxcf.Height,
                                         16))
             vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                                "Failed to allocate lag buffer");
@@ -1369,7 +1274,7 @@
 #if VP8_TEMPORAL_ALT_REF
 
     if (vp8_yv12_alloc_frame_buffer(&cpi->alt_ref_buffer.source_buffer,
-                                    cpi->oxcf.Width, cpi->oxcf.Height, 16))
+                                    width, cpi->oxcf.Height, 16))
         vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                            "Failed to allocate altref buffer");
 
@@ -1380,14 +1285,13 @@
 
 static int vp8_alloc_partition_data(VP8_COMP *cpi)
 {
-    if(cpi->mb.pip)
         vpx_free(cpi->mb.pip);
 
     cpi->mb.pip = vpx_calloc((cpi->common.mb_cols + 1) *
                                 (cpi->common.mb_rows + 1),
                                 sizeof(PARTITION_INFO));
     if(!cpi->mb.pip)
-        return ALLOC_FAILURE;
+        return 1;
 
     cpi->mb.pi = cpi->mb.pip + cpi->common.mode_info_stride + 1;
 
@@ -1427,7 +1331,6 @@
                            "Failed to allocate scaled source buffer");
 
 
-    if (cpi->tok != 0)
         vpx_free(cpi->tok);
 
     {
@@ -1443,7 +1346,6 @@
 
 
     // Structures used to minitor GF usage
-    if (cpi->gf_active_flags != 0)
         vpx_free(cpi->gf_active_flags);
 
     CHECK_MEM_ERROR(cpi->gf_active_flags, vpx_calloc(1, cm->mb_rows * cm->mb_cols));
@@ -1451,15 +1353,13 @@
     cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
 
 #if !(CONFIG_REALTIME_ONLY)
-    if(cpi->total_stats)
         vpx_free(cpi->total_stats);
 
-    cpi->total_stats = vpx_calloc(1, vp8_firstpass_stats_sz(cpi->common.MBs));
+    cpi->total_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));
 
-    if(cpi->this_frame_stats)
         vpx_free(cpi->this_frame_stats);
 
-    cpi->this_frame_stats = vpx_calloc(1, vp8_firstpass_stats_sz(cpi->common.MBs));
+    cpi->this_frame_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));
 
     if(!cpi->total_stats || !cpi->this_frame_stats)
         vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
@@ -1476,6 +1376,10 @@
     else
         cpi->mt_sync_range = 16;
 #endif
+
+        vpx_free(cpi->tplist);
+
+    CHECK_MEM_ERROR(cpi->tplist, vpx_malloc(sizeof(TOKENLIST) * cpi->common.mb_rows));
 }
 
 
@@ -1548,13 +1452,12 @@
 }
 
 
-void vp8_init_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
+static void init_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
 {
     VP8_COMP *cpi = (VP8_COMP *)(ptr);
     VP8_COMMON *cm = &cpi->common;
 
-    if (!cpi)
-        return;
+    cpi->oxcf = *oxcf;
 
     cpi->auto_gold = 1;
     cpi->auto_adjust_gold_quantizer = 1;
@@ -1566,299 +1469,31 @@
     cm->version = oxcf->Version;
     vp8_setup_version(cm);
 
-    if (oxcf == 0)
-    {
-        cpi->pass                     = 0;
+    // change includes all joint functionality
+    vp8_change_config(ptr, oxcf);
 
-        cpi->auto_worst_q              = 0;
-        cpi->oxcf.best_allowed_q            = MINQ;
-        cpi->oxcf.worst_allowed_q           = MAXQ;
-        cpi->oxcf.cq_level = MINQ;
+    // Initialize active best and worst q and average q values.
+    cpi->active_worst_quality         = cpi->oxcf.worst_allowed_q;
+    cpi->active_best_quality          = cpi->oxcf.best_allowed_q;
+    cpi->avg_frame_qindex             = cpi->oxcf.worst_allowed_q;
 
-        cpi->oxcf.end_usage                = USAGE_STREAM_FROM_SERVER;
-        cpi->oxcf.starting_buffer_level     =   4000;
-        cpi->oxcf.optimal_buffer_level      =   5000;
-        cpi->oxcf.maximum_buffer_size       =   6000;
-        cpi->oxcf.under_shoot_pct           =  90;
-        cpi->oxcf.allow_df                 =   0;
-        cpi->oxcf.drop_frames_water_mark     =  20;
-
-        cpi->oxcf.allow_spatial_resampling  = 0;
-        cpi->oxcf.resample_down_water_mark   = 40;
-        cpi->oxcf.resample_up_water_mark     = 60;
-
-        cpi->oxcf.fixed_q = cpi->interquantizer;
-
-        cpi->filter_type = NORMAL_LOOPFILTER;
-
-        if (cm->simpler_lpf)
-            cpi->filter_type = SIMPLE_LOOPFILTER;
-
-        cpi->compressor_speed = 1;
-        cpi->horiz_scale = 0;
-        cpi->vert_scale = 0;
-        cpi->oxcf.two_pass_vbrbias = 50;
-        cpi->oxcf.two_pass_vbrmax_section = 400;
-        cpi->oxcf.two_pass_vbrmin_section = 0;
-
-        cpi->oxcf.Sharpness = 0;
-        cpi->oxcf.noise_sensitivity = 0;
-    }
-    else
-        cpi->oxcf = *oxcf;
-
-
-    switch (cpi->oxcf.Mode)
-    {
-
-    case MODE_REALTIME:
-        cpi->pass = 0;
-        cpi->compressor_speed = 2;
-
-        if (cpi->oxcf.cpu_used < -16)
-        {
-            cpi->oxcf.cpu_used = -16;
-        }
-
-        if (cpi->oxcf.cpu_used > 16)
-            cpi->oxcf.cpu_used = 16;
-
-        break;
-
-#if !(CONFIG_REALTIME_ONLY)
-    case MODE_GOODQUALITY:
-        cpi->pass = 0;
-        cpi->compressor_speed = 1;
-
-        if (cpi->oxcf.cpu_used < -5)
-        {
-            cpi->oxcf.cpu_used = -5;
-        }
-
-        if (cpi->oxcf.cpu_used > 5)
-            cpi->oxcf.cpu_used = 5;
-
-        break;
-
-    case MODE_BESTQUALITY:
-        cpi->pass = 0;
-        cpi->compressor_speed = 0;
-        break;
-
-    case MODE_FIRSTPASS:
-        cpi->pass = 1;
-        cpi->compressor_speed = 1;
-        break;
-    case MODE_SECONDPASS:
-        cpi->pass = 2;
-        cpi->compressor_speed = 1;
-
-        if (cpi->oxcf.cpu_used < -5)
-        {
-            cpi->oxcf.cpu_used = -5;
-        }
-
-        if (cpi->oxcf.cpu_used > 5)
-            cpi->oxcf.cpu_used = 5;
-
-        break;
-    case MODE_SECONDPASS_BEST:
-        cpi->pass = 2;
-        cpi->compressor_speed = 0;
-        break;
-#endif
-    }
-
-    if (cpi->pass == 0)
-        cpi->auto_worst_q = 1;
-
-    cpi->oxcf.worst_allowed_q = q_trans[oxcf->worst_allowed_q];
-    cpi->oxcf.best_allowed_q  = q_trans[oxcf->best_allowed_q];
-    cpi->oxcf.cq_level = q_trans[cpi->oxcf.cq_level];
-
-    if (oxcf->fixed_q >= 0)
-    {
-        if (oxcf->worst_allowed_q < 0)
-            cpi->oxcf.fixed_q = q_trans[0];
-        else
-            cpi->oxcf.fixed_q = q_trans[oxcf->worst_allowed_q];
-
-        if (oxcf->alt_q < 0)
-            cpi->oxcf.alt_q = q_trans[0];
-        else
-            cpi->oxcf.alt_q = q_trans[oxcf->alt_q];
-
-        if (oxcf->key_q < 0)
-            cpi->oxcf.key_q = q_trans[0];
-        else
-            cpi->oxcf.key_q = q_trans[oxcf->key_q];
-
-        if (oxcf->gold_q < 0)
-            cpi->oxcf.gold_q = q_trans[0];
-        else
-            cpi->oxcf.gold_q = q_trans[oxcf->gold_q];
-
-    }
-
-    cpi->baseline_gf_interval = cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL;
-    cpi->ref_frame_flags = VP8_ALT_FLAG | VP8_GOLD_FLAG | VP8_LAST_FLAG;
-
-    //cpi->use_golden_frame_only = 0;
-    //cpi->use_last_frame_only = 0;
-    cm->refresh_golden_frame = 0;
-    cm->refresh_last_frame = 1;
-    cm->refresh_entropy_probs = 1;
-
-    if (cpi->oxcf.token_partitions >= 0 && cpi->oxcf.token_partitions <= 3)
-        cm->multi_token_partition = (TOKEN_PARTITION) cpi->oxcf.token_partitions;
-
-    setup_features(cpi);
-
-    {
-        int i;
-
-        for (i = 0; i < MAX_MB_SEGMENTS; i++)
-            cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout;
-    }
-
-    // At the moment the first order values may not be > MAXQ
-    if (cpi->oxcf.fixed_q > MAXQ)
-        cpi->oxcf.fixed_q = MAXQ;
-
-    // local file playback mode == really big buffer
-    if (cpi->oxcf.end_usage == USAGE_LOCAL_FILE_PLAYBACK)
-    {
-        cpi->oxcf.starting_buffer_level   = 60000;
-        cpi->oxcf.optimal_buffer_level    = 60000;
-        cpi->oxcf.maximum_buffer_size     = 240000;
-
-    }
-
-
-    // Convert target bandwidth from Kbit/s to Bit/s
-    cpi->oxcf.target_bandwidth       *= 1000;
+    // Initialise the starting buffer levels
     cpi->oxcf.starting_buffer_level =
         rescale(cpi->oxcf.starting_buffer_level,
                 cpi->oxcf.target_bandwidth, 1000);
 
-    if (cpi->oxcf.optimal_buffer_level == 0)
-        cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8;
-    else
-        cpi->oxcf.optimal_buffer_level =
-            rescale(cpi->oxcf.optimal_buffer_level,
-                    cpi->oxcf.target_bandwidth, 1000);
-
-    if (cpi->oxcf.maximum_buffer_size == 0)
-        cpi->oxcf.maximum_buffer_size = cpi->oxcf.target_bandwidth / 8;
-    else
-        cpi->oxcf.maximum_buffer_size =
-            rescale(cpi->oxcf.maximum_buffer_size,
-                    cpi->oxcf.target_bandwidth, 1000);
-
-    cpi->buffer_level                = cpi->oxcf.starting_buffer_level;
+    cpi->buffer_level                 = cpi->oxcf.starting_buffer_level;
     cpi->bits_off_target              = cpi->oxcf.starting_buffer_level;
 
-    vp8_new_frame_rate(cpi, cpi->oxcf.frame_rate);
-    cpi->worst_quality               = cpi->oxcf.worst_allowed_q;
-    cpi->active_worst_quality         = cpi->oxcf.worst_allowed_q;
-    cpi->avg_frame_qindex             = cpi->oxcf.worst_allowed_q;
-    cpi->best_quality                = cpi->oxcf.best_allowed_q;
-    cpi->active_best_quality          = cpi->oxcf.best_allowed_q;
-    cpi->cq_target_quality = cpi->oxcf.cq_level;
-
-    cpi->buffered_mode = (cpi->oxcf.optimal_buffer_level > 0) ? TRUE : FALSE;
-
     cpi->rolling_target_bits          = cpi->av_per_frame_bandwidth;
     cpi->rolling_actual_bits          = cpi->av_per_frame_bandwidth;
-    cpi->long_rolling_target_bits      = cpi->av_per_frame_bandwidth;
-    cpi->long_rolling_actual_bits      = cpi->av_per_frame_bandwidth;
+    cpi->long_rolling_target_bits     = cpi->av_per_frame_bandwidth;
+    cpi->long_rolling_actual_bits     = cpi->av_per_frame_bandwidth;
 
     cpi->total_actual_bits            = 0;
-    cpi->total_target_vs_actual        = 0;
-
-    // Only allow dropped frames in buffered mode
-    cpi->drop_frames_allowed          = cpi->oxcf.allow_df && cpi->buffered_mode;
-
-    cm->filter_type      = (LOOPFILTERTYPE) cpi->filter_type;
-
-    if (!cm->use_bilinear_mc_filter)
-        cm->mcomp_filter_type = SIXTAP;
-    else
-        cm->mcomp_filter_type = BILINEAR;
-
-    cpi->target_bandwidth = cpi->oxcf.target_bandwidth;
-
-    cm->Width       = cpi->oxcf.Width     ;
-    cm->Height      = cpi->oxcf.Height    ;
-
-    cpi->intra_frame_target = (4 * (cm->Width + cm->Height) / 15) * 1000; // As per VP8
-
-    cm->horiz_scale  = cpi->horiz_scale;
-    cm->vert_scale   = cpi->vert_scale ;
-
-    // VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs)
-    if (cpi->oxcf.Sharpness > 7)
-        cpi->oxcf.Sharpness = 7;
-
-    cm->sharpness_level = cpi->oxcf.Sharpness;
-
-    if (cm->horiz_scale != NORMAL || cm->vert_scale != NORMAL)
-    {
-        int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs);
-        int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs);
-
-        Scale2Ratio(cm->horiz_scale, &hr, &hs);
-        Scale2Ratio(cm->vert_scale, &vr, &vs);
-
-        // always go to the next whole number
-        cm->Width = (hs - 1 + cpi->oxcf.Width * hr) / hs;
-        cm->Height = (vs - 1 + cpi->oxcf.Height * vr) / vs;
-    }
-
-    if (((cm->Width + 15) & 0xfffffff0) != cm->yv12_fb[cm->lst_fb_idx].y_width ||
-        ((cm->Height + 15) & 0xfffffff0) != cm->yv12_fb[cm->lst_fb_idx].y_height ||
-        cm->yv12_fb[cm->lst_fb_idx].y_width == 0)
-    {
-        alloc_raw_frame_buffers(cpi);
-        vp8_alloc_compressor_data(cpi);
-    }
-
-    // Clamp KF frame size to quarter of data rate
-    if (cpi->intra_frame_target > cpi->target_bandwidth >> 2)
-        cpi->intra_frame_target = cpi->target_bandwidth >> 2;
-
-    if (cpi->oxcf.fixed_q >= 0)
-    {
-        cpi->last_q[0] = cpi->oxcf.fixed_q;
-        cpi->last_q[1] = cpi->oxcf.fixed_q;
-    }
-
-    cpi->Speed = cpi->oxcf.cpu_used;
-
-    // force to allowlag to 0 if lag_in_frames is 0;
-    if (cpi->oxcf.lag_in_frames == 0)
-    {
-        cpi->oxcf.allow_lag = 0;
-    }
-    // Limit on lag buffers as these are not currently dynamically allocated
-    else if (cpi->oxcf.lag_in_frames > MAX_LAG_BUFFERS)
-        cpi->oxcf.lag_in_frames = MAX_LAG_BUFFERS;
-
-    // YX Temp
-    cpi->last_alt_ref_sei    = -1;
-    cpi->is_src_frame_alt_ref = 0;
-    cpi->is_next_src_alt_ref = 0;
-
-#if 0
-    // Experimental RD Code
-    cpi->frame_distortion = 0;
-    cpi->last_frame_distortion = 0;
-#endif
+    cpi->total_target_vs_actual       = 0;
 
 #if VP8_TEMPORAL_ALT_REF
-
-    cpi->use_weighted_temporal_filter = 0;
-
     {
         int i;
 
@@ -1870,12 +1505,6 @@
 #endif
 }
 
-/*
- * This function needs more clean up, i.e. be more tuned torwards
- * change_config rather than init_config  !!!!!!!!!!!!!!!!
- * YX - 5/28/2009
- *
- */
 
 void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
 {
@@ -1988,7 +1617,8 @@
 
     }
 
-    cpi->baseline_gf_interval = cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL;
+    cpi->baseline_gf_interval =
+        cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL;
 
     cpi->ref_frame_flags = VP8_ALT_FLAG | VP8_GOLD_FLAG | VP8_LAST_FLAG;
 
@@ -1999,7 +1629,8 @@
     cm->refresh_entropy_probs = 1;
 
     if (cpi->oxcf.token_partitions >= 0 && cpi->oxcf.token_partitions <= 3)
-        cm->multi_token_partition = (TOKEN_PARTITION) cpi->oxcf.token_partitions;
+        cm->multi_token_partition =
+            (TOKEN_PARTITION) cpi->oxcf.token_partitions;
 
     setup_features(cpi);
 
@@ -2020,16 +1651,12 @@
         cpi->oxcf.starting_buffer_level   = 60000;
         cpi->oxcf.optimal_buffer_level    = 60000;
         cpi->oxcf.maximum_buffer_size     = 240000;
-
     }
 
     // Convert target bandwidth from Kbit/s to Bit/s
     cpi->oxcf.target_bandwidth       *= 1000;
 
-    cpi->oxcf.starting_buffer_level =
-        rescale(cpi->oxcf.starting_buffer_level,
-                cpi->oxcf.target_bandwidth, 1000);
-
+    // Set or reset optimal and maximum buffer levels.
     if (cpi->oxcf.optimal_buffer_level == 0)
         cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8;
     else
@@ -2044,31 +1671,41 @@
             rescale(cpi->oxcf.maximum_buffer_size,
                     cpi->oxcf.target_bandwidth, 1000);
 
-    cpi->buffer_level                = cpi->oxcf.starting_buffer_level;
-    cpi->bits_off_target              = cpi->oxcf.starting_buffer_level;
-
+    // Set up frame rate and related parameters rate control values.
     vp8_new_frame_rate(cpi, cpi->oxcf.frame_rate);
+
+    // Set absolute upper and lower quality limits
     cpi->worst_quality               = cpi->oxcf.worst_allowed_q;
-    cpi->active_worst_quality         = cpi->oxcf.worst_allowed_q;
-    cpi->avg_frame_qindex             = cpi->oxcf.worst_allowed_q;
     cpi->best_quality                = cpi->oxcf.best_allowed_q;
-    cpi->active_best_quality          = cpi->oxcf.best_allowed_q;
+
+    // active values should only be modified if out of new range
+    if (cpi->active_worst_quality > cpi->oxcf.worst_allowed_q)
+    {
+      cpi->active_worst_quality = cpi->oxcf.worst_allowed_q;
+    }
+    // less likely
+    else if (cpi->active_worst_quality < cpi->oxcf.best_allowed_q)
+    {
+      cpi->active_worst_quality = cpi->oxcf.best_allowed_q;
+    }
+    if (cpi->active_best_quality < cpi->oxcf.best_allowed_q)
+    {
+      cpi->active_best_quality = cpi->oxcf.best_allowed_q;
+    }
+    // less likely
+    else if (cpi->active_best_quality > cpi->oxcf.worst_allowed_q)
+    {
+      cpi->active_best_quality = cpi->oxcf.worst_allowed_q;
+    }
+
     cpi->buffered_mode = (cpi->oxcf.optimal_buffer_level > 0) ? TRUE : FALSE;
 
     cpi->cq_target_quality = cpi->oxcf.cq_level;
 
-    cpi->rolling_target_bits          = cpi->av_per_frame_bandwidth;
-    cpi->rolling_actual_bits          = cpi->av_per_frame_bandwidth;
-    cpi->long_rolling_target_bits      = cpi->av_per_frame_bandwidth;
-    cpi->long_rolling_actual_bits      = cpi->av_per_frame_bandwidth;
-
-    cpi->total_actual_bits            = 0;
-    cpi->total_target_vs_actual        = 0;
-
     // Only allow dropped frames in buffered mode
-    cpi->drop_frames_allowed          = cpi->oxcf.allow_df && cpi->buffered_mode;
+    cpi->drop_frames_allowed = cpi->oxcf.allow_df && cpi->buffered_mode;
 
-    cm->filter_type                  = (LOOPFILTERTYPE) cpi->filter_type;
+    cm->filter_type          = (LOOPFILTERTYPE) cpi->filter_type;
 
     if (!cm->use_bilinear_mc_filter)
         cm->mcomp_filter_type = SIXTAP;
@@ -2083,7 +1720,8 @@
     cm->horiz_scale  = cpi->horiz_scale;
     cm->vert_scale   = cpi->vert_scale ;
 
-    cpi->intra_frame_target           = (4 * (cm->Width + cm->Height) / 15) * 1000; // As per VP8
+    // As per VP8
+    cpi->intra_frame_target = (4 * (cm->Width + cm->Height) / 15) * 1000;
 
     // VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs)
     if (cpi->oxcf.Sharpness > 7)
@@ -2104,8 +1742,10 @@
         cm->Height = (vs - 1 + cpi->oxcf.Height * vr) / vs;
     }
 
-    if (((cm->Width + 15) & 0xfffffff0) != cm->yv12_fb[cm->lst_fb_idx].y_width ||
-        ((cm->Height + 15) & 0xfffffff0) != cm->yv12_fb[cm->lst_fb_idx].y_height ||
+    if (((cm->Width + 15) & 0xfffffff0) !=
+          cm->yv12_fb[cm->lst_fb_idx].y_width ||
+        ((cm->Height + 15) & 0xfffffff0) !=
+          cm->yv12_fb[cm->lst_fb_idx].y_height ||
         cm->yv12_fb[cm->lst_fb_idx].y_width == 0)
     {
         alloc_raw_frame_buffers(cpi);
@@ -2203,7 +1843,7 @@
     vp8_create_common(&cpi->common);
     vp8_cmachine_specific_config(cpi);
 
-    vp8_init_config((VP8_PTR)cpi, oxcf);
+    init_config((VP8_PTR)cpi, oxcf);
 
     memcpy(cpi->base_skip_false_prob, vp8cx_base_skip_false_prob, sizeof(vp8cx_base_skip_false_prob));
     cpi->common.current_video_frame   = 0;
@@ -2244,10 +1884,6 @@
     vpx_memset(cpi->active_map , 1, (cpi->common.mb_rows * cpi->common.mb_cols));
     cpi->active_map_enabled = 0;
 
-    // Create the first pass motion map structure and set to 0
-    // Allocate space for maximum of 15 buffers
-    CHECK_MEM_ERROR(cpi->fp_motion_map, vpx_calloc(15*cpi->common.MBs, 1));
-
 #if 0
     // Experimental code for lagged and one pass
     // Initialise one_pass GF frames stats
@@ -2397,7 +2033,7 @@
     }
     else if (cpi->pass == 2)
     {
-        size_t packet_sz = vp8_firstpass_stats_sz(cpi->common.MBs);
+        size_t packet_sz = sizeof(FIRSTPASS_STATS);
         int packets = oxcf->two_pass_stats_in.sz / packet_sz;
 
         cpi->stats_in = oxcf->two_pass_stats_in.buf;
@@ -2427,7 +2063,9 @@
     init_mv_ref_counts();
 #endif
 
+#if CONFIG_MULTITHREAD
     vp8cx_create_encoder_threads(cpi);
+#endif
 
     cpi->fn_ptr[BLOCK_16X16].sdf            = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16);
     cpi->fn_ptr[BLOCK_16X16].vf             = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16);
@@ -2702,9 +2340,11 @@
 
     }
 
+#if CONFIG_MULTITHREAD
     vp8cx_remove_encoder_threads(cpi);
+#endif
 
-    vp8_dealloc_compressor_data(cpi);
+    dealloc_compressor_data(cpi);
     vpx_free(cpi->mb.ss);
     vpx_free(cpi->tok);
     vpx_free(cpi->cyclic_refresh_map);
@@ -3123,28 +2763,44 @@
 
     return 1;
 }
+
 static void set_quantizer(VP8_COMP *cpi, int Q)
 {
     VP8_COMMON *cm = &cpi->common;
     MACROBLOCKD *mbd = &cpi->mb.e_mbd;
-
+    int update = 0;
+    int new_delta_q;
     cm->base_qindex = Q;
 
+    /* if any of the delta_q values are changing update flag has to be set */
+    /* currently only y2dc_delta_q may change */
+
     cm->y1dc_delta_q = 0;
-    cm->y2dc_delta_q = 0;
     cm->y2ac_delta_q = 0;
     cm->uvdc_delta_q = 0;
     cm->uvac_delta_q = 0;
 
-    if(Q<4)
+    if (Q < 4)
     {
-        cm->y2dc_delta_q = 4-Q;
+        new_delta_q = 4-Q;
     }
+    else
+        new_delta_q = 0;
+
+    update |= cm->y2dc_delta_q != new_delta_q;
+    cm->y2dc_delta_q = new_delta_q;
+
+
     // Set Segment specific quatizers
     mbd->segment_feature_data[MB_LVL_ALT_Q][0] = cpi->segment_feature_data[MB_LVL_ALT_Q][0];
     mbd->segment_feature_data[MB_LVL_ALT_Q][1] = cpi->segment_feature_data[MB_LVL_ALT_Q][1];
     mbd->segment_feature_data[MB_LVL_ALT_Q][2] = cpi->segment_feature_data[MB_LVL_ALT_Q][2];
     mbd->segment_feature_data[MB_LVL_ALT_Q][3] = cpi->segment_feature_data[MB_LVL_ALT_Q][3];
+
+    /* quantizer has to be reinitialized for any delta_q changes */
+    if(update)
+        vp8cx_init_quantizer(cpi);
+
 }
 
 static void update_alt_ref_frame_and_stats(VP8_COMP *cpi)
@@ -3193,8 +2849,11 @@
     // Update the Golden frame reconstruction buffer if signalled and the GF usage counts.
     if (cm->refresh_golden_frame)
     {
-        // Update the golden frame buffer
-        vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cm->yv12_fb[cm->gld_fb_idx]);
+        if (cm->frame_type != KEY_FRAME)
+        {
+            // Update the golden frame buffer
+            vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cm->yv12_fb[cm->gld_fb_idx]);
+        }
 
         // Select an interval before next GF
         if (!cpi->auto_gold)
@@ -3575,6 +3234,89 @@
     return force_recode;
 }
 
+void loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm)
+{
+    if (cm->no_lpf)
+    {
+        cm->filter_level = 0;
+    }
+    else
+    {
+        struct vpx_usec_timer timer;
+
+        vp8_clear_system_state();
+
+        vpx_usec_timer_start(&timer);
+        if (cpi->sf.auto_filter == 0)
+            vp8cx_pick_filter_level_fast(cpi->Source, cpi);
+
+        else
+            vp8cx_pick_filter_level(cpi->Source, cpi);
+
+        vpx_usec_timer_mark(&timer);
+        cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer);
+    }
+
+#if CONFIG_MULTITHREAD
+    sem_post(&cpi->h_event_end_lpf); /* signal that we have set filter_level */
+#endif
+
+    if (cm->filter_level > 0)
+    {
+        vp8cx_set_alt_lf_level(cpi, cm->filter_level);
+        vp8_loop_filter_frame(cm, &cpi->mb.e_mbd, cm->filter_level);
+        cm->last_filter_type = cm->filter_type;
+        cm->last_sharpness_level = cm->sharpness_level;
+    }
+
+    vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show);
+
+    {
+        YV12_BUFFER_CONFIG *lst_yv12 = &cm->yv12_fb[cm->lst_fb_idx];
+        YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];
+        YV12_BUFFER_CONFIG *gld_yv12 = &cm->yv12_fb[cm->gld_fb_idx];
+        YV12_BUFFER_CONFIG *alt_yv12 = &cm->yv12_fb[cm->alt_fb_idx];
+        // At this point the new frame has been encoded.
+        // If any buffer copy / swapping is signaled it should be done here.
+        if (cm->frame_type == KEY_FRAME)
+        {
+            vp8_yv12_copy_frame_ptr(cm->frame_to_show, gld_yv12);
+            vp8_yv12_copy_frame_ptr(cm->frame_to_show, alt_yv12);
+        }
+        else    // For non key frames
+        {
+            // Code to copy between reference buffers
+            if (cm->copy_buffer_to_arf)
+            {
+                if (cm->copy_buffer_to_arf == 1)
+                {
+                    if (cm->refresh_last_frame)
+                        // We copy new_frame here because last and new buffers will already have been swapped if cm->refresh_last_frame is set.
+                        vp8_yv12_copy_frame_ptr(new_yv12, alt_yv12);
+                    else
+                        vp8_yv12_copy_frame_ptr(lst_yv12, alt_yv12);
+                }
+                else if (cm->copy_buffer_to_arf == 2)
+                    vp8_yv12_copy_frame_ptr(gld_yv12, alt_yv12);
+            }
+
+            if (cm->copy_buffer_to_gf)
+            {
+                if (cm->copy_buffer_to_gf == 1)
+                {
+                    if (cm->refresh_last_frame)
+                        // We copy new_frame here because last and new buffers will already have been swapped if cm->refresh_last_frame is set.
+                        vp8_yv12_copy_frame_ptr(new_yv12, gld_yv12);
+                    else
+                        vp8_yv12_copy_frame_ptr(lst_yv12, gld_yv12);
+                }
+                else if (cm->copy_buffer_to_gf == 2)
+                    vp8_yv12_copy_frame_ptr(alt_yv12, gld_yv12);
+            }
+        }
+    }
+}
+
 static void encode_frame_to_data_rate
 (
     VP8_COMP *cpi,
@@ -3608,6 +3350,7 @@
     int drop_mark50 = drop_mark / 4;
     int drop_mark25 = drop_mark / 8;
 
+
     // Clear down mmx registers to allow floating point in what follows
     vp8_clear_system_state();
 
@@ -3928,11 +3671,12 @@
             }
         }
 
-        // If CBR and the buffer is as full then it is reasonable to allow higher quality on the frames
-        // to prevent bits just going to waste.
+        // If CBR and the buffer is as full then it is reasonable to allow
+        // higher quality on the frames to prevent bits just going to waste.
         if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
         {
-            // Note that the use of >= here elliminates the risk of a devide by 0 error in the else if clause
+            // Note that the use of >= here elliminates the risk of a devide
+            // by 0 error in the else if clause
             if (cpi->buffer_level >= cpi->oxcf.maximum_buffer_size)
                 cpi->active_best_quality = cpi->best_quality;
 
@@ -3945,6 +3689,20 @@
             }
         }
     }
+    // Make sure constrained quality mode limits are adhered to for the first
+    // few frames of one pass encodes
+    else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY)
+    {
+        if ( (cm->frame_type == KEY_FRAME) ||
+             cm->refresh_golden_frame || cpi->common.refresh_alt_ref_frame )
+        {
+             cpi->active_best_quality = cpi->best_quality;
+        }
+        else if (cpi->active_best_quality < cpi->cq_target_quality)
+        {
+            cpi->active_best_quality = cpi->cq_target_quality;
+        }
+    }
 
     // Clip the active best and worst quality values to limits
     if (cpi->active_worst_quality > cpi->worst_quality)
@@ -4124,8 +3882,8 @@
             vp8_setup_key_frame(cpi);
 
         // transform / motion compensation build reconstruction frame
-
         vp8_encode_frame(cpi);
+
         cpi->projected_frame_size -= vp8_estimate_entropy_savings(cpi);
         cpi->projected_frame_size = (cpi->projected_frame_size > 0) ? cpi->projected_frame_size : 0;
 
@@ -4474,98 +4232,44 @@
     else
         cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];
 
-    //#pragma omp parallel sections
+
+#if CONFIG_MULTITHREAD
+    if (cpi->b_multi_threaded)
     {
-
-        //#pragma omp section
-        {
-
-            struct vpx_usec_timer timer;
-
-            vpx_usec_timer_start(&timer);
-
-            if (cpi->sf.auto_filter == 0)
-                vp8cx_pick_filter_level_fast(cpi->Source, cpi);
-            else
-                vp8cx_pick_filter_level(cpi->Source, cpi);
-
-            vpx_usec_timer_mark(&timer);
-
-            cpi->time_pick_lpf +=  vpx_usec_timer_elapsed(&timer);
-
-            if (cm->no_lpf)
-                cm->filter_level = 0;
-
-            if (cm->filter_level > 0)
-            {
-                vp8cx_set_alt_lf_level(cpi, cm->filter_level);
-                vp8_loop_filter_frame(cm, &cpi->mb.e_mbd, cm->filter_level);
-                cm->last_filter_type = cm->filter_type;
-                cm->last_sharpness_level = cm->sharpness_level;
-            }
-            /* Move storing frame_type out of the above loop since it is also needed in motion search besides loopfilter */
-            cm->last_frame_type = cm->frame_type;
-
-            vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show);
-
-            if (cpi->oxcf.error_resilient_mode == 1)
-            {
-                cm->refresh_entropy_probs = 0;
-            }
-
-        }
-//#pragma omp section
-        {
-            // build the bitstream
-            vp8_pack_bitstream(cpi, dest, size);
-        }
+        sem_post(&cpi->h_event_start_lpf); /* start loopfilter in separate thread */
+    }
+    else
+#endif
+    {
+        loopfilter_frame(cpi, cm);
     }
 
+    if (cpi->oxcf.error_resilient_mode == 1)
     {
-        YV12_BUFFER_CONFIG *lst_yv12 = &cm->yv12_fb[cm->lst_fb_idx];
-        YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];
-        YV12_BUFFER_CONFIG *gld_yv12 = &cm->yv12_fb[cm->gld_fb_idx];
-        YV12_BUFFER_CONFIG *alt_yv12 = &cm->yv12_fb[cm->alt_fb_idx];
-        // At this point the new frame has been encoded coded.
-        // If any buffer copy / swaping is signalled it should be done here.
-        if (cm->frame_type == KEY_FRAME)
-        {
-            vp8_yv12_copy_frame_ptr(cm->frame_to_show, gld_yv12);
-            vp8_yv12_copy_frame_ptr(cm->frame_to_show, alt_yv12);
-        }
-        else    // For non key frames
-        {
-            // Code to copy between reference buffers
-            if (cm->copy_buffer_to_arf)
-            {
-                if (cm->copy_buffer_to_arf == 1)
-                {
-                    if (cm->refresh_last_frame)
-                        // We copy new_frame here because last and new buffers will already have been swapped if cm->refresh_last_frame is set.
-                        vp8_yv12_copy_frame_ptr(new_yv12, alt_yv12);
-                    else
-                        vp8_yv12_copy_frame_ptr(lst_yv12, alt_yv12);
-                }
-                else if (cm->copy_buffer_to_arf == 2)
-                    vp8_yv12_copy_frame_ptr(gld_yv12, alt_yv12);
-            }
-
-            if (cm->copy_buffer_to_gf)
-            {
-                if (cm->copy_buffer_to_gf == 1)
-                {
-                    if (cm->refresh_last_frame)
-                        // We copy new_frame here because last and new buffers will already have been swapped if cm->refresh_last_frame is set.
-                        vp8_yv12_copy_frame_ptr(new_yv12, gld_yv12);
-                    else
-                        vp8_yv12_copy_frame_ptr(lst_yv12, gld_yv12);
-                }
-                else if (cm->copy_buffer_to_gf == 2)
-                    vp8_yv12_copy_frame_ptr(alt_yv12, gld_yv12);
-            }
-        }
+        cm->refresh_entropy_probs = 0;
     }
 
+#if CONFIG_MULTITHREAD
+    /* wait that filter_level is picked so that we can continue with stream packing */
+    if (cpi->b_multi_threaded)
+        sem_wait(&cpi->h_event_end_lpf);
+#endif
+
+    // build the bitstream
+    vp8_pack_bitstream(cpi, dest, size);
+
+#if CONFIG_MULTITHREAD
+    /* wait for loopfilter thread done */
+    if (cpi->b_multi_threaded)
+    {
+        sem_wait(&cpi->h_event_end_lpf);
+    }
+#endif
+
+    /* Move storing frame_type out of the above loop since it is also
+     * needed in motion search besides loopfilter */
+      cm->last_frame_type = cm->frame_type;
+
     // Update rate control heuristics
     cpi->total_byte_count += (*size);
     cpi->projected_frame_size = (*size) << 3;
@@ -4816,16 +4520,19 @@
 
     if (cpi->oxcf.error_resilient_mode)
     {
-        // Is this an alternate reference update
-        if (cpi->common.refresh_alt_ref_frame)
-            vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cm->yv12_fb[cm->alt_fb_idx]);
+        if (cm->frame_type != KEY_FRAME)
+        {
+            // Is this an alternate reference update
+            if (cm->refresh_alt_ref_frame)
+                vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cm->yv12_fb[cm->alt_fb_idx]);
 
-        if (cpi->common.refresh_golden_frame)
-            vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cm->yv12_fb[cm->gld_fb_idx]);
+            if (cm->refresh_golden_frame)
+                vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cm->yv12_fb[cm->gld_fb_idx]);
+        }
     }
     else
     {
-        if (cpi->oxcf.play_alternate && cpi->common.refresh_alt_ref_frame && (cpi->common.frame_type != KEY_FRAME))
+        if (cpi->oxcf.play_alternate && cm->refresh_alt_ref_frame && (cm->frame_type != KEY_FRAME))
             // Update the alternate reference frame and stats as appropriate.
             update_alt_ref_frame_and_stats(cpi);
         else
@@ -4885,18 +4592,8 @@
 
 }
 
-int vp8_is_gf_update_needed(VP8_PTR ptr)
-{
-    VP8_COMP *cpi = (VP8_COMP *) ptr;
-    int ret_val;
 
-    ret_val = cpi->gf_update_recommended;
-    cpi->gf_update_recommended = 0;
-
-    return ret_val;
-}
-
-void vp8_check_gf_quality(VP8_COMP *cpi)
+static void check_gf_quality(VP8_COMP *cpi)
 {
     VP8_COMMON *cm = &cpi->common;
     int gf_active_pct = (100 * cpi->gf_active_count) / (cm->mb_rows * cm->mb_cols);
@@ -5145,7 +4842,7 @@
                 if (start_frame < 0)
                     start_frame += cpi->oxcf.lag_in_frames;
 
-                besterr = vp8_calc_low_ss_err(&cpi->src_buffer[cpi->last_alt_ref_sei].source_buffer,
+                besterr = calc_low_ss_err(&cpi->src_buffer[cpi->last_alt_ref_sei].source_buffer,
                                               &cpi->src_buffer[start_frame].source_buffer, IF_RTCD(&cpi->rtcd.variance));
 
                 for (i = 0; i < 7; i++)
@@ -5154,7 +4851,7 @@
                     cpi->oxcf.arnr_strength = i;
                     vp8_temporal_filter_prepare_c(cpi);
 
-                    thiserr = vp8_calc_low_ss_err(&cpi->alt_ref_buffer.source_buffer,
+                    thiserr = calc_low_ss_err(&cpi->alt_ref_buffer.source_buffer,
                                                   &cpi->src_buffer[start_frame].source_buffer, IF_RTCD(&cpi->rtcd.variance));
 
                     if (10 * thiserr < besterr * 8)
@@ -5282,9 +4979,12 @@
         {
             long long nanosecs = cpi->source_end_time_stamp
                 - cpi->last_end_time_stamp_seen;
-            double this_fps = 10000000.000 / nanosecs;
 
-            vp8_new_frame_rate(cpi, (7 * cpi->oxcf.frame_rate + this_fps) / 8);
+            if (nanosecs > 0)
+            {
+              double this_fps = 10000000.000 / nanosecs;
+              vp8_new_frame_rate(cpi, (7 * cpi->oxcf.frame_rate + this_fps) / 8);
+            }
 
         }
 
@@ -5294,24 +4994,7 @@
 
     if (cpi->compressor_speed == 2)
     {
-        vp8_check_gf_quality(cpi);
-    }
-
-    if (!cpi)
-    {
-#if HAVE_ARMV7
-#if CONFIG_RUNTIME_CPU_DETECT
-        if (cm->rtcd.flags & HAS_NEON)
-#endif
-        {
-            vp8_pop_neon(store_reg);
-        }
-#endif
-        return 0;
-    }
-
-    if (cpi->compressor_speed == 2)
-    {
+        check_gf_quality(cpi);
         vpx_usec_timer_start(&tsctimer);
         vpx_usec_timer_start(&ticktimer);
     }
@@ -5410,7 +5093,9 @@
     cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer);
 
     if (cpi->b_calculate_psnr && cpi->pass != 1 && cm->show_frame)
+    {
         generate_psnr_packet(cpi);
+    }
 
 #if CONFIG_PSNR
 
@@ -5426,12 +5111,35 @@
             if (cpi->b_calculate_psnr)
             {
                 double y, u, v;
-                double sq_error;
-                double frame_psnr = vp8_calc_psnr(cpi->Source, cm->frame_to_show, &y, &u, &v, &sq_error);
+                double ye,ue,ve;
+                double frame_psnr;
+                YV12_BUFFER_CONFIG      *orig = cpi->Source;
+                YV12_BUFFER_CONFIG      *recon = cpi->common.frame_to_show;
+                YV12_BUFFER_CONFIG      *pp = &cm->post_proc_buffer;
+                int y_samples = orig->y_height * orig->y_width ;
+                int uv_samples = orig->uv_height * orig->uv_width ;
+                int t_samples = y_samples + 2 * uv_samples;
+                long long sq_error;
 
-                cpi->total_y += y;
-                cpi->total_u += u;
-                cpi->total_v += v;
+                ye = calc_plane_error(orig->y_buffer, orig->y_stride,
+                  recon->y_buffer, recon->y_stride, orig->y_width, orig->y_height,
+                  IF_RTCD(&cpi->rtcd.variance));
+
+                ue = calc_plane_error(orig->u_buffer, orig->uv_stride,
+                  recon->u_buffer, recon->uv_stride, orig->uv_width, orig->uv_height,
+                  IF_RTCD(&cpi->rtcd.variance));
+
+                ve = calc_plane_error(orig->v_buffer, orig->uv_stride,
+                  recon->v_buffer, recon->uv_stride, orig->uv_width, orig->uv_height,
+                  IF_RTCD(&cpi->rtcd.variance));
+
+                sq_error = ye + ue + ve;
+
+                frame_psnr = vp8_mse2psnr(t_samples, 255.0, sq_error);
+
+                cpi->total_y += vp8_mse2psnr(y_samples, 255.0, ye);
+                cpi->total_u += vp8_mse2psnr(uv_samples, 255.0, ue);
+                cpi->total_v += vp8_mse2psnr(uv_samples, 255.0, ve);
                 cpi->total_sq_error += sq_error;
                 cpi->total  += frame_psnr;
                 {
@@ -5440,18 +5148,36 @@
 
                     vp8_deblock(cm->frame_to_show, &cm->post_proc_buffer, cm->filter_level * 10 / 6, 1, 0, IF_RTCD(&cm->rtcd.postproc));
                     vp8_clear_system_state();
-                    frame_psnr2 = vp8_calc_psnr(cpi->Source, &cm->post_proc_buffer, &y2, &u2, &v2, &sq_error);
-                    frame_ssim2 = vp8_calc_ssim(cpi->Source, &cm->post_proc_buffer, 1, &weight);
+
+                    ye = calc_plane_error(orig->y_buffer, orig->y_stride,
+                      pp->y_buffer, pp->y_stride, orig->y_width, orig->y_height,
+                      IF_RTCD(&cpi->rtcd.variance));
+
+                    ue = calc_plane_error(orig->u_buffer, orig->uv_stride,
+                      pp->u_buffer, pp->uv_stride, orig->uv_width, orig->uv_height,
+                      IF_RTCD(&cpi->rtcd.variance));
+
+                    ve = calc_plane_error(orig->v_buffer, orig->uv_stride,
+                      pp->v_buffer, pp->uv_stride, orig->uv_width, orig->uv_height,
+                      IF_RTCD(&cpi->rtcd.variance));
+
+                    sq_error = ye + ue + ve;
+
+                    frame_psnr2 = vp8_mse2psnr(t_samples, 255.0, sq_error);
+
+                    cpi->totalp_y += vp8_mse2psnr(y_samples, 255.0, ye);
+                    cpi->totalp_u += vp8_mse2psnr(uv_samples, 255.0, ue);
+                    cpi->totalp_v += vp8_mse2psnr(uv_samples, 255.0, ve);
+                    cpi->total_sq_error2 += sq_error;
+                    cpi->totalp  += frame_psnr2;
+
+                    frame_ssim2 = vp8_calc_ssim(cpi->Source,
+                      &cm->post_proc_buffer, 1, &weight,
+                      IF_RTCD(&cpi->rtcd.variance));
 
                     cpi->summed_quality += frame_ssim2 * weight;
                     cpi->summed_weights += weight;
 
-                    cpi->totalp_y += y2;
-                    cpi->totalp_u += u2;
-                    cpi->totalp_v += v2;
-                    cpi->totalp  += frame_psnr2;
-                    cpi->total_sq_error2 += sq_error;
-
                 }
             }
 
@@ -5647,7 +5373,9 @@
 
     return Total;
 }
-int vp8_calc_low_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const vp8_variance_rtcd_vtable_t *rtcd)
+
+
+static int calc_low_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const vp8_variance_rtcd_vtable_t *rtcd)
 {
     int i, j;
     int Total = 0;
@@ -5675,11 +5403,7 @@
     return Total;
 }
 
-int vp8_get_speed(VP8_PTR c)
-{
-    VP8_COMP   *cpi = (VP8_COMP *) c;
-    return cpi->Speed;
-}
+
 int vp8_get_quantizer(VP8_PTR c)
 {
     VP8_COMP   *cpi = (VP8_COMP *) c;

diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index e8a452d..0e53f68 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h

@@ -14,21 +14,21 @@
 
 #include <stdio.h>
 #include "vpx_ports/config.h"
-#include "onyx.h"
+#include "vp8/common/onyx.h"
 #include "treewriter.h"
 #include "tokenize.h"
-#include "onyxc_int.h"
+#include "vp8/common/onyxc_int.h"
 #include "variance.h"
 #include "dct.h"
 #include "encodemb.h"
 #include "quantize.h"
-#include "entropy.h"
-#include "threading.h"
+#include "vp8/common/entropy.h"
+#include "vp8/common/threading.h"
 #include "vpx_ports/mem.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "mcomp.h"
 #include "temporal_filter.h"
-#include "findnearmv.h"
+#include "vp8/common/findnearmv.h"
 
 //#define SPEEDSTATS 1
 #define MIN_GF_INTERVAL             4
@@ -51,7 +51,9 @@
 #define MV_ZBIN_BOOST        4
 #define ZBIN_OQ_MAX 192
 
+#if !(CONFIG_REALTIME_ONLY)
 #define VP8_TEMPORAL_ALT_REF 1
+#endif
 
 typedef struct
 {
@@ -97,6 +99,7 @@
     double pcnt_inter;
     double pcnt_motion;
     double pcnt_second_ref;
+    double pcnt_neutral;
     double MVr;
     double mvr_abs;
     double MVc;
@@ -185,6 +188,7 @@
 
     int use_fastquant_for_pick;
     int no_skip_block4x4_search;
+    int improved_mv_pred;
 
 } SPEED_FEATURES;
 
@@ -491,9 +495,6 @@
     FIRSTPASS_STATS *stats_in, *stats_in_end;
     struct vpx_codec_pkt_list  *output_pkt_list;
     int                          first_pass_done;
-    unsigned char *fp_motion_map;
-
-    unsigned char *fp_motion_map_stats, *fp_motion_map_stats_save;
 
 #if 0
     // Experimental code for lagged and one pass
@@ -589,6 +590,7 @@
     int cyclic_refresh_q;
     signed char *cyclic_refresh_map;
 
+#if CONFIG_MULTITHREAD
     // multithread data
     int * mt_current_mb_col;
     int mt_sync_range;
@@ -596,16 +598,18 @@
     int b_multi_threaded;
     int encoding_thread_count;
 
-#if CONFIG_MULTITHREAD
     pthread_t *h_encoding_thread;
-#endif
+    pthread_t h_filter_thread;
+
     MB_ROW_COMP *mb_row_ei;
     ENCODETHREAD_DATA *en_thread_data;
+    LPFTHREAD_DATA lpf_thread_data;
 
-#if CONFIG_MULTITHREAD
     //events
     sem_t *h_event_start_encoding;
     sem_t h_event_end_encoding;
+    sem_t h_event_start_lpf;
+    sem_t h_event_end_lpf;
 #endif
 
     TOKENLIST *tplist;
@@ -638,8 +642,6 @@
     YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS];
     int fixed_divide[512];
 #endif
-    // Flag to indicate temporal filter method
-    int use_weighted_temporal_filter;
 
 #if CONFIG_PSNR
     int    count;

diff --git a/vp8/encoder/parms.cpp b/vp8/encoder/parms.cpp
index 6cc4501..2a39b2c 100644
--- a/vp8/encoder/parms.cpp
+++ b/vp8/encoder/parms.cpp

@@ -16,7 +16,7 @@
 #include <fstream>
 extern "C"
 {
-    #include "onyx.h"
+    #include "vp8/common/onyx.h"
 }
 
 

diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index cfaf497..0790d35 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c

@@ -14,17 +14,17 @@
 #include "onyx_int.h"
 #include "modecosts.h"
 #include "encodeintra.h"
-#include "entropymode.h"
+#include "vp8/common/entropymode.h"
 #include "pickinter.h"
-#include "findnearmv.h"
+#include "vp8/common/findnearmv.h"
 #include "encodemb.h"
-#include "reconinter.h"
-#include "reconintra.h"
-#include "reconintra4x4.h"
-#include "g_common.h"
+#include "vp8/common/reconinter.h"
+#include "vp8/common/reconintra.h"
+#include "vp8/common/reconintra4x4.h"
+#include "vp8/common/g_common.h"
 #include "variance.h"
 #include "mcomp.h"
-
+#include "rdopt.h"
 #include "vpx_mem/vpx_mem.h"
 
 #if CONFIG_RUNTIME_CPU_DETECT
@@ -412,7 +412,6 @@
 
 }
 
-
 int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra)
 {
     BLOCK *b = &x->block[0];
@@ -421,7 +420,7 @@
     B_MODE_INFO best_bmodes[16];
     MB_MODE_INFO best_mbmode;
     PARTITION_INFO best_partition;
-    MV best_ref_mv1;
+    MV best_ref_mv;
     MV mode_mv[MB_MODE_COUNT];
     MB_PREDICTION_MODE this_mode;
     int num00;
@@ -439,9 +438,14 @@
     int best_mode_index = 0;
     int sse = INT_MAX;
 
+    MV mvp;
+    int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+    int saddone=0;
+    int sr=0;    //search range got from mv_pred(). It uses step_param levels. (0-7)
+
     MV nearest_mv[4];
     MV near_mv[4];
-    MV best_ref_mv[4];
+    MV frame_best_ref_mv[4];
     int MDCounts[4][4];
     unsigned char *y_buffer[4];
     unsigned char *u_buffer[4];
@@ -461,7 +465,7 @@
         YV12_BUFFER_CONFIG *lst_yv12 = &cpi->common.yv12_fb[cpi->common.lst_fb_idx];
 
         vp8_find_near_mvs(&x->e_mbd, x->e_mbd.mode_info_context, &nearest_mv[LAST_FRAME], &near_mv[LAST_FRAME],
-                          &best_ref_mv[LAST_FRAME], MDCounts[LAST_FRAME], LAST_FRAME, cpi->common.ref_frame_sign_bias);
+                          &frame_best_ref_mv[LAST_FRAME], MDCounts[LAST_FRAME], LAST_FRAME, cpi->common.ref_frame_sign_bias);
 
         y_buffer[LAST_FRAME] = lst_yv12->y_buffer + recon_yoffset;
         u_buffer[LAST_FRAME] = lst_yv12->u_buffer + recon_uvoffset;
@@ -475,7 +479,7 @@
         YV12_BUFFER_CONFIG *gld_yv12 = &cpi->common.yv12_fb[cpi->common.gld_fb_idx];
 
         vp8_find_near_mvs(&x->e_mbd, x->e_mbd.mode_info_context, &nearest_mv[GOLDEN_FRAME], &near_mv[GOLDEN_FRAME],
-                          &best_ref_mv[GOLDEN_FRAME], MDCounts[GOLDEN_FRAME], GOLDEN_FRAME, cpi->common.ref_frame_sign_bias);
+                          &frame_best_ref_mv[GOLDEN_FRAME], MDCounts[GOLDEN_FRAME], GOLDEN_FRAME, cpi->common.ref_frame_sign_bias);
 
         y_buffer[GOLDEN_FRAME] = gld_yv12->y_buffer + recon_yoffset;
         u_buffer[GOLDEN_FRAME] = gld_yv12->u_buffer + recon_uvoffset;
@@ -489,7 +493,7 @@
         YV12_BUFFER_CONFIG *alt_yv12 = &cpi->common.yv12_fb[cpi->common.alt_fb_idx];
 
         vp8_find_near_mvs(&x->e_mbd, x->e_mbd.mode_info_context, &nearest_mv[ALTREF_FRAME], &near_mv[ALTREF_FRAME],
-                          &best_ref_mv[ALTREF_FRAME], MDCounts[ALTREF_FRAME], ALTREF_FRAME, cpi->common.ref_frame_sign_bias);
+                          &frame_best_ref_mv[ALTREF_FRAME], MDCounts[ALTREF_FRAME], ALTREF_FRAME, cpi->common.ref_frame_sign_bias);
 
         y_buffer[ALTREF_FRAME] = alt_yv12->y_buffer + recon_yoffset;
         u_buffer[ALTREF_FRAME] = alt_yv12->u_buffer + recon_uvoffset;
@@ -529,10 +533,6 @@
                                         + vp8_cost_one(cpi->prob_gf_coded);
     }
 
-
-
-    best_rd = INT_MAX;
-
     x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
 
     // if we encode a new mv this is important
@@ -595,7 +595,7 @@
             x->e_mbd.pre.v_buffer = v_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame];
             mode_mv[NEARESTMV] = nearest_mv[x->e_mbd.mode_info_context->mbmi.ref_frame];
             mode_mv[NEARMV] = near_mv[x->e_mbd.mode_info_context->mbmi.ref_frame];
-            best_ref_mv1 = best_ref_mv[x->e_mbd.mode_info_context->mbmi.ref_frame];
+            best_ref_mv = frame_best_ref_mv[x->e_mbd.mode_info_context->mbmi.ref_frame];
             memcpy(mdcounts, MDCounts[x->e_mbd.mode_info_context->mbmi.ref_frame], sizeof(mdcounts));
         }
 
@@ -608,6 +608,28 @@
                 continue;
         }
 
+        if(cpi->sf.improved_mv_pred && x->e_mbd.mode_info_context->mbmi.mode == NEWMV)
+        {
+            if(!saddone)
+            {
+                vp8_cal_sad(cpi,xd,x, recon_yoffset ,&near_sadidx[0] );
+                saddone = 1;
+            }
+
+            vp8_mv_pred(cpi, &x->e_mbd, x->e_mbd.mode_info_context, &mvp,
+                        x->e_mbd.mode_info_context->mbmi.ref_frame, cpi->common.ref_frame_sign_bias, &sr, &near_sadidx[0]);
+
+            /* adjust mvp to make sure it is within MV range */
+            if(mvp.row > best_ref_mv.row + MAX_FULL_PEL_VAL)
+                mvp.row = best_ref_mv.row + MAX_FULL_PEL_VAL;
+            else if(mvp.row < best_ref_mv.row - MAX_FULL_PEL_VAL)
+                mvp.row = best_ref_mv.row - MAX_FULL_PEL_VAL;
+            if(mvp.col > best_ref_mv.col + MAX_FULL_PEL_VAL)
+                mvp.col = best_ref_mv.col + MAX_FULL_PEL_VAL;
+            else if(mvp.col < best_ref_mv.col - MAX_FULL_PEL_VAL)
+                mvp.col = best_ref_mv.col - MAX_FULL_PEL_VAL;
+        }
+
         switch (this_mode)
         {
         case B_PRED:
@@ -642,7 +664,8 @@
         case V_PRED:
         case H_PRED:
         case TM_PRED:
-            vp8_build_intra_predictors_mby_ptr(&x->e_mbd);
+            RECON_INVOKE(&cpi->common.rtcd.recon, build_intra_predictors_mby)
+                (&x->e_mbd);
             distortion2 = VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16prederror)(x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16, 0x7fffffff);
             rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.mode];
             this_rd = RD_ESTIMATE(x->rdmult, x->rddiv, rate2, distortion2);
@@ -663,61 +686,59 @@
             int n = 0;
             int sadpb = x->sadperbit16;
 
+            int col_min;
+            int col_max;
+            int row_min;
+            int row_max;
+
+            int tmp_col_min = x->mv_col_min;
+            int tmp_col_max = x->mv_col_max;
+            int tmp_row_min = x->mv_row_min;
+            int tmp_row_max = x->mv_row_max;
+
+            int speed_adjust = (cpi->Speed > 5) ? ((cpi->Speed >= 8)? 3 : 2) : 1;
+
             // Further step/diamond searches as necessary
-            if (cpi->Speed < 8)
+            step_param = cpi->sf.first_step + speed_adjust;
+
+            if(cpi->sf.improved_mv_pred)
             {
-                step_param = cpi->sf.first_step + ((cpi->Speed > 5) ? 1 : 0);
-                further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
-            }
-            else
+                sr += speed_adjust;
+                //adjust search range according to sr from mv prediction
+                if(sr > step_param)
+                    step_param = sr;
+
+                col_min = (best_ref_mv.col - MAX_FULL_PEL_VAL) >>3;
+                col_max = (best_ref_mv.col + MAX_FULL_PEL_VAL) >>3;
+                row_min = (best_ref_mv.row - MAX_FULL_PEL_VAL) >>3;
+                row_max = (best_ref_mv.row + MAX_FULL_PEL_VAL) >>3;
+
+                // Get intersection of UMV window and valid MV window to reduce # of checks in diamond search.
+                if (x->mv_col_min < col_min )
+                    x->mv_col_min = col_min;
+                if (x->mv_col_max > col_max )
+                    x->mv_col_max = col_max;
+                if (x->mv_row_min < row_min )
+                    x->mv_row_min = row_min;
+                if (x->mv_row_max > row_max )
+                    x->mv_row_max = row_max;
+            }else
             {
-                step_param = cpi->sf.first_step + 2;
-                further_steps = 0;
+                mvp.row = best_ref_mv.row;
+                mvp.col = best_ref_mv.col;
             }
 
-#if 0
-
-            // Initial step Search
-            bestsme = vp8_diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, x->errorperbit, &num00, &cpi->fn_ptr, cpi->mb.mvsadcost, cpi->mb.mvcost, &best_ref_mv1);
-            mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
-            mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
-
-            // Further step searches
-            while (n < further_steps)
-            {
-                n++;
-
-                if (num00)
-                    num00--;
-                else
-                {
-                    thissme = vp8_diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param + n, x->errorperbit, &num00, &cpi->fn_ptr, cpi->mb.mvsadcost, x->mvcost, &best_ref_mv1);
-
-                    if (thissme < bestsme)
-                    {
-                        bestsme = thissme;
-                        mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
-                        mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
-                    }
-                    else
-                    {
-                        d->bmi.mv.as_mv.row = mode_mv[NEWMV].row;
-                        d->bmi.mv.as_mv.col = mode_mv[NEWMV].col;
-                    }
-                }
-            }
-
-#else
+            further_steps = (cpi->Speed >= 8)? 0: (cpi->sf.max_step_search_steps - 1 - step_param);
 
             if (cpi->sf.search_method == HEX)
             {
-                bestsme = vp8_hex_search(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, sadpb/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost);
+                bestsme = vp8_hex_search(x, b, d, &mvp, &d->bmi.mv.as_mv, step_param, sadpb/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost, &best_ref_mv);
                 mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
                 mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
             }
             else
             {
-                bestsme = cpi->diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, sadpb / 2/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost, &best_ref_mv1); //sadpb < 9
+                bestsme = cpi->diamond_search_sad(x, b, d, &mvp, &d->bmi.mv.as_mv, step_param, sadpb / 2/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost, &best_ref_mv); //sadpb < 9
                 mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
                 mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
 
@@ -736,7 +757,7 @@
                         num00--;
                     else
                     {
-                        thissme = cpi->diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param + n, sadpb / 4/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost, &best_ref_mv1); //sadpb = 9
+                        thissme = cpi->diamond_search_sad(x, b, d, &mvp, &d->bmi.mv.as_mv, step_param + n, sadpb / 4/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost, &best_ref_mv); //sadpb = 9
 
                         if (thissme < bestsme)
                         {
@@ -753,19 +774,24 @@
                 }
             }
 
-#endif
+            if(cpi->sf.improved_mv_pred)
+            {
+                x->mv_col_min = tmp_col_min;
+                x->mv_col_max = tmp_col_max;
+                x->mv_row_min = tmp_row_min;
+                x->mv_row_max = tmp_row_max;
+            }
+
+            if (bestsme < INT_MAX)
+                cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv, x->errorperbit, &cpi->fn_ptr[BLOCK_16X16], cpi->mb.mvcost);
+
+            mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
+            mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
+
+            // mv cost;
+            rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv, cpi->mb.mvcost, 128);
         }
 
-        if (bestsme < INT_MAX)
-            cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv1, x->errorperbit, &cpi->fn_ptr[BLOCK_16X16], cpi->mb.mvcost);
-
-        mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
-        mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
-
-        // mv cost;
-        rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv1, cpi->mb.mvcost, 128);
-
-
         case NEARESTMV:
         case NEARMV:
 

diff --git a/vp8/encoder/pickinter.h b/vp8/encoder/pickinter.h
index b80e4c8..af060d7 100644
--- a/vp8/encoder/pickinter.h
+++ b/vp8/encoder/pickinter.h

@@ -12,7 +12,7 @@
 #ifndef __INC_PICKINTER_H
 #define __INC_PICKINTER_H
 #include "vpx_ports/config.h"
-#include "onyxc_int.h"
+#include "vp8/common/onyxc_int.h"
 
 #define RD_ESTIMATE(RM,DM,R,D) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) )
 extern int vp8_pick_intra4x4mby_modes(const VP8_ENCODER_RTCD *, MACROBLOCK *mb, int *Rate, int *Distortion);

diff --git a/vp8/encoder/picklpf.c b/vp8/encoder/picklpf.c
index 78aa866..d294af6 100644
--- a/vp8/encoder/picklpf.c
+++ b/vp8/encoder/picklpf.c

@@ -9,13 +9,13 @@
  */
 
 
-#include "onyxc_int.h"
+#include "vp8/common/onyxc_int.h"
 #include "onyx_int.h"
 #include "quantize.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_scale/yv12extend.h"
 #include "vpx_scale/vpxscale.h"
-#include "alloccommon.h"
+#include "vp8/common/alloccommon.h"
 #if ARCH_ARM
 #include "vpx_ports/arm.h"
 #endif

diff --git a/vp8/encoder/ppc/csystemdependent.c b/vp8/encoder/ppc/csystemdependent.c
index 588656b..8dfd2a5 100644
--- a/vp8/encoder/ppc/csystemdependent.c
+++ b/vp8/encoder/ppc/csystemdependent.c

@@ -9,8 +9,8 @@
  */
 
 
-#include "variance.h"
-#include "onyx_int.h"
+#include "vp8/encoder/variance.h"
+#include "vp8/encoder/onyx_int.h"
 
 SADFunction *vp8_sad16x16;
 SADFunction *vp8_sad16x8;

diff --git a/vp8/encoder/psnr.c b/vp8/encoder/psnr.c
index dc2a03b..5119bb8 100644
--- a/vp8/encoder/psnr.c
+++ b/vp8/encoder/psnr.c

@@ -11,7 +11,7 @@
 
 #include "vpx_scale/yv12config.h"
 #include "math.h"
-#include "systemdependent.h" /* for vp8_clear_system_state() */
+#include "vp8/common/systemdependent.h" /* for vp8_clear_system_state() */
 
 #define MAX_PSNR 60
 
@@ -29,89 +29,3 @@
 
     return psnr;
 }
-
-double vp8_calc_psnr(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, double *YPsnr, double *UPsnr, double *VPsnr, double *sq_error)
-{
-    int i, j;
-    int Diff;
-    double frame_psnr;
-    double Total;
-    double grand_total;
-    unsigned char *src = source->y_buffer;
-    unsigned char *dst = dest->y_buffer;
-
-    Total = 0.0;
-    grand_total = 0.0;
-
-    // Loop throught the Y plane raw and reconstruction data summing (square differences)
-    for (i = 0; i < source->y_height; i++)
-    {
-
-        for (j = 0; j < source->y_width; j++)
-        {
-            Diff        = (int)(src[j]) - (int)(dst[j]);
-            Total      += Diff * Diff;
-        }
-
-        src += source->y_stride;
-        dst += dest->y_stride;
-    }
-
-    // Work out Y PSNR
-    *YPsnr = vp8_mse2psnr(source->y_height * source->y_width, 255.0, Total);
-    grand_total += Total;
-    Total = 0;
-
-
-    // Loop through the U plane
-    src = source->u_buffer;
-    dst = dest->u_buffer;
-
-    for (i = 0; i < source->uv_height; i++)
-    {
-
-        for (j = 0; j < source->uv_width; j++)
-        {
-            Diff        = (int)(src[j]) - (int)(dst[j]);
-            Total      += Diff * Diff;
-        }
-
-        src += source->uv_stride;
-        dst += dest->uv_stride;
-    }
-
-    // Work out U PSNR
-    *UPsnr = vp8_mse2psnr(source->uv_height * source->uv_width, 255.0, Total);
-    grand_total += Total;
-    Total = 0;
-
-
-    // V PSNR
-    src = source->v_buffer;
-    dst = dest->v_buffer;
-
-    for (i = 0; i < source->uv_height; i++)
-    {
-
-        for (j = 0; j < source->uv_width; j++)
-        {
-            Diff        = (int)(src[j]) - (int)(dst[j]);
-            Total      += Diff * Diff;
-        }
-
-        src += source->uv_stride;
-        dst += dest->uv_stride;
-    }
-
-    // Work out UV PSNR
-    *VPsnr = vp8_mse2psnr(source->uv_height * source->uv_width, 255.0, Total);
-    grand_total += Total;
-    Total = 0;
-
-    // Work out total PSNR
-    frame_psnr = vp8_mse2psnr(source->y_height * source->y_width * 3 / 2 , 255.0, grand_total);
-
-    *sq_error = 1.0 * grand_total;
-
-    return frame_psnr;
-}

diff --git a/vp8/encoder/psnr.h b/vp8/encoder/psnr.h
index 8ae4448..7f6269a 100644
--- a/vp8/encoder/psnr.h
+++ b/vp8/encoder/psnr.h

@@ -13,6 +13,5 @@
 #define __INC_PSNR_H
 
 extern double vp8_mse2psnr(double Samples, double Peak, double Mse);
-extern double vp8_calc_psnr(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, double *YPsnr, double *UPsnr, double *VPsnr, double *sq_error);
 
 #endif

diff --git a/vp8/encoder/quantize.c b/vp8/encoder/quantize.c
index 4a2329f..803e3a5 100644
--- a/vp8/encoder/quantize.c
+++ b/vp8/encoder/quantize.c

@@ -13,8 +13,7 @@
 #include "vpx_mem/vpx_mem.h"
 
 #include "quantize.h"
-#include "entropy.h"
-#include "predictdc.h"
+#include "vp8/common/entropy.h"
 
 #define EXACT_QUANT
 

diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c
index b69a196..e2c6327 100644
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c

@@ -16,11 +16,11 @@
 #include <assert.h>
 
 #include "math.h"
-#include "common.h"
+#include "vp8/common/common.h"
 #include "ratectrl.h"
-#include "entropymode.h"
+#include "vp8/common/entropymode.h"
 #include "vpx_mem/vpx_mem.h"
-#include "systemdependent.h"
+#include "vp8/common/systemdependent.h"
 #include "encodemv.h"
 
 
@@ -90,7 +90,7 @@
     }
 };
 
-const int vp8_kf_boost_qadjustment[QINDEX_RANGE] =
+static const int kf_boost_qadjustment[QINDEX_RANGE] =
 {
     128, 129, 130, 131, 132, 133, 134, 135,
     136, 137, 138, 139, 140, 141, 142, 143,
@@ -154,7 +154,7 @@
 };
 */
 
-const int vp8_kf_gf_boost_qlimits[QINDEX_RANGE] =
+static const int kf_gf_boost_qlimits[QINDEX_RANGE] =
 {
     150, 155, 160, 165, 170, 175, 180, 185,
     190, 195, 200, 205, 210, 215, 220, 225,
@@ -175,14 +175,14 @@
 };
 
 // % adjustment to target kf size based on seperation from previous frame
-const int vp8_kf_boost_seperationt_adjustment[16] =
+static const int kf_boost_seperation_adjustment[16] =
 {
     30,   40,   50,   55,   60,   65,   70,   75,
     80,   85,   90,   95,  100,  100,  100,  100,
 };
 
 
-const int vp8_gf_adjust_table[101] =
+static const int gf_adjust_table[101] =
 {
     100,
     115, 130, 145, 160, 175, 190, 200, 210, 220, 230,
@@ -197,13 +197,13 @@
     400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
 };
 
-const int vp8_gf_intra_useage_adjustment[20] =
+static const int gf_intra_usage_adjustment[20] =
 {
     125, 120, 115, 110, 105, 100,  95,  85,  80,  75,
     70,  65,  60,  55,  50,  50,  50,  50,  50,  50,
 };
 
-const int vp8_gf_interval_table[101] =
+static const int gf_interval_table[101] =
 {
     7,
     7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
@@ -353,7 +353,7 @@
         kf_boost = (int)(2 * cpi->output_frame_rate - 16);
 
         // adjustment up based on q
-        kf_boost = kf_boost * vp8_kf_boost_qadjustment[cpi->ni_av_qi] / 100;
+        kf_boost = kf_boost * kf_boost_qadjustment[cpi->ni_av_qi] / 100;
 
         // frame separation adjustment ( down)
         if (cpi->frames_since_key  < cpi->output_frame_rate / 2)
@@ -488,10 +488,10 @@
             Boost = GFQ_ADJUSTMENT;
 
             // Adjust based upon most recently measure intra useage
-            Boost = Boost * vp8_gf_intra_useage_adjustment[(cpi->this_frame_percent_intra < 15) ? cpi->this_frame_percent_intra : 14] / 100;
+            Boost = Boost * gf_intra_usage_adjustment[(cpi->this_frame_percent_intra < 15) ? cpi->this_frame_percent_intra : 14] / 100;
 
             // Adjust gf boost based upon GF usage since last GF
-            Boost = Boost * vp8_gf_adjust_table[gf_frame_useage] / 100;
+            Boost = Boost * gf_adjust_table[gf_frame_useage] / 100;
 #endif
         }
 
@@ -503,8 +503,8 @@
         }
 
         // Apply an upper limit based on Q for 1 pass encodes
-        if (Boost > vp8_kf_gf_boost_qlimits[Q] && (cpi->pass == 0))
-            Boost = vp8_kf_gf_boost_qlimits[Q];
+        if (Boost > kf_gf_boost_qlimits[Q] && (cpi->pass == 0))
+            Boost = kf_gf_boost_qlimits[Q];
 
         // Apply lower limits to boost.
         else if (Boost < 110)
@@ -539,8 +539,8 @@
             if (cpi->last_boost >= 1500)
                 cpi->frames_till_gf_update_due ++;
 
-            if (vp8_gf_interval_table[gf_frame_useage] > cpi->frames_till_gf_update_due)
-                cpi->frames_till_gf_update_due = vp8_gf_interval_table[gf_frame_useage];
+            if (gf_interval_table[gf_frame_useage] > cpi->frames_till_gf_update_due)
+                cpi->frames_till_gf_update_due = gf_interval_table[gf_frame_useage];
 
             if (cpi->frames_till_gf_update_due > cpi->max_gf_interval)
                 cpi->frames_till_gf_update_due = cpi->max_gf_interval;
@@ -594,17 +594,17 @@
         // between key frames.
 
         // Adjust boost based upon ambient Q
-        Boost = vp8_kf_boost_qadjustment[Q];
+        Boost = kf_boost_qadjustment[Q];
 
         // Make the Key frame boost less if the seperation from the previous key frame is small
         if (cpi->frames_since_key < 16)
-            Boost = Boost * vp8_kf_boost_seperationt_adjustment[cpi->frames_since_key] / 100;
+            Boost = Boost * kf_boost_seperation_adjustment[cpi->frames_since_key] / 100;
         else
-            Boost = Boost * vp8_kf_boost_seperationt_adjustment[15] / 100;
+            Boost = Boost * kf_boost_seperation_adjustment[15] / 100;
 
         // Apply limits on boost
-        if (Boost > vp8_kf_gf_boost_qlimits[Q])
-            Boost = vp8_kf_gf_boost_qlimits[Q];
+        if (Boost > kf_gf_boost_qlimits[Q])
+            Boost = kf_gf_boost_qlimits[Q];
         else if (Boost < 120)
             Boost = 120;
     }
@@ -842,7 +842,8 @@
         {
             int one_percent_bits = 1 + cpi->oxcf.optimal_buffer_level / 100;
 
-            if ((cpi->buffer_level < cpi->oxcf.optimal_buffer_level) || (cpi->bits_off_target < cpi->oxcf.optimal_buffer_level))
+            if ((cpi->buffer_level < cpi->oxcf.optimal_buffer_level) ||
+                (cpi->bits_off_target < cpi->oxcf.optimal_buffer_level))
             {
                 int percent_low = 0;
 
@@ -851,9 +852,12 @@
                 // If we are are below the optimal buffer fullness level and adherence
                 // to buffering contraints is important to the end useage then adjust
                 // the per frame target.
-                if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) && (cpi->buffer_level < cpi->oxcf.optimal_buffer_level))
+                if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) &&
+                    (cpi->buffer_level < cpi->oxcf.optimal_buffer_level))
                 {
-                    percent_low = (cpi->oxcf.optimal_buffer_level - cpi->buffer_level) / one_percent_bits;
+                    percent_low =
+                        (cpi->oxcf.optimal_buffer_level - cpi->buffer_level) /
+                        one_percent_bits;
 
                     if (percent_low > 100)
                         percent_low = 100;
@@ -864,7 +868,8 @@
                 else if (cpi->bits_off_target < 0)
                 {
                     // Adjust per frame data target downwards to compensate.
-                    percent_low = (int)(100 * -cpi->bits_off_target / (cpi->total_byte_count * 8));
+                    percent_low = (int)(100 * -cpi->bits_off_target /
+                                       (cpi->total_byte_count * 8));
 
                     if (percent_low > 100)
                         percent_low = 100;
@@ -873,39 +878,60 @@
                 }
 
                 // lower the target bandwidth for this frame.
-                cpi->this_frame_target = (cpi->this_frame_target * (100 - (percent_low / 2))) / 100;
+                cpi->this_frame_target =
+                    (cpi->this_frame_target * (100 - (percent_low / 2))) / 100;
 
-                // Are we using allowing control of active_worst_allowed_q according to buffer level.
+                // Are we using allowing control of active_worst_allowed_q
+                // according to buffer level.
                 if (cpi->auto_worst_q)
                 {
                     int critical_buffer_level;
 
-                    // For streaming applications the most important factor is cpi->buffer_level as this takes
-                    // into account the specified short term buffering constraints. However, hitting the long
-                    // term clip data rate target is also important.
+                    // For streaming applications the most important factor is
+                    // cpi->buffer_level as this takes into account the
+                    // specified short term buffering constraints. However,
+                    // hitting the long term clip data rate target is also
+                    // important.
                     if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
                     {
-                        // Take the smaller of cpi->buffer_level and cpi->bits_off_target
-                        critical_buffer_level = (cpi->buffer_level < cpi->bits_off_target) ? cpi->buffer_level : cpi->bits_off_target;
+                        // Take the smaller of cpi->buffer_level and
+                        // cpi->bits_off_target
+                        critical_buffer_level =
+                            (cpi->buffer_level < cpi->bits_off_target)
+                            ? cpi->buffer_level : cpi->bits_off_target;
                     }
-                    // For local file playback short term buffering contraints are less of an issue
+                    // For local file playback short term buffering contraints
+                    // are less of an issue
                     else
                     {
-                        // Consider only how we are doing for the clip as a whole
+                        // Consider only how we are doing for the clip as a
+                        // whole
                         critical_buffer_level = cpi->bits_off_target;
                     }
 
-                    // Set the active worst quality based upon the selected buffer fullness number.
+                    // Set the active worst quality based upon the selected
+                    // buffer fullness number.
                     if (critical_buffer_level < cpi->oxcf.optimal_buffer_level)
                     {
-                        if (critical_buffer_level > (cpi->oxcf.optimal_buffer_level / 4))
+                        if ( critical_buffer_level >
+                             (cpi->oxcf.optimal_buffer_level >> 2) )
                         {
-                            int qadjustment_range = cpi->worst_quality - cpi->ni_av_qi;
-                            int above_base = (critical_buffer_level - (cpi->oxcf.optimal_buffer_level / 4));
+                            INT64 qadjustment_range =
+                                      cpi->worst_quality - cpi->ni_av_qi;
+                            INT64 above_base =
+                                      (critical_buffer_level -
+                                       (cpi->oxcf.optimal_buffer_level >> 2));
 
-                            // Step active worst quality down from cpi->ni_av_qi when (critical_buffer_level == cpi->optimal_buffer_level)
-                            // to cpi->oxcf.worst_allowed_q when (critical_buffer_level == cpi->optimal_buffer_level/4)
-                            cpi->active_worst_quality = cpi->worst_quality - ((qadjustment_range * above_base) / (cpi->oxcf.optimal_buffer_level * 3 / 4));
+                            // Step active worst quality down from
+                            // cpi->ni_av_qi when (critical_buffer_level ==
+                            // cpi->optimal_buffer_level) to
+                            // cpi->worst_quality when
+                            // (critical_buffer_level ==
+                            //     cpi->optimal_buffer_level >> 2)
+                            cpi->active_worst_quality =
+                                cpi->worst_quality -
+                                ((qadjustment_range * above_base) /
+                                 (cpi->oxcf.optimal_buffer_level*3>>2));
                         }
                         else
                         {
@@ -965,6 +991,15 @@
             // Set the active worst quality
             cpi->active_worst_quality = cpi->worst_quality;
         }
+
+        // Special trap for constrained quality mode
+        // "active_worst_quality" may never drop below cq level
+        // for any frame type.
+        if ( cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY &&
+             cpi->active_worst_quality < cpi->cq_target_quality)
+        {
+            cpi->active_worst_quality = cpi->cq_target_quality;
+        }
     }
 
     // Test to see if we have to drop a frame

diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 248d9c4..a125cc4 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c

@@ -13,28 +13,28 @@
 #include <math.h>
 #include <limits.h>
 #include <assert.h>
-#include "pragmas.h"
+#include "vp8/common/pragmas.h"
 
 #include "tokenize.h"
 #include "treewriter.h"
 #include "onyx_int.h"
 #include "modecosts.h"
 #include "encodeintra.h"
-#include "entropymode.h"
-#include "reconinter.h"
-#include "reconintra.h"
-#include "reconintra4x4.h"
-#include "findnearmv.h"
+#include "vp8/common/entropymode.h"
+#include "vp8/common/reconinter.h"
+#include "vp8/common/reconintra.h"
+#include "vp8/common/reconintra4x4.h"
+#include "vp8/common/findnearmv.h"
 #include "encodemb.h"
 #include "quantize.h"
-#include "idct.h"
-#include "g_common.h"
+#include "vp8/common/idct.h"
+#include "vp8/common/g_common.h"
 #include "variance.h"
 #include "mcomp.h"
 
 #include "vpx_mem/vpx_mem.h"
 #include "dct.h"
-#include "systemdependent.h"
+#include "vp8/common/systemdependent.h"
 
 #if CONFIG_RUNTIME_CPU_DETECT
 #define IF_RTCD(x)  (x)
@@ -53,7 +53,7 @@
 
 
 
-const int vp8_auto_speed_thresh[17] =
+static const int auto_speed_thresh[17] =
 {
     1000,
     200,
@@ -353,7 +353,7 @@
                 }
             }
 
-            if (milliseconds_for_compress * 100 > cpi->avg_encode_time * vp8_auto_speed_thresh[cpi->Speed])
+            if (milliseconds_for_compress * 100 > cpi->avg_encode_time * auto_speed_thresh[cpi->Speed])
             {
                 cpi->Speed          -= 1;
                 cpi->avg_pick_mode_time = 0;
@@ -529,10 +529,10 @@
     tl = (ENTROPY_CONTEXT *)&t_left;
 
     for (b = 0; b < 16; b++)
-        cost += cost_coeffs(mb, x->block + b, 0,
+        cost += cost_coeffs(mb, x->block + b, PLANE_TYPE_Y_NO_DC,
                     ta + vp8_block2above[b], tl + vp8_block2left[b]);
 
-    cost += cost_coeffs(mb, x->block + 24, 1,
+    cost += cost_coeffs(mb, x->block + 24, PLANE_TYPE_Y2,
                 ta + vp8_block2above[24], tl + vp8_block2left[24]);
 
     return cost;
@@ -584,14 +584,22 @@
     *Rate = vp8_rdcost_mby(mb);
 }
 
-static void rd_pick_intra4x4block(
+static void copy_predictor(unsigned char *dst, const unsigned char *predictor)
+{
+    const unsigned int *p = (const unsigned int *)predictor;
+    unsigned int *d = (unsigned int *)dst;
+    d[0] = p[0];
+    d[4] = p[4];
+    d[8] = p[8];
+    d[12] = p[12];
+}
+static int rd_pick_intra4x4block(
     VP8_COMP *cpi,
     MACROBLOCK *x,
     BLOCK *be,
     BLOCKD *b,
     B_PREDICTION_MODE *best_mode,
-    B_PREDICTION_MODE above,
-    B_PREDICTION_MODE left,
+    unsigned int *bmode_costs,
     ENTROPY_CONTEXT *a,
     ENTROPY_CONTEXT *l,
 
@@ -600,36 +608,36 @@
     int *bestdistortion)
 {
     B_PREDICTION_MODE mode;
-    int best_rd = INT_MAX;       // 1<<30
+    int best_rd = INT_MAX;
     int rate = 0;
     int distortion;
-    unsigned int *mode_costs;
 
     ENTROPY_CONTEXT ta = *a, tempa = *a;
     ENTROPY_CONTEXT tl = *l, templ = *l;
-
-
-    if (x->e_mbd.frame_type == KEY_FRAME)
-    {
-        mode_costs  = x->bmode_costs[above][left];
-    }
-    else
-    {
-        mode_costs = x->inter_bmode_costs;
-    }
+    /*
+     * The predictor buffer is a 2d buffer with a stride of 16.  Create
+     * a temp buffer that meets the stride requirements, but we are only
+     * interested in the left 4x4 block
+     * */
+    DECLARE_ALIGNED_ARRAY(16, unsigned char,  best_predictor, 16*4);
+    DECLARE_ALIGNED_ARRAY(16, short, best_dqcoeff, 16);
 
     for (mode = B_DC_PRED; mode <= B_HU_PRED; mode++)
     {
         int this_rd;
         int ratey;
 
-        rate = mode_costs[mode];
-        vp8_encode_intra4x4block_rd(IF_RTCD(&cpi->rtcd), x, be, b, mode);
+        rate = bmode_costs[mode];
+
+        vp8_predict_intra4x4(b, mode, b->predictor);
+        ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), subb)(be, b, 16);
+        x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32);
+        x->quantize_b(be, b);
 
         tempa = ta;
         templ = tl;
 
-        ratey = cost_coeffs(x, b, 3, &tempa, &templ);
+        ratey = cost_coeffs(x, b, PLANE_TYPE_Y_WITH_DC, &tempa, &templ);
         rate += ratey;
         distortion = ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), berr)(be->coeff, b->dqcoeff) >> 2;
 
@@ -644,25 +652,32 @@
             *best_mode = mode;
             *a = tempa;
             *l = templ;
+            copy_predictor(best_predictor, b->predictor);
+            vpx_memcpy(best_dqcoeff, b->dqcoeff, 32);
         }
     }
 
     b->bmi.mode = (B_PREDICTION_MODE)(*best_mode);
-    vp8_encode_intra4x4block_rd(IF_RTCD(&cpi->rtcd), x, be, b, b->bmi.mode);
 
+    IDCT_INVOKE(IF_RTCD(&cpi->rtcd.common->idct), idct16)(best_dqcoeff, b->diff, 32);
+    RECON_INVOKE(IF_RTCD(&cpi->rtcd.common->recon), recon)(best_predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+
+    return best_rd;
 }
 
-
-int vp8_rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *Rate, int *rate_y, int *Distortion)
+int vp8_rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *Rate,
+                                  int *rate_y, int *Distortion, int best_rd)
 {
     MACROBLOCKD *const xd = &mb->e_mbd;
     int i;
     int cost = mb->mbmode_cost [xd->frame_type] [B_PRED];
     int distortion = 0;
     int tot_rate_y = 0;
+    long long total_rd = 0;
     ENTROPY_CONTEXT_PLANES t_above, t_left;
     ENTROPY_CONTEXT *ta;
     ENTROPY_CONTEXT *tl;
+    unsigned int *bmode_costs;
 
     vpx_memcpy(&t_above, mb->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
     vpx_memcpy(&t_left, mb->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
@@ -672,17 +687,25 @@
 
     vp8_intra_prediction_down_copy(xd);
 
+    bmode_costs = mb->inter_bmode_costs;
+
     for (i = 0; i < 16; i++)
     {
         MODE_INFO *const mic = xd->mode_info_context;
         const int mis = xd->mode_info_stride;
-        const B_PREDICTION_MODE A = vp8_above_bmi(mic, i, mis)->mode;
-        const B_PREDICTION_MODE L = vp8_left_bmi(mic, i)->mode;
         B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
         int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry), UNINITIALIZED_IS_SAFE(d);
 
-        rd_pick_intra4x4block(
-            cpi, mb, mb->block + i, xd->block + i, &best_mode, A, L,
+        if (mb->e_mbd.frame_type == KEY_FRAME)
+        {
+            const B_PREDICTION_MODE A = vp8_above_bmi(mic, i, mis)->mode;
+            const B_PREDICTION_MODE L = vp8_left_bmi(mic, i)->mode;
+
+            bmode_costs  = mb->bmode_costs[A][L];
+        }
+
+        total_rd += rd_pick_intra4x4block(
+            cpi, mb, mb->block + i, xd->block + i, &best_mode, bmode_costs,
             ta + vp8_block2above[i],
             tl + vp8_block2left[i], &r, &ry, &d);
 
@@ -690,15 +713,20 @@
         distortion += d;
         tot_rate_y += ry;
         mic->bmi[i].mode = xd->block[i].bmi.mode = best_mode;
+
+        if(total_rd >= (long long)best_rd)
+            break;
     }
 
+    if(total_rd >= (long long)best_rd)
+        return INT_MAX;
+
     *Rate = cost;
     *rate_y += tot_rate_y;
     *Distortion = distortion;
 
     return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);
 }
-
 int vp8_rd_pick_intra16x16mby_mode(VP8_COMP *cpi,
                                    MACROBLOCK *x,
                                    int *Rate,
@@ -717,7 +745,8 @@
     {
         x->e_mbd.mode_info_context->mbmi.mode = mode;
 
-        vp8_build_intra_predictors_mby_ptr(&x->e_mbd);
+        RECON_INVOKE(&cpi->common.rtcd.recon, build_intra_predictors_mby)
+            (&x->e_mbd);
 
         macro_block_yrd(x, &ratey, &distortion, IF_RTCD(&cpi->rtcd.encodemb));
         rate = ratey + x->mbmode_cost[x->e_mbd.frame_type]
@@ -754,27 +783,14 @@
     ta = (ENTROPY_CONTEXT *)&t_above;
     tl = (ENTROPY_CONTEXT *)&t_left;
 
-    for (b = 16; b < 20; b++)
-        cost += cost_coeffs(mb, x->block + b, vp8_block2type[b],
-                    ta + vp8_block2above[b], tl + vp8_block2left[b]);
-
-    for (b = 20; b < 24; b++)
-        cost += cost_coeffs(mb, x->block + b, vp8_block2type[b],
+    for (b = 16; b < 24; b++)
+        cost += cost_coeffs(mb, x->block + b, PLANE_TYPE_UV,
                     ta + vp8_block2above[b], tl + vp8_block2left[b]);
 
     return cost;
 }
 
 
-unsigned int vp8_get_mbuvrecon_error(const vp8_variance_rtcd_vtable_t *rtcd, const MACROBLOCK *x) // sum of squares
-{
-    unsigned int sse0, sse1;
-    int sum0, sum1;
-    VARIANCE_INVOKE(rtcd, get8x8var)(x->src.u_buffer, x->src.uv_stride, x->e_mbd.dst.u_buffer, x->e_mbd.dst.uv_stride, &sse0, &sum0);
-    VARIANCE_INVOKE(rtcd, get8x8var)(x->src.v_buffer, x->src.uv_stride, x->e_mbd.dst.v_buffer, x->e_mbd.dst.uv_stride, &sse1, &sum1);
-    return (sse0 + sse1);
-}
-
 static int vp8_rd_inter_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate, int *distortion, int fullpixel)
 {
     vp8_build_uvmvs(&x->e_mbd, fullpixel);
@@ -802,7 +818,12 @@
         int this_rd;
 
         x->e_mbd.mode_info_context->mbmi.uv_mode = mode;
-        vp8_encode_intra16x16mbuvrd(IF_RTCD(&cpi->rtcd), x);
+        vp8_build_intra_predictors_mbuv(&x->e_mbd);
+        ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), submbuv)(x->src_diff,
+                      x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor,
+                      x->src.uv_stride);
+        vp8_transform_mbuv(x);
+        vp8_quantize_mbuv(x);
 
         rate_to = rd_cost_mbuv(x);
         rate = rate_to + x->intra_uv_mode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.uv_mode];
@@ -944,7 +965,7 @@
 
     for (b = 0; b < 16; b++)
         if (labels[ b] == which_label)
-            cost += cost_coeffs(mb, x->block + b, 3,
+            cost += cost_coeffs(mb, x->block + b, PLANE_TYPE_Y_WITH_DC,
                                 ta + vp8_block2above[b],
                                 tl + vp8_block2left[b]);
 
@@ -979,13 +1000,6 @@
     return distortion;
 }
 
-unsigned char vp8_mbsplit_offset2[4][16] = {
-    { 0,  8,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
-    { 0,  2,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
-    { 0,  2,  8, 10,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
-    { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}
-};
-
 
 static const unsigned int segmentation_to_sseshift[4] = {3, 3, 2, 0};
 
@@ -1013,8 +1027,8 @@
 } BEST_SEG_INFO;
 
 
-void vp8_rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi,
-                         unsigned int segmentation)
+static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x,
+                             BEST_SEG_INFO *bsi, unsigned int segmentation)
 {
     int i;
     int const *labels;
@@ -1132,14 +1146,14 @@
                     int sadpb = x->sadperbit4;
 
                     // find first label
-                    n = vp8_mbsplit_offset2[segmentation][i];
+                    n = vp8_mbsplit_offset[segmentation][i];
 
                     c = &x->block[n];
                     e = &x->e_mbd.block[n];
 
                     if (cpi->sf.search_method == HEX)
                         bestsme = vp8_hex_search(x, c, e, bsi->ref_mv,
-                                                 &mode_mv[NEW4X4], step_param, sadpb, &num00, v_fn_ptr, x->mvsadcost, x->mvcost);
+                                                 &mode_mv[NEW4X4], step_param, sadpb, &num00, v_fn_ptr, x->mvsadcost, x->mvcost, bsi->ref_mv);
 
                     else
                     {
@@ -1311,16 +1325,16 @@
     {
         /* for now, we will keep the original segmentation order
            when in best quality mode */
-        vp8_rd_check_segment(cpi, x, &bsi, BLOCK_16X8);
-        vp8_rd_check_segment(cpi, x, &bsi, BLOCK_8X16);
-        vp8_rd_check_segment(cpi, x, &bsi, BLOCK_8X8);
-        vp8_rd_check_segment(cpi, x, &bsi, BLOCK_4X4);
+        rd_check_segment(cpi, x, &bsi, BLOCK_16X8);
+        rd_check_segment(cpi, x, &bsi, BLOCK_8X16);
+        rd_check_segment(cpi, x, &bsi, BLOCK_8X8);
+        rd_check_segment(cpi, x, &bsi, BLOCK_4X4);
     }
     else
     {
         int sr;
 
-        vp8_rd_check_segment(cpi, x, &bsi, BLOCK_8X8);
+        rd_check_segment(cpi, x, &bsi, BLOCK_8X8);
 
         if (bsi.segment_rd < best_rd)
         {
@@ -1359,7 +1373,7 @@
                 sr = MAXF((abs(bsi.sv_mvp[1].row - bsi.sv_mvp[3].row))>>3, (abs(bsi.sv_mvp[1].col - bsi.sv_mvp[3].col))>>3);
                 vp8_cal_step_param(sr, &bsi.sv_istep[1]);
 
-                vp8_rd_check_segment(cpi, x, &bsi, BLOCK_8X16);
+                rd_check_segment(cpi, x, &bsi, BLOCK_8X16);
             }
 
             /* block 16X8 */
@@ -1370,7 +1384,7 @@
                 sr = MAXF((abs(bsi.sv_mvp[2].row - bsi.sv_mvp[3].row))>>3, (abs(bsi.sv_mvp[2].col - bsi.sv_mvp[3].col))>>3);
                 vp8_cal_step_param(sr, &bsi.sv_istep[1]);
 
-                vp8_rd_check_segment(cpi, x, &bsi, BLOCK_16X8);
+                rd_check_segment(cpi, x, &bsi, BLOCK_16X8);
             }
 
             /* If 8x8 is better than 16x8/8x16, then do 4x4 search */
@@ -1378,7 +1392,7 @@
             if (cpi->sf.no_skip_block4x4_search || bsi.segment_num == BLOCK_8X8)  /* || (sv_segment_rd8x8-bsi.segment_rd) < sv_segment_rd8x8>>5) */
             {
                 bsi.mvp = &bsi.sv_mvp[0];
-                vp8_rd_check_segment(cpi, x, &bsi, BLOCK_4X4);
+                rd_check_segment(cpi, x, &bsi, BLOCK_4X4);
             }
 
             /* restore UMV window */
@@ -1411,7 +1425,7 @@
     {
         int j;
 
-        j = vp8_mbsplit_offset2[bsi.segment_num][i];
+        j = vp8_mbsplit_offset[bsi.segment_num][i];
 
         x->partition_info->bmi[i].mode = x->e_mbd.block[j].bmi.mode;
         x->partition_info->bmi[i].mv.as_mv = x->e_mbd.block[j].bmi.mv.as_mv;
@@ -1419,6 +1433,7 @@
 
     return bsi.segment_rd;
 }
+#endif
 
 static void swap(int *x,int *y)
 {
@@ -1651,6 +1666,62 @@
     vp8_clamp_mv(mvp, xd);
 }
 
+void vp8_cal_sad(VP8_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x, int recon_yoffset, int near_sadidx[])
+{
+
+    int near_sad[8] = {0}; // 0-cf above, 1-cf left, 2-cf aboveleft, 3-lf current, 4-lf above, 5-lf left, 6-lf right, 7-lf below
+
+    //calculate sad for current frame 3 nearby MBs.
+    if( xd->mb_to_top_edge==0 && xd->mb_to_left_edge ==0)
+    {
+        near_sad[0] = near_sad[1] = near_sad[2] = INT_MAX;
+    }else if(xd->mb_to_top_edge==0)
+    {   //only has left MB for sad calculation.
+        near_sad[0] = near_sad[2] = INT_MAX;
+        near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, xd->dst.y_buffer - 16,xd->dst.y_stride, 0x7fffffff);
+    }else if(xd->mb_to_left_edge ==0)
+    {   //only has left MB for sad calculation.
+        near_sad[1] = near_sad[2] = INT_MAX;
+        near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, xd->dst.y_buffer - xd->dst.y_stride *16,xd->dst.y_stride, 0x7fffffff);
+    }else
+    {
+        near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, xd->dst.y_buffer - xd->dst.y_stride *16,xd->dst.y_stride, 0x7fffffff);
+        near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, xd->dst.y_buffer - 16,xd->dst.y_stride, 0x7fffffff);
+        near_sad[2] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, xd->dst.y_buffer - xd->dst.y_stride *16 -16,xd->dst.y_stride, 0x7fffffff);
+    }
+
+    if(cpi->common.last_frame_type != KEY_FRAME)
+    {
+        //calculate sad for last frame 5 nearby MBs.
+        unsigned char *pre_y_buffer = cpi->common.yv12_fb[cpi->common.lst_fb_idx].y_buffer + recon_yoffset;
+        int pre_y_stride = cpi->common.yv12_fb[cpi->common.lst_fb_idx].y_stride;
+
+        if(xd->mb_to_top_edge==0) near_sad[4] = INT_MAX;
+        if(xd->mb_to_left_edge ==0) near_sad[5] = INT_MAX;
+        if(xd->mb_to_right_edge ==0) near_sad[6] = INT_MAX;
+        if(xd->mb_to_bottom_edge==0) near_sad[7] = INT_MAX;
+
+        if(near_sad[4] != INT_MAX)
+            near_sad[4] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, pre_y_buffer - pre_y_stride *16, pre_y_stride, 0x7fffffff);
+        if(near_sad[5] != INT_MAX)
+            near_sad[5] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, pre_y_buffer - 16, pre_y_stride, 0x7fffffff);
+        near_sad[3] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, pre_y_buffer, pre_y_stride, 0x7fffffff);
+        if(near_sad[6] != INT_MAX)
+            near_sad[6] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, pre_y_buffer + 16, pre_y_stride, 0x7fffffff);
+        if(near_sad[7] != INT_MAX)
+            near_sad[7] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, pre_y_buffer + pre_y_stride *16, pre_y_stride, 0x7fffffff);
+    }
+
+    if(cpi->common.last_frame_type != KEY_FRAME)
+    {
+        quicksortsad(near_sad, near_sadidx, 0, 7);
+    }else
+    {
+        quicksortsad(near_sad, near_sadidx, 0, 2);
+    }
+}
+
+#if !(CONFIG_REALTIME_ONLY)
 int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra)
 {
     BLOCK *b = &x->block[0];
@@ -1688,7 +1759,6 @@
     int force_no_skip = 0;
 
     MV mvp;
-    int near_sad[8] = {0}; // 0-cf above, 1-cf left, 2-cf aboveleft, 3-lf current, 4-lf above, 5-lf left, 6-lf right, 7-lf below
     int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7};
     int saddone=0;
     int sr=0;    //search range got from mv_pred(). It uses step_param levels. (0-7)
@@ -1835,60 +1905,11 @@
             lf_or_gf = frame_lf_or_gf[x->e_mbd.mode_info_context->mbmi.ref_frame];
         }
 
-
         if(x->e_mbd.mode_info_context->mbmi.mode == NEWMV)
         {
             if(!saddone)
             {
-                //calculate sad for current frame 3 nearby MBs.
-                if( xd->mb_to_top_edge==0 && xd->mb_to_left_edge ==0)
-                {
-                    near_sad[0] = near_sad[1] = near_sad[2] = INT_MAX;
-                }else if(xd->mb_to_top_edge==0)
-                {   //only has left MB for sad calculation.
-                    near_sad[0] = near_sad[2] = INT_MAX;
-                    near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, xd->dst.y_buffer - 16,xd->dst.y_stride, 0x7fffffff);
-                }else if(xd->mb_to_left_edge ==0)
-                {   //only has left MB for sad calculation.
-                    near_sad[1] = near_sad[2] = INT_MAX;
-                    near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, xd->dst.y_buffer - xd->dst.y_stride *16,xd->dst.y_stride, 0x7fffffff);
-                }else
-                {
-                    near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, xd->dst.y_buffer - xd->dst.y_stride *16,xd->dst.y_stride, 0x7fffffff);
-                    near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, xd->dst.y_buffer - 16,xd->dst.y_stride, 0x7fffffff);
-                    near_sad[2] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, xd->dst.y_buffer - xd->dst.y_stride *16 -16,xd->dst.y_stride, 0x7fffffff);
-                }
-
-                if(cpi->common.last_frame_type != KEY_FRAME)
-                {
-                    //calculate sad for last frame 5 nearby MBs.
-                    unsigned char *pre_y_buffer = cpi->common.yv12_fb[cpi->common.lst_fb_idx].y_buffer + recon_yoffset;
-                    int pre_y_stride = cpi->common.yv12_fb[cpi->common.lst_fb_idx].y_stride;
-
-                    if(xd->mb_to_top_edge==0) near_sad[4] = INT_MAX;
-                    if(xd->mb_to_left_edge ==0) near_sad[5] = INT_MAX;
-                    if(xd->mb_to_right_edge ==0) near_sad[6] = INT_MAX;
-                    if(xd->mb_to_bottom_edge==0) near_sad[7] = INT_MAX;
-
-                    if(near_sad[4] != INT_MAX)
-                        near_sad[4] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, pre_y_buffer - pre_y_stride *16, pre_y_stride, 0x7fffffff);
-                    if(near_sad[5] != INT_MAX)
-                        near_sad[5] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, pre_y_buffer - 16, pre_y_stride, 0x7fffffff);
-                    near_sad[3] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, pre_y_buffer, pre_y_stride, 0x7fffffff);
-                    if(near_sad[6] != INT_MAX)
-                        near_sad[6] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, pre_y_buffer + 16, pre_y_stride, 0x7fffffff);
-                    if(near_sad[7] != INT_MAX)
-                        near_sad[7] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, pre_y_buffer + pre_y_stride *16, pre_y_stride, 0x7fffffff);
-                }
-
-                if(cpi->common.last_frame_type != KEY_FRAME)
-                {
-                    quicksortsad(near_sad, near_sadidx, 0, 7);
-                }else
-                {
-                    quicksortsad(near_sad, near_sadidx, 0, 2);
-                }
-
+                vp8_cal_sad(cpi,xd,x, recon_yoffset ,&near_sadidx[0] );
                 saddone = 1;
             }
 
@@ -1941,7 +1962,7 @@
                     else
                         cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
                 }
-                else if (vp8_ref_frame_order[mode_index] == SPLITMV)
+                else if (vp8_mode_order[mode_index] == SPLITMV)
                     cpi->zbin_mode_boost = 0;
                 else
                     cpi->zbin_mode_boost = MV_ZBIN_BOOST;
@@ -1953,15 +1974,28 @@
         switch (this_mode)
         {
         case B_PRED:
+        {
+            int tmp_rd;
+
             // Note the rate value returned here includes the cost of coding the BPRED mode : x->mbmode_cost[x->e_mbd.frame_type][BPRED];
-            vp8_rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y, &distortion);
+            tmp_rd = vp8_rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y, &distortion, best_yrd);
             rate2 += rate;
             distortion2 += distortion;
-            rate2 += uv_intra_rate;
-            rate_uv = uv_intra_rate_tokenonly;
-            distortion2 += uv_intra_distortion;
-            distortion_uv = uv_intra_distortion;
-            break;
+
+            if(tmp_rd < best_yrd)
+            {
+                rate2 += uv_intra_rate;
+                rate_uv = uv_intra_rate_tokenonly;
+                distortion2 += uv_intra_distortion;
+                distortion_uv = uv_intra_distortion;
+            }
+            else
+            {
+                this_rd = INT_MAX;
+                disable_skip = 1;
+            }
+        }
+        break;
 
         case SPLITMV:
         {
@@ -1998,7 +2032,8 @@
         case H_PRED:
         case TM_PRED:
             x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
-            vp8_build_intra_predictors_mby_ptr(&x->e_mbd);
+            RECON_INVOKE(&cpi->common.rtcd.recon, build_intra_predictors_mby)
+                (&x->e_mbd);
             macro_block_yrd(x, &rate_y, &distortion, IF_RTCD(&cpi->rtcd.encodemb)) ;
             rate2 += rate_y;
             distortion2 += distortion;
@@ -2061,7 +2096,7 @@
 
                     if (cpi->sf.search_method == HEX)
                     {
-                        bestsme = vp8_hex_search(x, b, d, &best_ref_mv, &d->bmi.mv.as_mv, step_param, sadpb/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost);
+                        bestsme = vp8_hex_search(x, b, d, &best_ref_mv, &d->bmi.mv.as_mv, step_param, sadpb/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost, &best_ref_mv);
                         mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
                         mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
                     }

diff --git a/vp8/encoder/rdopt.h b/vp8/encoder/rdopt.h
index fb74dd4..72ba9a0 100644
--- a/vp8/encoder/rdopt.h
+++ b/vp8/encoder/rdopt.h

@@ -12,10 +12,22 @@
 #ifndef __INC_RDOPT_H
 #define __INC_RDOPT_H
 void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue);
-int vp8_rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *rate, int *rate_to, int *distortion);
+int vp8_rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *rate, int *rate_to, int *distortion, int best_rd);
 int vp8_rd_pick_intra16x16mby_mode(VP8_COMP *cpi, MACROBLOCK *x, int *returnrate, int *rate_to, int *returndistortion);
 int vp8_rd_pick_intra_mbuv_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate, int *rate_to, int *distortion);
 extern int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra);
 
+extern void vp8_mv_pred
+(
+    VP8_COMP *cpi,
+    MACROBLOCKD *xd,
+    const MODE_INFO *here,
+    MV *mvp,
+    int refframe,
+    int *ref_frame_sign_bias,
+    int *sr,
+    int near_sadidx[]
+);
+void vp8_cal_sad(VP8_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x, int recon_yoffset, int near_sadidx[]);
 
 #endif

diff --git a/vp8/encoder/segmentation.h b/vp8/encoder/segmentation.h
index 216e194..12815b0 100644
--- a/vp8/encoder/segmentation.h
+++ b/vp8/encoder/segmentation.h

@@ -10,7 +10,7 @@
 
 
 #include "string.h"
-#include "blockd.h"
+#include "vp8/common/blockd.h"
 #include "onyx_int.h"
 
 extern void vp8_update_gf_useage_maps(VP8_COMP *cpi, VP8_COMMON *cm, MACROBLOCK *x);

diff --git a/vp8/encoder/ssim.c b/vp8/encoder/ssim.c
index 4ebcba1..64d67c6 100644
--- a/vp8/encoder/ssim.c
+++ b/vp8/encoder/ssim.c

@@ -11,298 +11,13 @@
 
 #include "vpx_scale/yv12config.h"
 #include "math.h"
+#include "onyx_int.h"
 
-#define C1 (float)(64 * 64 * 0.01*255*0.01*255)
-#define C2 (float)(64 * 64 * 0.03*255*0.03*255)
-
-static int width_y;
-static int height_y;
-static int height_uv;
-static int width_uv;
-static int stride_uv;
-static int stride;
-static int lumimask;
-static int luminance;
-static double plane_summed_weights = 0;
-
-static short img12_sum_block[8*4096*4096*2] ;
-
-static short img1_sum[8*4096*2];
-static short img2_sum[8*4096*2];
-static int   img1_sq_sum[8*4096*2];
-static int   img2_sq_sum[8*4096*2];
-static int   img12_mul_sum[8*4096*2];
-
-
-double vp8_similarity
-(
-    int mu_x,
-    int mu_y,
-    int pre_mu_x2,
-    int pre_mu_y2,
-    int pre_mu_xy2
-)
-{
-    int mu_x2, mu_y2, mu_xy, theta_x2, theta_y2, theta_xy;
-
-    mu_x2 = mu_x * mu_x;
-    mu_y2 = mu_y * mu_y;
-    mu_xy = mu_x * mu_y;
-
-    theta_x2 = 64 * pre_mu_x2 - mu_x2;
-    theta_y2 = 64 * pre_mu_y2 - mu_y2;
-    theta_xy = 64 * pre_mu_xy2 - mu_xy;
-
-    return (2 * mu_xy + C1) * (2 * theta_xy + C2) / ((mu_x2 + mu_y2 + C1) * (theta_x2 + theta_y2 + C2));
-}
-
-double vp8_ssim
-(
-    const unsigned char *img1,
-    const unsigned char *img2,
-    int stride_img1,
-    int stride_img2,
-    int width,
-    int height
-)
-{
-    int x, y, x2, y2, img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block, temp;
-
-    double plane_quality, weight, mean;
-
-    short *img1_sum_ptr1, *img1_sum_ptr2;
-    short *img2_sum_ptr1, *img2_sum_ptr2;
-    int *img1_sq_sum_ptr1, *img1_sq_sum_ptr2;
-    int *img2_sq_sum_ptr1, *img2_sq_sum_ptr2;
-    int *img12_mul_sum_ptr1, *img12_mul_sum_ptr2;
-
-    plane_quality = 0;
-
-    if (lumimask)
-        plane_summed_weights = 0.0f;
-    else
-        plane_summed_weights = (height - 7) * (width - 7);
-
-    //some prologue for the main loop
-    temp = 8 * width;
-
-    img1_sum_ptr1      = img1_sum + temp;
-    img2_sum_ptr1      = img2_sum + temp;
-    img1_sq_sum_ptr1   = img1_sq_sum + temp;
-    img2_sq_sum_ptr1   = img2_sq_sum + temp;
-    img12_mul_sum_ptr1 = img12_mul_sum + temp;
-
-    for (x = 0; x < width; x++)
-    {
-        img1_sum[x]      = img1[x];
-        img2_sum[x]      = img2[x];
-        img1_sq_sum[x]   = img1[x] * img1[x];
-        img2_sq_sum[x]   = img2[x] * img2[x];
-        img12_mul_sum[x] = img1[x] * img2[x];
-
-        img1_sum_ptr1[x]      = 0;
-        img2_sum_ptr1[x]      = 0;
-        img1_sq_sum_ptr1[x]   = 0;
-        img2_sq_sum_ptr1[x]   = 0;
-        img12_mul_sum_ptr1[x] = 0;
-    }
-
-    //the main loop
-    for (y = 1; y < height; y++)
-    {
-        img1 += stride_img1;
-        img2 += stride_img2;
-
-        temp = (y - 1) % 9 * width;
-
-        img1_sum_ptr1      = img1_sum + temp;
-        img2_sum_ptr1      = img2_sum + temp;
-        img1_sq_sum_ptr1   = img1_sq_sum + temp;
-        img2_sq_sum_ptr1   = img2_sq_sum + temp;
-        img12_mul_sum_ptr1 = img12_mul_sum + temp;
-
-        temp = y % 9 * width;
-
-        img1_sum_ptr2      = img1_sum + temp;
-        img2_sum_ptr2      = img2_sum + temp;
-        img1_sq_sum_ptr2   = img1_sq_sum + temp;
-        img2_sq_sum_ptr2   = img2_sq_sum + temp;
-        img12_mul_sum_ptr2 = img12_mul_sum + temp;
-
-        for (x = 0; x < width; x++)
-        {
-            img1_sum_ptr2[x]      = img1_sum_ptr1[x] + img1[x];
-            img2_sum_ptr2[x]      = img2_sum_ptr1[x] + img2[x];
-            img1_sq_sum_ptr2[x]   = img1_sq_sum_ptr1[x] + img1[x] * img1[x];
-            img2_sq_sum_ptr2[x]   = img2_sq_sum_ptr1[x] + img2[x] * img2[x];
-            img12_mul_sum_ptr2[x] = img12_mul_sum_ptr1[x] + img1[x] * img2[x];
-        }
-
-        if (y > 6)
-        {
-            //calculate the sum of the last 8 lines by subtracting the total sum of 8 lines back from the present sum
-            temp = (y + 1) % 9 * width;
-
-            img1_sum_ptr1      = img1_sum + temp;
-            img2_sum_ptr1      = img2_sum + temp;
-            img1_sq_sum_ptr1   = img1_sq_sum + temp;
-            img2_sq_sum_ptr1   = img2_sq_sum + temp;
-            img12_mul_sum_ptr1 = img12_mul_sum + temp;
-
-            for (x = 0; x < width; x++)
-            {
-                img1_sum_ptr1[x]      = img1_sum_ptr2[x] - img1_sum_ptr1[x];
-                img2_sum_ptr1[x]      = img2_sum_ptr2[x] - img2_sum_ptr1[x];
-                img1_sq_sum_ptr1[x]   = img1_sq_sum_ptr2[x] - img1_sq_sum_ptr1[x];
-                img2_sq_sum_ptr1[x]   = img2_sq_sum_ptr2[x] - img2_sq_sum_ptr1[x];
-                img12_mul_sum_ptr1[x] = img12_mul_sum_ptr2[x] - img12_mul_sum_ptr1[x];
-            }
-
-            //here we calculate the sum over the 8x8 block of pixels
-            //this is done by sliding a window across the column sums for the last 8 lines
-            //each time adding the new column sum, and subtracting the one which fell out of the window
-            img1_block      = 0;
-            img2_block      = 0;
-            img1_sq_block   = 0;
-            img2_sq_block   = 0;
-            img12_mul_block = 0;
-
-            //prologue, and calculation of simularity measure from the first 8 column sums
-            for (x = 0; x < 8; x++)
-            {
-                img1_block      += img1_sum_ptr1[x];
-                img2_block      += img2_sum_ptr1[x];
-                img1_sq_block   += img1_sq_sum_ptr1[x];
-                img2_sq_block   += img2_sq_sum_ptr1[x];
-                img12_mul_block += img12_mul_sum_ptr1[x];
-            }
-
-            if (lumimask)
-            {
-                y2 = y - 7;
-                x2 = 0;
-
-                if (luminance)
-                {
-                    mean = (img2_block + img1_block) / 128.0f;
-
-                    if (!(y2 % 2 || x2 % 2))
-                        *(img12_sum_block + y2 / 2 * width_uv + x2 / 2) = img2_block + img1_block;
-                }
-                else
-                {
-                    mean = *(img12_sum_block + y2 * width_uv + x2);
-                    mean += *(img12_sum_block + y2 * width_uv + x2 + 4);
-                    mean += *(img12_sum_block + (y2 + 4) * width_uv + x2);
-                    mean += *(img12_sum_block + (y2 + 4) * width_uv + x2 + 4);
-
-                    mean /= 512.0f;
-                }
-
-                weight = mean < 40 ? 0.0f :
-                         (mean < 50 ? (mean - 40.0f) / 10.0f : 1.0f);
-                plane_summed_weights += weight;
-
-                plane_quality += weight * vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block);
-            }
-            else
-                plane_quality += vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block);
-
-            //and for the rest
-            for (x = 8; x < width; x++)
-            {
-                img1_block      = img1_block + img1_sum_ptr1[x] - img1_sum_ptr1[x - 8];
-                img2_block      = img2_block + img2_sum_ptr1[x] - img2_sum_ptr1[x - 8];
-                img1_sq_block   = img1_sq_block + img1_sq_sum_ptr1[x] - img1_sq_sum_ptr1[x - 8];
-                img2_sq_block   = img2_sq_block + img2_sq_sum_ptr1[x] - img2_sq_sum_ptr1[x - 8];
-                img12_mul_block = img12_mul_block + img12_mul_sum_ptr1[x] - img12_mul_sum_ptr1[x - 8];
-
-                if (lumimask)
-                {
-                    y2 = y - 7;
-                    x2 = x - 7;
-
-                    if (luminance)
-                    {
-                        mean = (img2_block + img1_block) / 128.0f;
-
-                        if (!(y2 % 2 || x2 % 2))
-                            *(img12_sum_block + y2 / 2 * width_uv + x2 / 2) = img2_block + img1_block;
-                    }
-                    else
-                    {
-                        mean = *(img12_sum_block + y2 * width_uv + x2);
-                        mean += *(img12_sum_block + y2 * width_uv + x2 + 4);
-                        mean += *(img12_sum_block + (y2 + 4) * width_uv + x2);
-                        mean += *(img12_sum_block + (y2 + 4) * width_uv + x2 + 4);
-
-                        mean /= 512.0f;
-                    }
-
-                    weight = mean < 40 ? 0.0f :
-                             (mean < 50 ? (mean - 40.0f) / 10.0f : 1.0f);
-                    plane_summed_weights += weight;
-
-                    plane_quality += weight * vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block);
-                }
-                else
-                    plane_quality += vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block);
-            }
-        }
-    }
-
-    if (plane_summed_weights == 0)
-        return 1.0f;
-    else
-        return plane_quality / plane_summed_weights;
-}
-
-double vp8_calc_ssim
-(
-    YV12_BUFFER_CONFIG *source,
-    YV12_BUFFER_CONFIG *dest,
-    int lumamask,
-    double *weight
-)
-{
-    double a, b, c;
-    double frame_weight;
-    double ssimv;
-
-    width_y = source->y_width;
-    height_y = source->y_height;
-    height_uv = source->uv_height;
-    width_uv = source->uv_width;
-    stride_uv = dest->uv_stride;
-    stride = dest->y_stride;
-
-    lumimask = lumamask;
-
-    luminance = 1;
-    a = vp8_ssim(source->y_buffer, dest->y_buffer,
-                 source->y_stride, dest->y_stride, source->y_width, source->y_height);
-    luminance = 0;
-
-    frame_weight = plane_summed_weights / ((width_y - 7) * (height_y - 7));
-
-    if (frame_weight == 0)
-        a = b = c = 1.0f;
-    else
-    {
-        b = vp8_ssim(source->u_buffer, dest->u_buffer,
-                     source->uv_stride, dest->uv_stride, source->uv_width, source->uv_height);
-
-        c = vp8_ssim(source->v_buffer, dest->v_buffer,
-                     source->uv_stride, dest->uv_stride, source->uv_width, source->uv_height);
-    }
-
-    ssimv = a * .8 + .1 * (b + c);
-
-    *weight = frame_weight;
-
-    return ssimv;
-}
-
+#if CONFIG_RUNTIME_CPU_DETECT
+#define IF_RTCD(x)  (x)
+#else
+#define IF_RTCD(x)  NULL
+#endif
 // Google version of SSIM
 // SSIM
 #define KERNEL 3
@@ -520,3 +235,174 @@
     *ssim_v /= uvsize;
     return ssim_all;
 }
+
+
+void ssim_parms_c
+(
+    unsigned char *s,
+    int sp,
+    unsigned char *r,
+    int rp,
+    unsigned long *sum_s,
+    unsigned long *sum_r,
+    unsigned long *sum_sq_s,
+    unsigned long *sum_sq_r,
+    unsigned long *sum_sxr
+)
+{
+    int i,j;
+    for(i=0;i<16;i++,s+=sp,r+=rp)
+     {
+         for(j=0;j<16;j++)
+         {
+             *sum_s += s[j];
+             *sum_r += r[j];
+             *sum_sq_s += s[j] * s[j];
+             *sum_sq_r += r[j] * r[j];
+             *sum_sxr += s[j] * r[j];
+         }
+     }
+}
+void ssim_parms_8x8_c
+(
+    unsigned char *s,
+    int sp,
+    unsigned char *r,
+    int rp,
+    unsigned long *sum_s,
+    unsigned long *sum_r,
+    unsigned long *sum_sq_s,
+    unsigned long *sum_sq_r,
+    unsigned long *sum_sxr
+)
+{
+    int i,j;
+    for(i=0;i<8;i++,s+=sp,r+=rp)
+     {
+         for(j=0;j<8;j++)
+         {
+             *sum_s += s[j];
+             *sum_r += r[j];
+             *sum_sq_s += s[j] * s[j];
+             *sum_sq_r += r[j] * r[j];
+             *sum_sxr += s[j] * r[j];
+         }
+     }
+}
+
+const static long long c1 =  426148; // (256^2*(.01*255)^2
+const static long long c2 = 3835331; //(256^2*(.03*255)^2
+
+static double similarity
+(
+    unsigned long sum_s,
+    unsigned long sum_r,
+    unsigned long sum_sq_s,
+    unsigned long sum_sq_r,
+    unsigned long sum_sxr,
+    int count
+)
+{
+    long long ssim_n = (2*sum_s*sum_r+ c1)*(2*count*sum_sxr-2*sum_s*sum_r+c2);
+
+    long long ssim_d = (sum_s*sum_s +sum_r*sum_r+c1)*
+            (count*sum_sq_s-sum_s*sum_s + count*sum_sq_r-sum_r*sum_r +c2) ;
+
+    return ssim_n * 1.0 / ssim_d;
+}
+
+static double ssim_16x16(unsigned char *s,int sp, unsigned char *r,int rp,
+            const vp8_variance_rtcd_vtable_t *rtcd)
+{
+    unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0;
+    rtcd->ssimpf(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
+    return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 256);
+}
+static double ssim_8x8(unsigned char *s,int sp, unsigned char *r,int rp,
+                const vp8_variance_rtcd_vtable_t *rtcd)
+{
+    unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0;
+    rtcd->ssimpf_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
+    return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64);
+}
+
+// TODO: (jbb) tried to scale this function such that we may be able to use it
+// for distortion metric in mode selection code ( provided we do a reconstruction)
+long dssim(unsigned char *s,int sp, unsigned char *r,int rp,
+           const vp8_variance_rtcd_vtable_t *rtcd)
+{
+    unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0;
+    double ssim3;
+    long long ssim_n;
+    long long ssim_d;
+
+    rtcd->ssimpf(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
+    ssim_n = (2*sum_s*sum_r+ c1)*(2*256*sum_sxr-2*sum_s*sum_r+c2);
+
+    ssim_d = (sum_s*sum_s +sum_r*sum_r+c1)*
+            (256*sum_sq_s-sum_s*sum_s + 256*sum_sq_r-sum_r*sum_r +c2) ;
+
+    ssim3 = 256 * (ssim_d-ssim_n) / ssim_d;
+    return (long)( 256*ssim3 * ssim3 );
+}
+// TODO: (jbb) this 8x8 window might be too big + we may want to pick pixels
+// such that the window regions overlap block boundaries to penalize blocking
+// artifacts.
+
+double vp8_ssim2
+(
+    unsigned char *img1,
+    unsigned char *img2,
+    int stride_img1,
+    int stride_img2,
+    int width,
+    int height,
+    const vp8_variance_rtcd_vtable_t *rtcd
+)
+{
+    int i,j;
+
+    double ssim_total=0;
+
+    // we can sample points as frequently as we like start with 1 per 8x8
+    for(i=0; i < height; i+=8, img1 += stride_img1*8, img2 += stride_img2*8)
+    {
+        for(j=0; j < width; j+=8 )
+        {
+            ssim_total += ssim_8x8(img1, stride_img1, img2, stride_img2, rtcd);
+        }
+    }
+    ssim_total /= (width/8 * height /8);
+    return ssim_total;
+
+}
+double vp8_calc_ssim
+(
+    YV12_BUFFER_CONFIG *source,
+    YV12_BUFFER_CONFIG *dest,
+    int lumamask,
+    double *weight,
+    const vp8_variance_rtcd_vtable_t *rtcd
+)
+{
+    double a, b, c;
+    double ssimv;
+
+    a = vp8_ssim2(source->y_buffer, dest->y_buffer,
+                 source->y_stride, dest->y_stride, source->y_width,
+                 source->y_height, rtcd);
+
+    b = vp8_ssim2(source->u_buffer, dest->u_buffer,
+                 source->uv_stride, dest->uv_stride, source->uv_width,
+                 source->uv_height, rtcd);
+
+    c = vp8_ssim2(source->v_buffer, dest->v_buffer,
+                 source->uv_stride, dest->uv_stride, source->uv_width,
+                 source->uv_height, rtcd);
+
+    ssimv = a * .8 + .1 * (b + c);
+
+    *weight = 1;
+
+    return ssimv;
+}

diff --git a/vp8/encoder/temporal_filter.c b/vp8/encoder/temporal_filter.c
index f28daaf..fd36b22 100644
--- a/vp8/encoder/temporal_filter.c
+++ b/vp8/encoder/temporal_filter.c

@@ -9,27 +9,26 @@
  */
 
 
-#include "onyxc_int.h"
+#include "vp8/common/onyxc_int.h"
 #include "onyx_int.h"
-#include "systemdependent.h"
+#include "vp8/common/systemdependent.h"
 #include "quantize.h"
-#include "alloccommon.h"
+#include "vp8/common/alloccommon.h"
 #include "mcomp.h"
 #include "firstpass.h"
 #include "psnr.h"
 #include "vpx_scale/vpxscale.h"
-#include "extend.h"
+#include "vp8/common/extend.h"
 #include "ratectrl.h"
-#include "quant_common.h"
+#include "vp8/common/quant_common.h"
 #include "segmentation.h"
-#include "g_common.h"
+#include "vp8/common/g_common.h"
 #include "vpx_scale/yv12extend.h"
-#include "postproc.h"
+#include "vp8/common/postproc.h"
 #include "vpx_mem/vpx_mem.h"
-#include "swapyv12buffer.h"
-#include "threading.h"
+#include "vp8/common/swapyv12buffer.h"
+#include "vp8/common/threading.h"
 #include "vpx_ports/vpx_timer.h"
-#include "vpxerrors.h"
 
 #include <math.h>
 #include <limits.h>
@@ -70,7 +69,7 @@
     // U & V
     mv_row >>= 1;
     mv_col >>= 1;
-    stride >>= 1;
+    stride = (stride + 1) >> 1;
     offset = (mv_row >> 3) * stride + (mv_col >> 3);
     uptr = u_mb_ptr + offset;
     vptr = v_mb_ptr + offset;
@@ -204,7 +203,7 @@
             step_param,
             sadpb/*x->errorperbit*/,
             &num00, &cpi->fn_ptr[BLOCK_16X16],
-            mvsadcost, mvcost);
+            mvsadcost, mvcost, &best_ref_mv1);
     }
     else
     {
@@ -288,8 +287,7 @@
     int byte;
     int frame;
     int mb_col, mb_row;
-    unsigned int filter_weight[MAX_LAG_BUFFERS];
-    unsigned char *mm_ptr = cpi->fp_motion_map;
+    unsigned int filter_weight;
     int mb_cols = cpi->common.mb_cols;
     int mb_rows = cpi->common.mb_rows;
     int MBs  = cpi->common.MBs;
@@ -307,13 +305,6 @@
     unsigned char *u_buffer = mbd->pre.u_buffer;
     unsigned char *v_buffer = mbd->pre.v_buffer;
 
-    if (!cpi->use_weighted_temporal_filter)
-    {
-        // Temporal filtering is unweighted
-        for (frame = 0; frame < frame_count; frame++)
-            filter_weight[frame] = 1;
-    }
-
     for (mb_row = 0; mb_row < mb_rows; mb_row++)
     {
 #if ALT_REF_MC_ENABLED
@@ -339,34 +330,9 @@
                                     + (VP8BORDERINPIXELS - 19);
 #endif
 
-            // Read & process macroblock weights from motion map
-            if (cpi->use_weighted_temporal_filter)
-            {
-                weight_cap = 2;
-
-                for (frame = alt_ref_index-1; frame >= 0; frame--)
-                {
-                    w = *(mm_ptr + (frame+1)*MBs);
-                    filter_weight[frame] = w < weight_cap ? w : weight_cap;
-                    weight_cap = w;
-                }
-
-                filter_weight[alt_ref_index] = 2;
-
-                weight_cap = 2;
-
-                for (frame = alt_ref_index+1; frame < frame_count; frame++)
-                {
-                    w = *(mm_ptr + frame*MBs);
-                    filter_weight[frame] = w < weight_cap ? w : weight_cap;
-                    weight_cap = w;
-                }
-
-            }
-
             for (frame = 0; frame < frame_count; frame++)
             {
-                int err;
+                int err = 0;
 
                 if (cpi->frames[frame] == NULL)
                     continue;
@@ -375,28 +341,25 @@
                 mbd->block[0].bmi.mv.as_mv.col = 0;
 
 #if ALT_REF_MC_ENABLED
-                //if (filter_weight[frame] == 0)
-                {
 #define THRESH_LOW   10000
 #define THRESH_HIGH  20000
 
-                    // Correlation has been lost try MC
-                    err = vp8_temporal_filter_find_matching_mb_c
-                        (cpi,
-                         cpi->frames[alt_ref_index],
-                         cpi->frames[frame],
-                         mb_y_offset,
-                         THRESH_LOW);
+                // Find best match in this frame by MC
+                err = vp8_temporal_filter_find_matching_mb_c
+                      (cpi,
+                       cpi->frames[alt_ref_index],
+                       cpi->frames[frame],
+                       mb_y_offset,
+                       THRESH_LOW);
 
-                    if (filter_weight[frame] < 2)
-                    {
-                        // Set weight depending on error
-                        filter_weight[frame] = err<THRESH_LOW
-                                                ? 2 : err<THRESH_HIGH ? 1 : 0;
-                    }
-                }
 #endif
-                if (filter_weight[frame] != 0)
+                // Assign higher weight to matching MB if it's error
+                // score is lower. If not applying MC default behavior
+                // is to weight all MBs equal.
+                filter_weight = err<THRESH_LOW
+                                  ? 2 : err<THRESH_HIGH ? 1 : 0;
+
+                if (filter_weight != 0)
                 {
                     // Construct the predictors
                     vp8_temporal_filter_predictors_mb_c
@@ -416,7 +379,7 @@
                          predictor,
                          16,
                          strength,
-                         filter_weight[frame],
+                         filter_weight,
                          accumulator,
                          count);
 
@@ -426,7 +389,7 @@
                          predictor + 256,
                          8,
                          strength,
-                         filter_weight[frame],
+                         filter_weight,
                          accumulator + 256,
                          count + 256);
 
@@ -436,7 +399,7 @@
                          predictor + 320,
                          8,
                          strength,
-                         filter_weight[frame],
+                         filter_weight,
                          accumulator + 320,
                          count + 320);
                 }
@@ -492,7 +455,6 @@
                 byte += stride - 8;
             }
 
-            mm_ptr++;
             mb_y_offset += 16;
             mb_uv_offset += 8;
         }

diff --git a/vp8/encoder/tokenize.c b/vp8/encoder/tokenize.c
index 5e01863..e3f423f 100644
--- a/vp8/encoder/tokenize.c
+++ b/vp8/encoder/tokenize.c

@@ -26,9 +26,9 @@
 void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) ;
 void vp8_fix_contexts(MACROBLOCKD *x);
 
-TOKENVALUE vp8_dct_value_tokens[DCT_MAX_VALUE*2];
+static TOKENVALUE dct_value_tokens[DCT_MAX_VALUE*2];
 const TOKENVALUE *vp8_dct_value_tokens_ptr;
-int vp8_dct_value_cost[DCT_MAX_VALUE*2];
+static int dct_value_cost[DCT_MAX_VALUE*2];
 const int *vp8_dct_value_cost_ptr;
 #if 0
 int skip_true_count = 0;
@@ -37,7 +37,7 @@
 static void fill_value_tokens()
 {
 
-    TOKENVALUE *const t = vp8_dct_value_tokens + DCT_MAX_VALUE;
+    TOKENVALUE *const t = dct_value_tokens + DCT_MAX_VALUE;
     vp8_extra_bit_struct *const e = vp8_extra_bits;
 
     int i = -DCT_MAX_VALUE;
@@ -81,7 +81,7 @@
                     cost += vp8_treed_cost(p->tree, p->prob, extra >> 1, Length);
 
                 cost += vp8_cost_bit(vp8_prob_half, extra & 1); /* sign */
-                vp8_dct_value_cost[i + DCT_MAX_VALUE] = cost;
+                dct_value_cost[i + DCT_MAX_VALUE] = cost;
             }
 
         }
@@ -89,8 +89,8 @@
     }
     while (++i < DCT_MAX_VALUE);
 
-    vp8_dct_value_tokens_ptr = vp8_dct_value_tokens + DCT_MAX_VALUE;
-    vp8_dct_value_cost_ptr   = vp8_dct_value_cost + DCT_MAX_VALUE;
+    vp8_dct_value_tokens_ptr = dct_value_tokens + DCT_MAX_VALUE;
+    vp8_dct_value_cost_ptr   = dct_value_cost + DCT_MAX_VALUE;
 }
 
 static void tokenize2nd_order_b

diff --git a/vp8/encoder/tokenize.h b/vp8/encoder/tokenize.h
index ed5eb0c..d87c1a3 100644
--- a/vp8/encoder/tokenize.h
+++ b/vp8/encoder/tokenize.h

@@ -12,7 +12,7 @@
 #ifndef tokenize_h
 #define tokenize_h
 
-#include "entropy.h"
+#include "vp8/common/entropy.h"
 #include "block.h"
 
 void vp8_tokenize_initialize();

diff --git a/vp8/encoder/treewriter.h b/vp8/encoder/treewriter.h
index 88096d8..c28a0fa 100644
--- a/vp8/encoder/treewriter.h
+++ b/vp8/encoder/treewriter.h

@@ -15,7 +15,7 @@
 /* Trees map alphabets into huffman-like codes suitable for an arithmetic
    bit coder.  Timothy S Murphy  11 October 2004 */
 
-#include "treecoder.h"
+#include "vp8/common/treecoder.h"
 
 #include "boolhuff.h"       /* for now */
 

diff --git a/vp8/encoder/variance.h b/vp8/encoder/variance.h
index 5befd3b..bf17ea8 100644
--- a/vp8/encoder/variance.h
+++ b/vp8/encoder/variance.h

@@ -85,6 +85,19 @@
       unsigned int *sse \
     );
 
+#define prototype_ssimpf(sym) \
+    void (sym) \
+      ( \
+        unsigned char *s, \
+        int sp, \
+        unsigned char *r, \
+        int rp, \
+        unsigned long *sum_s, \
+        unsigned long *sum_r, \
+        unsigned long *sum_sq_s, \
+        unsigned long *sum_sq_r, \
+        unsigned long *sum_sxr \
+      );
 
 #define prototype_getmbss(sym) unsigned int (sym)(const short *)
 
@@ -306,6 +319,15 @@
 #endif
 extern prototype_sad(vp8_variance_get4x4sse_cs);
 
+#ifndef vp8_ssimpf
+#define vp8_ssimpf ssim_parms_c
+#endif
+extern prototype_ssimpf(vp8_ssimpf)
+
+#ifndef vp8_ssimpf_8x8
+#define vp8_ssimpf_8x8 ssim_parms_8x8_c
+#endif
+extern prototype_ssimpf(vp8_ssimpf_8x8)
 
 typedef prototype_sad(*vp8_sad_fn_t);
 typedef prototype_sad_multi_same_address(*vp8_sad_multi_fn_t);
@@ -315,6 +337,10 @@
 typedef prototype_variance2(*vp8_variance2_fn_t);
 typedef prototype_subpixvariance(*vp8_subpixvariance_fn_t);
 typedef prototype_getmbss(*vp8_getmbss_fn_t);
+
+typedef prototype_ssimpf(*vp8_ssimpf_fn_t)
+
+
 typedef struct
 {
     vp8_sad_fn_t             sad4x4;
@@ -365,6 +391,11 @@
     vp8_sad_multi_d_fn_t     sad8x8x4d;
     vp8_sad_multi_d_fn_t     sad4x4x4d;
 
+#if CONFIG_PSNR
+    vp8_ssimpf_fn_t          ssimpf_8x8;
+    vp8_ssimpf_fn_t          ssimpf;
+#endif
+
 } vp8_variance_rtcd_vtable_t;
 
 typedef struct
@@ -378,6 +409,7 @@
     vp8_sad_multi_fn_t      sdx3f;
     vp8_sad_multi1_fn_t     sdx8f;
     vp8_sad_multi_d_fn_t    sdx4df;
+
 } vp8_variance_fn_ptr_t;
 
 #if CONFIG_RUNTIME_CPU_DETECT

diff --git a/vp8/encoder/variance_c.c b/vp8/encoder/variance_c.c
index 95ec96c..ede07c8 100644
--- a/vp8/encoder/variance_c.c
+++ b/vp8/encoder/variance_c.c

@@ -10,33 +10,8 @@
 
 
 #include "variance.h"
+#include "vp8/common/filter.h"
 
-const int vp8_six_tap[8][6] =
-{
-    { 0,  0,  128,    0,   0,  0 },         // note that 1/8 pel positions are just as per alpha -0.5 bicubic
-    { 0, -6,  123,   12,  -1,  0 },
-    { 2, -11, 108,   36,  -8,  1 },         // New 1/4 pel 6 tap filter
-    { 0, -9,   93,   50,  -6,  0 },
-    { 3, -16,  77,   77, -16,  3 },         // New 1/2 pel 6 tap filter
-    { 0, -6,   50,   93,  -9,  0 },
-    { 1, -8,   36,  108, -11,  2 },         // New 1/4 pel 6 tap filter
-    { 0, -1,   12,  123,  -6,  0 }
-};
-
-
-const int VP8_FILTER_WEIGHT = 128;
-const int VP8_FILTER_SHIFT  =   7;
-const int vp8_bilinear_taps[8][2] =
-{
-    { 128,   0 },
-    { 112,  16 },
-    {  96,  32 },
-    {  80,  48 },
-    {  64,  64 },
-    {  48,  80 },
-    {  32,  96 },
-    {  16, 112 }
-};
 
 unsigned int vp8_get_mb_ss_c
 (
@@ -56,7 +31,7 @@
 }
 
 
-void  vp8_variance(
+static void variance(
     const unsigned char *src_ptr,
     int  source_stride,
     const unsigned char *ref_ptr,
@@ -98,7 +73,7 @@
 )
 {
 
-    vp8_variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, SSE, Sum);
+    variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, SSE, Sum);
     return (*SSE - (((*Sum) * (*Sum)) >> 6));
 }
 
@@ -114,7 +89,7 @@
 )
 {
 
-    vp8_variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, SSE, Sum);
+    variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, SSE, Sum);
     return (*SSE - (((*Sum) * (*Sum)) >> 8));
 
 }
@@ -132,7 +107,7 @@
     int avg;
 
 
-    vp8_variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
+    variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
     *sse = var;
     return (var - ((avg * avg) >> 8));
 }
@@ -148,7 +123,7 @@
     int avg;
 
 
-    vp8_variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg);
+    variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg);
     *sse = var;
     return (var - ((avg * avg) >> 7));
 }
@@ -164,7 +139,7 @@
     int avg;
 
 
-    vp8_variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg);
+    variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg);
     *sse = var;
     return (var - ((avg * avg) >> 7));
 }
@@ -181,7 +156,7 @@
     int avg;
 
 
-    vp8_variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg);
+    variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg);
     *sse = var;
     return (var - ((avg * avg) >> 6));
 }
@@ -197,7 +172,7 @@
     int avg;
 
 
-    vp8_variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, &var, &avg);
+    variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, &var, &avg);
     *sse = var;
     return (var - ((avg * avg) >> 4));
 }
@@ -213,7 +188,7 @@
     unsigned int var;
     int avg;
 
-    vp8_variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
+    variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
     *sse = var;
     return var;
 }
@@ -247,7 +222,7 @@
  *                  to the next.
  *
  ****************************************************************************/
-void vp8e_filter_block2d_bil_first_pass
+static void var_filter_block2d_bil_first_pass
 (
     const unsigned char *src_ptr,
     unsigned short *output_ptr,
@@ -255,7 +230,7 @@
     int pixel_step,
     unsigned int output_height,
     unsigned int output_width,
-    const int *vp8_filter
+    const short *vp8_filter
 )
 {
     unsigned int i, j;
@@ -305,7 +280,7 @@
  *                  to the next.
  *
  ****************************************************************************/
-void vp8e_filter_block2d_bil_second_pass
+static void var_filter_block2d_bil_second_pass
 (
     const unsigned short *src_ptr,
     unsigned char  *output_ptr,
@@ -313,7 +288,7 @@
     unsigned int  pixel_step,
     unsigned int  output_height,
     unsigned int  output_width,
-    const int *vp8_filter
+    const short *vp8_filter
 )
 {
     unsigned int  i, j;
@@ -338,52 +313,6 @@
 }
 
 
-/****************************************************************************
- *
- *  ROUTINE       : filter_block2d_bil
- *
- *  INPUTS        : UINT8  *src_ptr          : Pointer to source block.
- *                  UINT32 src_pixels_per_line : Stride of input block.
- *                  INT32  *HFilter         : Array of 2 horizontal filter taps.
- *                  INT32  *VFilter         : Array of 2 vertical filter taps.
- *
- *  OUTPUTS       : UINT16 *output_ptr       : Pointer to filtered block.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : 2-D filters an 8x8 input block by applying a 2-tap
- *                  bi-linear filter horizontally followed by a 2-tap
- *                  bi-linear filter vertically on the result.
- *
- *  SPECIAL NOTES : The intermediate horizontally filtered block must produce
- *                  1 more point than the input block in each column. This
- *                  is to ensure that the 2-tap filter has one extra data-point
- *                  at the top of each column so filter taps do not extend
- *                  beyond data. Thus the output of the first stage filter
- *                  is an 8x9 (hx_v) block.
- *
- ****************************************************************************/
-void vp8e_filter_block2d_bil
-(
-    const unsigned char  *src_ptr,
-    unsigned char *output_ptr,
-    unsigned int src_pixels_per_line,
-    int  *HFilter,
-    int  *VFilter
-)
-{
-
-    unsigned short FData[20*16];    // Temp data bufffer used in filtering
-
-    // First filter 1-D horizontally...
-    vp8e_filter_block2d_bil_first_pass(src_ptr, FData, src_pixels_per_line, 1, 9, 8, HFilter);
-
-    // then 1-D vertically...
-    vp8e_filter_block2d_bil_second_pass(FData, output_ptr, 8, 8, 8, 8, VFilter);
-}
-
-
-
 unsigned int vp8_sub_pixel_variance4x4_c
 (
     const unsigned char  *src_ptr,
@@ -396,17 +325,17 @@
 )
 {
     unsigned char  temp2[20*16];
-    const int *HFilter, *VFilter;
+    const short *HFilter, *VFilter;
     unsigned short FData3[5*4]; // Temp data bufffer used in filtering
 
-    HFilter = vp8_bilinear_taps[xoffset];
-    VFilter = vp8_bilinear_taps[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
 
     // First filter 1d Horizontal
-    vp8e_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 5, 4, HFilter);
+    var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 5, 4, HFilter);
 
     // Now filter Verticaly
-    vp8e_filter_block2d_bil_second_pass(FData3, temp2, 4,  4,  4,  4, VFilter);
+    var_filter_block2d_bil_second_pass(FData3, temp2, 4,  4,  4,  4, VFilter);
 
     return vp8_variance4x4_c(temp2, 4, dst_ptr, dst_pixels_per_line, sse);
 }
@@ -425,13 +354,13 @@
 {
     unsigned short FData3[9*8]; // Temp data bufffer used in filtering
     unsigned char  temp2[20*16];
-    const int *HFilter, *VFilter;
+    const short *HFilter, *VFilter;
 
-    HFilter = vp8_bilinear_taps[xoffset];
-    VFilter = vp8_bilinear_taps[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
 
-    vp8e_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter);
-    vp8e_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter);
+    var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter);
+    var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter);
 
     return vp8_variance8x8_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
 }
@@ -449,13 +378,13 @@
 {
     unsigned short FData3[17*16];   // Temp data bufffer used in filtering
     unsigned char  temp2[20*16];
-    const int *HFilter, *VFilter;
+    const short *HFilter, *VFilter;
 
-    HFilter = vp8_bilinear_taps[xoffset];
-    VFilter = vp8_bilinear_taps[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
 
-    vp8e_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter);
-    vp8e_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter);
+    var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter);
+    var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter);
 
     return vp8_variance16x16_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
 }
@@ -525,13 +454,13 @@
 {
     unsigned short FData3[16*9];    // Temp data bufffer used in filtering
     unsigned char  temp2[20*16];
-    const int *HFilter, *VFilter;
+    const short *HFilter, *VFilter;
 
-    HFilter = vp8_bilinear_taps[xoffset];
-    VFilter = vp8_bilinear_taps[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
 
-    vp8e_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter);
-    vp8e_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter);
+    var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter);
+    var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter);
 
     return vp8_variance16x8_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
 }
@@ -549,15 +478,15 @@
 {
     unsigned short FData3[9*16];    // Temp data bufffer used in filtering
     unsigned char  temp2[20*16];
-    const int *HFilter, *VFilter;
+    const short *HFilter, *VFilter;
 
 
-    HFilter = vp8_bilinear_taps[xoffset];
-    VFilter = vp8_bilinear_taps[yoffset];
+    HFilter = vp8_bilinear_filters[xoffset];
+    VFilter = vp8_bilinear_filters[yoffset];
 
 
-    vp8e_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 8, HFilter);
-    vp8e_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 16, 8, VFilter);
+    var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 8, HFilter);
+    var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 16, 8, VFilter);
 
     return vp8_variance8x16_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
 }

diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm
index 45e1a2a..bc70b68 100644
--- a/vp8/encoder/x86/quantize_sse2.asm
+++ b/vp8/encoder/x86/quantize_sse2.asm

@@ -9,48 +9,59 @@
 
 
 %include "vpx_ports/x86_abi_support.asm"
+%include "asm_enc_offsets.asm"
 
 
-;int vp8_regular_quantize_b_impl_sse2(
-;               short *coeff_ptr,
-;               short *zbin_ptr,
-;               short *qcoeff_ptr,
-;               short *dequant_ptr,
-;               const int *default_zig_zag,
-;               short *round_ptr,
-;               short *quant_ptr,
-;               short *dqcoeff_ptr,
-;               unsigned short zbin_oq_value,
-;               short *zbin_boost_ptr,
-;               short *quant_shift);
-;
-global sym(vp8_regular_quantize_b_impl_sse2)
-sym(vp8_regular_quantize_b_impl_sse2):
+; void vp8_regular_quantize_b_sse2 | arg
+;  (BLOCK  *b,                     |  0
+;   BLOCKD *d)                     |  1
+
+global sym(vp8_regular_quantize_b_sse2)
+sym(vp8_regular_quantize_b_sse2):
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 11
     SAVE_XMM
+    GET_GOT     rbx
     push        rsi
+
+%if ABI_IS_32BIT
     push        rdi
-    push        rbx
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    push        rdi
+  %endif
+%endif
+
     ALIGN_STACK 16, rax
-    %define abs_minus_zbin    0
-    %define temp_qcoeff       32
-    %define qcoeff            64
-    %define eob_tmp           96
+    %define BLOCKD_d          0  ;  8
+    %define zrun_zbin_boost   8  ;  8
+    %define abs_minus_zbin    16 ; 32
+    %define temp_qcoeff       48 ; 32
+    %define qcoeff            80 ; 32
     %define stack_size        112
     sub         rsp, stack_size
     ; end prolog
 
-    mov         rdx, arg(0)                 ; coeff_ptr
-    mov         rcx, arg(1)                 ; zbin_ptr
-    movd        xmm7, arg(8)                ; zbin_oq_value
-    mov         rdi, arg(5)                 ; round_ptr
-    mov         rsi, arg(6)                 ; quant_ptr
+%if ABI_IS_32BIT
+    mov         rdi, arg(0)
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    mov         rdi, rcx                    ; BLOCK *b
+    mov         [rsp + BLOCKD_d], rdx
+  %else
+    ;mov         rdi, rdi                    ; BLOCK *b
+    mov         [rsp + BLOCKD_d], rsi
+  %endif
+%endif
+
+    mov         rdx, [rdi + vp8_block_coeff] ; coeff_ptr
+    mov         rcx, [rdi + vp8_block_zbin] ; zbin_ptr
+    movd        xmm7, [rdi + vp8_block_zbin_extra] ; zbin_oq_value
 
     ; z
-    movdqa      xmm0, OWORD PTR[rdx]
-    movdqa      xmm4, OWORD PTR[rdx + 16]
+    movdqa      xmm0, [rdx]
+    movdqa      xmm4, [rdx + 16]
+    mov         rdx, [rdi + vp8_block_round] ; round_ptr
 
     pshuflw     xmm7, xmm7, 0
     punpcklwd   xmm7, xmm7                  ; duplicated zbin_oq_value
@@ -70,8 +81,9 @@
     psubw       xmm1, xmm0
     psubw       xmm5, xmm4
 
-    movdqa      xmm2, OWORD PTR[rcx]
-    movdqa      xmm3, OWORD PTR[rcx + 16]
+    movdqa      xmm2, [rcx]
+    movdqa      xmm3, [rcx + 16]
+    mov         rcx, [rdi + vp8_block_quant] ; quant_ptr
 
     ; *zbin_ptr + zbin_oq_value
     paddw       xmm2, xmm7
@@ -80,18 +92,18 @@
     ; x - (*zbin_ptr + zbin_oq_value)
     psubw       xmm1, xmm2
     psubw       xmm5, xmm3
-    movdqa      OWORD PTR[rsp + abs_minus_zbin], xmm1
-    movdqa      OWORD PTR[rsp + abs_minus_zbin + 16], xmm5
+    movdqa      [rsp + abs_minus_zbin], xmm1
+    movdqa      [rsp + abs_minus_zbin + 16], xmm5
 
     ; add (zbin_ptr + zbin_oq_value) back
     paddw       xmm1, xmm2
     paddw       xmm5, xmm3
 
-    movdqa      xmm2, OWORD PTR[rdi]
-    movdqa      xmm6, OWORD PTR[rdi + 16]
+    movdqa      xmm2, [rdx]
+    movdqa      xmm6, [rdx + 16]
 
-    movdqa      xmm3, OWORD PTR[rsi]
-    movdqa      xmm7, OWORD PTR[rsi + 16]
+    movdqa      xmm3, [rcx]
+    movdqa      xmm7, [rcx + 16]
 
     ; x + round
     paddw       xmm1, xmm2
@@ -105,68 +117,67 @@
     paddw       xmm1, xmm3
     paddw       xmm5, xmm7
 
-    movdqa      OWORD PTR[rsp + temp_qcoeff], xmm1
-    movdqa      OWORD PTR[rsp + temp_qcoeff + 16], xmm5
+    movdqa      [rsp + temp_qcoeff], xmm1
+    movdqa      [rsp + temp_qcoeff + 16], xmm5
 
     pxor        xmm6, xmm6
     ; zero qcoeff
-    movdqa      OWORD PTR[rsp + qcoeff], xmm6
-    movdqa      OWORD PTR[rsp + qcoeff + 16], xmm6
+    movdqa      [rsp + qcoeff], xmm6
+    movdqa      [rsp + qcoeff + 16], xmm6
 
-    mov         [rsp + eob_tmp], DWORD -1   ; eob
-    mov         rsi, arg(9)                 ; zbin_boost_ptr
-    mov         rdi, arg(4)                 ; default_zig_zag
-    mov         rax, arg(10)                ; quant_shift_ptr
+    mov         rsi, [rdi + vp8_block_zrun_zbin_boost] ; zbin_boost_ptr
+    mov         rax, [rdi + vp8_block_quant_shift] ; quant_shift_ptr
+    mov         [rsp + zrun_zbin_boost], rsi
 
-%macro ZIGZAG_LOOP 2
-rq_zigzag_loop_%1:
-    movsxd      rdx, DWORD PTR[rdi + (%1 * 4)] ; rc
-    movsx       ebx, WORD PTR [rsi]         ; *zbin_boost_ptr
-    lea         rsi, [rsi + 2]              ; zbin_boost_ptr++
+%macro ZIGZAG_LOOP 1
+    movsx       edx, WORD PTR[GLOBAL(zig_zag) + (%1 * 2)] ; rc
 
     ; x
     movsx       ecx, WORD PTR[rsp + abs_minus_zbin + rdx *2]
 
     ; if (x >= zbin)
-    sub         ecx, ebx                    ; x - zbin
-    jl          rq_zigzag_loop_%2           ; x < zbin
+    sub         cx, WORD PTR[rsi]           ; x - zbin
+    lea         rsi, [rsi + 2]              ; zbin_boost_ptr++
+    jl          rq_zigzag_loop_%1           ; x < zbin
 
-    movsx       ebx, WORD PTR[rsp + temp_qcoeff + rdx *2]
+    movsx       edi, WORD PTR[rsp + temp_qcoeff + rdx *2]
 
     ; downshift by quant_shift[rdx]
     movsx       ecx, WORD PTR[rax + rdx*2]  ; quant_shift_ptr[rc]
-    sar         ebx, cl                     ; also sets Z bit
-    je          rq_zigzag_loop_%2           ; !y
-    mov         WORD PTR[rsp + qcoeff + rdx * 2], bx ;qcoeff_ptr[rc] = temp_qcoeff[rc]
-
-    mov         rsi, arg(9)                 ; reset to b->zrun_zbin_boost
-    mov         [rsp + eob_tmp], DWORD %1   ; eob = i
+    sar         edi, cl                     ; also sets Z bit
+    je          rq_zigzag_loop_%1           ; !y
+    mov         WORD PTR[rsp + qcoeff + rdx*2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc]
+    mov         rsi, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost
+rq_zigzag_loop_%1:
 %endmacro
-ZIGZAG_LOOP 0, 1
-ZIGZAG_LOOP 1, 2
-ZIGZAG_LOOP 2, 3
-ZIGZAG_LOOP 3, 4
-ZIGZAG_LOOP 4, 5
-ZIGZAG_LOOP 5, 6
-ZIGZAG_LOOP 6, 7
-ZIGZAG_LOOP 7, 8
-ZIGZAG_LOOP 8, 9
-ZIGZAG_LOOP 9, 10
-ZIGZAG_LOOP 10, 11
-ZIGZAG_LOOP 11, 12
-ZIGZAG_LOOP 12, 13
-ZIGZAG_LOOP 13, 14
-ZIGZAG_LOOP 14, 15
-ZIGZAG_LOOP 15, end
-rq_zigzag_loop_end:
+ZIGZAG_LOOP 0
+ZIGZAG_LOOP 1
+ZIGZAG_LOOP 2
+ZIGZAG_LOOP 3
+ZIGZAG_LOOP 4
+ZIGZAG_LOOP 5
+ZIGZAG_LOOP 6
+ZIGZAG_LOOP 7
+ZIGZAG_LOOP 8
+ZIGZAG_LOOP 9
+ZIGZAG_LOOP 10
+ZIGZAG_LOOP 11
+ZIGZAG_LOOP 12
+ZIGZAG_LOOP 13
+ZIGZAG_LOOP 14
+ZIGZAG_LOOP 15
 
-    mov         rbx, arg(2)                 ; qcoeff_ptr
-    mov         rcx, arg(3)                 ; dequant_ptr
-    mov         rsi, arg(7)                 ; dqcoeff_ptr
-    mov         rax, [rsp + eob_tmp]        ; eob
+    movdqa      xmm2, [rsp + qcoeff]
+    movdqa      xmm3, [rsp + qcoeff + 16]
 
-    movdqa      xmm2, OWORD PTR[rsp + qcoeff]
-    movdqa      xmm3, OWORD PTR[rsp + qcoeff + 16]
+%if ABI_IS_32BIT
+    mov         rdi, arg(1)
+%else
+    mov         rdi, [rsp + BLOCKD_d]
+%endif
+
+    mov         rcx, [rdi + vp8_blockd_dequant] ; dequant_ptr
+    mov         rsi, [rdi + vp8_blockd_dqcoeff] ; dqcoeff_ptr
 
     ; y ^ sz
     pxor        xmm2, xmm0
@@ -175,34 +186,67 @@
     psubw       xmm2, xmm0
     psubw       xmm3, xmm4
 
-    movdqa      xmm0, OWORD PTR[rcx]
-    movdqa      xmm1, OWORD PTR[rcx + 16]
+    ; dequant
+    movdqa      xmm0, [rcx]
+    movdqa      xmm1, [rcx + 16]
+
+    mov         rcx, [rdi + vp8_blockd_qcoeff] ; qcoeff_ptr
 
     pmullw      xmm0, xmm2
     pmullw      xmm1, xmm3
 
-    movdqa      OWORD PTR[rbx], xmm2
-    movdqa      OWORD PTR[rbx + 16], xmm3
-    movdqa      OWORD PTR[rsi], xmm0        ; store dqcoeff
-    movdqa      OWORD PTR[rsi + 16], xmm1   ; store dqcoeff
+    movdqa      [rcx], xmm2        ; store qcoeff
+    movdqa      [rcx + 16], xmm3
+    movdqa      [rsi], xmm0        ; store dqcoeff
+    movdqa      [rsi + 16], xmm1
 
-    add         rax, 1
+    ; select the last value (in zig_zag order) for EOB
+    pcmpeqw     xmm2, xmm6
+    pcmpeqw     xmm3, xmm6
+    ; !
+    pcmpeqw     xmm6, xmm6
+    pxor        xmm2, xmm6
+    pxor        xmm3, xmm6
+    ; mask inv_zig_zag
+    pand        xmm2, [GLOBAL(inv_zig_zag)]
+    pand        xmm3, [GLOBAL(inv_zig_zag) + 16]
+    ; select the max value
+    pmaxsw      xmm2, xmm3
+    pshufd      xmm3, xmm2, 00001110b
+    pmaxsw      xmm2, xmm3
+    pshuflw     xmm3, xmm2, 00001110b
+    pmaxsw      xmm2, xmm3
+    pshuflw     xmm3, xmm2, 00000001b
+    pmaxsw      xmm2, xmm3
+    movd        eax, xmm2
+    and         eax, 0xff
+    mov         [rdi + vp8_blockd_eob], eax
 
     ; begin epilog
     add         rsp, stack_size
     pop         rsp
-    pop         rbx
+%if ABI_IS_32BIT
     pop         rdi
+%else
+  %ifidn __OUTPUT_FORMAT__,x64
+    pop         rdi
+  %endif
+%endif
     pop         rsi
+    RESTORE_GOT
     RESTORE_XMM
-    UNSHADOW_ARGS
     pop         rbp
     ret
 
-;int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr,
-;                           short *qcoeff_ptr,short *dequant_ptr,
-;                           short *inv_scan_order, short *round_ptr,
-;                           short *quant_ptr, short *dqcoeff_ptr);
+; int vp8_fast_quantize_b_impl_sse2 | arg
+;  (short *coeff_ptr,               |  0
+;   short *qcoeff_ptr,              |  1
+;   short *dequant_ptr,             |  2
+;   short *inv_scan_order,          |  3
+;   short *round_ptr,               |  4
+;   short *quant_ptr,               |  5
+;   short *dqcoeff_ptr)             |  6
+
 global sym(vp8_fast_quantize_b_impl_sse2)
 sym(vp8_fast_quantize_b_impl_sse2):
     push        rbp
@@ -300,3 +344,16 @@
     UNSHADOW_ARGS
     pop         rbp
     ret
+
+SECTION_RODATA
+align 16
+zig_zag:
+  dw 0x0000, 0x0001, 0x0004, 0x0008
+  dw 0x0005, 0x0002, 0x0003, 0x0006
+  dw 0x0009, 0x000c, 0x000d, 0x000a
+  dw 0x0007, 0x000b, 0x000e, 0x000f
+inv_zig_zag:
+  dw 0x0001, 0x0002, 0x0006, 0x0007
+  dw 0x0003, 0x0005, 0x0008, 0x000d
+  dw 0x0004, 0x0009, 0x000c, 0x000e
+  dw 0x000a, 0x000b, 0x000f, 0x0010

diff --git a/vp8/encoder/x86/quantize_x86.h b/vp8/encoder/x86/quantize_x86.h
index 266efb4..6f54bec 100644
--- a/vp8/encoder/x86/quantize_x86.h
+++ b/vp8/encoder/x86/quantize_x86.h

@@ -27,11 +27,8 @@
 
 #if !CONFIG_RUNTIME_CPU_DETECT
 
-// Currently, this function realizes a gain on x86 and a loss on x86_64
-#if ARCH_X86
 #undef vp8_quantize_quantb
 #define vp8_quantize_quantb vp8_regular_quantize_b_sse2
-#endif
 
 #endif
 

diff --git a/vp8/encoder/x86/sad_sse3.asm b/vp8/encoder/x86/sad_sse3.asm
index 5754175..f0336ab 100644
--- a/vp8/encoder/x86/sad_sse3.asm
+++ b/vp8/encoder/x86/sad_sse3.asm

@@ -586,52 +586,45 @@
 
     STACK_FRAME_CREATE_X3
 
-        lea             end_ptr,    [src_ptr+src_stride*8]
-
-        lea             end_ptr,    [end_ptr+src_stride*8]
-        pxor            mm7,        mm7
+        mov             end_ptr,    4
+        pxor            xmm7,        xmm7
 
 .vp8_sad16x16_sse3_loop:
-
-        movq            ret_var,    mm7
-        cmp             ret_var,    max_err
-        jg              .vp8_sad16x16_early_exit
-
-        movq            mm0,        QWORD PTR [src_ptr]
-        movq            mm2,        QWORD PTR [src_ptr+8]
-
-        movq            mm1,        QWORD PTR [ref_ptr]
-        movq            mm3,        QWORD PTR [ref_ptr+8]
-
-        movq            mm4,        QWORD PTR [src_ptr+src_stride]
-        movq            mm5,        QWORD PTR [ref_ptr+ref_stride]
-
-        psadbw          mm0,        mm1
-        psadbw          mm2,        mm3
-
-        movq            mm1,        QWORD PTR [src_ptr+src_stride+8]
-        movq            mm3,        QWORD PTR [ref_ptr+ref_stride+8]
-
-        psadbw          mm4,        mm5
-        psadbw          mm1,        mm3
+        movdqa          xmm0,       XMMWORD PTR [src_ptr]
+        movdqu          xmm1,       XMMWORD PTR [ref_ptr]
+        movdqa          xmm2,       XMMWORD PTR [src_ptr+src_stride]
+        movdqu          xmm3,       XMMWORD PTR [ref_ptr+ref_stride]
 
         lea             src_ptr,    [src_ptr+src_stride*2]
         lea             ref_ptr,    [ref_ptr+ref_stride*2]
 
-        paddw           mm0,        mm2
-        paddw           mm4,        mm1
+        movdqa          xmm4,       XMMWORD PTR [src_ptr]
+        movdqu          xmm5,       XMMWORD PTR [ref_ptr]
+        movdqa          xmm6,       XMMWORD PTR [src_ptr+src_stride]
 
-        paddw           mm7,        mm0
-        paddw           mm7,        mm4
+        psadbw          xmm0,       xmm1
 
-        cmp             src_ptr,    end_ptr
+        movdqu          xmm1,       XMMWORD PTR [ref_ptr+ref_stride]
+
+        psadbw          xmm2,       xmm3
+        psadbw          xmm4,       xmm5
+        psadbw          xmm6,       xmm1
+
+        lea             src_ptr,    [src_ptr+src_stride*2]
+        lea             ref_ptr,    [ref_ptr+ref_stride*2]
+
+        paddw           xmm7,        xmm0
+        paddw           xmm7,        xmm2
+        paddw           xmm7,        xmm4
+        paddw           xmm7,        xmm6
+
+        sub             end_ptr,     1
         jne             .vp8_sad16x16_sse3_loop
 
-        movq            ret_var,    mm7
-
-.vp8_sad16x16_early_exit:
-
-        mov             rax,        ret_var
+        movq            xmm0,       xmm7
+        psrldq          xmm7,       8
+        paddw           xmm0,       xmm7
+        movq            rax,        xmm0
 
     STACK_FRAME_DESTROY_X3
 

diff --git a/vp8/encoder/x86/sad_sse4.asm b/vp8/encoder/x86/sad_sse4.asm
index 21e2e50..03ecec4 100644
--- a/vp8/encoder/x86/sad_sse4.asm
+++ b/vp8/encoder/x86/sad_sse4.asm

@@ -186,7 +186,7 @@
         PROCESS_16X2X8 0
 
         mov             rdi,        arg(4)           ;Results
-        movdqu          XMMWORD PTR [rdi],    xmm1
+        movdqa          XMMWORD PTR [rdi],    xmm1
 
     ; begin epilog
     pop         rdi
@@ -224,7 +224,7 @@
         PROCESS_16X2X8 0
 
         mov             rdi,        arg(4)           ;Results
-        movdqu          XMMWORD PTR [rdi],    xmm1
+        movdqa          XMMWORD PTR [rdi],    xmm1
 
     ; begin epilog
     pop         rdi
@@ -262,7 +262,7 @@
         PROCESS_8X2X8 0
 
         mov             rdi,        arg(4)           ;Results
-        movdqu          XMMWORD PTR [rdi],    xmm1
+        movdqa          XMMWORD PTR [rdi],    xmm1
 
     ; begin epilog
     pop         rdi
@@ -303,7 +303,7 @@
         PROCESS_8X2X8 0
         PROCESS_8X2X8 0
         mov             rdi,        arg(4)           ;Results
-        movdqu          XMMWORD PTR [rdi],    xmm1
+        movdqa          XMMWORD PTR [rdi],    xmm1
 
     ; begin epilog
     pop         rdi
@@ -339,7 +339,7 @@
         PROCESS_4X2X8 0
 
         mov             rdi,        arg(4)           ;Results
-        movdqu          XMMWORD PTR [rdi],    xmm1
+        movdqa          XMMWORD PTR [rdi],    xmm1
 
     ; begin epilog
     pop         rdi

diff --git a/vp8/encoder/x86/ssim_opt.asm b/vp8/encoder/x86/ssim_opt.asm
new file mode 100644
index 0000000..c267cdb
--- /dev/null
+++ b/vp8/encoder/x86/ssim_opt.asm

@@ -0,0 +1,215 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
+%macro TABULATE_SSIM 0
+        paddusw         xmm15, xmm3  ; sum_s
+        paddusw         xmm14, xmm4  ; sum_r
+        movdqa          xmm1, xmm3
+        pmaddwd         xmm1, xmm1
+        paddq           xmm13, xmm1 ; sum_sq_s
+        movdqa          xmm2, xmm4
+        pmaddwd         xmm2, xmm2
+        paddq           xmm12, xmm2 ; sum_sq_r
+        pmaddwd         xmm3, xmm4
+        paddq           xmm11, xmm3  ; sum_sxr
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_Q 1
+        movdqa          xmm2,%1
+        punpckldq       %1,xmm0
+        punpckhdq       xmm2,xmm0
+        paddq           %1,xmm2
+        movdqa          xmm2,%1
+        punpcklqdq      %1,xmm0
+        punpckhqdq      xmm2,xmm0
+        paddq           %1,xmm2
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_W 1
+        movdqa          xmm1, %1
+        punpcklwd       %1,xmm0
+        punpckhwd       xmm1,xmm0
+        paddd           %1, xmm1
+        SUM_ACROSS_Q    %1
+%endmacro
+;void ssim_parms_sse3(
+;    unsigned char *s,
+;    int sp,
+;    unsigned char *r,
+;    int rp
+;    unsigned long *sum_s,
+;    unsigned long *sum_r,
+;    unsigned long *sum_sq_s,
+;    unsigned long *sum_sq_r,
+;    unsigned long *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+global sym(vp8_ssim_parms_16x16_sse3)
+sym(vp8_ssim_parms_16x16_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov             rsi,        arg(0) ;s
+    mov             rcx,        arg(1) ;sp
+    mov             rdi,        arg(2) ;r
+    mov             rax,        arg(3) ;rp
+
+    pxor            xmm0, xmm0
+    pxor            xmm15,xmm15  ;sum_s
+    pxor            xmm14,xmm14  ;sum_r
+    pxor            xmm13,xmm13  ;sum_sq_s
+    pxor            xmm12,xmm12  ;sum_sq_r
+    pxor            xmm11,xmm11  ;sum_sxr
+
+    mov             rdx, 16      ;row counter
+NextRow:
+
+    ;grab source and reference pixels
+    movdqu          xmm5, [rsi]
+    movdqu          xmm6, [rdi]
+    movdqa          xmm3, xmm5
+    movdqa          xmm4, xmm6
+    punpckhbw       xmm3, xmm0 ; high_s
+    punpckhbw       xmm4, xmm0 ; high_r
+
+    TABULATE_SSIM
+
+    movdqa          xmm3, xmm5
+    movdqa          xmm4, xmm6
+    punpcklbw       xmm3, xmm0 ; low_s
+    punpcklbw       xmm4, xmm0 ; low_r
+
+    TABULATE_SSIM
+
+    add             rsi, rcx   ; next s row
+    add             rdi, rax   ; next r row
+
+    dec             rdx        ; counter
+    jnz NextRow
+
+    SUM_ACROSS_W    xmm15
+    SUM_ACROSS_W    xmm14
+    SUM_ACROSS_Q    xmm13
+    SUM_ACROSS_Q    xmm12
+    SUM_ACROSS_Q    xmm11
+
+    mov             rdi,arg(4)
+    movq            [rdi], xmm15;
+    mov             rdi,arg(5)
+    movq            [rdi], xmm14;
+    mov             rdi,arg(6)
+    movq            [rdi], xmm13;
+    mov             rdi,arg(7)
+    movq            [rdi], xmm12;
+    mov             rdi,arg(8)
+    movq            [rdi], xmm11;
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void ssim_parms_sse3(
+;    unsigned char *s,
+;    int sp,
+;    unsigned char *r,
+;    int rp
+;    unsigned long *sum_s,
+;    unsigned long *sum_r,
+;    unsigned long *sum_sq_s,
+;    unsigned long *sum_sq_r,
+;    unsigned long *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+global sym(vp8_ssim_parms_8x8_sse3)
+sym(vp8_ssim_parms_8x8_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov             rsi,        arg(0) ;s
+    mov             rcx,        arg(1) ;sp
+    mov             rdi,        arg(2) ;r
+    mov             rax,        arg(3) ;rp
+
+    pxor            xmm0, xmm0
+    pxor            xmm15,xmm15  ;sum_s
+    pxor            xmm14,xmm14  ;sum_r
+    pxor            xmm13,xmm13  ;sum_sq_s
+    pxor            xmm12,xmm12  ;sum_sq_r
+    pxor            xmm11,xmm11  ;sum_sxr
+
+    mov             rdx, 8      ;row counter
+NextRow2:
+
+    ;grab source and reference pixels
+    movq            xmm5, [rsi]
+    movq            xmm6, [rdi]
+
+    movdqa          xmm3, xmm5
+    movdqa          xmm4, xmm6
+    punpcklbw       xmm3, xmm0 ; low_s
+    punpcklbw       xmm4, xmm0 ; low_r
+
+    TABULATE_SSIM
+
+    add             rsi, rcx   ; next s row
+    add             rdi, rax   ; next r row
+
+    dec             rdx        ; counter
+    jnz NextRow2
+
+    SUM_ACROSS_W    xmm15
+    SUM_ACROSS_W    xmm14
+    SUM_ACROSS_Q    xmm13
+    SUM_ACROSS_Q    xmm12
+    SUM_ACROSS_Q    xmm11
+
+    mov             rdi,arg(4)
+    movq            [rdi], xmm15;
+    mov             rdi,arg(5)
+    movq            [rdi], xmm14;
+    mov             rdi,arg(6)
+    movq            [rdi], xmm13;
+    mov             rdi,arg(7)
+    movq            [rdi], xmm12;
+    mov             rdi,arg(8)
+    movq            [rdi], xmm11;
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret

diff --git a/vp8/encoder/x86/temporal_filter_apply_sse2.asm b/vp8/encoder/x86/temporal_filter_apply_sse2.asm
index f2adccc..0127b01 100644
--- a/vp8/encoder/x86/temporal_filter_apply_sse2.asm
+++ b/vp8/encoder/x86/temporal_filter_apply_sse2.asm

@@ -84,7 +84,7 @@
         jmp         temporal_filter_apply_load_finished
 
 temporal_filter_apply_load_16:
-        movdqu      xmm0,           [rsi]  ; src (frame1)
+        movdqa      xmm0,           [rsi]  ; src (frame1)
         lea         rsi,            [rsi + rbp] ; += stride
         movdqa      xmm1,           xmm0
         punpcklbw   xmm0,           xmm7   ; src[ 0- 7]

diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm
index 7178e7e..c2c30de 100644
--- a/vp8/encoder/x86/variance_impl_sse2.asm
+++ b/vp8/encoder/x86/variance_impl_sse2.asm

@@ -85,10 +85,9 @@
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
-    GET_GOT     rbx
+    push rbx
     push rsi
     push rdi
-    sub         rsp, 16
     ; end prolog
 
         mov         rsi,            arg(0) ;[src_ptr]
@@ -97,6 +96,29 @@
         movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
         movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
 
+        ; Prefetch data
+        lea             rcx,    [rax+rax*2]
+        prefetcht0      [rsi]
+        prefetcht0      [rsi+rax]
+        prefetcht0      [rsi+rax*2]
+        prefetcht0      [rsi+rcx]
+        lea             rbx,    [rsi+rax*4]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rax]
+        prefetcht0      [rbx+rax*2]
+        prefetcht0      [rbx+rcx]
+
+        lea             rcx,    [rdx+rdx*2]
+        prefetcht0      [rdi]
+        prefetcht0      [rdi+rdx]
+        prefetcht0      [rdi+rdx*2]
+        prefetcht0      [rdi+rcx]
+        lea             rbx,    [rdi+rdx*4]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rdx]
+        prefetcht0      [rbx+rdx*2]
+        prefetcht0      [rbx+rcx]
+
         pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
         pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
 
@@ -107,6 +129,9 @@
         movdqu      xmm1,           XMMWORD PTR [rsi]
         movdqu      xmm2,           XMMWORD PTR [rdi]
 
+        prefetcht0      [rsi+rax*8]
+        prefetcht0      [rdi+rdx*8]
+
         movdqa      xmm3,           xmm1
         movdqa      xmm4,           xmm2
 
@@ -178,10 +203,9 @@
 
 
     ; begin epilog
-    add rsp, 16
     pop rdi
     pop rsi
-    RESTORE_GOT
+    pop rbx
     UNSHADOW_ARGS
     pop         rbp
     ret
@@ -603,6 +627,10 @@
 
 filter_block2d_bil_var_sse2_sp_only:
         movsxd          rdx,            dword ptr arg(6)     ; yoffset
+
+        cmp             rdx,            0                    ; skip all if both xoffset=0 and yoffset=0
+        je              filter_block2d_bil_var_sse2_full_pixel
+
         shl             rdx,            5
         lea             rdx,            [rdx + rcx]          ; VFilter
 
@@ -647,6 +675,35 @@
 
         jmp             filter_block2d_bil_variance
 
+filter_block2d_bil_var_sse2_full_pixel:
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
+        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
+        pxor            xmm0,           xmm0                 ;
+
+filter_block2d_bil_full_pixel_loop:
+        movq            xmm1,           QWORD PTR [rsi]               ;
+        punpcklbw       xmm1,           xmm0                 ;
+
+        movq            xmm2,           QWORD PTR [rdi]               ;
+        punpcklbw       xmm2,           xmm0                 ;
+
+        psubw           xmm1,           xmm2                 ;
+        paddw           xmm6,           xmm1                 ;
+
+        pmaddwd         xmm1,           xmm1                 ;
+        paddd           xmm7,           xmm1                 ;
+
+        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
+        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
+
+        sub             rcx,            1                   ;
+        jnz             filter_block2d_bil_full_pixel_loop       ;
+
+        jmp             filter_block2d_bil_variance
+
 filter_block2d_bil_var_sse2_fp_only:
         mov             rsi,            arg(0)               ;ref_ptr
         mov             rdi,            arg(2)               ;src_ptr
@@ -733,7 +790,7 @@
     ret
 
 
-;void vp8_half_horiz_vert_variance16x_h_sse2
+;void vp8_half_horiz_vert_variance8x_h_sse2
 ;(
 ;    unsigned char *ref_ptr,
 ;    int ref_pixels_per_line,
@@ -743,8 +800,8 @@
 ;    int *sum,
 ;    unsigned int *sumsquared
 ;)
-global sym(vp8_half_horiz_vert_variance16x_h_sse2)
-sym(vp8_half_horiz_vert_variance16x_h_sse2):
+global sym(vp8_half_horiz_vert_variance8x_h_sse2)
+sym(vp8_half_horiz_vert_variance8x_h_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -778,7 +835,7 @@
         add             rsi, r8
 %endif
 
-vp8_half_horiz_vert_variance16x_h_1:
+vp8_half_horiz_vert_variance8x_h_1:
 
         movq            xmm1,           QWORD PTR [rsi]     ;
         movq            xmm2,           QWORD PTR [rsi+1]   ;
@@ -806,7 +863,7 @@
 %endif
 
         sub             rcx,            1                   ;
-        jnz             vp8_half_horiz_vert_variance16x_h_1     ;
+        jnz             vp8_half_horiz_vert_variance8x_h_1     ;
 
         movdq2q         mm6,            xmm6                ;
         movdq2q         mm7,            xmm7                ;
@@ -853,8 +910,7 @@
     pop         rbp
     ret
 
-
-;void vp8_half_vert_variance16x_h_sse2
+;void vp8_half_horiz_vert_variance16x_h_sse2
 ;(
 ;    unsigned char *ref_ptr,
 ;    int ref_pixels_per_line,
@@ -864,8 +920,124 @@
 ;    int *sum,
 ;    unsigned int *sumsquared
 ;)
-global sym(vp8_half_vert_variance16x_h_sse2)
-sym(vp8_half_vert_variance16x_h_sse2):
+global sym(vp8_half_horiz_vert_variance16x_h_sse2)
+sym(vp8_half_horiz_vert_variance16x_h_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        pxor            xmm6,           xmm6                ;  error accumulator
+        pxor            xmm7,           xmm7                ;  sse eaccumulator
+        mov             rsi,            arg(0) ;ref_ptr              ;
+
+        mov             rdi,            arg(2) ;src_ptr              ;
+        movsxd          rcx,            dword ptr arg(4) ;Height              ;
+        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
+        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
+
+        pxor            xmm0,           xmm0                ;
+
+        movdqu          xmm5,           XMMWORD PTR [rsi]
+        movdqu          xmm3,           XMMWORD PTR [rsi+1]
+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
+
+        lea             rsi,            [rsi + rax]
+
+vp8_half_horiz_vert_variance16x_h_1:
+        movdqu          xmm1,           XMMWORD PTR [rsi]     ;
+        movdqu          xmm2,           XMMWORD PTR [rsi+1]   ;
+        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
+
+        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
+
+        movdqa          xmm4,           xmm5
+        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
+        punpckhbw       xmm4,           xmm0
+
+        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
+        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
+        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
+
+        movq            xmm3,           QWORD PTR [rdi+8]
+        punpcklbw       xmm3,           xmm0
+        psubw           xmm4,           xmm3
+
+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
+        paddw           xmm6,           xmm4
+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
+        pmaddwd         xmm4,           xmm4
+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
+        paddd           xmm7,           xmm4
+
+        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
+
+        lea             rsi,            [rsi + rax]
+        lea             rdi,            [rdi + rdx]
+
+        sub             rcx,            1                   ;
+        jnz             vp8_half_horiz_vert_variance16x_h_1     ;
+
+        pxor        xmm1,           xmm1
+        pxor        xmm5,           xmm5
+
+        punpcklwd   xmm0,           xmm6
+        punpckhwd   xmm1,           xmm6
+        psrad       xmm0,           16
+        psrad       xmm1,           16
+        paddd       xmm0,           xmm1
+        movdqa      xmm1,           xmm0
+
+        movdqa      xmm6,           xmm7
+        punpckldq   xmm6,           xmm5
+        punpckhdq   xmm7,           xmm5
+        paddd       xmm6,           xmm7
+
+        punpckldq   xmm0,           xmm5
+        punpckhdq   xmm1,           xmm5
+        paddd       xmm0,           xmm1
+
+        movdqa      xmm7,           xmm6
+        movdqa      xmm1,           xmm0
+
+        psrldq      xmm7,           8
+        psrldq      xmm1,           8
+
+        paddd       xmm6,           xmm7
+        paddd       xmm0,           xmm1
+
+        mov         rsi,            arg(5) ;[Sum]
+        mov         rdi,            arg(6) ;[SSE]
+
+        movd        [rsi],       xmm0
+        movd        [rdi],       xmm6
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_half_vert_variance8x_h_sse2
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vp8_half_vert_variance8x_h_sse2)
+sym(vp8_half_vert_variance8x_h_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -888,7 +1060,7 @@
         movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
 
         pxor            xmm0,           xmm0                ;
-vp8_half_vert_variance16x_h_1:
+vp8_half_vert_variance8x_h_1:
         movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
         movq            xmm3,           QWORD PTR [rsi+rax] ;  xmm3 = s1,s2,s3..s9
 
@@ -912,7 +1084,7 @@
 %endif
 
         sub             rcx,            1                   ;
-        jnz             vp8_half_vert_variance16x_h_1          ;
+        jnz             vp8_half_vert_variance8x_h_1          ;
 
         movdq2q         mm6,            xmm6                ;
         movdq2q         mm7,            xmm7                ;
@@ -959,8 +1131,7 @@
     pop         rbp
     ret
 
-
-;void vp8_half_horiz_variance16x_h_sse2
+;void vp8_half_vert_variance16x_h_sse2
 ;(
 ;    unsigned char *ref_ptr,
 ;    int ref_pixels_per_line,
@@ -970,8 +1141,116 @@
 ;    int *sum,
 ;    unsigned int *sumsquared
 ;)
-global sym(vp8_half_horiz_variance16x_h_sse2)
-sym(vp8_half_horiz_variance16x_h_sse2):
+global sym(vp8_half_vert_variance16x_h_sse2)
+sym(vp8_half_vert_variance16x_h_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        pxor            xmm6,           xmm6                ;  error accumulator
+        pxor            xmm7,           xmm7                ;  sse eaccumulator
+        mov             rsi,            arg(0)              ;ref_ptr
+
+        mov             rdi,            arg(2)              ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)    ;Height
+        movsxd          rax,            dword ptr arg(1)    ;ref_pixels_per_line
+        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
+
+        movdqu          xmm5,           XMMWORD PTR [rsi]
+        lea             rsi,            [rsi + rax          ]
+        pxor            xmm0,           xmm0
+
+vp8_half_vert_variance16x_h_1:
+        movdqu          xmm3,           XMMWORD PTR [rsi]
+
+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
+        movdqa          xmm4,           xmm5
+        punpcklbw       xmm5,           xmm0
+        punpckhbw       xmm4,           xmm0
+
+        movq            xmm2,           QWORD PTR [rdi]
+        punpcklbw       xmm2,           xmm0
+        psubw           xmm5,           xmm2
+        movq            xmm2,           QWORD PTR [rdi+8]
+        punpcklbw       xmm2,           xmm0
+        psubw           xmm4,           xmm2
+
+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
+        paddw           xmm6,           xmm4
+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
+        pmaddwd         xmm4,           xmm4
+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
+        paddd           xmm7,           xmm4
+
+        movdqa          xmm5,           xmm3
+
+        lea             rsi,            [rsi + rax]
+        lea             rdi,            [rdi + rdx]
+
+        sub             rcx,            1
+        jnz             vp8_half_vert_variance16x_h_1
+
+        pxor        xmm1,           xmm1
+        pxor        xmm5,           xmm5
+
+        punpcklwd   xmm0,           xmm6
+        punpckhwd   xmm1,           xmm6
+        psrad       xmm0,           16
+        psrad       xmm1,           16
+        paddd       xmm0,           xmm1
+        movdqa      xmm1,           xmm0
+
+        movdqa      xmm6,           xmm7
+        punpckldq   xmm6,           xmm5
+        punpckhdq   xmm7,           xmm5
+        paddd       xmm6,           xmm7
+
+        punpckldq   xmm0,           xmm5
+        punpckhdq   xmm1,           xmm5
+        paddd       xmm0,           xmm1
+
+        movdqa      xmm7,           xmm6
+        movdqa      xmm1,           xmm0
+
+        psrldq      xmm7,           8
+        psrldq      xmm1,           8
+
+        paddd       xmm6,           xmm7
+        paddd       xmm0,           xmm1
+
+        mov         rsi,            arg(5) ;[Sum]
+        mov         rdi,            arg(6) ;[SSE]
+
+        movd        [rsi],       xmm0
+        movd        [rdi],       xmm6
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_half_horiz_variance8x_h_sse2
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vp8_half_horiz_variance8x_h_sse2)
+sym(vp8_half_horiz_variance8x_h_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -993,7 +1272,7 @@
         movsxd          rcx,            dword ptr arg(4) ;Height              ;
 
         pxor            xmm0,           xmm0                ;
-vp8_half_horiz_variance16x16_1:
+vp8_half_horiz_variance8x_h_1:
         movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
         movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9
 
@@ -1016,7 +1295,7 @@
         add             rdi, r9
 %endif
         sub             rcx,            1                   ;
-        jnz             vp8_half_horiz_variance16x16_1        ;
+        jnz             vp8_half_horiz_variance8x_h_1        ;
 
         movdq2q         mm6,            xmm6                ;
         movdq2q         mm7,            xmm7                ;
@@ -1063,6 +1342,109 @@
     pop         rbp
     ret
 
+;void vp8_half_horiz_variance16x_h_sse2
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vp8_half_horiz_variance16x_h_sse2)
+sym(vp8_half_horiz_variance16x_h_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        pxor            xmm6,           xmm6                ;  error accumulator
+        pxor            xmm7,           xmm7                ;  sse eaccumulator
+        mov             rsi,            arg(0) ;ref_ptr              ;
+
+        mov             rdi,            arg(2) ;src_ptr              ;
+        movsxd          rcx,            dword ptr arg(4) ;Height              ;
+        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
+        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
+
+        pxor            xmm0,           xmm0                ;
+
+vp8_half_horiz_variance16x_h_1:
+        movdqu          xmm5,           XMMWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s15
+        movdqu          xmm3,           XMMWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s16
+
+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
+        movdqa          xmm1,           xmm5
+        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
+        punpckhbw       xmm1,           xmm0
+
+        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
+        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
+        movq            xmm2,           QWORD PTR [rdi+8]
+        punpcklbw       xmm2,           xmm0
+
+        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
+        psubw           xmm1,           xmm2
+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
+        paddw           xmm6,           xmm1
+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
+        pmaddwd         xmm1,           xmm1
+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
+        paddd           xmm7,           xmm1
+
+        lea             rsi,            [rsi + rax]
+        lea             rdi,            [rdi + rdx]
+
+        sub             rcx,            1                   ;
+        jnz             vp8_half_horiz_variance16x_h_1        ;
+
+        pxor        xmm1,           xmm1
+        pxor        xmm5,           xmm5
+
+        punpcklwd   xmm0,           xmm6
+        punpckhwd   xmm1,           xmm6
+        psrad       xmm0,           16
+        psrad       xmm1,           16
+        paddd       xmm0,           xmm1
+        movdqa      xmm1,           xmm0
+
+        movdqa      xmm6,           xmm7
+        punpckldq   xmm6,           xmm5
+        punpckhdq   xmm7,           xmm5
+        paddd       xmm6,           xmm7
+
+        punpckldq   xmm0,           xmm5
+        punpckhdq   xmm1,           xmm5
+        paddd       xmm0,           xmm1
+
+        movdqa      xmm7,           xmm6
+        movdqa      xmm1,           xmm0
+
+        psrldq      xmm7,           8
+        psrldq      xmm1,           8
+
+        paddd       xmm6,           xmm7
+        paddd       xmm0,           xmm1
+
+        mov         rsi,            arg(5) ;[Sum]
+        mov         rdi,            arg(6) ;[SSE]
+
+        movd        [rsi],       xmm0
+        movd        [rdi],       xmm6
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
 
 SECTION_RODATA
 ;    short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};

diff --git a/vp8/encoder/x86/variance_impl_ssse3.asm b/vp8/encoder/x86/variance_impl_ssse3.asm
new file mode 100644
index 0000000..b197632
--- /dev/null
+++ b/vp8/encoder/x86/variance_impl_ssse3.asm

@@ -0,0 +1,348 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%define xmm_filter_shift            7
+
+
+;void vp8_filter_block2d_bil_var_ssse3
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int  xoffset,
+;    int  yoffset,
+;    int *sum,
+;    unsigned int *sumsquared;;
+;
+;)
+;Note: The filter coefficient at offset=0 is 128. Since the second register
+;for Pmaddubsw is signed bytes, we must calculate zero offset seperately.
+global sym(vp8_filter_block2d_bil_var_ssse3)
+sym(vp8_filter_block2d_bil_var_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    push rbx
+    ; end prolog
+
+        pxor            xmm6,           xmm6
+        pxor            xmm7,           xmm7
+
+        lea             rcx,            [GLOBAL(vp8_bilinear_filters_ssse3)]
+        movsxd          rax,            dword ptr arg(5)     ; xoffset
+
+        cmp             rax,            0                    ; skip first_pass filter if xoffset=0
+        je              filter_block2d_bil_var_ssse3_sp_only
+
+        shl             rax,            4                    ; point to filter coeff with xoffset
+        lea             rax,            [rax + rcx]          ; HFilter
+
+        movsxd          rdx,            dword ptr arg(6)     ; yoffset
+
+        cmp             rdx,            0                    ; skip second_pass filter if yoffset=0
+        je              filter_block2d_bil_var_ssse3_fp_only
+
+        shl             rdx,            4
+        lea             rdx,            [rdx + rcx]          ; VFilter
+
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+
+        movdqu          xmm0,           XMMWORD PTR [rsi]
+        movdqu          xmm1,           XMMWORD PTR [rsi+1]
+        movdqa          xmm2,           xmm0
+
+        punpcklbw       xmm0,           xmm1
+        punpckhbw       xmm2,           xmm1
+        pmaddubsw       xmm0,           [rax]
+        pmaddubsw       xmm2,           [rax]
+
+        paddw           xmm0,           [GLOBAL(xmm_bi_rd)]
+        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
+        psraw           xmm0,           xmm_filter_shift
+        psraw           xmm2,           xmm_filter_shift
+
+        packuswb        xmm0,           xmm2
+
+        movsxd          rbx,            dword ptr arg(1) ;ref_pixels_per_line
+        lea             rsi,            [rsi + rbx]
+%if ABI_IS_32BIT=0
+        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+filter_block2d_bil_var_ssse3_loop:
+        movdqu          xmm1,           XMMWORD PTR [rsi]
+        movdqu          xmm2,           XMMWORD PTR [rsi+1]
+        movdqa          xmm3,           xmm1
+
+        punpcklbw       xmm1,           xmm2
+        punpckhbw       xmm3,           xmm2
+        pmaddubsw       xmm1,           [rax]
+        pmaddubsw       xmm3,           [rax]
+
+        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
+        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
+        psraw           xmm1,           xmm_filter_shift
+        psraw           xmm3,           xmm_filter_shift
+        packuswb        xmm1,           xmm3
+
+        movdqa          xmm2,           xmm0
+        movdqa          xmm0,           xmm1
+        movdqa          xmm3,           xmm2
+
+        punpcklbw       xmm2,           xmm1
+        punpckhbw       xmm3,           xmm1
+        pmaddubsw       xmm2,           [rdx]
+        pmaddubsw       xmm3,           [rdx]
+
+        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
+        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
+        psraw           xmm2,           xmm_filter_shift
+        psraw           xmm3,           xmm_filter_shift
+
+        movq            xmm1,           QWORD PTR [rdi]
+        pxor            xmm4,           xmm4
+        punpcklbw       xmm1,           xmm4
+        movq            xmm5,           QWORD PTR [rdi+8]
+        punpcklbw       xmm5,           xmm4
+
+        psubw           xmm2,           xmm1
+        psubw           xmm3,           xmm5
+        paddw           xmm6,           xmm2
+        paddw           xmm6,           xmm3
+        pmaddwd         xmm2,           xmm2
+        pmaddwd         xmm3,           xmm3
+        paddd           xmm7,           xmm2
+        paddd           xmm7,           xmm3
+
+        lea             rsi,            [rsi + rbx]          ;ref_pixels_per_line
+%if ABI_IS_32BIT
+        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
+%else
+        lea             rdi,            [rdi + r9]
+%endif
+
+        sub             rcx,            1
+        jnz             filter_block2d_bil_var_ssse3_loop
+
+        jmp             filter_block2d_bil_variance
+
+filter_block2d_bil_var_ssse3_sp_only:
+        movsxd          rdx,            dword ptr arg(6)     ; yoffset
+
+        cmp             rdx,            0                    ; Both xoffset =0 and yoffset=0
+        je              filter_block2d_bil_var_ssse3_full_pixel
+
+        shl             rdx,            4
+        lea             rdx,            [rdx + rcx]          ; VFilter
+
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
+
+        movdqu          xmm1,           XMMWORD PTR [rsi]
+        movdqa          xmm0,           xmm1
+
+        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
+        lea             rsi,            [rsi + rax]
+
+filter_block2d_bil_sp_only_loop:
+        movdqu          xmm3,           XMMWORD PTR [rsi]
+        movdqa          xmm2,           xmm1
+        movdqa          xmm0,           xmm3
+
+        punpcklbw       xmm1,           xmm3
+        punpckhbw       xmm2,           xmm3
+        pmaddubsw       xmm1,           [rdx]
+        pmaddubsw       xmm2,           [rdx]
+
+        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
+        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
+        psraw           xmm1,           xmm_filter_shift
+        psraw           xmm2,           xmm_filter_shift
+
+        movq            xmm3,           QWORD PTR [rdi]
+        pxor            xmm4,           xmm4
+        punpcklbw       xmm3,           xmm4
+        movq            xmm5,           QWORD PTR [rdi+8]
+        punpcklbw       xmm5,           xmm4
+
+        psubw           xmm1,           xmm3
+        psubw           xmm2,           xmm5
+        paddw           xmm6,           xmm1
+        paddw           xmm6,           xmm2
+        pmaddwd         xmm1,           xmm1
+        pmaddwd         xmm2,           xmm2
+        paddd           xmm7,           xmm1
+        paddd           xmm7,           xmm2
+
+        movdqa          xmm1,           xmm0
+        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
+        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
+
+        sub             rcx,            1
+        jnz             filter_block2d_bil_sp_only_loop
+
+        jmp             filter_block2d_bil_variance
+
+filter_block2d_bil_var_ssse3_full_pixel:
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
+        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
+        pxor            xmm0,           xmm0
+
+filter_block2d_bil_full_pixel_loop:
+        movq            xmm1,           QWORD PTR [rsi]
+        punpcklbw       xmm1,           xmm0
+        movq            xmm2,           QWORD PTR [rsi+8]
+        punpcklbw       xmm2,           xmm0
+
+        movq            xmm3,           QWORD PTR [rdi]
+        punpcklbw       xmm3,           xmm0
+        movq            xmm4,           QWORD PTR [rdi+8]
+        punpcklbw       xmm4,           xmm0
+
+        psubw           xmm1,           xmm3
+        psubw           xmm2,           xmm4
+        paddw           xmm6,           xmm1
+        paddw           xmm6,           xmm2
+        pmaddwd         xmm1,           xmm1
+        pmaddwd         xmm2,           xmm2
+        paddd           xmm7,           xmm1
+        paddd           xmm7,           xmm2
+
+        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
+        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
+        sub             rcx,            1
+        jnz             filter_block2d_bil_full_pixel_loop
+
+        jmp             filter_block2d_bil_variance
+
+filter_block2d_bil_var_ssse3_fp_only:
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+        movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line
+
+        pxor            xmm0,           xmm0
+        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
+
+filter_block2d_bil_fp_only_loop:
+        movdqu          xmm1,           XMMWORD PTR [rsi]
+        movdqu          xmm2,           XMMWORD PTR [rsi+1]
+        movdqa          xmm3,           xmm1
+
+        punpcklbw       xmm1,           xmm2
+        punpckhbw       xmm3,           xmm2
+        pmaddubsw       xmm1,           [rax]
+        pmaddubsw       xmm3,           [rax]
+
+        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
+        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
+        psraw           xmm1,           xmm_filter_shift
+        psraw           xmm3,           xmm_filter_shift
+
+        movq            xmm2,           XMMWORD PTR [rdi]
+        pxor            xmm4,           xmm4
+        punpcklbw       xmm2,           xmm4
+        movq            xmm5,           QWORD PTR [rdi+8]
+        punpcklbw       xmm5,           xmm4
+
+        psubw           xmm1,           xmm2
+        psubw           xmm3,           xmm5
+        paddw           xmm6,           xmm1
+        paddw           xmm6,           xmm3
+        pmaddwd         xmm1,           xmm1
+        pmaddwd         xmm3,           xmm3
+        paddd           xmm7,           xmm1
+        paddd           xmm7,           xmm3
+
+        lea             rsi,            [rsi + rdx]
+        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
+
+        sub             rcx,            1
+        jnz             filter_block2d_bil_fp_only_loop
+
+        jmp             filter_block2d_bil_variance
+
+filter_block2d_bil_variance:
+        pxor        xmm0,           xmm0
+        pxor        xmm1,           xmm1
+        pxor        xmm5,           xmm5
+
+        punpcklwd   xmm0,           xmm6
+        punpckhwd   xmm1,           xmm6
+        psrad       xmm0,           16
+        psrad       xmm1,           16
+        paddd       xmm0,           xmm1
+        movdqa      xmm1,           xmm0
+
+        movdqa      xmm6,           xmm7
+        punpckldq   xmm6,           xmm5
+        punpckhdq   xmm7,           xmm5
+        paddd       xmm6,           xmm7
+
+        punpckldq   xmm0,           xmm5
+        punpckhdq   xmm1,           xmm5
+        paddd       xmm0,           xmm1
+
+        movdqa      xmm7,           xmm6
+        movdqa      xmm1,           xmm0
+
+        psrldq      xmm7,           8
+        psrldq      xmm1,           8
+
+        paddd       xmm6,           xmm7
+        paddd       xmm0,           xmm1
+
+        mov         rsi,            arg(7) ;[Sum]
+        mov         rdi,            arg(8) ;[SSE]
+
+        movd        [rsi],       xmm0
+        movd        [rdi],       xmm6
+
+    ; begin epilog
+    pop rbx
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+SECTION_RODATA
+align 16
+xmm_bi_rd:
+    times 8 dw 64
+align 16
+vp8_bilinear_filters_ssse3:
+    times 8 db 128, 0
+    times 8 db 112, 16
+    times 8 db 96,  32
+    times 8 db 80,  48
+    times 8 db 64,  64
+    times 8 db 48,  80
+    times 8 db 32,  96
+    times 8 db 16,  112

diff --git a/vp8/encoder/x86/variance_mmx.c b/vp8/encoder/x86/variance_mmx.c
index 2df73a6..cc8c1d0 100644
--- a/vp8/encoder/x86/variance_mmx.c
+++ b/vp8/encoder/x86/variance_mmx.c

@@ -9,8 +9,8 @@
  */
 
 
-#include "variance.h"
-#include "pragmas.h"
+#include "vp8/encoder/variance.h"
+#include "vp8/common/pragmas.h"
 #include "vpx_ports/mem.h"
 
 extern void filter_block1d_h6_mmx
@@ -53,13 +53,6 @@
     unsigned int *SSE,
     int *Sum
 );
-extern unsigned int vp8_get4x4sse_cs_mmx
-(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride
-);
 extern void vp8_filter_block2d_bil4x4_var_mmx
 (
     const unsigned char *ref_ptr,
@@ -92,39 +85,6 @@
 );
 
 
-void vp8_test_get_mb_ss(void)
-{
-    short zz[] =
-    {
-        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
-        -2, -2, -2, -2, 2, 2, 2, 2, -2, -2, -2, -2, 2, 2, 2, 2,
-        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
-        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
-        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
-        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
-        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
-        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
-        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
-        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
-        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
-        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
-        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
-        -3, -3, -3, -3, 3, 3, 3, 3, -3, -3, -3, -3, 3, 3, 3, 3,
-        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
-        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
-    };
-    int s = 0, x = vp8_get_mb_ss_mmx(zz);
-    {
-        int y;
-
-        for (y = 0; y < 256; y++)
-            s += (zz[y] * zz[y]);
-    }
-
-    x += 0;
-}
-
-
 unsigned int vp8_get16x16var_mmx(
     const unsigned char *src_ptr,
     int  source_stride,
@@ -456,146 +416,6 @@
     return (xxsum - ((xsum * xsum) >> 7));
 }
 
-unsigned int vp8_i_variance16x16_mmx(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int sse0, sse1, sse2, sse3, var;
-    int sum0, sum1, sum2, sum3, avg;
-
-
-    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
-    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
-    vp8_get8x8var_mmx(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse2, &sum2) ;
-    vp8_get8x8var_mmx(src_ptr + (source_stride >> 1) + 8, source_stride, ref_ptr + (recon_stride >> 1) + 8, recon_stride, &sse3, &sum3);
-
-    var = sse0 + sse1 + sse2 + sse3;
-    avg = sum0 + sum1 + sum2 + sum3;
-    *sse = var;
-    return (var - ((avg * avg) >> 8));
-
-}
-
-unsigned int vp8_i_variance8x16_mmx(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int sse0, sse1, var;
-    int sum0, sum1, avg;
-    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
-    vp8_get8x8var_mmx(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse1, &sum1) ;
-
-    var = sse0 + sse1;
-    avg = sum0 + sum1;
-
-    *sse = var;
-    return (var - ((avg * avg) >> 7));
-
-}
-
-unsigned int vp8_i_sub_pixel_variance16x16_mmx
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    int xsum0, xsum1;
-    unsigned int xxsum0, xxsum1;
-    int f2soffset = (src_pixels_per_line >> 1);
-    int f2doffset = (dst_pixels_per_line >> 1);
-
-
-    vp8_filter_block2d_bil_var_mmx(
-        src_ptr, src_pixels_per_line,
-        dst_ptr, dst_pixels_per_line, 8,
-        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
-        &xsum0, &xxsum0
-    );
-
-
-    vp8_filter_block2d_bil_var_mmx(
-        src_ptr + 8, src_pixels_per_line,
-        dst_ptr + 8, dst_pixels_per_line, 8,
-        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
-        &xsum1, &xxsum1
-    );
-
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
-
-    vp8_filter_block2d_bil_var_mmx(
-        src_ptr + f2soffset, src_pixels_per_line,
-        dst_ptr + f2doffset, dst_pixels_per_line, 8,
-        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
-        &xsum1, &xxsum1
-    );
-
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
-
-    vp8_filter_block2d_bil_var_mmx(
-        src_ptr + f2soffset + 8, src_pixels_per_line,
-        dst_ptr + f2doffset + 8, dst_pixels_per_line, 8,
-        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
-        &xsum1, &xxsum1
-    );
-
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
-    *sse = xxsum0;
-    return (xxsum0 - ((xsum0 * xsum0) >> 8));
-}
-
-
-unsigned int vp8_i_sub_pixel_variance8x16_mmx
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    int xsum0, xsum1;
-    unsigned int xxsum0, xxsum1;
-    int f2soffset = (src_pixels_per_line >> 1);
-    int f2doffset = (dst_pixels_per_line >> 1);
-
-
-    vp8_filter_block2d_bil_var_mmx(
-        src_ptr, src_pixels_per_line,
-        dst_ptr, dst_pixels_per_line, 8,
-        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
-        &xsum0, &xxsum0
-    );
-
-
-    vp8_filter_block2d_bil_var_mmx(
-        src_ptr + f2soffset, src_pixels_per_line,
-        dst_ptr + f2doffset, dst_pixels_per_line, 8,
-        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
-        &xsum1, &xxsum1
-    );
-
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
-    *sse = xxsum0;
-    return (xxsum0 - ((xsum0 * xsum0) >> 7));
-}
-
 
 unsigned int vp8_variance_halfpixvar16x16_h_mmx(
     const unsigned char *src_ptr,

diff --git a/vp8/encoder/x86/variance_sse2.c b/vp8/encoder/x86/variance_sse2.c
index 6f79f0d..0edda30 100644
--- a/vp8/encoder/x86/variance_sse2.c
+++ b/vp8/encoder/x86/variance_sse2.c

@@ -9,8 +9,8 @@
  */
 
 
-#include "variance.h"
-#include "pragmas.h"
+#include "vp8/encoder/variance.h"
+#include "vp8/common/pragmas.h"
 #include "vpx_ports/mem.h"
 
 extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
@@ -81,6 +81,16 @@
     int *sum,
     unsigned int *sumsquared
 );
+void vp8_half_horiz_vert_variance8x_h_sse2
+(
+    const unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    const unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int *sum,
+    unsigned int *sumsquared
+);
 void vp8_half_horiz_vert_variance16x_h_sse2
 (
     const unsigned char *ref_ptr,
@@ -91,6 +101,16 @@
     int *sum,
     unsigned int *sumsquared
 );
+void vp8_half_horiz_variance8x_h_sse2
+(
+    const unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    const unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int *sum,
+    unsigned int *sumsquared
+);
 void vp8_half_horiz_variance16x_h_sse2
 (
     const unsigned char *ref_ptr,
@@ -101,6 +121,16 @@
     int *sum,
     unsigned int *sumsquared
 );
+void vp8_half_vert_variance8x_h_sse2
+(
+    const unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    const unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int *sum,
+    unsigned int *sumsquared
+);
 void vp8_half_vert_variance16x_h_sse2
 (
     const unsigned char *ref_ptr,
@@ -262,21 +292,21 @@
 
     if (xoffset == 4 && yoffset == 0)
     {
-        vp8_half_horiz_variance16x_h_sse2(
+        vp8_half_horiz_variance8x_h_sse2(
             src_ptr, src_pixels_per_line,
             dst_ptr, dst_pixels_per_line, 8,
             &xsum, &xxsum);
     }
     else if (xoffset == 0 && yoffset == 4)
     {
-        vp8_half_vert_variance16x_h_sse2(
+        vp8_half_vert_variance8x_h_sse2(
             src_ptr, src_pixels_per_line,
             dst_ptr, dst_pixels_per_line, 8,
             &xsum, &xxsum);
     }
     else if (xoffset == 4 && yoffset == 4)
     {
-        vp8_half_horiz_vert_variance16x_h_sse2(
+        vp8_half_horiz_vert_variance8x_h_sse2(
             src_ptr, src_pixels_per_line,
             dst_ptr, dst_pixels_per_line, 8,
             &xsum, &xxsum);
@@ -317,11 +347,6 @@
             src_ptr, src_pixels_per_line,
             dst_ptr, dst_pixels_per_line, 16,
             &xsum0, &xxsum0);
-
-        vp8_half_horiz_variance16x_h_sse2(
-            src_ptr + 8, src_pixels_per_line,
-            dst_ptr + 8, dst_pixels_per_line, 16,
-            &xsum1, &xxsum1);
     }
     else if (xoffset == 0 && yoffset == 4)
     {
@@ -329,11 +354,6 @@
             src_ptr, src_pixels_per_line,
             dst_ptr, dst_pixels_per_line, 16,
             &xsum0, &xxsum0);
-
-        vp8_half_vert_variance16x_h_sse2(
-            src_ptr + 8, src_pixels_per_line,
-            dst_ptr + 8, dst_pixels_per_line, 16,
-            &xsum1, &xxsum1);
     }
     else if (xoffset == 4 && yoffset == 4)
     {
@@ -341,11 +361,6 @@
             src_ptr, src_pixels_per_line,
             dst_ptr, dst_pixels_per_line, 16,
             &xsum0, &xxsum0);
-
-        vp8_half_horiz_vert_variance16x_h_sse2(
-            src_ptr + 8, src_pixels_per_line,
-            dst_ptr + 8, dst_pixels_per_line, 16,
-            &xsum1, &xxsum1);
     }
     else
     {
@@ -356,17 +371,16 @@
             &xsum0, &xxsum0
         );
 
-
         vp8_filter_block2d_bil_var_sse2(
             src_ptr + 8, src_pixels_per_line,
             dst_ptr + 8, dst_pixels_per_line, 16,
             xoffset, yoffset,
             &xsum1, &xxsum1
         );
+        xsum0 += xsum1;
+        xxsum0 += xxsum1;
     }
 
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
     *sse = xxsum0;
     return (xxsum0 - ((xsum0 * xsum0) >> 8));
 }
@@ -406,11 +420,6 @@
             src_ptr, src_pixels_per_line,
             dst_ptr, dst_pixels_per_line, 8,
             &xsum0, &xxsum0);
-
-        vp8_half_horiz_variance16x_h_sse2(
-            src_ptr + 8, src_pixels_per_line,
-            dst_ptr + 8, dst_pixels_per_line, 8,
-            &xsum1, &xxsum1);
     }
     else if (xoffset == 0 && yoffset == 4)
     {
@@ -418,11 +427,6 @@
             src_ptr, src_pixels_per_line,
             dst_ptr, dst_pixels_per_line, 8,
             &xsum0, &xxsum0);
-
-        vp8_half_vert_variance16x_h_sse2(
-            src_ptr + 8, src_pixels_per_line,
-            dst_ptr + 8, dst_pixels_per_line, 8,
-            &xsum1, &xxsum1);
     }
     else if (xoffset == 4 && yoffset == 4)
     {
@@ -430,11 +434,6 @@
             src_ptr, src_pixels_per_line,
             dst_ptr, dst_pixels_per_line, 8,
             &xsum0, &xxsum0);
-
-        vp8_half_horiz_vert_variance16x_h_sse2(
-            src_ptr + 8, src_pixels_per_line,
-            dst_ptr + 8, dst_pixels_per_line, 8,
-            &xsum1, &xxsum1);
     }
     else
     {
@@ -449,11 +448,10 @@
             dst_ptr + 8, dst_pixels_per_line, 8,
             xoffset, yoffset,
             &xsum1, &xxsum1);
+        xsum0 += xsum1;
+        xxsum0 += xxsum1;
     }
 
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
-
     *sse = xxsum0;
     return (xxsum0 - ((xsum0 * xsum0) >> 7));
 }
@@ -474,21 +472,21 @@
 
     if (xoffset == 4 && yoffset == 0)
     {
-        vp8_half_horiz_variance16x_h_sse2(
+        vp8_half_horiz_variance8x_h_sse2(
             src_ptr, src_pixels_per_line,
             dst_ptr, dst_pixels_per_line, 16,
             &xsum, &xxsum);
     }
     else if (xoffset == 0 && yoffset == 4)
     {
-        vp8_half_vert_variance16x_h_sse2(
+        vp8_half_vert_variance8x_h_sse2(
             src_ptr, src_pixels_per_line,
             dst_ptr, dst_pixels_per_line, 16,
             &xsum, &xxsum);
     }
     else if (xoffset == 4 && yoffset == 4)
     {
-        vp8_half_horiz_vert_variance16x_h_sse2(
+        vp8_half_horiz_vert_variance8x_h_sse2(
             src_ptr, src_pixels_per_line,
             dst_ptr, dst_pixels_per_line, 16,
             &xsum, &xxsum);
@@ -506,81 +504,6 @@
     return (xxsum - ((xsum * xsum) >> 7));
 }
 
-unsigned int vp8_i_variance16x16_wmt(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int sse0, sse1, sse2, sse3, var;
-    int sum0, sum1, sum2, sum3, avg;
-
-
-    vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
-    vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
-    vp8_get8x8var_sse2(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse2, &sum2) ;
-    vp8_get8x8var_sse2(src_ptr + (source_stride >> 1) + 8, source_stride, ref_ptr + (recon_stride >> 1) + 8, recon_stride, &sse3, &sum3);
-
-    var = sse0 + sse1 + sse2 + sse3;
-    avg = sum0 + sum1 + sum2 + sum3;
-
-    *sse = var;
-    return (var - ((avg * avg) >> 8));
-
-}
-
-unsigned int vp8_i_variance8x16_wmt(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int sse0, sse1, var;
-    int sum0, sum1, avg;
-    vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
-    vp8_get8x8var_sse2(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse1, &sum1) ;
-
-    var = sse0 + sse1;
-    avg = sum0 + sum1;
-
-    *sse = var;
-    return (var - ((avg * avg) >> 7));
-
-}
-
-
-unsigned int vp8_i_sub_pixel_variance16x16_wmt
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    return vp8_sub_pixel_variance16x16_wmt(src_ptr, (src_pixels_per_line >> 1), xoffset, yoffset, dst_ptr, (dst_pixels_per_line >> 1), sse);
-}
-
-
-unsigned int vp8_i_sub_pixel_variance8x16_wmt
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-
-    return vp8_sub_pixel_variance8x16_wmt(src_ptr, (src_pixels_per_line >> 1), xoffset, yoffset, dst_ptr, (dst_pixels_per_line >> 1), sse);
-}
-
 
 unsigned int vp8_variance_halfpixvar16x16_h_wmt(
     const unsigned char *src_ptr,
@@ -589,21 +512,14 @@
     int  dst_pixels_per_line,
     unsigned int *sse)
 {
-    int xsum0, xsum1;
-    unsigned int xxsum0, xxsum1;
+    int xsum0;
+    unsigned int xxsum0;
 
     vp8_half_horiz_variance16x_h_sse2(
         src_ptr, src_pixels_per_line,
         dst_ptr, dst_pixels_per_line, 16,
         &xsum0, &xxsum0);
 
-    vp8_half_horiz_variance16x_h_sse2(
-        src_ptr + 8, src_pixels_per_line,
-        dst_ptr + 8, dst_pixels_per_line, 16,
-        &xsum1, &xxsum1);
-
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
     *sse = xxsum0;
     return (xxsum0 - ((xsum0 * xsum0) >> 8));
 }
@@ -616,21 +532,13 @@
     int  dst_pixels_per_line,
     unsigned int *sse)
 {
-    int xsum0, xsum1;
-    unsigned int xxsum0, xxsum1;
-
+    int xsum0;
+    unsigned int xxsum0;
     vp8_half_vert_variance16x_h_sse2(
         src_ptr, src_pixels_per_line,
         dst_ptr, dst_pixels_per_line, 16,
         &xsum0, &xxsum0);
 
-    vp8_half_vert_variance16x_h_sse2(
-        src_ptr + 8, src_pixels_per_line,
-        dst_ptr + 8, dst_pixels_per_line, 16,
-        &xsum1, &xxsum1);
-
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
     *sse = xxsum0;
     return (xxsum0 - ((xsum0 * xsum0) >> 8));
 }
@@ -643,21 +551,14 @@
     int  dst_pixels_per_line,
     unsigned int *sse)
 {
-    int xsum0, xsum1;
-    unsigned int xxsum0, xxsum1;
+    int xsum0;
+    unsigned int xxsum0;
 
     vp8_half_horiz_vert_variance16x_h_sse2(
         src_ptr, src_pixels_per_line,
         dst_ptr, dst_pixels_per_line, 16,
         &xsum0, &xxsum0);
 
-    vp8_half_horiz_vert_variance16x_h_sse2(
-        src_ptr + 8, src_pixels_per_line,
-        dst_ptr + 8, dst_pixels_per_line, 16,
-        &xsum1, &xxsum1);
-
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
     *sse = xxsum0;
     return (xxsum0 - ((xsum0 * xsum0) >> 8));
 }

diff --git a/vp8/encoder/x86/variance_ssse3.c b/vp8/encoder/x86/variance_ssse3.c
new file mode 100644
index 0000000..eb5d486
--- /dev/null
+++ b/vp8/encoder/x86/variance_ssse3.c

@@ -0,0 +1,165 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp8/encoder/variance.h"
+#include "vp8/common/pragmas.h"
+#include "vpx_ports/mem.h"
+
+extern unsigned int vp8_get16x16var_sse2
+(
+    const unsigned char *src_ptr,
+    int source_stride,
+    const unsigned char *ref_ptr,
+    int recon_stride,
+    unsigned int *SSE,
+    int *Sum
+);
+extern void vp8_half_horiz_vert_variance16x_h_sse2
+(
+    const unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    const unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int *sum,
+    unsigned int *sumsquared
+);
+extern void vp8_half_horiz_variance16x_h_sse2
+(
+    const unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    const unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int *sum,
+    unsigned int *sumsquared
+);
+extern void vp8_half_vert_variance16x_h_sse2
+(
+    const unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    const unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int *sum,
+    unsigned int *sumsquared
+);
+extern void vp8_filter_block2d_bil_var_ssse3
+(
+    const unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    const unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int  xoffset,
+    int  yoffset,
+    int *sum,
+    unsigned int *sumsquared
+);
+
+unsigned int vp8_sub_pixel_variance16x16_ssse3
+(
+    const unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    const unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    int xsum0;
+    unsigned int xxsum0;
+
+    // note we could avoid these if statements if the calling function
+    // just called the appropriate functions inside.
+    if (xoffset == 4 && yoffset == 0)
+    {
+        vp8_half_horiz_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            &xsum0, &xxsum0);
+    }
+    else if (xoffset == 0 && yoffset == 4)
+    {
+        vp8_half_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            &xsum0, &xxsum0);
+    }
+    else if (xoffset == 4 && yoffset == 4)
+    {
+        vp8_half_horiz_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            &xsum0, &xxsum0);
+    }
+    else
+    {
+        vp8_filter_block2d_bil_var_ssse3(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            xoffset, yoffset,
+            &xsum0, &xxsum0);
+    }
+
+    *sse = xxsum0;
+    return (xxsum0 - ((xsum0 * xsum0) >> 8));
+}
+
+unsigned int vp8_sub_pixel_variance16x8_ssse3
+(
+    const unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    const unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+
+)
+{
+    int xsum0;
+    unsigned int xxsum0;
+
+    if (xoffset == 4 && yoffset == 0)
+    {
+        vp8_half_horiz_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            &xsum0, &xxsum0);
+    }
+    else if (xoffset == 0 && yoffset == 4)
+    {
+        vp8_half_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            &xsum0, &xxsum0);
+    }
+    else if (xoffset == 4 && yoffset == 4)
+    {
+        vp8_half_horiz_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            &xsum0, &xxsum0);
+    }
+    else
+    {
+        vp8_filter_block2d_bil_var_ssse3(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 8,
+            xoffset, yoffset,
+            &xsum0, &xxsum0);
+    }
+
+    *sse = xxsum0;
+    return (xxsum0 - ((xsum0 * xsum0) >> 7));
+}

diff --git a/vp8/encoder/x86/variance_x86.h b/vp8/encoder/x86/variance_x86.h
index 6bea15e..3560f74 100644
--- a/vp8/encoder/x86/variance_x86.h
+++ b/vp8/encoder/x86/variance_x86.h

@@ -286,6 +286,8 @@
 #if HAVE_SSSE3
 extern prototype_sad_multi_same_address(vp8_sad16x16x3_ssse3);
 extern prototype_sad_multi_same_address(vp8_sad16x8x3_ssse3);
+extern prototype_subpixvariance(vp8_sub_pixel_variance16x8_ssse3);
+extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_ssse3);
 
 #if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_variance_sad16x16x3
@@ -294,6 +296,12 @@
 #undef  vp8_variance_sad16x8x3
 #define vp8_variance_sad16x8x3 vp8_sad16x8x3_ssse3
 
+#undef  vp8_variance_subpixvar16x8
+#define vp8_variance_subpixvar16x8 vp8_sub_pixel_variance16x8_ssse3
+
+#undef  vp8_variance_subpixvar16x16
+#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_ssse3
+
 #endif
 #endif
 

diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c
index 31438f9..2b6bd98 100644
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c

@@ -11,12 +11,12 @@
 
 #include "vpx_ports/config.h"
 #include "vpx_ports/x86.h"
-#include "variance.h"
-#include "onyx_int.h"
+#include "vp8/encoder/variance.h"
+#include "vp8/encoder/onyx_int.h"
 
 
 #if HAVE_MMX
-void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch)
+static void short_fdct8x4_mmx(short *input, short *output, int pitch)
 {
     vp8_short_fdct4x4_mmx(input,   output,    pitch);
     vp8_short_fdct4x4_mmx(input + 4, output + 16, pitch);
@@ -26,7 +26,7 @@
                                  short *qcoeff_ptr, short *dequant_ptr,
                                  short *scan_mask, short *round_ptr,
                                  short *quant_ptr, short *dqcoeff_ptr);
-void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d)
+static void fast_quantize_b_mmx(BLOCK *b, BLOCKD *d)
 {
     short *scan_mask   = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr;
     short *coeff_ptr   = b->coeff;
@@ -51,7 +51,7 @@
 }
 
 int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
-int vp8_mbblock_error_mmx(MACROBLOCK *mb, int dc)
+static int mbblock_error_mmx(MACROBLOCK *mb, int dc)
 {
     short *coeff_ptr =  mb->block[0].coeff;
     short *dcoef_ptr =  mb->e_mbd.block[0].dqcoeff;
@@ -59,7 +59,7 @@
 }
 
 int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
-int vp8_mbuverror_mmx(MACROBLOCK *mb)
+static int mbuverror_mmx(MACROBLOCK *mb)
 {
     short *s_ptr = &mb->coeff[256];
     short *d_ptr = &mb->e_mbd.dqcoeff[256];
@@ -69,7 +69,7 @@
 void vp8_subtract_b_mmx_impl(unsigned char *z,  int src_stride,
                              short *diff, unsigned char *predictor,
                              int pitch);
-void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch)
+static void subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch)
 {
     unsigned char *z = *(be->base_src) + be->src;
     unsigned int  src_stride = be->src_stride;
@@ -85,7 +85,7 @@
                                  short *qcoeff_ptr, short *dequant_ptr,
                                  const short *inv_scan_order, short *round_ptr,
                                  short *quant_ptr, short *dqcoeff_ptr);
-void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d)
+static void fast_quantize_b_sse2(BLOCK *b, BLOCKD *d)
 {
     short *scan_mask   = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr;
     short *coeff_ptr   = b->coeff;
@@ -106,32 +106,8 @@
              );
 }
 
-
-int vp8_regular_quantize_b_impl_sse2(short *coeff_ptr, short *zbin_ptr,
-                                     short *qcoeff_ptr,short *dequant_ptr,
-                                     const int *default_zig_zag, short *round_ptr,
-                                     short *quant_ptr, short *dqcoeff_ptr,
-                                     unsigned short zbin_oq_value,
-                                     short *zbin_boost_ptr,
-                                     short *quant_shift_ptr);
-
-void vp8_regular_quantize_b_sse2(BLOCK *b,BLOCKD *d)
-{
-    d->eob = vp8_regular_quantize_b_impl_sse2(b->coeff,
-                                              b->zbin,
-                                              d->qcoeff,
-                                              d->dequant,
-                                              vp8_default_zig_zag1d,
-                                              b->round,
-                                              b->quant,
-                                              d->dqcoeff,
-                                              b->zbin_extra,
-                                              b->zrun_zbin_boost,
-                                              b->quant_shift);
-}
-
 int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
-int vp8_mbblock_error_xmm(MACROBLOCK *mb, int dc)
+static int mbblock_error_xmm(MACROBLOCK *mb, int dc)
 {
     short *coeff_ptr =  mb->block[0].coeff;
     short *dcoef_ptr =  mb->e_mbd.block[0].dqcoeff;
@@ -139,7 +115,7 @@
 }
 
 int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
-int vp8_mbuverror_xmm(MACROBLOCK *mb)
+static int mbuverror_xmm(MACROBLOCK *mb)
 {
     short *s_ptr = &mb->coeff[256];
     short *d_ptr = &mb->e_mbd.dqcoeff[256];
@@ -149,7 +125,7 @@
 void vp8_subtract_b_sse2_impl(unsigned char *z,  int src_stride,
                              short *diff, unsigned char *predictor,
                              int pitch);
-void vp8_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch)
+static void subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch)
 {
     unsigned char *z = *(be->base_src) + be->src;
     unsigned int  src_stride = be->src_stride;
@@ -165,7 +141,7 @@
                                  short *qcoeff_ptr, short *dequant_ptr,
                                  short *round_ptr,
                                  short *quant_ptr, short *dqcoeff_ptr);
-void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d)
+static void fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d)
 {
     d->eob = vp8_fast_quantize_b_impl_ssse3(
                     b->coeff,
@@ -176,6 +152,25 @@
                     d->dqcoeff
                );
 }
+#if CONFIG_PSNR
+#if ARCH_X86_64
+typedef void ssimpf
+(
+    unsigned char *s,
+    int sp,
+    unsigned char *r,
+    int rp,
+    unsigned long *sum_s,
+    unsigned long *sum_r,
+    unsigned long *sum_sq_s,
+    unsigned long *sum_sq_r,
+    unsigned long *sum_sxr
+);
+
+extern ssimpf vp8_ssim_parms_16x16_sse3;
+extern ssimpf vp8_ssim_parms_8x8_sse3;
+#endif
+#endif
 #endif
 
 
@@ -232,20 +227,20 @@
         cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_mmx;
 
         cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_mmx;
-        cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_mmx;
+        cpi->rtcd.fdct.short8x4                  = short_fdct8x4_mmx;
         cpi->rtcd.fdct.fast4x4                   = vp8_short_fdct4x4_mmx;
-        cpi->rtcd.fdct.fast8x4                   = vp8_short_fdct8x4_mmx;
+        cpi->rtcd.fdct.fast8x4                   = short_fdct8x4_mmx;
 
         cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_c;
 
         cpi->rtcd.encodemb.berr                  = vp8_block_error_mmx;
-        cpi->rtcd.encodemb.mberr                 = vp8_mbblock_error_mmx;
-        cpi->rtcd.encodemb.mbuverr               = vp8_mbuverror_mmx;
-        cpi->rtcd.encodemb.subb                  = vp8_subtract_b_mmx;
+        cpi->rtcd.encodemb.mberr                 = mbblock_error_mmx;
+        cpi->rtcd.encodemb.mbuverr               = mbuverror_mmx;
+        cpi->rtcd.encodemb.subb                  = subtract_b_mmx;
         cpi->rtcd.encodemb.submby                = vp8_subtract_mby_mmx;
         cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_mmx;
 
-        /*cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_mmx;*/
+        /*cpi->rtcd.quantize.fastquantb            = fast_quantize_b_mmx;*/
     }
 #endif
 
@@ -280,6 +275,8 @@
         cpi->rtcd.variance.get16x16prederror     = vp8_get16x16pred_error_sse2;
         cpi->rtcd.variance.get8x8var             = vp8_get8x8var_sse2;
         cpi->rtcd.variance.get16x16var           = vp8_get16x16var_sse2;
+
+
         /* cpi->rtcd.variance.get4x4sse_cs  not implemented for wmt */;
 
         cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_sse2;
@@ -290,18 +287,18 @@
         cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_sse2 ;
 
         cpi->rtcd.encodemb.berr                  = vp8_block_error_xmm;
-        cpi->rtcd.encodemb.mberr                 = vp8_mbblock_error_xmm;
-        cpi->rtcd.encodemb.mbuverr               = vp8_mbuverror_xmm;
-        cpi->rtcd.encodemb.subb                  = vp8_subtract_b_sse2;
+        cpi->rtcd.encodemb.mberr                 = mbblock_error_xmm;
+        cpi->rtcd.encodemb.mbuverr               = mbuverror_xmm;
+        cpi->rtcd.encodemb.subb                  = subtract_b_sse2;
         cpi->rtcd.encodemb.submby                = vp8_subtract_mby_sse2;
         cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_sse2;
 
-#if ARCH_X86
         cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b_sse2;
-#endif
-        cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_sse2;
+        cpi->rtcd.quantize.fastquantb            = fast_quantize_b_sse2;
 
+#if !(CONFIG_REALTIME_ONLY)
         cpi->rtcd.temporal.apply                 = vp8_temporal_filter_apply_sse2;
+#endif
     }
 #endif
 
@@ -332,11 +329,23 @@
         cpi->rtcd.variance.sad16x16x3            = vp8_sad16x16x3_ssse3;
         cpi->rtcd.variance.sad16x8x3             = vp8_sad16x8x3_ssse3;
 
-        cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_ssse3;
+        cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_ssse3;
+        cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_ssse3;
+
+        cpi->rtcd.quantize.fastquantb            = fast_quantize_b_ssse3;
+
+#if CONFIG_PSNR
+#if ARCH_X86_64
+        cpi->rtcd.variance.ssimpf_8x8            = vp8_ssim_parms_8x8_sse3;
+        cpi->rtcd.variance.ssimpf                = vp8_ssim_parms_16x16_sse3;
+#endif
+#endif
 
     }
 #endif
 
+
+
 #if HAVE_SSE4_1
     if (SSE4_1Enabled)
     {

diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index cf47626..2a2f0cf 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk

@@ -8,24 +8,12 @@
 ##  be found in the AUTHORS file in the root of the source tree.
 ##
 
-
-#add this file to the installed sources list
 VP8_COMMON_SRCS-yes += vp8_common.mk
-
-CFLAGS+=-I$(SRC_PATH_BARE)/$(VP8_PREFIX)common
 VP8_COMMON_SRCS-yes += common/type_aliases.h
 VP8_COMMON_SRCS-yes += common/pragmas.h
-
-CFLAGS+=-I$(SRC_PATH_BARE)/$(VP8_PREFIX)common
-VP8_COMMON_SRCS-yes += common/vpxerrors.h
-
-CFLAGS+=-I$(SRC_PATH_BARE)/$(VP8_PREFIX)common
 VP8_COMMON_SRCS-yes += common/ppflags.h
 VP8_COMMON_SRCS-yes += common/onyx.h
 VP8_COMMON_SRCS-yes += common/onyxd.h
-
-CFLAGS+=-I$(SRC_PATH_BARE)/$(VP8_PREFIX)common
-
 VP8_COMMON_SRCS-yes += common/alloccommon.c
 VP8_COMMON_SRCS-yes += common/blockd.c
 VP8_COMMON_SRCS-yes += common/coefupdateprobs.h
@@ -35,7 +23,8 @@
 VP8_COMMON_SRCS-yes += common/entropymode.c
 VP8_COMMON_SRCS-yes += common/entropymv.c
 VP8_COMMON_SRCS-yes += common/extend.c
-VP8_COMMON_SRCS-yes += common/filter_c.c
+VP8_COMMON_SRCS-yes += common/filter.c
+VP8_COMMON_SRCS-yes += common/filter.h
 VP8_COMMON_SRCS-yes += common/findnearmv.c
 VP8_COMMON_SRCS-yes += common/generic/systemdependent.c
 VP8_COMMON_SRCS-yes += common/idctllm.c
@@ -56,7 +45,6 @@
 VP8_COMMON_SRCS-yes += common/modecont.h
 VP8_COMMON_SRCS-yes += common/mv.h
 VP8_COMMON_SRCS-yes += common/onyxc_int.h
-VP8_COMMON_SRCS-yes += common/predictdc.h
 VP8_COMMON_SRCS-yes += common/quant_common.h
 VP8_COMMON_SRCS-yes += common/recon.h
 VP8_COMMON_SRCS-yes += common/reconinter.h
@@ -74,7 +62,6 @@
 VP8_COMMON_SRCS-yes += common/mbpitch.c
 VP8_COMMON_SRCS-yes += common/modecont.c
 VP8_COMMON_SRCS-yes += common/modecontext.c
-VP8_COMMON_SRCS-yes += common/predictdc.c
 VP8_COMMON_SRCS-yes += common/quant_common.c
 VP8_COMMON_SRCS-yes += common/recon.c
 VP8_COMMON_SRCS-yes += common/reconinter.c
@@ -82,7 +69,7 @@
 VP8_COMMON_SRCS-yes += common/reconintra4x4.c
 VP8_COMMON_SRCS-yes += common/setupintrarecon.c
 VP8_COMMON_SRCS-yes += common/swapyv12buffer.c
-VP8_COMMON_SRCS-yes += common/textblit.c
+VP8_COMMON_SRCS-$(CONFIG_POSTPROC_VISUALIZER) += common/textblit.c
 VP8_COMMON_SRCS-yes += common/treecoder.c
 
 VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/idct_x86.h
@@ -111,14 +98,15 @@
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/postproc_sse2.asm
 endif
 
+VP8_COMMON_SRCS-$(ARCH_ARM)  += common/asm_com_offsets.c
 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/arm_systemdependent.c
 
 # common (c)
 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/bilinearfilter_arm.c
+VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/bilinearfilter_arm.h
 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/filter_arm.c
 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/loopfilter_arm.c
 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/reconintra_arm.c
-VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/vpx_asm_offsets.c
 
 # common (armv6)
 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/armv6/bilinearfilter_v6$(ASM)
@@ -161,16 +149,3 @@
 VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/buildintrapredictorsmby_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/save_neon_reg$(ASM)
 VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/recon_neon.c
-
-
-#
-# Rule to extract assembly constants from C sources
-#
-ifeq ($(ARCH_ARM),yes)
-vpx_asm_offsets.asm: obj_int_extract
-vpx_asm_offsets.asm: $(VP8_PREFIX)common/arm/vpx_asm_offsets.c.o
-	./obj_int_extract rvds $< $(ADS2GAS) > $@
-OBJS-yes += $(VP8_PREFIX)common/arm/vpx_asm_offsets.c.o
-CLEAN-OBJS += vpx_asm_offsets.asm
-$(filter %$(ASM).o,$(OBJS-yes)): vpx_asm_offsets.asm
-endif

diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index b23bd95..2622738 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c

@@ -12,10 +12,10 @@
 #include "vpx/vpx_codec.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "vpx_version.h"
-#include "onyx_int.h"
+#include "vp8/encoder/onyx_int.h"
 #include "vpx/vp8e.h"
 #include "vp8/encoder/firstpass.h"
-#include "onyx.h"
+#include "vp8/common/onyx.h"
 #include <stdlib.h>
 #include <string.h>
 
@@ -199,7 +199,7 @@
     {
         int              mb_r = (cfg->g_h + 15) / 16;
         int              mb_c = (cfg->g_w + 15) / 16;
-        size_t           packet_sz = vp8_firstpass_stats_sz(mb_r * mb_c);
+        size_t           packet_sz = sizeof(FIRSTPASS_STATS);
         int              n_packets = cfg->rc_twopass_stats_in.sz / packet_sz;
         FIRSTPASS_STATS *stats;
 
@@ -492,57 +492,67 @@
     {
         priv = calloc(1, sizeof(struct vpx_codec_alg_priv));
 
-        if (priv)
+        if (!priv)
         {
-            ctx->priv = &priv->base;
-            ctx->priv->sz = sizeof(*ctx->priv);
-            ctx->priv->iface = ctx->iface;
-            ctx->priv->alg_priv = priv;
-            ctx->priv->init_flags = ctx->init_flags;
+            return VPX_CODEC_MEM_ERROR;
+        }
 
-            if (ctx->config.enc)
-            {
-                /* Update the reference to the config structure to an
-                 * internal copy.
-                 */
-                ctx->priv->alg_priv->cfg = *ctx->config.enc;
-                ctx->config.enc = &ctx->priv->alg_priv->cfg;
-            }
+        ctx->priv = &priv->base;
+        ctx->priv->sz = sizeof(*ctx->priv);
+        ctx->priv->iface = ctx->iface;
+        ctx->priv->alg_priv = priv;
+        ctx->priv->init_flags = ctx->init_flags;
 
-            cfg =  &ctx->priv->alg_priv->cfg;
-
-            /* Select the extra vp6 configuration table based on the current
-             * usage value. If the current usage value isn't found, use the
-             * values for usage case 0.
+        if (ctx->config.enc)
+        {
+            /* Update the reference to the config structure to an
+             * internal copy.
              */
-            for (i = 0;
-                 extracfg_map[i].usage && extracfg_map[i].usage != cfg->g_usage;
-                 i++);
+            ctx->priv->alg_priv->cfg = *ctx->config.enc;
+            ctx->config.enc = &ctx->priv->alg_priv->cfg;
+        }
 
-            priv->vp8_cfg = extracfg_map[i].cfg;
-            priv->vp8_cfg.pkt_list = &priv->pkt_list.head;
+        cfg =  &ctx->priv->alg_priv->cfg;
 
-            priv->cx_data_sz = priv->cfg.g_w * priv->cfg.g_h * 3 / 2 * 2;
+        /* Select the extra vp6 configuration table based on the current
+         * usage value. If the current usage value isn't found, use the
+         * values for usage case 0.
+         */
+        for (i = 0;
+             extracfg_map[i].usage && extracfg_map[i].usage != cfg->g_usage;
+             i++);
 
-            if (priv->cx_data_sz < 4096) priv->cx_data_sz = 4096;
+        priv->vp8_cfg = extracfg_map[i].cfg;
+        priv->vp8_cfg.pkt_list = &priv->pkt_list.head;
 
-            priv->cx_data = malloc(priv->cx_data_sz);
-            priv->deprecated_mode = NO_MODE_SET;
+        priv->cx_data_sz = priv->cfg.g_w * priv->cfg.g_h * 3 / 2 * 2;
 
-            vp8_initialize();
+        if (priv->cx_data_sz < 4096) priv->cx_data_sz = 4096;
 
-            res = validate_config(priv, &priv->cfg, &priv->vp8_cfg);
+        priv->cx_data = malloc(priv->cx_data_sz);
 
-            if (!res)
-            {
-                set_vp8e_config(&ctx->priv->alg_priv->oxcf, ctx->priv->alg_priv->cfg, ctx->priv->alg_priv->vp8_cfg);
-                optr = vp8_create_compressor(&ctx->priv->alg_priv->oxcf);
+        if (!priv->cx_data)
+        {
+            return VPX_CODEC_MEM_ERROR;
+        }
 
-                if (!optr)
-                    res = VPX_CODEC_MEM_ERROR;
-                else
-                    ctx->priv->alg_priv->cpi = optr;
-            }
+        priv->deprecated_mode = NO_MODE_SET;
+
+        vp8_initialize();
+
+        res = validate_config(priv, &priv->cfg, &priv->vp8_cfg);
+
+        if (!res)
+        {
+            set_vp8e_config(&ctx->priv->alg_priv->oxcf,
+                             ctx->priv->alg_priv->cfg,
+                             ctx->priv->alg_priv->vp8_cfg);
+            optr = vp8_create_compressor(&ctx->priv->alg_priv->oxcf);
+
+            if (!optr)
+                res = VPX_CODEC_MEM_ERROR;
+            else
+                ctx->priv->alg_priv->cpi = optr;
         }
     }
 
@@ -706,7 +716,7 @@
         if (++ctx->fixed_kf_cntr > ctx->cfg.kf_min_dist)
         {
             flags |= VPX_EFLAG_FORCE_KF;
-            ctx->fixed_kf_cntr = 0;
+            ctx->fixed_kf_cntr = 1;
         }
     }
 
@@ -912,8 +922,8 @@
         ctx->preview_img.x_chroma_shift = 1;
         ctx->preview_img.y_chroma_shift = 1;
 
-        ctx->preview_img.d_w = ctx->cfg.g_w;
-        ctx->preview_img.d_h = ctx->cfg.g_h;
+        ctx->preview_img.d_w = sd.y_width;
+        ctx->preview_img.d_h = sd.y_height;
         ctx->preview_img.stride[VPX_PLANE_Y] = sd.y_stride;
         ctx->preview_img.stride[VPX_PLANE_U] = sd.uv_stride;
         ctx->preview_img.stride[VPX_PLANE_V] = sd.uv_stride;

diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index 1b1cf3b..99657e0 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c

@@ -15,24 +15,11 @@
 #include "vpx/vp8dx.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "vpx_version.h"
-#include "onyxd.h"
-#include "onyxd_int.h"
+#include "common/onyxd.h"
+#include "decoder/onyxd_int.h"
 
 #define VP8_CAP_POSTPROC (CONFIG_POSTPROC ? VPX_CODEC_CAP_POSTPROC : 0)
 
-#if CONFIG_BIG_ENDIAN
-# define swap4(d)\
-    ((d&0x000000ff)<<24) |  \
-    ((d&0x0000ff00)<<8)  |  \
-    ((d&0x00ff0000)>>8)  |  \
-    ((d&0xff000000)>>24)
-# define swap2(d)\
-    ((d&0x000000ff)<<8) |  \
-    ((d&0x0000ff00)>>8)
-#else
-# define swap4(d) d
-# define swap2(d) d
-#endif
 typedef vpx_codec_stream_info_t  vp8_stream_info_t;
 
 /* Structures for handling memory allocations */
@@ -181,7 +168,7 @@
 {
     int i;
 
-    for (i = 0; i < NELEMENTS(vp8_mem_req_segs); i++)
+    for (i = 0; i < NELEMENTS(ctx->mmaps); i++)
         if (ctx->mmaps[i].id == id)
             return ctx->mmaps[i].base;
 
@@ -189,25 +176,7 @@
 }
 static void vp8_finalize_mmaps(vpx_codec_alg_priv_t *ctx)
 {
-    /*
-    ctx->pbi = mmap_lkup(ctx, VP6_SEG_PB_INSTANCE);
-    ctx->pbi->mbi.block_dx_info[0].idct_output_ptr = mmap_lkup(ctx, VP6_SEG_IDCT_BUFFER);
-    ctx->pbi->loop_filtered_block = mmap_lkup(ctx, VP6_SEG_LF_BLOCK);
-    ctx->pbi->huff = mmap_lkup(ctx, VP6_SEG_HUFF);
-    ctx->pbi->mbi.coeffs_base_ptr = mmap_lkup(ctx, VP6_SEG_COEFFS);
-    ctx->pbi->fc.above_y = mmap_lkup(ctx, VP6_SEG_ABOVEY);
-    ctx->pbi->fc.above_u = mmap_lkup(ctx, VP6_SEG_ABOVEU);
-    ctx->pbi->fc.above_v = mmap_lkup(ctx, VP6_SEG_ABOVEV);
-    ctx->pbi->prediction_mode = mmap_lkup(ctx, VP6_SEG_PRED_MODES);
-    ctx->pbi->mbmotion_vector = mmap_lkup(ctx, VP6_SEG_MV_FIELD);
-    ctx->pbi->fb_storage_ptr[0] = mmap_lkup(ctx, VP6_SEG_IMG0_STRG);
-    ctx->pbi->fb_storage_ptr[1] = mmap_lkup(ctx, VP6_SEG_IMG1_STRG);
-    ctx->pbi->fb_storage_ptr[2] = mmap_lkup(ctx, VP6_SEG_IMG2_STRG);
-    #if CONFIG_POSTPROC
-    ctx->pbi->postproc.deblock.fragment_variances = mmap_lkup(ctx, VP6_SEG_DEBLOCKER);
-    ctx->pbi->fb_storage_ptr[3] = mmap_lkup(ctx, VP6_SEG_PP_IMG_STRG);
-    #endif
-    */
+    /* nothing to clean up */
 }
 
 static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx)
@@ -283,8 +252,8 @@
             if (c[0] != 0x9d || c[1] != 0x01 || c[2] != 0x2a)
                 res = VPX_CODEC_UNSUP_BITSTREAM;
 
-            si->w = swap2(*(const unsigned short *)(c + 3)) & 0x3fff;
-            si->h = swap2(*(const unsigned short *)(c + 5)) & 0x3fff;
+            si->w = (c[3] | (c[4] << 8)) & 0x3fff;
+            si->h = (c[5] | (c[6] << 8)) & 0x3fff;
 
             /*printf("w=%d, h=%d\n", si->w, si->h);*/
             if (!(si->h | si->w))
@@ -556,7 +525,7 @@
 
     if (!res && ctx->priv->alg_priv)
     {
-        for (i = 0; i < NELEMENTS(vp8_mem_req_segs); i++)
+        for (i = 0; i < NELEMENTS(ctx->priv->alg_priv->mmaps); i++)
         {
             if (ctx->priv->alg_priv->mmaps[i].id == mmap->id)
                 if (!ctx->priv->alg_priv->mmaps[i].base)

diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index 932f145..8037f9a 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk

@@ -33,8 +33,7 @@
 #INCLUDES += common
 #INCLUDES += encoder
 
-CFLAGS+=-I$(SRC_PATH_BARE)/$(VP8_PREFIX)encoder
-
+VP8_CX_SRCS-yes += encoder/asm_enc_offsets.c
 VP8_CX_SRCS-yes += encoder/bitstream.c
 VP8_CX_SRCS-yes += encoder/boolhuff.c
 VP8_CX_SRCS-yes += encoder/dct.c
@@ -42,7 +41,7 @@
 VP8_CX_SRCS-yes += encoder/encodeintra.c
 VP8_CX_SRCS-yes += encoder/encodemb.c
 VP8_CX_SRCS-yes += encoder/encodemv.c
-VP8_CX_SRCS-yes += encoder/ethreading.c
+VP8_CX_SRCS-$(CONFIG_MULTITHREAD) += encoder/ethreading.c
 VP8_CX_SRCS-yes += encoder/firstpass.c
 VP8_CX_SRCS-yes += encoder/generic/csystemdependent.c
 VP8_CX_SRCS-yes += encoder/block.h
@@ -87,6 +86,7 @@
 
 ifeq ($(CONFIG_REALTIME_ONLY),yes)
 VP8_CX_SRCS_REMOVE-yes += encoder/firstpass.c
+VP8_CX_SRCS_REMOVE-yes += encoder/temporal_filter.c
 endif
 
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodemb_x86.h
@@ -111,9 +111,17 @@
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE3) += encoder/x86/sad_sse3.asm
 VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/sad_ssse3.asm
+VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/variance_ssse3.c
+VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/variance_impl_ssse3.asm
 VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.asm
 VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/sad_sse4.asm
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm
+VP8_CX_SRCS-$(ARCH_X86_64) += encoder/x86/ssim_opt.asm
+
+ifeq ($(CONFIG_REALTIME_ONLY),yes)
+VP8_CX_SRCS_REMOVE-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
+endif
+
 
 VP8_CX_SRCS-yes := $(filter-out $(VP8_CX_SRCS_REMOVE-yes),$(VP8_CX_SRCS-yes))

diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk
index 6b624a7..349c3fd 100644
--- a/vp8/vp8cx_arm.mk
+++ b/vp8/vp8cx_arm.mk

@@ -16,9 +16,11 @@
 VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/arm_csystemdependent.c
 
 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/encodemb_arm.c
-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/variance_arm.c
 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/quantize_arm.c
 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/picklpf_arm.c
+VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/dct_arm.c
+VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/variance_arm.c
+VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/variance_arm.h
 VP8_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/boolhuff_arm.c
 
 VP8_CX_SRCS_REMOVE-$(HAVE_ARMV5TE)  += encoder/boolhuff.c
@@ -32,6 +34,12 @@
 
 #File list for armv6
 # encoder
+VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_fast_fdct4x4_armv6$(ASM)
+VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_fast_quantize_b_armv6$(ASM)
+VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_sad16x16_armv6$(ASM)
+VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_variance16x16_armv6$(ASM)
+VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_mse16x16_armv6$(ASM)
+VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_variance8x8_armv6$(ASM)
 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/walsh_v6$(ASM)
 
 #File list for neon
@@ -50,17 +58,3 @@
 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_subpixelvariance16x16s_neon$(ASM)
 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_memcpy_neon$(ASM)
 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/neon/vp8_shortwalsh4x4_neon$(ASM)
-
-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/vpx_vp8_enc_asm_offsets.c
-
-#
-# Rule to extract assembly constants from C sources
-#
-ifeq ($(ARCH_ARM),yes)
-vpx_vp8_enc_asm_offsets.asm: obj_int_extract
-vpx_vp8_enc_asm_offsets.asm: $(VP8_PREFIX)encoder/arm/vpx_vp8_enc_asm_offsets.c.o
-	./obj_int_extract rvds $< $(ADS2GAS) > $@
-OBJS-yes += $(VP8_PREFIX)encoder/arm/vpx_vp8_enc_asm_offsets.c.o
-CLEAN-OBJS += vpx_vp8_enc_asm_offsets.asm
-$(filter %$(ASM).o,$(OBJS-yes)): vpx_vp8_enc_asm_offsets.asm
-endif

diff --git a/vp8/vp8dx.mk b/vp8/vp8dx.mk
index 1acd674..5649671 100644
--- a/vp8/vp8dx.mk
+++ b/vp8/vp8dx.mk

@@ -24,9 +24,6 @@
 
 VP8_DX_SRCS-yes += vp8_dx_iface.c
 
-CFLAGS+=-I$(SRC_PATH_BARE)/$(VP8_PREFIX)decoder
-
-
 # common
 #define ARM
 #define DISABLE_THREAD
@@ -65,7 +62,7 @@
 VP8_DX_SRCS-yes += decoder/onyxd_int.h
 VP8_DX_SRCS-yes += decoder/treereader.h
 VP8_DX_SRCS-yes += decoder/onyxd_if.c
-VP8_DX_SRCS-yes += decoder/threading.c
+VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/threading.c
 VP8_DX_SRCS-yes += decoder/idct_blk.c
 VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/reconintra_mt.h
 VP8_DX_SRCS-$(CONFIG_MULTITHREAD) += decoder/reconintra_mt.c

diff --git a/vp8/vp8dx_arm.mk b/vp8/vp8dx_arm.mk
index 0803a9c..03084c5 100644
--- a/vp8/vp8dx_arm.mk
+++ b/vp8/vp8dx_arm.mk

@@ -12,9 +12,9 @@
 #VP8_DX_SRCS list is modified according to different platforms.
 
 VP8_DX_SRCS-$(ARCH_ARM)  += decoder/arm/arm_dsystemdependent.c
+VP8_CX_SRCS-$(ARCH_ARM)  += decoder/asm_dec_offsets.c
 
 VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/dequantize_arm.c
-VP8_DX_SRCS-$(CONFIG_ARM_ASM_DETOK)  += decoder/arm/detokenize$(ASM)
 
 #File list for armv6
 VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/armv6/dequant_dc_idct_v6$(ASM)

diff --git a/vpx/internal/vpx_codec_internal.h b/vpx/internal/vpx_codec_internal.h
index 9cde426..a1ff192 100644
--- a/vpx/internal/vpx_codec_internal.h
+++ b/vpx/internal/vpx_codec_internal.h

@@ -9,7 +9,7 @@
  */
 
 
-/*!\file decoder_impl.h
+/*!\file
  * \brief Describes the decoder algorithm interface for algorithm
  *        implementations.
  *
@@ -214,7 +214,7 @@
         vpx_codec_iter_t     *iter);
 
 
-/*\brief e_xternal Memory Allocation memory map get iterator
+/*\brief eXternal Memory Allocation memory map get iterator
  *
  * Iterates over a list of the memory maps requested by the decoder. The
  * iterator storage should be initialized to NULL to start the iteration.
@@ -230,7 +230,7 @@
         vpx_codec_iter_t           *iter);
 
 
-/*\brief e_xternal Memory Allocation memory map set iterator
+/*\brief eXternal Memory Allocation memory map set iterator
  *
  * Sets a memory descriptor inside the decoder instance.
  *
@@ -332,7 +332,7 @@
  * extended in one of two ways. First, a second, algorithm specific structure
  * can be allocated and the priv member pointed to it. Alternatively, this
  * structure can be made the first member of the algorithm specific structure,
- * and the pointer casted to the proper type.
+ * and the pointer cast to the proper type.
  */
 struct vpx_codec_priv
 {
@@ -405,7 +405,7 @@
 
 /* Internal Utility Functions
  *
- * The following functions are indended to be used inside algorithms as
+ * The following functions are intended to be used inside algorithms as
  * utilities for manipulating vpx_codec_* data structures.
  */
 struct vpx_codec_pkt_list

diff --git a/vpx/src/vpx_codec.c b/vpx/src/vpx_codec.c
index 9c1558c..f1a8b67 100644
--- a/vpx/src/vpx_codec.c
+++ b/vpx/src/vpx_codec.c

@@ -9,7 +9,7 @@
  */
 
 
-/*!\file vpx_decoder.c
+/*!\file
  * \brief Provides the high level interface to wrap decoder algorithms.
  *
  */

diff --git a/vpx/src/vpx_decoder.c b/vpx/src/vpx_decoder.c
index 7b78e5c..4ffb00d 100644
--- a/vpx/src/vpx_decoder.c
+++ b/vpx/src/vpx_decoder.c

@@ -9,7 +9,7 @@
  */
 
 
-/*!\file vpx_decoder.c
+/*!\file
  * \brief Provides the high level interface to wrap decoder algorithms.
  *
  */
@@ -36,6 +36,8 @@
         res = VPX_CODEC_INCAPABLE;
     else if ((flags & VPX_CODEC_USE_POSTPROC) && !(iface->caps & VPX_CODEC_CAP_POSTPROC))
         res = VPX_CODEC_INCAPABLE;
+    else if (!(iface->caps & VPX_CODEC_CAP_DECODER))
+        res = VPX_CODEC_INCAPABLE;
     else
     {
         memset(ctx, 0, sizeof(*ctx));

diff --git a/vpx/src/vpx_decoder_compat.c b/vpx/src/vpx_decoder_compat.c
index e264734..4fe00ce 100644
--- a/vpx/src/vpx_decoder_compat.c
+++ b/vpx/src/vpx_decoder_compat.c

@@ -9,7 +9,7 @@
  */
 
 
-/*!\file vpx_decoder.c
+/*!\file
  * \brief Provides the high level interface to wrap decoder algorithms.
  *
  */

diff --git a/vpx/src/vpx_encoder.c b/vpx/src/vpx_encoder.c
index 1092959..db778ff 100644
--- a/vpx/src/vpx_encoder.c
+++ b/vpx/src/vpx_encoder.c

@@ -9,7 +9,7 @@
  */
 
 
-/*!\file vpx_encoder.c
+/*!\file
  * \brief Provides the high level interface to wrap encoder algorithms.
  *
  */

diff --git a/vpx/vp8.h b/vpx/vp8.h
index 32c0132..7ca5c6e 100644
--- a/vpx/vp8.h
+++ b/vpx/vp8.h

@@ -14,7 +14,7 @@
  * VP8 is vpx's newest video compression algorithm that uses motion
  * compensated prediction, Discrete Cosine Transform (DCT) coding of the
  * prediction error signal and context dependent entropy coding techniques
- * based on arithmatic principles. It features:
+ * based on arithmetic principles. It features:
  *  - YUV 4:2:0 image format
  *  - Macro-block based coding (16x16 luma plus two 8x8 chroma)
  *  - 1/4 (1/8) pixel accuracy motion compensated prediction
@@ -25,7 +25,7 @@
  *
  * @{
  */
-/*!\file vp8.h
+/*!\file
  * \brief Provides controls common to both the VP8 encoder and decoder.
  */
 #ifndef VP8_H
@@ -36,7 +36,7 @@
  *
  * The set of macros define the control functions of VP8 interface
  */
-enum vp8_dec_control_id
+enum vp8_com_control_id
 {
     VP8_SET_REFERENCE           = 1,    /**< pass in an external frame into decoder to be used as reference frame */
     VP8_COPY_REFERENCE          = 2,    /**< get a copy of reference frame from the decoder */
@@ -45,7 +45,8 @@
     VP8_SET_DBG_COLOR_MB_MODES  = 5,    /**< set which macro block modes to color */
     VP8_SET_DBG_COLOR_B_MODES   = 6,    /**< set which blocks modes to color */
     VP8_SET_DBG_DISPLAY_MV      = 7,    /**< set which motion vector modes to draw */
-    VP8_COMMON_CTRL_ID_MAX
+    VP8_COMMON_CTRL_ID_MAX,
+    VP8_DECODER_CTRL_ID_START   = 256,
 };
 
 /*!\brief post process flags
@@ -67,7 +68,7 @@
 /*!\brief post process flags
  *
  * This define a structure that describe the post processing settings. For
- * the best objective measure (using thet PSNR metric) set post_proc_flag
+ * the best objective measure (using the PSNR metric) set post_proc_flag
  * to VP8_DEBLOCK and deblocking_level to 1.
  */
 
@@ -101,7 +102,7 @@
 } vpx_ref_frame_t;
 
 
-/*!\brief vp8 decoder control funciton parameter type
+/*!\brief vp8 decoder control function parameter type
  *
  * defines the data type for each of VP8 decoder control function requires
  */

diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index d574c44..5ff6bdc 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h

@@ -16,7 +16,7 @@
  */
 #include "vp8.h"
 
-/*!\file vp8cx.h
+/*!\file
  * \brief Provides definitions for using the VP8 encoder algorithm within the
  *        vpx Codec Interface.
  */
@@ -24,13 +24,15 @@
 #define VP8CX_H
 #include "vpx/vpx_codec_impl_top.h"
 
-/*!\brief Algorithm interface for VP8
+/*!\name Algorithm interface for VP8
  *
  * This interface provides the capability to encode raw VP8 streams, as would
  * be found in AVI files.
+ * @{
  */
 extern vpx_codec_iface_t  vpx_codec_vp8_cx_algo;
 extern vpx_codec_iface_t* vpx_codec_vp8_cx(void);
+/*!@} - end algorithm interface member group*/
 
 
 /*
@@ -114,7 +116,10 @@
 
 /*!\brief VP8 encoder control functions
  *
- * The set of macros define the control functions of VP8 encoder interface
+ * This set of macros define the control functions available for the VP8
+ * encoder interface.
+ *
+ * \sa #vpx_codec_control
  */
 enum vp8e_enc_control_id
 {
@@ -124,7 +129,18 @@
     VP8E_SET_ROI_MAP,                /**< control function to pass an ROI map to encoder */
     VP8E_SET_ACTIVEMAP,              /**< control function to pass an Active map to encoder */
     VP8E_SET_SCALEMODE         = 11, /**< control function to set encoder scaling mode */
-    VP8E_SET_CPUUSED           = 13, /**< control function to set vp8 encoder cpuused  */
+    /*!\brief control function to set vp8 encoder cpuused
+     *
+     * Changes in this value influences, among others, the encoder's selection
+     * of motion estimation methods. Values greater than 0 will increase encoder
+     * speed at the expense of quality.
+     * The full set of adjustments can be found in
+     * onyx_if.c:vp8_set_speed_features().
+     * \todo List highlights of the changes at various levels.
+     *
+     * \note Valid range: -16..16 or {-16..-4, 4..16} w/CONFIG_REALTIME_ONLY
+     */
+    VP8E_SET_CPUUSED           = 13,
     VP8E_SET_ENABLEAUTOALTREF,       /**< control function to enable vp8 to automatic set and use altref frame */
     VP8E_SET_NOISE_SENSITIVITY,      /**< control function to set noise sensitivity */
     VP8E_SET_SHARPNESS,              /**< control function to set sharpness */
@@ -141,7 +157,13 @@
     VP8E_SET_ARNR_STRENGTH ,         /**< control function to set the filter strength for the arf */
     VP8E_SET_ARNR_TYPE     ,         /**< control function to set the type of filter to use for the arf*/
     VP8E_SET_TUNING,                 /**< control function to set visual tuning */
-    VP8E_SET_CQ_LEVEL,               /**< control function to set constrained quality level */
+    /*!\brief control function to set constrained quality level
+     *
+     * \attention For this value to be used vpx_codec_enc_cfg_t::g_usage must be
+     *            set to #VPX_CQ.
+     * \note Valid range: 0..63
+     */
+    VP8E_SET_CQ_LEVEL,
 };
 
 /*!\brief vpx 1-D scaling mode

diff --git a/vpx/vp8dx.h b/vpx/vp8dx.h
index 16bc07c..4a3aef7 100644
--- a/vpx/vp8dx.h
+++ b/vpx/vp8dx.h

@@ -16,7 +16,7 @@
  *
  * @{
  */
-/*!\file vp8dx.h
+/*!\file
  * \brief Provides definitions for using the VP8 algorithm within the vpx Decoder
  *        interface.
  */
@@ -24,13 +24,15 @@
 #define VP8DX_H
 #include "vpx/vpx_codec_impl_top.h"
 
-/*!\brief Algorithm interface for VP8
+/*!\name Algorithm interface for VP8
  *
  * This interface provides the capability to decode raw VP8 streams, as would
  * be found in AVI files and other non-Flash uses.
+ * @{
  */
 extern vpx_codec_iface_t  vpx_codec_vp8_dx_algo;
 extern vpx_codec_iface_t* vpx_codec_vp8_dx(void);
+/*!@} - end algorithm interface member group*/
 
 /* Include controls common to both the encoder and decoder */
 #include "vp8.h"
@@ -38,21 +40,28 @@
 
 /*!\brief VP8 decoder control functions
  *
- * The set of macros define the control functions of VP8 decoder interface
+ * This set of macros define the control functions available for the VP8
+ * decoder interface.
+ *
+ * \sa #vpx_codec_control
  */
-enum vp8d_dec_control_id
+enum vp8_dec_control_id
 {
-    VP8_DECODER_CTRL_ID_START   = 256,
-    VP8D_GET_LAST_REF_UPDATES,              /**< control function to get info on which reference frames were updated
-                                            by the last decode */
-    VP8D_GET_FRAME_CORRUPTED,               /**< check if the indicated frame is corrupted */
+    /** control function to get info on which reference frames were updated
+     *  by the last decode
+     */
+    VP8D_GET_LAST_REF_UPDATES = VP8_DECODER_CTRL_ID_START,
+
+    /** check if the indicated frame is corrupted */
+    VP8D_GET_FRAME_CORRUPTED,
+
     VP8_DECODER_CTRL_ID_MAX
 } ;
 
 
-/*!\brief VP8 encoder control function parameter type
+/*!\brief VP8 decoder control function parameter type
  *
- * Defines the data types that VP8E control functions take. Note that
+ * Defines the data types that VP8D control functions take. Note that
  * additional common controls are defined in vp8.h
  *
  */

diff --git a/vpx/vpx_codec.h b/vpx/vpx_codec.h
index 899b27c..d92e165 100644
--- a/vpx/vpx_codec.h
+++ b/vpx/vpx_codec.h

@@ -16,7 +16,7 @@
  * @{
  */
 
-/*!\file vpx_codec.h
+/*!\file
  * \brief Describes the codec algorithm interface to applications.
  *
  * This file describes the interface between an application and a
@@ -145,7 +145,7 @@
     typedef long vpx_codec_caps_t;
 #define VPX_CODEC_CAP_DECODER 0x1 /**< Is a decoder */
 #define VPX_CODEC_CAP_ENCODER 0x2 /**< Is an encoder */
-#define VPX_CODEC_CAP_XMA     0x4 /**< Supports e_xternal Memory Allocation */
+#define VPX_CODEC_CAP_XMA     0x4 /**< Supports eXternal Memory Allocation */
 
 
     /*! \brief Initialization-time Feature Enabling
@@ -156,7 +156,7 @@
      *  The available flags are specified by VPX_CODEC_USE_* defines.
      */
     typedef long vpx_codec_flags_t;
-#define VPX_CODEC_USE_XMA 0x00000001    /**< Use e_xternal Memory Allocation mode */
+#define VPX_CODEC_USE_XMA 0x00000001    /**< Use eXternal Memory Allocation mode */
 
 
     /*!\brief Codec interface structure.
@@ -232,7 +232,7 @@
     /*!\brief Return the version major number */
 #define vpx_codec_version_major() ((vpx_codec_version()>>16)&0xff)
 
-    /*!\brief Return the version minr number */
+    /*!\brief Return the version minor number */
 #define vpx_codec_version_minor() ((vpx_codec_version()>>8)&0xff)
 
     /*!\brief Return the version patch number */
@@ -338,9 +338,9 @@
 
     /*!\brief Get the capabilities of an algorithm.
      *
-     * Retrieves the capabliities bitfield from the algorithm's interface.
+     * Retrieves the capabilities bitfield from the algorithm's interface.
      *
-     * \param[in] iface   Pointer to the alogrithm interface
+     * \param[in] iface   Pointer to the algorithm interface
      *
      */
     vpx_codec_caps_t vpx_codec_get_caps(vpx_codec_iface_t *iface);
@@ -354,7 +354,7 @@
      *
      * This wrapper function dispatches the request to the helper function
      * associated with the given ctrl_id. It tries to call this function
-     * transparantly, but will return #VPX_CODEC_ERROR if the request could not
+     * transparently, but will return #VPX_CODEC_ERROR if the request could not
      * be dispatched.
      *
      * Note that this function should not be used directly. Call the
@@ -498,7 +498,7 @@
      * Iterates over a list of the segments to allocate. The iterator storage
      * should be initialized to NULL to start the iteration. Iteration is complete
      * when this function returns VPX_CODEC_LIST_END. The amount of memory needed to
-     * allocate is dependant upon the size of the encoded stream. In cases where the
+     * allocate is dependent upon the size of the encoded stream. In cases where the
      * stream is not available at allocation time, a fixed size must be requested.
      * The codec will not be able to operate on streams larger than the size used at
      * allocation time.
@@ -525,7 +525,7 @@
      * passed in the order they are read from vpx_codec_get_mem_map(), but may be
      * passed in groups of any size. Segments \ref MUST be set only once. The
      * allocation function \ref MUST ensure that the vpx_codec_mmap_t::base member
-     * is non-NULL. If the segment requires cleanup handling (eg, calling free()
+     * is non-NULL. If the segment requires cleanup handling (e.g., calling free()
      * or close()) then the vpx_codec_mmap_t::dtor member \ref MUST be populated.
      *
      * \param[in]      ctx     Pointer to this instance's context.

diff --git a/vpx/vpx_decoder.h b/vpx/vpx_decoder.h
index 6ffc2d4..4c57409 100644
--- a/vpx/vpx_decoder.h
+++ b/vpx/vpx_decoder.h

@@ -17,7 +17,7 @@
  * @{
  */
 
-/*!\file vpx_decoder.h
+/*!\file
  * \brief Describes the decoder algorithm interface to applications.
  *
  * This file describes the interface between an application and a
@@ -48,7 +48,7 @@
      *  ::vpx_codec_iface_t interface structure. Capabilities are extra interfaces
      *  or functionality, and are not required to be supported by a decoder.
      *
-     *  The available flags are specifiedby VPX_CODEC_CAP_* defines.
+     *  The available flags are specified by VPX_CODEC_CAP_* defines.
      */
 #define VPX_CODEC_CAP_PUT_SLICE  0x10000 /**< Will issue put_slice callbacks */
 #define VPX_CODEC_CAP_PUT_FRAME  0x20000 /**< Will issue put_frame callbacks */
@@ -109,7 +109,7 @@
      * kept readable and stable until all memory maps have been set.
      *
      * \param[in]    ctx     Pointer to this instance's context.
-     * \param[in]    iface   Pointer to the alogrithm interface to use.
+     * \param[in]    iface   Pointer to the algorithm interface to use.
      * \param[in]    cfg     Configuration to use, if known. May be NULL.
      * \param[in]    flags   Bitfield of VPX_CODEC_USE_* flags
      * \param[in]    ver     ABI version number. Must be set to
@@ -139,7 +139,7 @@
      * context is not necessary. Can be used to determine if the bitstream is
      * of the proper format, and to extract information from the stream.
      *
-     * \param[in]      iface   Pointer to the alogrithm interface
+     * \param[in]      iface   Pointer to the algorithm interface
      * \param[in]      data    Pointer to a block of data to parse
      * \param[in]      data_sz Size of the data buffer
      * \param[in,out]  si      Pointer to stream info to update. The size member

diff --git a/vpx/vpx_decoder_compat.h b/vpx/vpx_decoder_compat.h
index ca6f618..8adc1b9 100644
--- a/vpx/vpx_decoder_compat.h
+++ b/vpx/vpx_decoder_compat.h

@@ -16,7 +16,7 @@
  * @{
  */
 
-/*!\file vpx_decoder_compat.h
+/*!\file
  * \brief Provides a compatibility layer between version 1 and 2 of this API.
  *
  * This interface has been deprecated. Only existing code should make use
@@ -89,12 +89,12 @@
      *  ::vpx_dec_iface_t interface structure. Capabilities are extra interfaces
      *  or functionality, and are not required to be supported by a decoder.
      *
-     *  The available flags are specifiedby VPX_DEC_CAP_* defines.
+     *  The available flags are specified by VPX_DEC_CAP_* defines.
      */
     typedef int vpx_dec_caps_t;
 #define VPX_DEC_CAP_PUT_SLICE  0x0001 /**< Will issue put_slice callbacks */
 #define VPX_DEC_CAP_PUT_FRAME  0x0002 /**< Will issue put_frame callbacks */
-#define VPX_DEC_CAP_XMA        0x0004 /**< Supports e_xternal Memory Allocation */
+#define VPX_DEC_CAP_XMA        0x0004 /**< Supports eXternal Memory Allocation */
 
     /*!\brief Stream properties
      *
@@ -222,7 +222,7 @@
      * is properly initialized.
      *
      * \param[in]    ctx     Pointer to this instance's context.
-     * \param[in]    iface   Pointer to the alogrithm interface to use.
+     * \param[in]    iface   Pointer to the algorithm interface to use.
      * \param[in]    ver     ABI version number. Must be set to
      *                       VPX_DECODER_ABI_VERSION
      * \retval #VPX_DEC_OK
@@ -253,9 +253,9 @@
 
     /*!\brief Get the capabilities of an algorithm.
      *
-     * Retrieves the capabliities bitfield from the algorithm's interface.
+     * Retrieves the capabilities bitfield from the algorithm's interface.
      *
-     * \param[in] iface   Pointer to the alogrithm interface
+     * \param[in] iface   Pointer to the algorithm interface
      *
      */
     vpx_dec_caps_t vpx_dec_get_caps(vpx_dec_iface_t *iface) DEPRECATED;
@@ -267,7 +267,7 @@
      * context is not necessary. Can be used to determine if the bitstream is
      * of the proper format, and to extract information from the stream.
      *
-     * \param[in]      iface   Pointer to the alogrithm interface
+     * \param[in]      iface   Pointer to the algorithm interface
      * \param[in]      data    Pointer to a block of data to parse
      * \param[in]      data_sz Size of the data buffer
      * \param[in,out]  si      Pointer to stream info to update. The size member
@@ -309,7 +309,7 @@
      *
      * This wrapper function dispatches the request to the helper function
      * associated with the given ctrl_id. It tries to call this function
-     * transparantly, but will return #VPX_DEC_ERROR if the request could not
+     * transparently, but will return #VPX_DEC_ERROR if the request could not
      * be dispatched.
      *
      * \param[in]     ctx              Pointer to this instance's context
@@ -507,7 +507,7 @@
      * is properly initialized.
      *
      * \param[in]    ctx     Pointer to this instance's context.
-     * \param[in]    iface   Pointer to the alogrithm interface to use.
+     * \param[in]    iface   Pointer to the algorithm interface to use.
      * \param[in]    ver     ABI version number. Must be set to
      *                       VPX_DECODER_ABI_VERSION
      * \retval #VPX_DEC_OK
@@ -527,7 +527,7 @@
      * Iterates over a list of the segments to allocate. The iterator storage
      * should be initialized to NULL to start the iteration. Iteration is complete
      * when this function returns VPX_DEC_LIST_END. The amount of memory needed to
-     * allocate is dependant upon the size of the encoded stream. This means that
+     * allocate is dependent upon the size of the encoded stream. This means that
      * the stream info structure must be known at allocation time. It can be
      * populated with the vpx_dec_peek_stream_info() function. In cases where the
      * stream to be decoded is not available at allocation time, a fixed size must
@@ -558,7 +558,7 @@
      * passed in the order they are read from vpx_dec_get_mem_map(), but may be
      * passed in groups of any size. Segments \ref MUST be set only once. The
      * allocation function \ref MUST ensure that the vpx_dec_mmap_t::base member
-     * is non-NULL. If the segment requires cleanup handling (eg, calling free()
+     * is non-NULL. If the segment requires cleanup handling (e.g., calling free()
      * or close()) then the vpx_dec_mmap_t::dtor member \ref MUST be populated.
      *
      * \param[in]      ctx     Pointer to this instance's context.

diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index 0d53f41..9c44414 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h

@@ -17,7 +17,7 @@
  * @{
  */
 
-/*!\file vpx_encoder.h
+/*!\file
  * \brief Describes the encoder algorithm interface to applications.
  *
  * This file describes the interface between an application and a
@@ -51,7 +51,7 @@
      *  interfaces or functionality, and are not required to be supported
      *  by an encoder.
      *
-     *  The available flags are specifiedby VPX_CODEC_CAP_* defines.
+     *  The available flags are specified by VPX_CODEC_CAP_* defines.
      */
 #define VPX_CODEC_CAP_PSNR  0x10000 /**< Can issue PSNR packets */
 
@@ -147,7 +147,7 @@
 
             /* This packet size is fixed to allow codecs to extend this
              * interface without having to manage storage for raw packets,
-             * ie if it's smaller than 128 bytes, you can store in the
+             * i.e., if it's smaller than 128 bytes, you can store in the
              * packet list directly.
              */
             char pad[128 - sizeof(enum vpx_codec_cx_pkt_kind)]; /**< fixed sz */

diff --git a/vpx/vpx_image.h b/vpx/vpx_image.h
index dcb8f31..8e08b36 100644
--- a/vpx/vpx_image.h
+++ b/vpx/vpx_image.h

@@ -9,7 +9,7 @@
  */
 
 
-/*!\file vpx_image.h
+/*!\file
  * \brief Describes the vpx image descriptor and associated operations
  *
  */
@@ -33,7 +33,7 @@
 
 #define VPX_IMG_FMT_PLANAR     0x100  /**< Image is a planar format */
 #define VPX_IMG_FMT_UV_FLIP    0x200  /**< V plane precedes U plane in memory */
-#define VPX_IMG_FMT_HAS_ALPHA  0x400  /**< Image has an alpha channel componnent */
+#define VPX_IMG_FMT_HAS_ALPHA  0x400  /**< Image has an alpha channel component */
 
 
     /*!\brief List of supported image formats */
@@ -115,7 +115,7 @@
 #define VPX_PLANE_Y      0   /**< Y (Luminance) plane */
 #define VPX_PLANE_U      1   /**< U (Chroma) plane */
 #define VPX_PLANE_V      2   /**< V (Chroma) plane */
-#define VPX_PLANE_ALPHA  3   /**< A (Transparancy) plane */
+#define VPX_PLANE_ALPHA  3   /**< A (Transparency) plane */
 #if !defined(VPX_CODEC_DISABLE_COMPAT) || !VPX_CODEC_DISABLE_COMPAT
 #define PLANE_PACKED     VPX_PLANE_PACKED
 #define PLANE_Y          VPX_PLANE_Y

diff --git a/vpx_mem/intel_linux/vpx_mem.c b/vpx_mem/intel_linux/vpx_mem.c
deleted file mode 100644
index 00150ac..0000000
--- a/vpx_mem/intel_linux/vpx_mem.c
+++ /dev/null

@@ -1,951 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#define __VPX_MEM_C__
-
-#include "vpx_mem.h"
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#ifndef CONFIG_MEM_MANAGER
-# if defined(VXWORKS)
-#  define CONFIG_MEM_MANAGER  1 //include heap manager functionality,
-//default: enabled on vxworks
-# else
-#  define CONFIG_MEM_MANAGER  0 //include heap manager functionality
-# endif
-#endif
-
-#ifndef CONFIG_MEM_TRACKER
-# define CONFIG_MEM_TRACKER     1 //include xvpx_* calls in the lib
-#endif
-
-#ifndef CONFIG_MEM_CHECKS
-# define CONFIG_MEM_CHECKS      0 //include some basic safety checks in
-//vpx_memcpy, _memset, and _memmove
-#endif
-
-#ifndef USE_GLOBAL_FUNCTION_POINTERS
-# define USE_GLOBAL_FUNCTION_POINTERS   0  //use function pointers instead of compiled functions.
-#endif
-
-#if CONFIG_MEM_TRACKER
-# include "vpx_mem_tracker.h"
-# if VPX_MEM_TRACKER_VERSION_CHIEF != 2 || VPX_MEM_TRACKER_VERSION_MAJOR != 5
-#  error "vpx_mem requires memory tracker version 2.5 to track memory usage"
-# endif
-#endif
-
-#define ADDRESS_STORAGE_SIZE      sizeof(size_t)
-
-#ifndef DEFAULT_ALIGNMENT
-# if defined(VXWORKS)
-#  define DEFAULT_ALIGNMENT        32        //default addr alignment to use in
-//calls to vpx_* functions other
-//than vpx_memalign
-# else
-#  define DEFAULT_ALIGNMENT        1
-# endif
-#endif
-
-#if DEFAULT_ALIGNMENT < 1
-# error "DEFAULT_ALIGNMENT must be >= 1!"
-#endif
-
-#if CONFIG_MEM_TRACKER
-# define TRY_BOUNDS_CHECK         1         //when set to 1 pads each allocation,
-//integrity can be checked using
-//vpx_memory_tracker_check_integrity
-//or on free by defining
-//TRY_BOUNDS_CHECK_ON_FREE
-static unsigned long g_alloc_count = 0;
-
-#else
-# define TRY_BOUNDS_CHECK         0
-#endif
-
-#if TRY_BOUNDS_CHECK
-# define TRY_BOUNDS_CHECK_ON_FREE 0          //checks mem integrity on every
-//free, very expensive
-# define BOUNDS_CHECK_VALUE       0xdeadbeef //value stored before/after ea.
-//mem addr for bounds checking
-# define BOUNDS_CHECK_PAD_SIZE    32         //size of the padding before and
-//after ea allocation to be filled
-//with BOUNDS_CHECK_VALUE.
-//this should be a multiple of 4
-#else
-# define BOUNDS_CHECK_VALUE       0
-# define BOUNDS_CHECK_PAD_SIZE    0
-#endif
-
-#if CONFIG_MEM_MANAGER
-# include "heapmm.h"
-# include "hmm_intrnl.h"
-
-# define SHIFT_HMM_ADDR_ALIGN_UNIT 5
-# define TOTAL_MEMORY_TO_ALLOCATE  20971520 // 20 * 1024 * 1024
-
-# define MM_DYNAMIC_MEMORY 1
-# if MM_DYNAMIC_MEMORY
-static unsigned char *g_p_mng_memory_raw = NULL;
-static unsigned char *g_p_mng_memory     = NULL;
-# else
-static unsigned char g_p_mng_memory[TOTAL_MEMORY_TO_ALLOCATE];
-# endif
-
-static size_t g_mm_memory_size = TOTAL_MEMORY_TO_ALLOCATE;
-
-static hmm_descriptor hmm_d;
-static int g_mng_memory_allocated = 0;
-
-static int vpx_mm_create_heap_memory();
-static void *vpx_mm_realloc(void *memblk, size_t size);
-#endif //CONFIG_MEM_MANAGER
-
-#if USE_GLOBAL_FUNCTION_POINTERS
-
-struct GLOBAL_FUNC_POINTERS
-{
-    g_malloc_func g_malloc;
-    g_calloc_func g_calloc;
-    g_realloc_func g_realloc;
-    g_free_func g_free;
-    g_memcpy_func g_memcpy;
-    g_memset_func g_memset;
-    g_memmove_func g_memmove;
-};
-struct GLOBAL_FUNC_POINTERS *g_func = 0;
-
-# define VPX_MALLOC_L  g_func->g_malloc
-# define VPX_REALLOC_L g_func->g_realloc
-# define VPX_FREE_L    g_func->g_free
-# define VPX_MEMCPY_L  g_func->g_memcpy
-# define VPX_MEMSET_L  g_func->g_memset
-# define VPX_MEMMOVE_L g_func->g_memmove
-
-#else
-# define VPX_MALLOC_L  malloc
-# define VPX_REALLOC_L realloc
-# define VPX_FREE_L    free
-# define VPX_MEMCPY_L  memcpy
-# define VPX_MEMSET_L  memset
-# define VPX_MEMMOVE_L memmove
-#endif // USE_GLOBAL_FUNCTION_POINTERS
-
-/* Should probably use a vpx_mem logger function. */
-#define __REMOVE_PRINTFS
-#ifdef __REMOVE_PRINTFS
-#define _P(x)
-#else
-#define _P(x) x
-#endif
-
-/*returns an addr aligned to the byte boundary specified by align*/
-#define align_addr(addr,align) \
-    (void*)(((size_t)(addr) + ((align) - 1)) & (size_t)-(align))
-
-unsigned int vpx_mem_get_version()
-{
-    unsigned int ver = ((unsigned int)(unsigned char)VPX_MEM_VERSION_CHIEF << 24 |
-                        (unsigned int)(unsigned char)VPX_MEM_VERSION_MAJOR << 16 |
-                        (unsigned int)(unsigned char)VPX_MEM_VERSION_MINOR << 8  |
-                        (unsigned int)(unsigned char)VPX_MEM_VERSION_PATCH);
-    return ver;
-}
-
-int vpx_mem_set_heap_size(size_t size)
-{
-    int ret = -1;
-
-#if CONFIG_MEM_MANAGER
-#if MM_DYNAMIC_MEMORY
-
-    if (!g_mng_memory_allocated && size)
-    {
-        g_mm_memory_size = size;
-        ret = 0;
-    }
-    else
-        ret = -3;
-
-#else
-    ret = -2;
-#endif
-#else
-    (void)size;
-#endif
-
-    return ret;
-}
-
-void *vpx_memalign(size_t align, size_t size)
-{
-    void *addr,
-         * x = NULL;
-
-#if CONFIG_MEM_MANAGER
-    int number_aau;
-
-    if (vpx_mm_create_heap_memory() < 0)
-    {
-        _P(printf("[vpx][mm] ERROR vpx_memalign() Couldn't create memory for Heap.\n");)
-    }
-
-    number_aau = ((size + align - 1 + ADDRESS_STORAGE_SIZE) >>
-                  SHIFT_HMM_ADDR_ALIGN_UNIT) + 1;
-
-    addr = hmm_alloc(&hmm_d, number_aau);
-#else
-    addr = VPX_MALLOC_L(size + align - 1 + ADDRESS_STORAGE_SIZE);
-#endif //CONFIG_MEM_MANAGER
-
-    if (addr)
-    {
-        x = align_addr((unsigned char *)addr + ADDRESS_STORAGE_SIZE, (int)align);
-        /* save the actual malloc address */
-        ((size_t *)x)[-1] = (size_t)addr;
-    }
-
-    return x;
-}
-
-void *vpx_malloc(size_t size)
-{
-    return vpx_memalign(DEFAULT_ALIGNMENT, size);
-}
-
-void *vpx_calloc(size_t num, size_t size)
-{
-    void *x;
-
-    x = vpx_memalign(DEFAULT_ALIGNMENT, num * size);
-
-    if (x)
-        VPX_MEMSET_L(x, 0, num * size);
-
-    return x;
-}
-
-void *vpx_realloc(void *memblk, size_t size)
-{
-    void *addr,
-         * new_addr = NULL;
-    int align = DEFAULT_ALIGNMENT;
-
-    /*
-    The realloc() function changes the size of the object pointed to by
-    ptr to the size specified by size, and returns a pointer to the
-    possibly moved block. The contents are unchanged up to the lesser
-    of the new and old sizes. If ptr is null, realloc() behaves like
-    malloc() for the specified size. If size is zero (0) and ptr is
-    not a null pointer, the object pointed to is freed.
-    */
-    if (!memblk)
-        new_addr = vpx_malloc(size);
-    else if (!size)
-        vpx_free(memblk);
-    else
-    {
-        addr   = (void *)(((size_t *)memblk)[-1]);
-        memblk = NULL;
-
-#if CONFIG_MEM_MANAGER
-        new_addr = vpx_mm_realloc(addr, size + align + ADDRESS_STORAGE_SIZE);
-#else
-        new_addr = VPX_REALLOC_L(addr, size + align + ADDRESS_STORAGE_SIZE);
-#endif
-
-        if (new_addr)
-        {
-            addr = new_addr;
-            new_addr = (void *)(((size_t)
-                                 ((unsigned char *)new_addr + ADDRESS_STORAGE_SIZE) + (align - 1)) &
-                                (size_t) - align);
-            /* save the actual malloc address */
-            ((size_t *)new_addr)[-1] = (size_t)addr;
-        }
-    }
-
-    return new_addr;
-}
-
-void vpx_free(void *memblk)
-{
-    if (memblk)
-    {
-        void *addr = (void *)(((size_t *)memblk)[-1]);
-#if CONFIG_MEM_MANAGER
-        hmm_free(&hmm_d, addr);
-#else
-        VPX_FREE_L(addr);
-#endif
-    }
-}
-
-void *vpx_mem_alloc(int id, size_t size, size_t align)
-{
-#if defined CHIP_DM642 || defined __uClinux__
-    void *mem = (void *)mem_alloc(id, size, align);
-
-    if (!mem)
-    {
-        _P(fprintf(stderr,
-                   "\n"
-                   "*********************************************************\n"
-                   "WARNING: mem_alloc returned 0 for id=%p size=%u align=%u.\n"
-                   "*********************************************************\n",
-                   mem, size, align));
-        // should no longer need this.  Softier says it's fixed. 2005-01-21 tjf
-        //#if defined __uClinux__
-        //while(1)usleep(1000000);
-        //#endif
-    }
-
-#if defined __uClinux__
-    else if (mem == (void *)0xFFFFFFFF)
-    {
-        // out of memory/error
-        mem = (void *)0;
-
-        _P(fprintf(stderr,
-                   "\n"
-                   "******************************************************\n"
-                   "ERROR: mem_alloc id=%p size=%u align=%u OUT OF MEMORY.\n"
-                   "******************************************************\n",
-                   mem, size, align));
-    }
-
-#endif  // __uClinux__
-
-    return mem;
-#else
-    (void)id;
-    (void)size;
-    (void)align;
-    return (void *)0;
-#endif
-}
-
-void vpx_mem_free(int id, void *mem, size_t size)
-{
-#if defined CHIP_DM642 || defined __uClinux__
-
-    if (!mem)
-    {
-        _P(fprintf(stderr,
-                   "\n"
-                   "**************************************\n"
-                   "WARNING: 0 being free'd id=%p size=%u.\n"
-                   "**************************************\n",
-                   id, size));
-
-        // should no longer need this.  Softier says it's fixed. 2005-01-21 tjf
-        //#if defined __uClinux__
-        //while(1)usleep(1000000);
-        //#endif
-    }
-
-    mem_free(id, mem, size);
-#else
-    (void)id;
-    (void)mem;
-    (void)size;
-#endif
-}
-
-
-#if CONFIG_MEM_TRACKER
-
-void *xvpx_mem_alloc(int id, size_t size, size_t align, char *file, int line)
-{
-    void *mem = vpx_mem_alloc(id, size, align);
-
-    vpx_memory_tracker_add((size_t)mem, size, file, line, 0);
-
-    return mem;
-}
-
-void xvpx_mem_free(int id, void *mem, size_t size, char *file, int line)
-{
-    if (vpx_memory_tracker_remove((size_t)mem) == -2)
-    {
-#if REMOVE_PRINTFS
-        (void)file;
-        (void)line;
-#endif
-        _P(fprintf(stderr, "[vpx_mem][xvpx_mem_free] addr: %p (id=%p size=%u) "
-                   "not found in list; freed from file:%s"
-                   " line:%d\n", mem, id, size, file, line));
-    }
-
-    vpx_mem_free(id, mem, size);
-}
-
-void *xvpx_memalign(size_t align, size_t size, char *file, int line)
-{
-#if TRY_BOUNDS_CHECK
-    unsigned char *x_bounds;
-#endif
-
-    void *x;
-
-    if (g_alloc_count == 0)
-    {
-#if TRY_BOUNDS_CHECK
-        int i_rv = vpx_memory_tracker_init(BOUNDS_CHECK_PAD_SIZE, BOUNDS_CHECK_VALUE);
-#else
-        int i_rv = vpx_memory_tracker_init(0, 0);
-#endif
-
-        if (i_rv < 0)
-        {
-            _P(printf("ERROR xvpx_malloc MEM_TRACK_USAGE error vpx_memory_tracker_init().\n");)
-        }
-    }
-
-#if TRY_BOUNDS_CHECK
-    {
-        int i;
-        unsigned int tempme = BOUNDS_CHECK_VALUE;
-
-        x_bounds = vpx_memalign(align, size + (BOUNDS_CHECK_PAD_SIZE * 2));
-
-        if (x_bounds)
-        {
-            /*we're aligning the address twice here but to keep things
-              consistent we want to have the padding come before the stored
-              address so no matter what free function gets called we will
-              attempt to free the correct address*/
-            x_bounds = (unsigned char *)(((size_t *)x_bounds)[-1]);
-            x = align_addr(x_bounds + BOUNDS_CHECK_PAD_SIZE + ADDRESS_STORAGE_SIZE,
-                           (int)align);
-            /* save the actual malloc address */
-            ((size_t *)x)[-1] = (size_t)x_bounds;
-
-            for (i = 0; i < BOUNDS_CHECK_PAD_SIZE; i += sizeof(unsigned int))
-            {
-                VPX_MEMCPY_L(x_bounds + i, &tempme, sizeof(unsigned int));
-                VPX_MEMCPY_L((unsigned char *)x + size + i,
-                             &tempme, sizeof(unsigned int));
-            }
-        }
-        else
-            x = NULL;
-    }
-#else
-    x = vpx_memalign(align, size);
-#endif //TRY_BOUNDS_CHECK
-
-    g_alloc_count++;
-
-    vpx_memory_tracker_add((size_t)x, size, file, line, 1);
-
-    return x;
-}
-
-void *xvpx_malloc(size_t size, char *file, int line)
-{
-    return xvpx_memalign(DEFAULT_ALIGNMENT, size, file, line);
-}
-
-void *xvpx_calloc(size_t num, size_t size, char *file, int line)
-{
-    void *x = xvpx_memalign(DEFAULT_ALIGNMENT, num * size, file, line);
-
-    if (x)
-        VPX_MEMSET_L(x, 0, num * size);
-
-    return x;
-}
-
-void *xvpx_realloc(void *memblk, size_t size, char *file, int line)
-{
-    struct mem_block *p = NULL;
-    int orig_size = 0,
-        orig_line = 0;
-    char *orig_file = NULL;
-
-#if TRY_BOUNDS_CHECK
-    unsigned char *x_bounds = memblk ?
-                              (unsigned char *)(((size_t *)memblk)[-1]) :
-                              NULL;
-#endif
-
-    void *x;
-
-    if (g_alloc_count == 0)
-    {
-#if TRY_BOUNDS_CHECK
-
-        if (!vpx_memory_tracker_init(BOUNDS_CHECK_PAD_SIZE, BOUNDS_CHECK_VALUE))
-#else
-        if (!vpx_memory_tracker_init(0, 0))
-#endif
-        {
-            _P(printf("ERROR xvpx_malloc MEM_TRACK_USAGE error vpx_memory_tracker_init().\n");)
-        }
-    }
-
-    if (p = vpx_memory_tracker_find((size_t)memblk))
-    {
-        orig_size = p->size;
-        orig_file = p->file;
-        orig_line = p->line;
-    }
-
-#if TRY_BOUNDS_CHECK_ON_FREE
-    vpx_memory_tracker_check_integrity(file, line);
-#endif
-
-    //have to do this regardless of success, because
-    //the memory that does get realloc'd may change
-    //the bounds values of this block
-    vpx_memory_tracker_remove((size_t)memblk);
-
-#if TRY_BOUNDS_CHECK
-    {
-        int i;
-        unsigned int tempme = BOUNDS_CHECK_VALUE;
-
-        x_bounds = vpx_realloc(memblk, size + (BOUNDS_CHECK_PAD_SIZE * 2));
-
-        if (x_bounds)
-        {
-            x_bounds = (unsigned char *)(((size_t *)x_bounds)[-1]);
-            x = align_addr(x_bounds + BOUNDS_CHECK_PAD_SIZE + ADDRESS_STORAGE_SIZE,
-                           (int)DEFAULT_ALIGNMENT);
-            /* save the actual malloc address */
-            ((size_t *)x)[-1] = (size_t)x_bounds;
-
-            for (i = 0; i < BOUNDS_CHECK_PAD_SIZE; i += sizeof(unsigned int))
-            {
-                VPX_MEMCPY_L(x_bounds + i, &tempme, sizeof(unsigned int));
-                VPX_MEMCPY_L((unsigned char *)x + size + i,
-                             &tempme, sizeof(unsigned int));
-            }
-        }
-        else
-            x = NULL;
-    }
-#else
-    x = vpx_realloc(memblk, size);
-#endif //TRY_BOUNDS_CHECK
-
-    if (x)
-        vpx_memory_tracker_add((size_t)x, size, file, line, 1);
-    else
-        vpx_memory_tracker_add((size_t)memblk, orig_size, orig_file, orig_line, 1);
-
-    return x;
-}
-
-void xvpx_free(void *p_address, char *file, int line)
-{
-#if TRY_BOUNDS_CHECK
-    unsigned char *p_bounds_address = (unsigned char *)p_address;
-    //p_bounds_address -= BOUNDS_CHECK_PAD_SIZE;
-#endif
-
-#if !TRY_BOUNDS_CHECK_ON_FREE
-    (void)file;
-    (void)line;
-#endif
-
-    if (p_address)
-    {
-#if TRY_BOUNDS_CHECK_ON_FREE
-        vpx_memory_tracker_check_integrity(file, line);
-#endif
-
-        //if the addr isn't found in the list, assume it was allocated via
-        //vpx_ calls not xvpx_, therefore it does not contain any padding
-        if (vpx_memory_tracker_remove((size_t)p_address) == -2)
-        {
-            p_bounds_address = p_address;
-            _P(fprintf(stderr, "[vpx_mem][xvpx_free] addr: %p not found in"
-                       " list; freed from file:%s"
-                       " line:%d\n", p_address, file, line));
-        }
-        else
-            --g_alloc_count;
-
-#if TRY_BOUNDS_CHECK
-        vpx_free(p_bounds_address);
-#else
-        vpx_free(p_address);
-#endif
-
-        if (!g_alloc_count)
-            vpx_memory_tracker_destroy();
-    }
-}
-
-#endif /*CONFIG_MEM_TRACKER*/
-
-#if CONFIG_MEM_CHECKS
-#if defined(VXWORKS)
-#include <task_lib.h> //for task_delay()
-/* This function is only used to get a stack trace of the player
-object so we can se where we are having a problem. */
-static int get_my_tt(int task)
-{
-    tt(task);
-
-    return 0;
-}
-
-static void vx_sleep(int msec)
-{
-    int ticks_to_sleep = 0;
-
-    if (msec)
-    {
-        int msec_per_tick = 1000 / sys_clk_rate_get();
-
-        if (msec < msec_per_tick)
-            ticks_to_sleep++;
-        else
-            ticks_to_sleep = msec / msec_per_tick;
-    }
-
-    task_delay(ticks_to_sleep);
-}
-#endif
-#endif
-
-void *vpx_memcpy(void *dest, const void *source, size_t length)
-{
-#if CONFIG_MEM_CHECKS
-
-    if (((int)dest < 0x4000) || ((int)source < 0x4000))
-    {
-        _P(printf("WARNING: vpx_memcpy dest:0x%x source:0x%x len:%d\n", (int)dest, (int)source, length);)
-
-#if defined(VXWORKS)
-        sp(get_my_tt, task_id_self(), 0, 0, 0, 0, 0, 0, 0, 0);
-
-        vx_sleep(10000);
-#endif
-    }
-
-#endif
-
-    return VPX_MEMCPY_L(dest, source, length);
-}
-
-void *vpx_memset(void *dest, int val, size_t length)
-{
-#if CONFIG_MEM_CHECKS
-
-    if ((int)dest < 0x4000)
-    {
-        _P(printf("WARNING: vpx_memset dest:0x%x val:%d len:%d\n", (int)dest, val, length);)
-
-#if defined(VXWORKS)
-        sp(get_my_tt, task_id_self(), 0, 0, 0, 0, 0, 0, 0, 0);
-
-        vx_sleep(10000);
-#endif
-    }
-
-#endif
-
-    return VPX_MEMSET_L(dest, val, length);
-}
-
-void *vpx_memmove(void *dest, const void *src, size_t count)
-{
-#if CONFIG_MEM_CHECKS
-
-    if (((int)dest < 0x4000) || ((int)src < 0x4000))
-    {
-        _P(printf("WARNING: vpx_memmove dest:0x%x src:0x%x count:%d\n", (int)dest, (int)src, count);)
-
-#if defined(VXWORKS)
-        sp(get_my_tt, task_id_self(), 0, 0, 0, 0, 0, 0, 0, 0);
-
-        vx_sleep(10000);
-#endif
-    }
-
-#endif
-
-    return VPX_MEMMOVE_L(dest, src, count);
-}
-
-#if CONFIG_MEM_MANAGER
-
-static int vpx_mm_create_heap_memory()
-{
-    int i_rv = 0;
-
-    if (!g_mng_memory_allocated)
-    {
-#if MM_DYNAMIC_MEMORY
-        g_p_mng_memory_raw =
-            (unsigned char *)malloc(g_mm_memory_size + HMM_ADDR_ALIGN_UNIT);
-
-        if (g_p_mng_memory_raw)
-        {
-            g_p_mng_memory = (unsigned char *)((((unsigned int)g_p_mng_memory_raw) +
-                                                HMM_ADDR_ALIGN_UNIT - 1) &
-                                               -(int)HMM_ADDR_ALIGN_UNIT);
-
-            _P(printf("[vpx][mm] total memory size:%d g_p_mng_memory_raw:0x%x g_p_mng_memory:0x%x\n"
-                      , g_mm_memory_size + HMM_ADDR_ALIGN_UNIT
-                      , (unsigned int)g_p_mng_memory_raw
-                      , (unsigned int)g_p_mng_memory);)
-        }
-        else
-        {
-            _P(printf("[vpx][mm] Couldn't allocate memory:%d for vpx memory manager.\n"
-                      , g_mm_memory_size);)
-
-            i_rv = -1;
-        }
-
-        if (g_p_mng_memory)
-#endif
-        {
-            int chunk_size = 0;
-
-            g_mng_memory_allocated = 1;
-
-            hmm_init(&hmm_d);
-
-            chunk_size = g_mm_memory_size >> SHIFT_HMM_ADDR_ALIGN_UNIT;
-
-            chunk_size -= DUMMY_END_BLOCK_BAUS;
-
-            _P(printf("[vpx][mm] memory size:%d for vpx memory manager. g_p_mng_memory:0x%x  chunk_size:%d\n"
-                      , g_mm_memory_size
-                      , (unsigned int)g_p_mng_memory
-                      , chunk_size);)
-
-            hmm_new_chunk(&hmm_d, (void *)g_p_mng_memory, chunk_size);
-        }
-
-#if MM_DYNAMIC_MEMORY
-        else
-        {
-            _P(printf("[vpx][mm] Couldn't allocate memory:%d for vpx memory manager.\n"
-                      , g_mm_memory_size);)
-
-            i_rv = -1;
-        }
-
-#endif
-    }
-
-    return i_rv;
-}
-
-static void *vpx_mm_realloc(void *memblk, size_t size)
-{
-    void *p_ret = NULL;
-
-    if (vpx_mm_create_heap_memory() < 0)
-    {
-        _P(printf("[vpx][mm] ERROR vpx_mm_realloc() Couldn't create memory for Heap.\n");)
-    }
-    else
-    {
-        int i_rv = 0;
-        int old_num_aaus;
-        int new_num_aaus;
-
-        old_num_aaus = hmm_true_size(memblk);
-        new_num_aaus = (size >> SHIFT_HMM_ADDR_ALIGN_UNIT) + 1;
-
-        if (old_num_aaus == new_num_aaus)
-        {
-            p_ret = memblk;
-        }
-        else
-        {
-            i_rv = hmm_resize(&hmm_d, memblk, new_num_aaus);
-
-            if (i_rv == 0)
-            {
-                p_ret = memblk;
-            }
-            else
-            {
-                /* Error. Try to malloc and then copy data. */
-                void *p_from_malloc;
-
-                new_num_aaus = (size >> SHIFT_HMM_ADDR_ALIGN_UNIT) + 1;
-                p_from_malloc  = hmm_alloc(&hmm_d, new_num_aaus);
-
-                if (p_from_malloc)
-                {
-                    vpx_memcpy(p_from_malloc, memblk, size);
-                    hmm_free(&hmm_d, memblk);
-
-                    p_ret = p_from_malloc;
-                }
-            }
-        }
-    }
-
-    return p_ret;
-}
-#endif //CONFIG_MEM_MANAGER
-
-#if USE_GLOBAL_FUNCTION_POINTERS
-# if CONFIG_MEM_TRACKER
-extern int vpx_memory_tracker_set_functions(g_malloc_func g_malloc_l
-        , g_calloc_func g_calloc_l
-        , g_realloc_func g_realloc_l
-        , g_free_func g_free_l
-        , g_memcpy_func g_memcpy_l
-        , g_memset_func g_memset_l
-        , g_memmove_func g_memmove_l);
-# endif
-#endif
-int vpx_mem_set_functions(g_malloc_func g_malloc_l
-                          , g_calloc_func g_calloc_l
-                          , g_realloc_func g_realloc_l
-                          , g_free_func g_free_l
-                          , g_memcpy_func g_memcpy_l
-                          , g_memset_func g_memset_l
-                          , g_memmove_func g_memmove_l)
-{
-#if USE_GLOBAL_FUNCTION_POINTERS
-
-    /* If use global functions is turned on then the
-    application must set the global functions before
-    it does anything else or vpx_mem will have
-    unpredictable results. */
-    if (!g_func)
-    {
-        g_func = (struct GLOBAL_FUNC_POINTERS *)g_malloc_l(sizeof(struct GLOBAL_FUNC_POINTERS));
-
-        if (!g_func)
-        {
-            return -1;
-        }
-    }
-
-#if CONFIG_MEM_TRACKER
-    {
-        int rv = 0;
-        rv = vpx_memory_tracker_set_functions(g_malloc_l
-                                              , g_calloc_l
-                                              , g_realloc_l
-                                              , g_free_l
-                                              , g_memcpy_l
-                                              , g_memset_l
-                                              , g_memmove_l);
-
-        if (rv < 0)
-        {
-            return rv;
-        }
-    }
-#endif
-
-    if (g_malloc_l)
-        g_func->g_malloc = g_malloc_l;
-    else
-        g_func->g_malloc = 0;
-
-    if (g_calloc_l)
-        g_func->g_calloc = g_calloc_l;
-    else
-        g_func->g_calloc = 0;
-
-    if (g_realloc_l)
-        g_func->g_realloc = g_realloc_l;
-    else
-        g_func->g_realloc = 0;
-
-    if (g_free_l)
-        g_func->g_free = g_free_l;
-    else
-        g_func->g_free = 0;
-
-    if (g_memcpy_l)
-        g_func->g_memcpy = g_memcpy_l;
-    else
-        g_func->g_memcpy = 0;
-
-    if (g_memset_l)
-        g_func->g_memset = g_memset_l;
-    else
-        g_func->g_memset = 0;
-
-    if (g_memmove_l)
-        g_func->g_memmove = g_memmove_l;
-    else
-        g_func->g_memmove = 0;
-
-    return 0;
-#else
-    (void)g_malloc_l;
-    (void)g_calloc_l;
-    (void)g_realloc_l;
-    (void)g_free_l;
-    (void)g_memcpy_l;
-    (void)g_memset_l;
-    (void)g_memmove_l;
-    return -1;
-#endif
-}
-
-int vpx_mem_unset_functions()
-{
-#if USE_GLOBAL_FUNCTION_POINTERS
-
-    if (g_func)
-    {
-        g_free_func temp_free;
-
-        temp_free = g_func->g_free;
-
-        temp_free(g_func);
-        g_func = 0;
-    }
-
-#endif
-    return 0;
-}
-
-#ifdef _INTEL_LINUX
-void *_intel_fast_memcpy(void *dest, const void *src, size_t count)
-{
-
-    //memcpy(dest, src, count);
-    char *dst8 = (char *)dest;
-    char *src8 = (char *)src;
-
-    while (count--)
-    {
-        *dst8++ = *src8++;
-    }
-
-    return dest;
-}
-
-void *_intel_fast_memset(void *dest, int c, size_t count)
-{
-    memset(dest, c, count);
-    return dest;
-}
-
-void *_VEC_memzero(void *dest, int c, size_t count)
-{
-    memset(dest, 0, count);
-    return dest;
-}
-
-#endif //_ICC

diff --git a/vpx_mem/intel_linux/vpx_mem_tracker.c b/vpx_mem/intel_linux/vpx_mem_tracker.c
deleted file mode 100644
index 5bed4b5..0000000
--- a/vpx_mem/intel_linux/vpx_mem_tracker.c
+++ /dev/null

@@ -1,812 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/*
-  vpx_mem_tracker.c
-
-  jwz 2003-09-30:
-   Stores a list of addreses, their size, and file and line they came from.
-   All exposed lib functions are prefaced by vpx_ and allow the global list
-   to be thread safe.
-   Current supported platforms are:
-    Linux, Win32, win_ce and vx_works
-   Further support can be added by defining the platform specific mutex
-   in the memory_tracker struct as well as calls to create/destroy/lock/unlock
-   the mutex in vpx_memory_tracker_init/Destroy and memory_tracker_lock_mutex/unlock_mutex
-*/
-
-#define NO_MUTEX
-
-#if defined(__uClinux__)
-# include <lddk.h>
-#endif
-
-#if defined(LINUX) || defined(__uClinux__)
-# include <pthread.h>
-#elif defined(WIN32) || defined(_WIN32_WCE)
-# define WIN32_LEAN_AND_MEAN
-# include <windows.h>
-# include <winbase.h>
-#elif defined(VXWORKS)
-# include <sem_lib.h>
-#elif defined(NDS_NITRO)
-# include <nitro.h>
-# include <nitro/os.h>
-#endif
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h> //VXWORKS doesn't have a malloc/memory.h file,
-//this should pull in malloc,free,etc.
-#include <stdarg.h>
-
-#include "vpx_mem_tracker.h"
-
-#undef vpx_malloc   //undefine any vpx_mem macros that may affect calls to
-#undef vpx_free     //memory functions in this file
-#undef vpx_memcpy
-#undef vpx_memset
-
-
-#ifndef USE_GLOBAL_FUNCTION_POINTERS
-# define USE_GLOBAL_FUNCTION_POINTERS   0  //use function pointers instead of compiled functions.
-#endif
-
-#if USE_GLOBAL_FUNCTION_POINTERS
-static mem_track_malloc_func g_malloc   = malloc;
-static mem_track_calloc_func g_calloc   = calloc;
-static mem_track_realloc_func g_realloc = realloc;
-static mem_track_free_func g_free       = free;
-static mem_track_memcpy_func g_memcpy   = memcpy;
-static mem_track_memset_func g_memset   = memset;
-static mem_track_memmove_func g_memmove = memmove;
-# define MEM_TRACK_MALLOC g_malloc
-# define MEM_TRACK_FREE   g_free
-# define MEM_TRACK_MEMCPY g_memcpy
-# define MEM_TRACK_MEMSET g_memset
-#else
-# define MEM_TRACK_MALLOC vpx_malloc
-# define MEM_TRACK_FREE   vpx_free
-# define MEM_TRACK_MEMCPY vpx_memcpy
-# define MEM_TRACK_MEMSET vpx_memset
-#endif // USE_GLOBAL_FUNCTION_POINTERS
-
-
-struct memory_tracker
-{
-    struct mem_block *head,
-            * tail;
-    int len,
-        totalsize;
-    unsigned int current_allocated,
-             max_allocated;
-
-#if defined(LINUX) || defined(__uClinux__)
-    pthread_mutex_t mutex;
-#elif defined(WIN32) || defined(_WIN32_WCE)
-    HANDLE mutex;
-#elif defined(VXWORKS)
-    SEM_ID mutex;
-#elif defined(NDS_NITRO)
-    OSMutex mutex;
-#elif defined(NO_MUTEX)
-#else
-#error "No mutex type defined for this platform!"
-#endif
-
-    int padding_size,
-        pad_value;
-};
-
-/* prototypes for internal library functions */
-static void memtrack_log(const char *fmt, ...);
-static void memory_tracker_dump();
-static void memory_tracker_check_integrity(char *file, unsigned int line);
-static void memory_tracker_add(size_t addr, unsigned int size,
-                               char *file, unsigned int line,
-                               int padded);
-static int memory_tracker_remove(size_t addr);
-static struct mem_block *memory_tracker_find(size_t addr);
-
-#if defined(NO_MUTEX)
-# define memory_tracker_lock_mutex() (!g_b_mem_tracker_inited)
-# define memory_tracker_unlock_mutex()
-#else
-static int memory_tracker_lock_mutex();
-static int memory_tracker_unlock_mutex();
-#endif
-
-static struct memory_tracker memtrack;   //our global memory allocation list
-static int g_b_mem_tracker_inited = 0;     //indicates whether the global list has
-//been initialized (1:yes/0:no)
-static struct
-{
-    FILE *file;
-    int type;
-    void (*func)(void *userdata, const char *fmt, va_list args);
-    void *userdata;
-} g_logging = {0};
-
-extern void *vpx_malloc(size_t size);
-extern void vpx_free(void *memblk);
-extern void *vpx_memcpy(void *dest, const void *src, size_t length);
-extern void *vpx_memset(void *dest, int val, size_t length);
-
-/*
- *
- * Exposed library functions
- *
-*/
-
-/*
-    vpx_memory_tracker_init(int padding_size, int pad_value)
-      padding_size - the size of the padding before and after each mem addr.
-                     Values > 0 indicate that integrity checks can be performed
-                     by inspecting these areas.
-      pad_value - the initial value within the padding area before and after
-                  each mem addr.
-
-    Initializes global memory tracker structure
-    Allocates the head of the list
-*/
-int vpx_memory_tracker_init(int padding_size, int pad_value)
-{
-    if (!g_b_mem_tracker_inited)
-    {
-        if (memtrack.head = (struct mem_block *)MEM_TRACK_MALLOC(sizeof(struct mem_block)))
-        {
-            int ret;
-
-            MEM_TRACK_MEMSET(memtrack.head, 0, sizeof(struct mem_block));
-
-            memtrack.tail = memtrack.head;
-
-            memtrack.current_allocated = 0;
-            memtrack.max_allocated     = 0;
-
-            memtrack.padding_size = padding_size;
-            memtrack.pad_value    = pad_value;
-
-#if defined(LINUX) || defined(__uClinux__)
-            ret = pthread_mutex_init(&memtrack.mutex,
-                                     NULL);            /*mutex attributes (NULL=default)*/
-#elif defined(WIN32) || defined(_WIN32_WCE)
-            memtrack.mutex = create_mutex(NULL,   /*security attributes*/
-                                          FALSE,  /*we don't want initial ownership*/
-                                          NULL);  /*mutex name*/
-            ret = !memtrack.mutex;
-#elif defined(VXWORKS)
-            memtrack.mutex = sem_bcreate(SEM_Q_FIFO, /*SEM_Q_FIFO non-priority based mutex*/
-                                         SEM_FULL);  /*SEM_FULL initial state is unlocked*/
-            ret = !memtrack.mutex;
-#elif defined(NDS_NITRO)
-            os_init_mutex(&memtrack.mutex);
-            ret = 0;
-#elif defined(NO_MUTEX)
-            ret = 0;
-#endif
-
-            if (ret)
-            {
-                memtrack_log("vpx_memory_tracker_init: Error creating mutex!\n");
-
-                MEM_TRACK_FREE(memtrack.head);
-                memtrack.head = NULL;
-            }
-            else
-            {
-                memtrack_log("Memory Tracker init'd, v."vpx_mem_tracker_version" pad_size:%d pad_val:0x%x %d\n"
-                             , padding_size
-                             , pad_value
-                             , pad_value);
-                g_b_mem_tracker_inited = 1;
-            }
-        }
-    }
-
-    return g_b_mem_tracker_inited;
-}
-
-/*
-    vpx_memory_tracker_destroy()
-    If our global struct was initialized zeros out all its members,
-    frees memory and destroys it's mutex
-*/
-void vpx_memory_tracker_destroy()
-{
-
-    if (!memory_tracker_lock_mutex())
-    {
-        struct mem_block *p  = memtrack.head,
-                                  * p2 = memtrack.head;
-
-        memory_tracker_dump();
-
-        while (p)
-    {
-            p2 = p;
-            p  = p->next;
-
-            MEM_TRACK_FREE(p2);
-        }
-
-        memtrack.head              = NULL;
-        memtrack.tail              = NULL;
-        memtrack.len               = 0;
-        memtrack.current_allocated = 0;
-        memtrack.max_allocated     = 0;
-
-        if ((g_logging.type == 0) && (g_logging.file != 0)) //&& (g_logging.file != stderr) )
-        {
-#if !defined(NDS_NITRO)
-            fclose(g_logging.file);
-#endif
-            g_logging.file = NULL;
-        }
-
-        memory_tracker_unlock_mutex();
-
-        g_b_mem_tracker_inited = 0;
-
-    }
-
-}
-
-/*
-    vpx_memory_tracker_add(size_t addr, unsigned int size,
-                         char * file, unsigned int line)
-      addr - memory address to be added to list
-      size - size of addr
-      file - the file addr was referenced from
-      line - the line in file addr was referenced from
-    Adds memory address addr, it's size, file and line it came from
-    to the global list via the thread safe internal library function
-*/
-void vpx_memory_tracker_add(size_t addr, unsigned int size,
-                            char *file, unsigned int line,
-                            int padded)
-{
-    memory_tracker_add(addr, size, file, line, padded);
-}
-
-/*
-    vpx_memory_tracker_remove(size_t addr)
-      addr - memory address to be removed from list
-    Removes addr from the global list via the thread safe
-    internal remove function
-    Return:
-      Same as described for memory_tracker_remove
-*/
-int vpx_memory_tracker_remove(size_t addr)
-{
-    return memory_tracker_remove(addr);
-}
-
-/*
-    vpx_memory_tracker_find(size_t addr)
-      addr - address to be found in list
-    Return:
-        If found, pointer to the memory block that matches addr
-        NULL otherwise
-*/
-struct mem_block *vpx_memory_tracker_find(size_t addr)
-{
-    struct mem_block *p = NULL;
-
-    if (!memory_tracker_lock_mutex())
-    {
-        p = memory_tracker_find(addr);
-        memory_tracker_unlock_mutex();
-    }
-
-    return p;
-}
-
-/*
-    vpx_memory_tracker_dump()
-    Locks the memory tracker's mutex and calls the internal
-    library function to dump the current contents of the
-    global memory allocation list
-*/
-void vpx_memory_tracker_dump()
-{
-    if (!memory_tracker_lock_mutex())
-    {
-        memory_tracker_dump();
-        memory_tracker_unlock_mutex();
-    }
-}
-
-/*
-    vpx_memory_tracker_check_integrity(char* file, unsigned int line)
-      file - The file name where the check was placed
-      line - The line in file where the check was placed
-    Locks the memory tracker's mutex and calls the internal
-    integrity check function to inspect every address in the global
-    memory allocation list
-*/
-void vpx_memory_tracker_check_integrity(char *file, unsigned int line)
-{
-    if (!memory_tracker_lock_mutex())
-    {
-        memory_tracker_check_integrity(file, line);
-        memory_tracker_unlock_mutex();
-    }
-}
-
-/*
-    vpx_memory_tracker_set_log_type
-    Sets the logging type for the memory tracker. Based on the value it will
-    direct its output to the appropriate place.
-    Return:
-      0: on success
-      -1: if the logging type could not be set, because the value was invalid
-          or because a file could not be opened
-*/
-int vpx_memory_tracker_set_log_type(int type, char *option)
-{
-    int ret = -1;
-
-
-    switch (type)
-    {
-    case 0:
-        g_logging.type = 0;
-
-        if (!option)
-        {
-            // g_logging.file = stderr;
-            ret = 0;
-        }
-
-#if !defined(NDS_NITRO)
-        else
-        {
-            if (g_logging.file = fopen((char *)option, "w"))
-                ret = 0;
-        }
-
-#endif
-        break;
-#if defined(WIN32) && !defined(_WIN32_WCE)
-    case 1:
-        g_logging.type = type;
-        ret = 0;
-        break;
-#endif
-    default:
-        break;
-    }
-
-    //output the version to the new logging destination
-    if (!ret)
-        memtrack_log("Memory Tracker logging initialized, "
-                     "Memory Tracker v."vpx_mem_tracker_version"\n");
-
-    return ret;
-}
-
-/*
-    vpx_memory_tracker_set_log_func
-    Sets a logging function to be used by the memory tracker.
-    Return:
-      0: on success
-      -1: if the logging type could not be set because logfunc was NULL
-*/
-int vpx_memory_tracker_set_log_func(void *userdata,
-                                    void(*logfunc)(void *userdata,
-                                            const char *fmt, va_list args))
-{
-    int ret = -1;
-
-    if (logfunc)
-    {
-        g_logging.type     = -1;
-        g_logging.userdata = userdata;
-        g_logging.func     = logfunc;
-        ret = 0;
-    }
-
-    //output the version to the new logging destination
-    if (!ret)
-        memtrack_log("Memory Tracker logging initialized, "
-                     "Memory Tracker v."vpx_mem_tracker_version"\n");
-
-    return ret;
-}
-
-/*
- *
- * END - Exposed library functions
- *
-*/
-
-
-/*
- *
- * Internal library functions
- *
-*/
-
-static void memtrack_log(const char *fmt, ...)
-{
-    va_list list;
-
-    va_start(list, fmt);
-
-    switch (g_logging.type)
-    {
-    case -1:
-
-        if (g_logging.func)
-            g_logging.func(g_logging.userdata, fmt, list);
-
-        break;
-    case 0:
-
-        if (g_logging.file)
-        {
-            vfprintf(g_logging.file, fmt, list);
-            fflush(g_logging.file);
-        }
-
-        break;
-#if defined(WIN32) && !defined(_WIN32_WCE)
-    case 1:
-    {
-        char temp[1024];
-        _vsnprintf(temp, sizeof(temp) / sizeof(char) - 1, fmt, list);
-        output_debug_string(temp);
-    }
-    break;
-#endif
-    default:
-        break;
-    }
-
-    va_end(list);
-}
-
-/*
-    memory_tracker_dump()
-    Dumps the current contents of the global memory allocation list
-*/
-static void memory_tracker_dump()
-{
-    int i = 0;
-    struct mem_block *p = (memtrack.head ? memtrack.head->next : NULL);
-
-    memtrack_log("\n_currently Allocated= %d; Max allocated= %d\n",
-                 memtrack.current_allocated, memtrack.max_allocated);
-
-    while (p)
-    {
-#if defined(WIN32) && !defined(_WIN32_WCE)
-
-        /*when using outputdebugstring, output filenames so they
-          can be clicked to be opened in visual studio*/
-        if (g_logging.type == 1)
-            memtrack_log("memblocks[%d].addr= 0x%.8x, memblocks[%d].size= %d, file:\n"
-                         "  %s(%d):\n", i,
-                         p->addr, i, p->size,
-                         p->file, p->line);
-        else
-#endif
-            memtrack_log("memblocks[%d].addr= 0x%.8x, memblocks[%d].size= %d, file: %s, line: %d\n", i,
-                         p->addr, i, p->size,
-                         p->file, p->line);
-
-        p = p->next;
-        ++i;
-    }
-
-    memtrack_log("\n");
-}
-
-/*
-    memory_tracker_check_integrity(char* file, unsigned int file)
-      file - the file name where the check was placed
-      line - the line in file where the check was placed
-    If a padding_size was supplied to vpx_memory_tracker_init()
-    this function will check ea. addr in the list verifying that
-    addr-padding_size and addr+padding_size is filled with pad_value
-*/
-static void memory_tracker_check_integrity(char *file, unsigned int line)
-{
-    if (memtrack.padding_size)
-    {
-        int i,
-            index = 0;
-        unsigned char *p_show_me,
-                 * p_show_me2;
-        unsigned int tempme = memtrack.pad_value,
-                     dead1,
-                     dead2;
-        unsigned char *x_bounds;
-        struct mem_block *p = memtrack.head->next;
-
-        while (p)
-        {
-            //x_bounds = (unsigned char*)p->addr;
-            //back up VPX_BYTE_ALIGNMENT
-            //x_bounds -= memtrack.padding_size;
-
-            if (p->padded)   // can the bounds be checked?
-            {
-                /*yes, move to the address that was actually allocated
-                by the vpx_* calls*/
-                x_bounds = (unsigned char *)(((size_t *)p->addr)[-1]);
-
-                for (i = 0; i < memtrack.padding_size; i += sizeof(unsigned int))
-                {
-                    p_show_me = (x_bounds + i);
-                    p_show_me2 = (unsigned char *)(p->addr + p->size + i);
-
-                    MEM_TRACK_MEMCPY(&dead1, p_show_me, sizeof(unsigned int));
-                    MEM_TRACK_MEMCPY(&dead2, p_show_me2, sizeof(unsigned int));
-
-                    if ((dead1 != tempme) || (dead2 != tempme))
-                    {
-                        memtrack_log("\n[vpx_mem integrity check failed]:\n"
-                                     "    index[%d] {%s:%d} addr=0x%x, size=%d,"
-                                     " file: %s, line: %d c0:0x%x c1:0x%x\n",
-                                     index, file, line, p->addr, p->size, p->file,
-                                     p->line, dead1, dead2);
-                    }
-                }
-            }
-
-            ++index;
-            p = p->next;
-        }
-    }
-}
-
-/*
-    memory_tracker_add(size_t addr, unsigned int size,
-                     char * file, unsigned int line)
-    Adds an address (addr), it's size, file and line number to our list.
-    Adjusts the total bytes allocated and max bytes allocated if necessary.
-    If memory cannot be allocated the list will be destroyed.
-*/
-void memory_tracker_add(size_t addr, unsigned int size,
-                        char *file, unsigned int line,
-                        int padded)
-{
-    if (!memory_tracker_lock_mutex())
-    {
-        struct mem_block *p;
-
-        p = MEM_TRACK_MALLOC(sizeof(struct mem_block));
-
-        if (p)
-        {
-            p->prev       = memtrack.tail;
-            p->prev->next = p;
-            p->addr       = addr;
-            p->size       = size;
-            p->line       = line;
-            p->file       = file;
-            p->padded     = padded;
-            p->next       = NULL;
-
-            memtrack.tail = p;
-
-            memtrack.current_allocated += size;
-
-            if (memtrack.current_allocated > memtrack.max_allocated)
-                memtrack.max_allocated = memtrack.current_allocated;
-
-            //memtrack_log("memory_tracker_add: added addr=0x%.8x\n", addr);
-
-            memory_tracker_unlock_mutex();
-        }
-        else
-        {
-            memtrack_log("memory_tracker_add: error allocating memory!\n");
-            memory_tracker_unlock_mutex();
-            vpx_memory_tracker_destroy();
-        }
-    }
-}
-
-/*
-    memory_tracker_remove(size_t addr)
-    Removes an address and its corresponding size (if they exist)
-    from the memory tracker list and adjusts the current number
-    of bytes allocated.
-    Return:
-      0: on success
-      -1: if the mutex could not be locked
-      -2: if the addr was not found in the list
-*/
-int memory_tracker_remove(size_t addr)
-{
-    int ret = -1;
-
-    if (!memory_tracker_lock_mutex())
-    {
-        struct mem_block *p;
-
-        if (p = memory_tracker_find(addr))
-        {
-            memtrack.current_allocated -= p->size;
-
-            p->prev->next = p->next;
-
-            if (p->next)
-                p->next->prev = p->prev;
-            else
-                memtrack.tail = p->prev;
-
-            ret = 0;
-            MEM_TRACK_FREE(p);
-        }
-        else
-        {
-            memtrack_log("memory_tracker_remove(): addr not found in list, 0x%.8x\n", addr);
-            ret = -2;
-        }
-
-        memory_tracker_unlock_mutex();
-    }
-
-    return ret;
-}
-
-/*
-    memory_tracker_find(size_t addr)
-    Finds an address in our addrs list
-    NOTE: the mutex MUST be locked in the other internal
-          functions before calling this one. This avoids
-          the need for repeated locking and unlocking as in Remove
-    Returns: pointer to the mem block if found, NULL otherwise
-*/
-static struct mem_block *memory_tracker_find(size_t addr)
-{
-    struct mem_block *p = NULL;
-
-    if (memtrack.head)
-    {
-        p = memtrack.head->next;
-
-        while (p && (p->addr != addr))
-            p = p->next;
-    }
-
-    return p;
-}
-
-
-#if !defined(NO_MUTEX)
-/*
-    memory_tracker_lock_mutex()
-    Locks the memory tracker mutex with a platform specific call
-    Returns:
-        0: Success
-       <0: Failure, either the mutex was not initialized
-           or the call to lock the mutex failed
-*/
-static int memory_tracker_lock_mutex()
-{
-    int ret = -1;
-
-    if (g_b_mem_tracker_inited)
-    {
-
-#if defined(LINUX) || defined(__uClinux__)
-        ret = pthread_mutex_lock(&memtrack.mutex);
-#elif defined(WIN32) || defined(_WIN32_WCE)
-        ret = WaitForSingleObject(memtrack.mutex, INFINITE);
-#elif defined(VXWORKS)
-        ret = sem_take(memtrack.mutex, WAIT_FOREVER);
-#elif defined(NDS_NITRO)
-        os_lock_mutex(&memtrack.mutex);
-        ret = 0;
-#endif
-
-        if (ret)
-        {
-            memtrack_log("memory_tracker_lock_mutex: mutex lock failed\n");
-        }
-    }
-
-    return ret;
-}
-
-/*
-    memory_tracker_unlock_mutex()
-    Unlocks the memory tracker mutex with a platform specific call
-    Returns:
-        0: Success
-       <0: Failure, either the mutex was not initialized
-           or the call to unlock the mutex failed
-*/
-static int memory_tracker_unlock_mutex()
-{
-    int ret = -1;
-
-    if (g_b_mem_tracker_inited)
-    {
-
-#if defined(LINUX) || defined(__uClinux__)
-        ret = pthread_mutex_unlock(&memtrack.mutex);
-#elif defined(WIN32) || defined(_WIN32_WCE)
-        ret = !release_mutex(memtrack.mutex);
-#elif defined(VXWORKS)
-        ret = sem_give(memtrack.mutex);
-#elif defined(NDS_NITRO)
-        os_unlock_mutex(&memtrack.mutex);
-        ret = 0;
-#endif
-
-        if (ret)
-        {
-            memtrack_log("memory_tracker_unlock_mutex: mutex unlock failed\n");
-        }
-    }
-
-    return ret;
-}
-#endif
-
-/*
-    vpx_memory_tracker_set_functions
-
-    Sets the function pointers for the standard library functions.
-
-    Return:
-      0: on success
-      -1: if the use global function pointers is not set.
-*/
-int vpx_memory_tracker_set_functions(mem_track_malloc_func g_malloc_l
-                                     , mem_track_calloc_func g_calloc_l
-                                     , mem_track_realloc_func g_realloc_l
-                                     , mem_track_free_func g_free_l
-                                     , mem_track_memcpy_func g_memcpy_l
-                                     , mem_track_memset_func g_memset_l
-                                     , mem_track_memmove_func g_memmove_l)
-{
-#if USE_GLOBAL_FUNCTION_POINTERS
-
-    if (g_malloc_l)
-        g_malloc = g_malloc_l;
-
-    if (g_calloc_l)
-        g_calloc = g_calloc_l;
-
-    if (g_realloc_l)
-        g_realloc = g_realloc_l;
-
-    if (g_free_l)
-        g_free = g_free_l;
-
-    if (g_memcpy_l)
-        g_memcpy = g_memcpy_l;
-
-    if (g_memset_l)
-        g_memset = g_memset_l;
-
-    if (g_memmove_l)
-        g_memmove = g_memmove_l;
-
-    return 0;
-#else
-    (void)g_malloc_l;
-    (void)g_calloc_l;
-    (void)g_realloc_l;
-    (void)g_free_l;
-    (void)g_memcpy_l;
-    (void)g_memset_l;
-    (void)g_memmove_l;
-    return -1;
-#endif
-}

diff --git a/vpx_mem/nds/vpx_mem_nds.c b/vpx_mem/nds/vpx_mem_nds.c
deleted file mode 100644
index 11ac95c..0000000
--- a/vpx_mem/nds/vpx_mem_nds.c
+++ /dev/null

@@ -1,66 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#define __VPX_MEM_C__
-#include "vpx_mem.h"
-#include <nitro.h>
-#include "vpx_mem_intrnl.h"
-
-// Allocate memory from the Arena specified by id.  Align it to
-//  the value specified by align.
-void *vpx_mem_nds_alloc(osarena_id id, osheap_handle handle, size_t size, size_t align)
-{
-    void *addr,
-         * x = NULL;
-
-    addr = os_alloc_from_heap((osarena_id) id, handle,
-                              size + align - 1 + ADDRESS_STORAGE_SIZE);
-
-    if (addr)
-    {
-        x = align_addr((unsigned char *)addr + ADDRESS_STORAGE_SIZE, (int)align);
-
-        // save the actual malloc address
-        ((size_t *)x)[-1] = (size_t)addr;
-    }
-
-    return x;
-}
-
-// Free them memory allocated by vpx_mem_nds_alloc
-void vpx_mem_nds_free(osarena_id id, osheap_handle handle, void *mem)
-{
-    if (mem)
-    {
-        void *addr = (void *)(((size_t *)mem)[-1]);
-        os_free_to_heap(id, handle, addr);
-    }
-}
-
-int vpx_nds_alloc_heap(osarena_id id, u32 size)
-{
-    osheap_handle    arena_handle;
-    void           *nstart;
-    void           *heap_start;
-
-    nstart = os_init_alloc(id, os_get_arena_lo(id), os_get_arena_hi(id), 1);
-    os_set_arena_lo(id, nstart);
-
-    heap_start = os_alloc_from_arena_lo(id, size, 32);
-    arena_handle = os_create_heap(id, heap_start, (void *)((u32)heap_start + size));
-
-    if (os_check_heap(id, arena_handle) == -1)
-        return -1;      //ERROR: DTCM heap is not consistent
-
-    (void)os_set_current_heap(id, arena_handle);
-
-    return arena_handle;
-}

diff --git a/vpx_mem/ti_c6x/vpx_mem_ti_6cx.c b/vpx_mem/ti_c6x/vpx_mem_ti_6cx.c
deleted file mode 100644
index d55b7d9..0000000
--- a/vpx_mem/ti_c6x/vpx_mem_ti_6cx.c
+++ /dev/null

@@ -1,116 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#define __VPX_MEM_C__
-
-#include "..\include\vpx_mem.h"
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include "..\include\vpx_mem_intrnl.h"
-
-void *vpx_mem_alloc(int id, size_t size, size_t align)
-{
-#if defined CHIP_DM642 || defined __uClinux__
-    void *mem = (void *)mem_alloc(id, size, align);
-
-    if (!mem)
-    {
-        _P(fprintf(stderr,
-                   "\n"
-                   "*********************************************************\n"
-                   "WARNING: mem_alloc returned 0 for id=%p size=%u align=%u.\n"
-                   "*********************************************************\n",
-                   mem, size, align));
-        // should no longer need this.  Softier says it's fixed. 2005-01-21 tjf
-        //#if defined __uClinux__
-        //while(1)usleep(1000000);
-        //#endif
-    }
-
-#if defined __uClinux__
-    else if (mem == (void *)0xFFFFFFFF)
-    {
-        // out of memory/error
-        mem = (void *)0;
-
-        _P(fprintf(stderr,
-                   "\n"
-                   "******************************************************\n"
-                   "ERROR: mem_alloc id=%p size=%u align=%u OUT OF MEMORY.\n"
-                   "******************************************************\n",
-                   mem, size, align));
-    }
-
-#endif  // __uClinux__
-
-    return mem;
-#else
-    (void)id;
-    (void)size;
-    (void)align;
-    return (void *)0;
-#endif
-}
-
-void vpx_mem_free(int id, void *mem, size_t size)
-{
-#if defined CHIP_DM642 || defined __uClinux__
-
-    if (!mem)
-    {
-        _P(fprintf(stderr,
-                   "\n"
-                   "**************************************\n"
-                   "WARNING: 0 being free'd id=%p size=%u.\n"
-                   "**************************************\n",
-                   id, size));
-
-        // should no longer need this.  Softier says it's fixed. 2005-01-21 tjf
-        //#if defined __uClinux__
-        //while(1)usleep(1000000);
-        //#endif
-    }
-
-    mem_free(id, mem, size);
-#else
-    (void)id;
-    (void)mem;
-    (void)size;
-#endif
-}
-
-#if CONFIG_MEM_TRACKER
-void *xvpx_mem_alloc(int id, size_t size, size_t align, char *file, int line)
-{
-    void *mem = vpx_mem_alloc(id, size, align);
-
-    vpx_memory_tracker_add((size_t)mem, size, file, line, 0);
-
-    return mem;
-}
-
-void xvpx_mem_free(int id, void *mem, size_t size, char *file, int line)
-{
-    if (vpx_memory_tracker_remove((size_t)mem) == -2)
-    {
-#if REMOVE_PRINTFS
-        (void)file;
-        (void)line;
-#endif
-        _P(fprintf(stderr, "[vpx_mem][xvpx_mem_free] addr: %p (id=%p size=%u) "
-                   "not found in list; freed from file:%s"
-                   " line:%d\n", mem, id, size, file, line));
-    }
-
-    vpx_mem_free(id, mem, size);
-}
-#endif /*CONFIG_MEM_TRACKER*/

diff --git a/vpx_ports/mem_ops.h b/vpx_ports/mem_ops.h
index c178b8b..0e52368 100644
--- a/vpx_ports/mem_ops.h
+++ b/vpx_ports/mem_ops.h

@@ -9,10 +9,10 @@
  */
 
 
-/* \file mem_ops.h
- * \brief Provides portable memory access primatives
+/* \file
+ * \brief Provides portable memory access primitives
  *
- * This function provides portable primatives for getting and setting of
+ * This function provides portable primitives for getting and setting of
  * signed and unsigned integers in 16, 24, and 32 bit sizes. The operations
  * can be performed on unaligned data regardless of hardware support for
  * unaligned accesses.

diff --git a/vpx_ports/mem_ops_aligned.h b/vpx_ports/mem_ops_aligned.h
index 4c44aa2..82a18b2 100644
--- a/vpx_ports/mem_ops_aligned.h
+++ b/vpx_ports/mem_ops_aligned.h

@@ -9,12 +9,12 @@
  */
 
 
-/* \file mem_ops_aligned.h
- * \brief Provides portable memory access primatives for operating on aligned
+/* \file
+ * \brief Provides portable memory access primitives for operating on aligned
  *        data
  *
- * This file is split from mem_ops.h for easier maintainence. See mem_ops.h
- * for a more detailed description of these primatives.
+ * This file is split from mem_ops.h for easier maintenance. See mem_ops.h
+ * for a more detailed description of these primitives.
  */
 #ifndef INCLUDED_BY_MEM_OPS_H
 #error Include mem_ops.h, not mem_ops_aligned.h directly.

diff --git a/vpx_ports/x86_abi_support.asm b/vpx_ports/x86_abi_support.asm
index a872b28..be64cd7 100644
--- a/vpx_ports/x86_abi_support.asm
+++ b/vpx_ports/x86_abi_support.asm

@@ -168,15 +168,10 @@
     %macro GET_GOT 1
       push %1
       call %%get_got
-      %%sub_offset:
-      jmp  %%exitGG
       %%get_got:
-      mov  %1, [esp]
-      add %1, fake_got - %%sub_offset
-      ret
-      %%exitGG:
+      pop  %1
       %undef GLOBAL
-      %define GLOBAL(x) x + %1 - fake_got
+      %define GLOBAL(x) x + %1 - %%get_got
       %undef RESTORE_GOT
       %define RESTORE_GOT pop %1
     %endmacro
@@ -289,7 +284,6 @@
 %elifidn __OUTPUT_FORMAT__,macho32
 %macro SECTION_RODATA 0
 section .text
-fake_got:
 %endmacro
 %else
 %define SECTION_RODATA section .rodata

diff --git a/vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm b/vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm
index 24d46cb..e6bb486 100644
--- a/vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm
+++ b/vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm

@@ -14,7 +14,7 @@
     REQUIRE8
     PRESERVE8
 
-    INCLUDE vpx_asm_offsets.asm
+    INCLUDE asm_com_offsets.asm
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 

diff --git a/vpx_scale/arm/neon/vp8_vpxyv12_copyframeyonly_neon.asm b/vpx_scale/arm/neon/vp8_vpxyv12_copyframeyonly_neon.asm
index 6534827..febccc2 100644
--- a/vpx_scale/arm/neon/vp8_vpxyv12_copyframeyonly_neon.asm
+++ b/vpx_scale/arm/neon/vp8_vpxyv12_copyframeyonly_neon.asm

@@ -16,7 +16,7 @@
     REQUIRE8
     PRESERVE8
 
-    INCLUDE vpx_asm_offsets.asm
+    INCLUDE asm_com_offsets.asm
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 ;void vpxyv12_copy_frame_yonly(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);

diff --git a/vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm b/vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm
index dfc8db5..ec64dbc 100644
--- a/vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm
+++ b/vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm

@@ -14,7 +14,7 @@
     REQUIRE8
     PRESERVE8
 
-    INCLUDE vpx_asm_offsets.asm
+    INCLUDE asm_com_offsets.asm
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 ;Note: This function is used to copy source data in src_buffer[i] at beginning of

diff --git a/vpx_scale/arm/neon/vp8_vpxyv12_extendframeborders_neon.asm b/vpx_scale/arm/neon/vp8_vpxyv12_extendframeborders_neon.asm
index e475b92..b0a3b93 100644
--- a/vpx_scale/arm/neon/vp8_vpxyv12_extendframeborders_neon.asm
+++ b/vpx_scale/arm/neon/vp8_vpxyv12_extendframeborders_neon.asm

@@ -14,7 +14,7 @@
     REQUIRE8
     PRESERVE8
 
-    INCLUDE vpx_asm_offsets.asm
+    INCLUDE asm_com_offsets.asm
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 ;void vp8_yv12_extend_frame_borders_neon (YV12_BUFFER_CONFIG *ybf);

diff --git a/vpx_scale/arm/scalesystemdependant.c b/vpx_scale/arm/scalesystemdependent.c
similarity index 100%
rename from vpx_scale/arm/scalesystemdependant.c
rename to vpx_scale/arm/scalesystemdependent.c


diff --git a/vpx_scale/blackfin/yv12config.c b/vpx_scale/blackfin/yv12config.c
deleted file mode 100644
index 42538af..0000000
--- a/vpx_scale/blackfin/yv12config.c
+++ /dev/null

@@ -1,120 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/****************************************************************************
- *
- *   Module Title :     yv12config.c
- *
- *   Description  :
- *
- ***************************************************************************/
-
-/****************************************************************************
-*  Header Files
-****************************************************************************/
-#include "vpx_scale/yv12config.h"
-#include "vpx_mem/vpx_mem.h"
-
-#include <cdef_bf533.h>
-
-/****************************************************************************
-*  Imports
-****************************************************************************/
-void
-extend_memset(void *dst, unsigned char value, unsigned int size);
-
-/****************************************************************************
- *
- ****************************************************************************/
-int
-vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf)
-{
-    if (ybf)
-    {
-        if (ybf->buffer_alloc)
-        {
-            duck_free(ybf->buffer_alloc);
-        }
-
-        ybf->buffer_alloc = 0;
-    }
-    else
-    {
-        return -1;
-    }
-
-    return 0;
-}
-
-/****************************************************************************
- *
- ****************************************************************************/
-int
-vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int border)
-{
-//NOTE:
-
-    int yplane_size = (height + 2 * border) * (width + 2 * border);
-    int uvplane_size = (height / 2 + border) * (width / 2 + border);
-
-    if (ybf)
-    {
-        vp8_yv12_de_alloc_frame_buffer(ybf);
-
-        ybf->y_width  = width;
-        ybf->y_height = height;
-        ybf->y_stride = width + 2 * border;
-
-        ybf->uv_width = width / 2;
-        ybf->uv_height = height / 2;
-        ybf->uv_stride = ybf->uv_width + border;
-
-        ybf->border = border;
-
-        // Added 2 extra lines to framebuffer so that copy12x12 doesn't fail
-        // when we have a large motion vector in V on the last v block.
-        // Note : We never use these pixels anyway so this doesn't hurt.
-        ybf->buffer_alloc = (unsigned char *) duck_memalign(32, (yplane_size * 3 / 2) +  ybf->y_stride , 0);
-
-        if (ybf->buffer_alloc == NULL)
-            return -1;
-
-        ybf->y_buffer = ybf->buffer_alloc + border * ybf->y_stride + border;
-        ybf->u_buffer = ybf->buffer_alloc + yplane_size + border / 2  * ybf->uv_stride + border / 2;
-        ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size + border / 2  * ybf->uv_stride + border / 2;
-    }
-    else
-    {
-        return -2;
-    }
-
-    return 0;
-}
-/****************************************************************************
- *
- ****************************************************************************/
-int
-vp8_yv12_black_frame_buffer(YV12_BUFFER_CONFIG *ybf)
-{
-    if (ybf)
-    {
-        if (ybf->buffer_alloc)
-        {
-            extend_memset(ybf->y_buffer, 0x0, ybf->y_stride *(ybf->y_height + 2 * ybf->border));
-            extend_memset(ybf->u_buffer, 0x80, ybf->uv_stride *(ybf->uv_height + ybf->border));
-            extend_memset(ybf->v_buffer, 0x80, ybf->uv_stride *(ybf->uv_height + ybf->border));
-        }
-
-        return 0;
-    }
-
-    return -1;
-}

diff --git a/vpx_scale/blackfin/yv12extend.c b/vpx_scale/blackfin/yv12extend.c
deleted file mode 100644
index cfd3de0..0000000
--- a/vpx_scale/blackfin/yv12extend.c
+++ /dev/null

@@ -1,350 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/****************************************************************************
- *
- *   Module Title :     yv12extend.c
- *
- *   Description  :
- *
- ***************************************************************************/
-
-/****************************************************************************
-*  Header Files
-****************************************************************************/
-#include <cdef_bf533.h>
-
-#include "vpx_scale/yv12config.h"
-#include "vpx_mem/vpx_mem.h"
-
-/****************************************************************************
-*
-****************************************************************************/
-
-
-/****************************************************************************
-*
-****************************************************************************/
-void
-extend_memset(void *dst, unsigned char value, unsigned int size)
-{
-#if 0
-    unsigned int quad_value;
-
-    quad_value = (unsigned int) value;
-    quad_value |= (unsigned int) value << 8;
-    quad_value |= (unsigned int) value << 16;
-    quad_value |= (unsigned int) value << 24;
-#else
-    unsigned short quad_value;
-
-    quad_value = (unsigned int) value;
-    quad_value |= (unsigned int) value << 8;
-#endif
-
-
-    if (size / 2 >= 64 * 1024)
-        printf("_Extend_memset__________ dma memset is broken\n");
-
-    *p_mdma_s1_start_addr = &quad_value;
-    *p_mdma_s1_x_count = size / 2;
-    *p_mdma_s1_x_modify = 0x0;
-    *p_mdma_d1_start_addr = dst;
-    *p_mdma_d1_x_count = size / 2;
-    *p_mdma_d1_x_modify = 2;
-
-    *p_mdma_s1_config = DMAEN | WDSIZE_16;
-    asm("ssync;");
-
-    *p_mdma_d1_config = DI_EN | DMAEN | WNR | WDSIZE_16;
-    asm("ssync;");
-
-    while ((*p_mdma_d1_irq_status & DMA_DONE) == 0);
-
-    *p_mdma_d1_irq_status |= DMA_DONE;
-}
-
-/****************************************************************************
-*
-****************************************************************************/
-void
-extend_memcpy(void *dst, void *src, unsigned int size)
-{
-    if (size / 2 >= 64 * 1024)
-        printf("_Extend_memcpy__________ dma memcpy is broken\n");
-
-
-    if ((size & 0x3))
-        printf("_)__________ size not a multiple of 4\n");
-
-//32 bit dma here caused some data to be corrupted --- WHY ??????
-
-    *p_mdma_s1_start_addr = src;
-    *p_mdma_s1_x_count = size / 2;
-    *p_mdma_s1_x_modify = 2;
-    *p_mdma_d1_start_addr = dst;
-    *p_mdma_d1_x_count = size / 2;
-    *p_mdma_d1_x_modify = 2;
-
-    *p_mdma_s1_config = DMAEN | WDSIZE_16;
-    asm("ssync;");
-
-    *p_mdma_d1_config = DI_EN | DMAEN | WNR | WDSIZE_16;
-    asm("ssync;");
-
-    while ((*p_mdma_d1_irq_status & DMA_DONE) == 0);
-
-    *p_mdma_d1_irq_status |= DMA_DONE;
-}
-
-/****************************************************************************
- *
- ****************************************************************************/
-void
-vp8_yv12_extend_frame_borders(YV12_BUFFER_CONFIG *ybf)
-{
-#if 1
-    int i;
-    unsigned char *src_ptr1, *src_ptr2;
-    unsigned char *dest_ptr1, *dest_ptr2;
-
-    unsigned int Border;
-    int plane_stride;
-    int plane_height;
-    int plane_width;
-
-    unsigned int quad_sample;
-    unsigned int sample;
-
-    /***********/
-    /* Y Plane */
-    /***********/
-    Border = ybf->border;
-    plane_stride = ybf->y_stride;
-    plane_height = ybf->y_height;
-    plane_width = ybf->y_width;
-
-    // copy the left and right most columns out
-    src_ptr1 = ybf->y_buffer;
-    src_ptr2 = src_ptr1 + plane_width - 1;
-    dest_ptr1 = src_ptr1 - Border;
-    dest_ptr2 = src_ptr2 + 1;
-
-    for (i = 0; i < plane_height; i++)
-    {
-        extend_memset(dest_ptr1, src_ptr1[0], Border);
-        extend_memset(dest_ptr2, src_ptr2[0], Border);
-        src_ptr1  += plane_stride;
-        src_ptr2  += plane_stride;
-        dest_ptr1 += plane_stride;
-        dest_ptr2 += plane_stride;
-    }
-
-    // Now copy the top and bottom source lines into each line of the respective borders
-    src_ptr1 = ybf->y_buffer - Border;
-    src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
-    dest_ptr1 = src_ptr1 - (Border * plane_stride);
-    dest_ptr2 = src_ptr2 + plane_stride;
-
-    for (i = 0; i < (int)Border; i++)
-    {
-        extend_memcpy(dest_ptr1, src_ptr1, plane_stride);
-        dest_ptr1 += plane_stride;
-    }
-
-    for (i = 0; i < (int)Border; i++)
-    {
-        extend_memcpy(dest_ptr2, src_ptr2, plane_stride);
-        dest_ptr2 += plane_stride;
-    }
-
-    plane_stride /= 2;
-    plane_height /= 2;
-    plane_width /= 2;
-    Border /= 2;
-
-    /***********/
-    /* U Plane */
-    /***********/
-
-    // copy the left and right most columns out
-    src_ptr1 = ybf->u_buffer;
-    src_ptr2 = src_ptr1 + plane_width - 1;
-    dest_ptr1 = src_ptr1 - Border;
-    dest_ptr2 = src_ptr2 + 1;
-
-    for (i = 0; i < plane_height; i++)
-    {
-        extend_memset(dest_ptr1, src_ptr1[0], Border);
-        extend_memset(dest_ptr2, src_ptr2[0], Border);
-        src_ptr1  += plane_stride;
-        src_ptr2  += plane_stride;
-        dest_ptr1 += plane_stride;
-        dest_ptr2 += plane_stride;
-    }
-
-    // Now copy the top and bottom source lines into each line of the respective borders
-    src_ptr1 = ybf->u_buffer - Border;
-    src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
-    dest_ptr1 = src_ptr1 - (Border * plane_stride);
-    dest_ptr2 = src_ptr2 + plane_stride;
-
-    for (i = 0; i < (int)(Border); i++)
-    {
-        extend_memcpy(dest_ptr1, src_ptr1, plane_stride);
-        dest_ptr1 += plane_stride;
-    }
-
-    for (i = 0; i < (int)(Border); i++)
-    {
-        extend_memcpy(dest_ptr2, src_ptr2, plane_stride);
-        dest_ptr2 += plane_stride;
-    }
-
-    /***********/
-    /* V Plane */
-    /***********/
-
-    // copy the left and right most columns out
-    src_ptr1 = ybf->v_buffer;
-    src_ptr2 = src_ptr1 + plane_width - 1;
-    dest_ptr1 = src_ptr1 - Border;
-    dest_ptr2 = src_ptr2 + 1;
-
-    for (i = 0; i < plane_height; i++)
-    {
-        extend_memset(dest_ptr1, src_ptr1[0], Border);
-        extend_memset(dest_ptr2, src_ptr2[0], Border);
-        src_ptr1  += plane_stride;
-        src_ptr2  += plane_stride;
-        dest_ptr1 += plane_stride;
-        dest_ptr2 += plane_stride;
-    }
-
-    // Now copy the top and bottom source lines into each line of the respective borders
-    src_ptr1 = ybf->v_buffer - Border;
-    src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
-    dest_ptr1 = src_ptr1 - (Border * plane_stride);
-    dest_ptr2 = src_ptr2 + plane_stride;
-
-    for (i = 0; i < (int)(Border); i++)
-    {
-        extend_memcpy(dest_ptr1, src_ptr1, plane_stride);
-        dest_ptr1 += plane_stride;
-    }
-
-    for (i = 0; i < (int)(Border); i++)
-    {
-        extend_memcpy(dest_ptr2, src_ptr2, plane_stride);
-        dest_ptr2 += plane_stride;
-    }
-
-#endif
-}
-/****************************************************************************
- *
- *  ROUTINE       : vp8_yv12_copy_frame
- *
- *  INPUTS        :
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Copies the source image into the destination image and
- *                  updates the destination's UMV borders.
- *
- *  SPECIAL NOTES : The frames are assumed to be identical in size.
- *
- ****************************************************************************/
-void
-vp8_yv12_copy_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc)
-{
-#if 1
-    int row;
-    unsigned char *source, *dest;
-
-    source = src_ybc->y_buffer;
-    dest = dst_ybc->y_buffer;
-
-    for (row = 0; row < src_ybc->y_height; row++)
-    {
-        extend_memcpy(dest, source, src_ybc->y_width);
-        source += src_ybc->y_stride;
-        dest   += dst_ybc->y_stride;
-    }
-
-    source = src_ybc->u_buffer;
-    dest = dst_ybc->u_buffer;
-
-    for (row = 0; row < src_ybc->uv_height; row++)
-    {
-        extend_memcpy(dest, source, src_ybc->uv_width);
-        source += src_ybc->uv_stride;
-        dest   += dst_ybc->uv_stride;
-    }
-
-    source = src_ybc->v_buffer;
-    dest = dst_ybc->v_buffer;
-
-    for (row = 0; row < src_ybc->uv_height; row++)
-    {
-        extend_memcpy(dest, source, src_ybc->uv_width);
-        source += src_ybc->uv_stride;
-        dest   += dst_ybc->uv_stride;
-    }
-
-    vp8_yv12_extend_frame_borders(dst_ybc);
-
-#else
-    int row;
-    char *source, *dest;
-    int height;
-    int width;
-
-    height = src_ybc->y_height + (src_ybc->border * 2);
-    width =  src_ybc->y_width + (src_ybc->border * 2);
-    source = src_ybc->y_buffer;
-    dest = dst_ybc->y_buffer;
-
-    for (row = 0; row < height; row++)
-    {
-        extend_memcpy(dest, source, width);
-        source += src_ybc->y_stride;
-        dest   += dst_ybc->y_stride;
-    }
-
-    height = src_ybc->uv_height + (src_ybc->border);
-    width =  src_ybc->uv_width + (src_ybc->border);
-
-    source = src_ybc->u_buffer;
-    dest = dst_ybc->u_buffer;
-
-    for (row = 0; row < height; row++)
-    {
-        extend_memcpy(dest, source, width);
-        source += src_ybc->uv_stride;
-        dest   += dst_ybc->uv_stride;
-    }
-
-    source = src_ybc->v_buffer;
-    dest = dst_ybc->v_buffer;
-
-    for (row = 0; row < height; row++)
-    {
-        extend_memcpy(dest, source, width);
-        source += src_ybc->uv_stride;
-        dest   += dst_ybc->uv_stride;
-    }
-
-#endif
-
-}

diff --git a/vpx_scale/dm642/bicubic_scaler_c64.c b/vpx_scale/dm642/bicubic_scaler_c64.c
deleted file mode 100644
index 5166ace..0000000
--- a/vpx_scale/dm642/bicubic_scaler_c64.c
+++ /dev/null

@@ -1,194 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include <float.h>
-#include <math.h>
-#include <stdio.h>
-#include "vpx_mem/vpx_mem.h"
-#include "vpxscale_arbitrary.h"
-
-extern BICUBIC_SCALER_STRUCT g_b_scaler;
-
-int bicubic_scale_c64(int in_width, int in_height, int in_stride,
-                      int out_width, int out_height, int out_stride,
-                      unsigned char *input_image, unsigned char *output_image)
-{
-    short *restrict l_w, * restrict l_h;
-    short *restrict c_w, * restrict c_h;
-    unsigned char *restrict ip, * restrict op, *restrict op_w;
-    unsigned char *restrict hbuf;
-    int h, w, lw, lh;
-    int phase_offset_w, phase_offset_h;
-    double coeff;
-    int max_phase;
-
-    c_w = g_b_scaler.c_w;
-    c_h = g_b_scaler.c_h;
-
-    op = output_image;
-
-    l_w = g_b_scaler.l_w;
-    l_h = g_b_scaler.l_h;
-
-    phase_offset_h = 0;
-
-    for (h = 0; h < out_height; h++)
-    {
-        // select the row to work on
-        lh = l_h[h];
-        ip = input_image + (in_stride * lh);
-
-        coeff = _memd8_const(&c_h[phase_offset_h*4]);
-
-        // vp8_filter the row vertically into an temporary buffer.
-        //  If the phase offset == 0 then all the multiplication
-        //  is going to result in the output equalling the input.
-        //  So instead point the temporary buffer to the input.
-        //  Also handle the boundry condition of not being able to
-        //  filter that last lines.
-        if (phase_offset_h && (lh < in_height - 2))
-        {
-            hbuf = g_b_scaler.hbuf;
-
-            for (w = 0; w < in_width; w += 4)
-            {
-                int ip1, ip2, ip3, ip4;
-                int y13_12, y11_10, y23_22, y21_20, y33_32, y31_30, y43_42, y41_40;
-                int y10_20, y11_21, y12_22, y13_23, y30_40, y31_41, y32_42, y33_43;
-                int s1, s2, s3, s4;
-
-                ip1 = _mem4_const(&ip[w - in_stride]);
-                ip2 = _mem4_const(&ip[w]);
-                ip3 = _mem4_const(&ip[w + in_stride]);
-                ip4 = _mem4_const(&ip[w + 2*in_stride]);
-
-                // realignment of data.  Unpack the data so that it is in short
-                //  format instead of bytes.
-                y13_12 = _unpkhu4(ip1);
-                y11_10 = _unpklu4(ip1);
-                y23_22 = _unpkhu4(ip2);
-                y21_20 = _unpklu4(ip2);
-                y33_32 = _unpkhu4(ip3);
-                y31_30 = _unpklu4(ip3);
-                y43_42 = _unpkhu4(ip4);
-                y41_40 = _unpklu4(ip4);
-
-                // repack the data so that elements 1 and 2 are together.  this
-                //  lines up so that a dot product with the coefficients can be
-                //  done.
-                y10_20 = _pack2(y11_10, y21_20);
-                y11_21 = _packh2(y11_10, y21_20);
-                y12_22 = _pack2(y13_12, y23_22);
-                y13_23 = _packh2(y13_12, y23_22);
-
-                s1 = _dotp2(_hi(coeff), y10_20);
-                s2 = _dotp2(_hi(coeff), y11_21);
-                s3 = _dotp2(_hi(coeff), y12_22);
-                s4 = _dotp2(_hi(coeff), y13_23);
-
-                y30_40 = _pack2(y31_30, y41_40);
-                y31_41 = _packh2(y31_30, y41_40);
-                y32_42 = _pack2(y33_32, y43_42);
-                y33_43 = _packh2(y33_32, y43_42);
-
-                // now repack elements 3 and 4 together.
-                s1 += _dotp2(_lo(coeff), y30_40);
-                s2 += _dotp2(_lo(coeff), y31_41);
-                s3 += _dotp2(_lo(coeff), y32_42);
-                s4 += _dotp2(_lo(coeff), y33_43);
-
-                s1 = s1 >> 12;
-                s2 = s2 >> 12;
-                s3 = s3 >> 12;
-                s4 = s4 >> 12;
-
-                s1 = _pack2(s2, s1);
-                s2 = _pack2(s4, s3);
-
-                _amem4(&hbuf[w])  = _spacku4(s2, s1);
-            }
-        }
-        else
-            hbuf = ip;
-
-        // increase the phase offset for the next time around.
-        if (++phase_offset_h >= g_b_scaler.nh)
-            phase_offset_h = 0;
-
-        op_w = op;
-
-        // will never be able to interpolate first pixel, so just copy it
-        // over here.
-        phase_offset_w = 1;
-        *op_w++ = hbuf[0];
-
-        if (1 >= g_b_scaler.nw) phase_offset_w = 0;
-
-        max_phase = g_b_scaler.nw;
-
-        for (w = 1; w < out_width; w++)
-        {
-            double coefficients;
-            int hbuf_high, hbuf_low, hbuf_both;
-            int sum_high, sum_low, sum;
-
-            // get the index to use to expand the image
-            lw = l_w[w];
-            coefficients = _amemd8_const(&c_w[phase_offset_w*4]);
-            hbuf_both = _mem4_const(&hbuf[lw-1]);
-
-            hbuf_high = _unpkhu4(hbuf_both);
-            hbuf_low  = _unpklu4(hbuf_both);
-
-            sum_high = _dotp2(_hi(coefficients), hbuf_high);
-            sum_low  = _dotp2(_lo(coefficients), hbuf_low);
-
-            sum = (sum_high + sum_low) >> 12;
-
-            if (++phase_offset_w >= max_phase)
-                phase_offset_w = 0;
-
-            if ((lw + 2) >= in_width)
-                sum = hbuf[lw];
-
-            *op_w++ = sum;
-        }
-
-        op += out_stride;
-    }
-
-    return 0;
-}
-
-void bicubic_scale_frame_c64(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
-                             int new_width, int new_height)
-{
-
-    dst->y_width = new_width;
-    dst->y_height = new_height;
-    dst->uv_width = new_width / 2;
-    dst->uv_height = new_height / 2;
-
-    dst->y_stride = dst->y_width;
-    dst->uv_stride = dst->uv_width;
-
-    bicubic_scale_c64(src->y_width, src->y_height, src->y_stride,
-                      new_width, new_height, dst->y_stride,
-                      src->y_buffer, dst->y_buffer);
-
-    bicubic_scale_c64(src->uv_width, src->uv_height, src->uv_stride,
-                      new_width / 2, new_height / 2, dst->uv_stride,
-                      src->u_buffer, dst->u_buffer);
-
-    bicubic_scale_c64(src->uv_width, src->uv_height, src->uv_stride,
-                      new_width / 2, new_height / 2, dst->uv_stride,
-                      src->v_buffer, dst->v_buffer);
-}

diff --git a/vpx_scale/dm642/gen_scalers_c64.c b/vpx_scale/dm642/gen_scalers_c64.c
deleted file mode 100644
index 87ff998..0000000
--- a/vpx_scale/dm642/gen_scalers_c64.c
+++ /dev/null

@@ -1,608 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/****************************************************************************
- *
- *   Module Title :     gen_scalers.c
- *
- *   Description  :     Generic image scaling functions.
- *
- ***************************************************************************/
-
-/****************************************************************************
-*  Header Files
-****************************************************************************/
-#include "vpx_scale/vpxscale.h"
-
-/****************************************************************************
-*  Imports
-****************************************************************************/
-
-/****************************************************************************
- *
- *  ROUTINE       : horizontal_line_4_5_scale_c4
- *
- *  INPUTS        : const unsigned char *source : Pointer to source data.
- *                  unsigned int source_width    : Stride of source.
- *                  unsigned char *dest         : Pointer to destination data.
- *                  unsigned int dest_width      : Stride of destination (NOT USED).
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Copies horizontal line of pixels from source to
- *                  destination scaling up by 4 to 5.
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-static
-void horizontal_line_4_5_scale_c64
-(
-    const unsigned char *source,
-    unsigned int source_width,
-    unsigned char *dest,
-    unsigned int dest_width
-)
-{
-    unsigned i;
-    unsigned int ba, cb, dc, ed;
-    unsigned char *restrict des = dest;
-    unsigned int *restrict src = (unsigned int *)source;
-    unsigned int const_51_205, const_102_154,
-             const_205_51, const_154_102;
-
-    unsigned int src_current, src_next;
-
-    (void) dest_width;
-
-    // Constants that are to be used for the filtering.  For
-    //  best speed we are going to want to right shift by 16.
-    //  In the generic version they were shift by 8, so put
-    //  an extra 8 in now so that 16 will come out later.
-    const_51_205 = 0x3300CD00; //_pack2 (51 << 8, 205 << 8);
-    const_205_51 = 0xCD003300; //_pack2 (205 << 8, 51 << 8);
-    const_102_154 = 0x66009A00; //_pack2 (102 << 8, 154 << 8);
-    const_154_102 = 0x9A006600; //_pack2 (154 << 8, 102 << 8);
-
-    // 5 points are needed to filter to give 5 output points.
-    //  A load can pull up 4 at a time, and one needs to be
-    //  "borrowed" from the next set of data.  So instead of
-    //  loading those 5 points each time, "steal" a point from
-    //  the next set and only load up 4 each time through.
-    src_current = _mem4(src);
-
-    for (i = 0; i < source_width - 4; i += 4)
-    {
-        src_next = _mem4(src++);
-
-        // Reorder the data so that it is ready for the
-        //  dot product.
-        ba = _unpklu4(src_current);
-        cb = _unpkhu4(_rotl(src_current, 8));
-        dc = _unpkhu4(src_current);
-        ed = _unpkhu4(_shrmb(src_next, src_current));
-
-        // Use the dot product with round and shift.
-        des [0] = src_current & 0xff;
-        des [1] = _dotprsu2(ba, const_205_51);
-        des [2] = _dotprsu2(cb, const_154_102);
-        des [3] = _dotprsu2(dc, const_102_154);
-        des [4] = _dotprsu2(ed, const_51_205);
-
-        des += 5;
-
-        // reuse loaded vales next time around.
-        src_current = src_next;
-    }
-
-    // vp8_filter the last set of points.  Normally a point from the next set
-    //  would be used, but there is no next set, so just fill.
-    ba = _unpklu4(src_current);
-    cb = _unpkhu4(_rotl(src_current, 8));
-    dc = _unpkhu4(src_current);
-
-    des [0] = src_current & 0xff;
-    des [1] = _dotprsu2(ba, const_205_51);
-    des [2] = _dotprsu2(cb, const_154_102);
-    des [3] = _dotprsu2(dc, const_102_154);
-    des [4] = src_current & 0xff;
-
-}
-/****************************************************************************
- *
- *  ROUTINE       : vertical_band_4_5_scale_c64
- *
- *  INPUTS        : unsigned char *dest    : Pointer to destination data.
- *                  unsigned int dest_pitch : Stride of destination data.
- *                  unsigned int dest_width : Width of destination data.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Scales vertical band of pixels by scale 4 to 5. The
- *                  height of the band scaled is 4-pixels.
- *
- *  SPECIAL NOTES : The routine uses the first line of the band below
- *                  the current band.
- *
- ****************************************************************************/
-static
-void vertical_band_4_5_scale_c64(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
-{
-    unsigned int i;
-    unsigned int a, b, c, d, e;
-    unsigned int ba, cb, dc, ed;
-    unsigned char *restrict src = dest;
-    unsigned char *restrict des = dest;
-    unsigned int const_51_205, const_102_154,
-             const_205_51, const_154_102;
-
-    const_51_205 = 0x3300CD00; //_pack2 (51 << 8, 205 << 8);
-    const_205_51 = 0xCD003300; //_pack2 (205 << 8, 51 << 8);
-    const_102_154 = 0x66009A00; //_pack2 (102 << 8, 154 << 8);
-    const_154_102 = 0x9A006600; //_pack2 (154 << 8, 102 << 8);
-
-    // Force a loop unroll here so that there is not such a
-    //  dependancy.
-    a = src [0];
-    b = src [dest_pitch];
-    c = src [dest_pitch*2];
-    d = src [dest_pitch*3];
-    e = src [dest_pitch*5];
-    src ++;
-
-    for (i = 0; i < dest_width; i++)
-    {
-        ba = _pack2(b, a);
-        cb = _pack2(c, b);
-        dc = _pack2(d, c);
-        ed = _pack2(e, d);
-
-        a = src [0];
-        b = src [dest_pitch];
-        c = src [dest_pitch*2];
-        d = src [dest_pitch*3];
-        e = src [dest_pitch*5];
-        src ++;
-
-        des [dest_pitch] = _dotprsu2(ba, const_205_51);
-        des [dest_pitch*2] = _dotprsu2(cb, const_154_102);
-        des [dest_pitch*3] = _dotprsu2(dc, const_102_154);
-        des [dest_pitch*4] = _dotprsu2(ed, const_51_205);
-
-        des ++;
-    }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : last_vertical_band_4_5_scale_c64
- *
- *  INPUTS        : unsigned char *dest    : Pointer to destination data.
- *                  unsigned int dest_pitch : Stride of destination data.
- *                  unsigned int dest_width : Width of destination data.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Scales last vertical band of pixels by scale 4 to 5. The
- *                  height of the band scaled is 4-pixels.
- *
- *  SPECIAL NOTES : The routine does not have available the first line of
- *                  the band below the current band, since this is the
- *                  last band.
- *
- ****************************************************************************/
-static
-void last_vertical_band_4_5_scale_c64(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
-{
-    unsigned int i;
-    unsigned int a, b, c, d;
-    unsigned int ba, cb, dc;
-    unsigned char *restrict src = dest;
-    unsigned char *restrict des = dest;
-    unsigned int const_102_154, const_205_51, const_154_102;
-
-    const_205_51 = 0xCD003300; //_pack2 (205 << 8, 51 << 8);
-    const_102_154 = 0x66009A00; //_pack2 (102 << 8, 154 << 8);
-    const_154_102 = 0x9A006600; //_pack2 (154 << 8, 102 << 8);
-
-    a = src [0];
-    b = src [dest_pitch];
-    c = src [dest_pitch*2];
-    d = src [dest_pitch*3];
-    src ++;
-
-    for (i = 0; i < dest_width; ++i)
-    {
-        ba = _pack2(b, a);
-        cb = _pack2(c, b);
-        dc = _pack2(d, c);
-
-        a = src [0];
-        b = src [dest_pitch];
-        c = src [dest_pitch*2];
-        d = src [dest_pitch*3];
-        src ++;
-
-        des [dest_pitch] = _dotprsu2(ba, const_205_51);
-        des [dest_pitch*2] = _dotprsu2(cb, const_154_102);
-        des [dest_pitch*3] = _dotprsu2(dc, const_102_154);
-        des [dest_pitch*4] = (unsigned char) d;
-
-        des++;
-    }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : horizontal_line_3_5_scale_c64
- *
- *  INPUTS        : const unsigned char *source : Pointer to source data.
- *                  unsigned int source_width    : Stride of source.
- *                  unsigned char *dest         : Pointer to destination data.
- *                  unsigned int dest_width      : Stride of destination (NOT USED).
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Copies horizontal line of pixels from source to
- *                  destination scaling up by 3 to 5.
- *
- *  SPECIAL NOTES : None.
- *
- *
- ****************************************************************************/
-static
-void horizontal_line_3_5_scale_c64
-(
-    const unsigned char *source,
-    unsigned int source_width,
-    unsigned char *dest,
-    unsigned int dest_width
-)
-{
-    unsigned int i;
-    unsigned int ba, cb, dc;
-    unsigned int src_current;
-    unsigned char *restrict des = dest;
-    unsigned char *restrict src = (unsigned char *)source;
-    unsigned int const_51_205, const_102_154,
-             const_205_51, const_154_102;
-
-    (void) dest_width;
-
-    const_51_205 = 0x3300CD00; //_pack2 (51 << 8, 205 << 8);
-    const_205_51 = 0xCD003300; //_pack2 (205 << 8, 51 << 8);
-    const_102_154 = 0x66009A00; //_pack2 (102 << 8, 154 << 8);
-    const_154_102 = 0x9A006600; //_pack2 (154 << 8, 102 << 8);
-
-    for (i = 0; i < source_width - 3; i += 3)
-    {
-        src_current = _mem4(src);
-
-        // Reorder the data so that it is ready for the
-        //  dot product.
-        ba = _unpklu4(src_current);
-        cb = _unpkhu4(_rotl(src_current, 8));
-        dc = _unpkhu4(src_current);
-
-        des [0] = src_current & 0xff;
-        des [1] = _dotprsu2(ba, const_154_102);
-        des [2] = _dotprsu2(cb, const_51_205);
-        des [3] = _dotprsu2(cb, const_205_51);
-        des [4] = _dotprsu2(dc, const_102_154);
-
-        src += 3;
-        des += 5;
-    }
-
-    src_current = _mem4(src);
-
-    ba = _unpklu4(src_current);
-    cb = _unpkhu4(_rotl(src_current, 8));
-    dc = _unpkhu4(src_current);
-
-
-    des [0] = src_current & 0xff;
-    des [1] = _dotprsu2(ba, const_154_102);
-    des [2] = _dotprsu2(cb, const_51_205);
-    des [3] = _dotprsu2(cb, const_205_51);
-    des [4] = dc & 0xff;
-
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : vertical_band_3_5_scale_c64
- *
- *  INPUTS        : unsigned char *dest    : Pointer to destination data.
- *                  unsigned int dest_pitch : Stride of destination data.
- *                  unsigned int dest_width : Width of destination data.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Scales vertical band of pixels by scale 3 to 5. The
- *                  height of the band scaled is 3-pixels.
- *
- *  SPECIAL NOTES : The routine uses the first line of the band below
- *                  the current band.
- *
- ****************************************************************************/
-static
-void vertical_band_3_5_scale_c64(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
-{
-    unsigned int i;
-    unsigned int a, b, c, d;
-    unsigned int ba, cb, dc;
-    unsigned char *restrict src = dest;
-    unsigned char *restrict des = dest;
-    unsigned int const_51_205, const_102_154,
-             const_205_51, const_154_102;
-
-    const_51_205 = 0x3300CD00; //_pack2 (51 << 8, 205 << 8);
-    const_205_51 = 0xCD003300; //_pack2 (205 << 8, 51 << 8);
-    const_102_154 = 0x66009A00; //_pack2 (102 << 8, 154 << 8);
-    const_154_102 = 0x9A006600; //_pack2 (154 << 8, 102 << 8);
-
-    a = src [0];
-    b = src [dest_pitch];
-    c = src [dest_pitch*2];
-    d = src [dest_pitch*5];
-    src ++;
-
-    for (i = 0; i < dest_width; i++)
-    {
-        ba = _pack2(b, a);
-        cb = _pack2(c, b);
-        dc = _pack2(d, c);
-
-        a = src [0];
-        b = src [dest_pitch];
-        c = src [dest_pitch*2];
-        d = src [dest_pitch*5];
-        src ++;
-
-        des [dest_pitch]   = _dotprsu2(ba, const_154_102);
-        des [dest_pitch*2] = _dotprsu2(cb, const_51_205);
-        des [dest_pitch*3] = _dotprsu2(cb, const_205_51);
-        des [dest_pitch*4] = _dotprsu2(dc, const_102_154);
-
-        des++;
-    }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : last_vertical_band_3_5_scale_c64
- *
- *  INPUTS        : unsigned char *dest    : Pointer to destination data.
- *                  unsigned int dest_pitch : Stride of destination data.
- *                  unsigned int dest_width : Width of destination data.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Scales last vertical band of pixels by scale 3 to 5. The
- *                  height of the band scaled is 3-pixels.
- *
- *  SPECIAL NOTES : The routine does not have available the first line of
- *                  the band below the current band, since this is the
- *                  last band.
- *
- ****************************************************************************/
-static
-void last_vertical_band_3_5_scale_c64(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
-{
-    unsigned int i;
-    unsigned int a, b, c;
-    unsigned int ba, cb;
-    unsigned char *restrict src = dest;
-    unsigned char *restrict des = dest;
-    unsigned int const_51_205, const_205_51, const_154_102;
-
-    const_51_205 = 0x3300CD00; //_pack2 (51 << 8, 205 << 8);
-    const_205_51 = 0xCD003300; //_pack2 (205 << 8, 51 << 8);
-    const_154_102 = 0x9A006600; //_pack2 (154 << 8, 102 << 8);
-
-    a = src [0];
-    b = src [dest_pitch];
-    c = src [dest_pitch*2];
-    src ++;
-
-    for (i = 0; i < dest_width; ++i)
-    {
-        ba = _pack2(b, a);
-        cb = _pack2(c, b);
-
-        a = src [0];
-        b = src [dest_pitch];
-        c = src [dest_pitch*2];
-        src ++;
-
-        des [dest_pitch]   = _dotprsu2(ba, const_154_102);
-        des [dest_pitch*2] = _dotprsu2(cb, const_51_205);
-        des [dest_pitch*3] = _dotprsu2(cb, const_205_51);
-        des [dest_pitch*4] = (unsigned char)(c) ;
-
-        des++;
-    }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : horizontal_line_1_2_scale_c64
- *
- *  INPUTS        : const unsigned char *source : Pointer to source data.
- *                  unsigned int source_width    : Stride of source.
- *                  unsigned char *dest         : Pointer to destination data.
- *                  unsigned int dest_width      : Stride of destination (NOT USED).
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Copies horizontal line of pixels from source to
- *                  destination scaling up by 1 to 2.
- *
- *  SPECIAL NOTES : source width must be a multiple of 4.
- *
- ****************************************************************************/
-void horizontal_line_1_2_scale_c64
-(
-    const unsigned char *source,
-    unsigned int source_width,
-    unsigned char *dest,
-    unsigned int dest_width
-)
-{
-    unsigned int i;
-    unsigned char *restrict des = dest;
-    unsigned char *restrict src = (unsigned char *)source;
-    unsigned int src7_4i, src4_1i, src3_0i;
-    unsigned int a4_0i, ahi, alo;
-    double src7_0d, src3_0d;
-    const unsigned int k01 = 0x01010101;
-
-    for (i = 0; i < source_width / 4; i += 1)
-    {
-        // Load up the data from src.  Here a wide load is
-        //  used to get 8 bytes at once, only 5 will be used
-        //  for the actual computation.
-        src7_0d = _memd8(src);
-        src3_0i = _lo(src7_0d);
-        src7_4i = _hi(src7_0d);
-
-        // Need to average between points.  Shift byte 5 into
-        //  the lower word.  This will result in bytes 5-1
-        //  averaged with 4-0.
-        src4_1i = _shrmb(src7_4i, src3_0i);
-        a4_0i = _avgu4(src4_1i, src3_0i);
-
-        // Expand the data out. Could do an unpack, however
-        //  all but the multiply units are getting pretty hard
-        //  here the multiply unit can take some of the computations.
-        src3_0d = _mpyu4(src3_0i, k01);
-
-        // The averages need to be unpacked so that they are in 16
-        //  bit form and will be able to be interleaved with the
-        //  original data
-        ahi = _unpkhu4(a4_0i);
-        alo = _unpklu4(a4_0i);
-
-        ahi = _swap4(ahi);
-        alo = _swap4(alo);
-
-        // Mix the average result in with the orginal data.
-        ahi = _hi(src3_0d) | ahi;
-        alo = _lo(src3_0d) | alo;
-
-        _memd8(des) = _itod(ahi, alo);
-
-        des += 8;
-        src += 4;
-    }
-}
-
-
-/****************************************************************************
- *
- *  ROUTINE       : vertical_band_1_2_scale_c64
- *
- *  INPUTS        : unsigned char *dest    : Pointer to destination data.
- *                  unsigned int dest_pitch : Stride of destination data.
- *                  unsigned int dest_width : Width of destination data.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Scales vertical band of pixels by scale 1 to 2. The
- *                  height of the band scaled is 1-pixel.
- *
- *  SPECIAL NOTES : The routine uses the first line of the band below
- *                  the current band.
- *                  Destination width must be a multiple of 4.  Because the
- *                  intput must be, therefore the output must be.
- *
- ****************************************************************************/
-static
-void vertical_band_1_2_scale_c64(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
-{
-    unsigned int i;
-    unsigned int a, b;
-    unsigned int *restrict line_a = (unsigned int *)dest;
-    unsigned int *restrict line_b = (unsigned int *)(dest + (dest_pitch * 2));
-    unsigned int *restrict des = (unsigned int *)(dest + dest_pitch);
-
-    for (i = 0; i < dest_width / 4; i++)
-    {
-        a = _mem4(line_a++);
-        b = _mem4(line_b++);
-
-        _mem4(des++) = _avgu4(a, b);
-    }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : last_vertical_band_1_2_scale_c64
- *
- *  INPUTS        : unsigned char *dest    : Pointer to destination data.
- *                  unsigned int dest_pitch : Stride of destination data.
- *                  unsigned int dest_width : Width of destination data.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Scales last vertical band of pixels by scale 1 to 2. The
- *                  height of the band scaled is 1-pixel.
- *
- *  SPECIAL NOTES : The routine does not have available the first line of
- *                  the band below the current band, since this is the
- *                  last band.  Again, width must be a multiple of 4.
- *
- ****************************************************************************/
-static
-void last_vertical_band_1_2_scale_c64(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
-{
-    unsigned int i;
-    unsigned int *restrict src = (unsigned int *)dest;
-    unsigned int *restrict des = (unsigned int *)(dest + dest_pitch);
-
-    for (i = 0; i < dest_width / 4; ++i)
-    {
-        _mem4(des++) = _mem4(src++);
-    }
-}
-
-void
-register_generic_scalers(void)
-{
-    vp8_horizontal_line_1_2_scale        = horizontal_line_1_2_scale_c64;
-    vp8_vertical_band_1_2_scale          = vertical_band_1_2_scale_c64;
-    vp8_last_vertical_band_1_2_scale      = last_vertical_band_1_2_scale_c64;
-    vp8_horizontal_line_3_5_scale        = horizontal_line_3_5_scale_c64;
-    vp8_vertical_band_3_5_scale          = vertical_band_3_5_scale_c64;
-    vp8_last_vertical_band_3_5_scale      = last_vertical_band_3_5_scale_c64;
-    vp8_horizontal_line_4_5_scale        = horizontal_line_4_5_scale_c64;
-    vp8_vertical_band_4_5_scale          = vertical_band_4_5_scale_c64;
-    vp8_last_vertical_band_4_5_scale      = last_vertical_band_4_5_scale_c64;
-}

diff --git a/vpx_scale/dm642/yv12extend.c b/vpx_scale/dm642/yv12extend.c
deleted file mode 100644
index 646e69a..0000000
--- a/vpx_scale/dm642/yv12extend.c
+++ /dev/null

@@ -1,446 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/****************************************************************************
- *
- *   Module Title :     yv12extend.c
- *
- *   Description  :
- *
- ***************************************************************************/
-
-/****************************************************************************
-*  Header Files
-****************************************************************************/
-//#include <stdlib.h>
-#include "csl_dat.h"
-#include "vpx_scale/yv12config.h"
-#include "vpx_mem/vpx_mem.h"
-
-/****************************************************************************
-*  Exports
-****************************************************************************/
-#define UINT8 unsigned char
-#define UINT32 unsigned int
-
-
-static inline
-void copy_yleft_right_border(
-    UINT8 *restrict src_ptr1,
-    UINT8 *restrict src_ptr2,
-    UINT8 *restrict dest_ptr1,
-    UINT8 *restrict dest_ptr2,
-    UINT32  plane_height,
-    UINT32  plane_stride
-)
-{
-    UINT32 left, right, left2, left4, right2, right4;
-    double dl, dr;
-    int i;
-
-#pragma MUST_ITERATE(16,16,16)
-
-    for (i = 0; i < plane_height; i++)
-    {
-        left  = src_ptr1[0];
-        right = src_ptr2[0];
-
-        left2 = _pack2(left, left);
-        left4 = _packl4(left2, left2);
-
-        right2 = _pack2(right, right);
-        right4 = _packl4(right2, right2);
-
-        dl = _itod(left4, left4);
-        dr = _itod(right4, right4);
-
-        _amemd8(&dest_ptr1[ 0]) = dl;
-        _amemd8(&dest_ptr2[ 0]) = dr;
-
-        _amemd8(&dest_ptr1[ 8]) = dl;
-        _amemd8(&dest_ptr2[ 8]) = dr;
-
-        _amemd8(&dest_ptr1[16]) = dl;
-        _amemd8(&dest_ptr2[16]) = dr;
-
-        _amemd8(&dest_ptr1[24]) = dl;
-        _amemd8(&dest_ptr2[24]) = dr;
-
-        _amemd8(&dest_ptr1[32]) = dl;
-        _amemd8(&dest_ptr2[32]) = dr;
-
-        _amemd8(&dest_ptr1[40]) = dl;
-        _amemd8(&dest_ptr2[40]) = dr;
-
-
-        src_ptr1 += plane_stride;
-        src_ptr2 += plane_stride;
-        dest_ptr1 += plane_stride;
-        dest_ptr2 += plane_stride;
-    }
-}
-/****************************************************************************
- *
- *
- ****************************************************************************/
-static
-void copy_uvleft_right_border(
-    UINT8 *restrict src_ptr1,
-    UINT8 *restrict src_ptr2,
-    UINT8 *restrict dest_ptr1,
-    UINT8 *restrict dest_ptr2,
-    UINT32  plane_height,
-    UINT32  plane_stride
-)
-{
-    UINT32 left, right, left2, left4, right2, right4;
-    double dl, dr;
-    int i;
-
-#pragma MUST_ITERATE(8,8 ,8)
-
-    for (i = 0; i < plane_height; i++)
-    {
-        left  = src_ptr1[0];
-        right = src_ptr2[0];
-
-        left2 = _pack2(left, left);
-        left4 = _packl4(left2, left2);
-
-        right2 = _pack2(right, right);
-        right4 = _packl4(right2, right2);
-
-        dl = _itod(left4, left4);
-        dr = _itod(right4, right4);
-
-        _amemd8(&dest_ptr1[ 0]) = dl;
-        _amemd8(&dest_ptr2[ 0]) = dr;
-
-        _amemd8(&dest_ptr1[ 8]) = dl;
-        _amemd8(&dest_ptr2[ 8]) = dr;
-
-        _amemd8(&dest_ptr1[16]) = dl;
-        _amemd8(&dest_ptr2[16]) = dr;
-
-
-        src_ptr1 += plane_stride;
-        src_ptr2 += plane_stride;
-        dest_ptr1 += plane_stride;
-        dest_ptr2 += plane_stride;
-    }
-}
-/****************************************************************************
- *
- ****************************************************************************/
-void
-vp8_yv12_extend_frame_borders(YV12_BUFFER_CONFIG *ybf)
-{
-    int i;
-    unsigned char *src_ptr1, *src_ptr2;
-    unsigned char *dest_ptr1, *dest_ptr2;
-
-    unsigned int Border;
-    int plane_stride;
-    int plane_height;
-    int plane_width;
-
-    /***********/
-    /* Y Plane */
-    /***********/
-    Border = ybf->border;
-    plane_stride = ybf->y_stride;
-    plane_height = ybf->y_height;
-    plane_width = ybf->y_width;
-
-#if 1
-    // copy the left and right most columns out
-    src_ptr1 = ybf->y_buffer;
-    src_ptr2 = src_ptr1 + plane_width - 1;
-    dest_ptr1 = src_ptr1 - Border;
-    dest_ptr2 = src_ptr2 + 1;
-    copy_yleft_right_border(src_ptr1, src_ptr2, dest_ptr1, dest_ptr2, plane_height, plane_stride);
-#endif
-
-    // Now copy the top and bottom source lines into each line of the respective borders
-    src_ptr1 = ybf->y_buffer - Border;
-    src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
-    dest_ptr1 = src_ptr1 - (Border * plane_stride);
-    dest_ptr2 = src_ptr2 + plane_stride;
-
-    for (i = 0; i < (int)Border; i++)
-    {
-        vpx_memcpy(dest_ptr1, src_ptr1, plane_stride);
-        vpx_memcpy(dest_ptr2, src_ptr2, plane_stride);
-        dest_ptr1 += plane_stride;
-        dest_ptr2 += plane_stride;
-    }
-
-    plane_stride /= 2;
-    plane_height /= 2;
-    plane_width /= 2;
-    Border /= 2;
-
-    /***********/
-    /* U Plane */
-    /***********/
-#if 1
-    // copy the left and right most columns out
-    src_ptr1 = ybf->u_buffer;
-    src_ptr2 = src_ptr1 + plane_width - 1;
-    dest_ptr1 = src_ptr1 - Border;
-    dest_ptr2 = src_ptr2 + 1;
-
-    copy_uvleft_right_border(src_ptr1, src_ptr2, dest_ptr1, dest_ptr2, plane_height, plane_stride);
-
-
-#endif
-
-    // Now copy the top and bottom source lines into each line of the respective borders
-    src_ptr1 = ybf->u_buffer - Border;
-    src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
-    dest_ptr1 = src_ptr1 - (Border * plane_stride);
-    dest_ptr2 = src_ptr2 + plane_stride;
-
-    for (i = 0; i < (int)(Border); i++)
-    {
-        vpx_memcpy(dest_ptr1, src_ptr1, plane_stride);
-        vpx_memcpy(dest_ptr2, src_ptr2, plane_stride);
-        dest_ptr1 += plane_stride;
-        dest_ptr2 += plane_stride;
-    }
-
-    /***********/
-    /* V Plane */
-    /***********/
-#if 1
-    // copy the left and right most columns out
-    src_ptr1 = ybf->v_buffer;
-    src_ptr2 = src_ptr1 + plane_width - 1;
-    dest_ptr1 = src_ptr1 - Border;
-    dest_ptr2 = src_ptr2 + 1;
-
-    copy_uvleft_right_border(src_ptr1, src_ptr2, dest_ptr1, dest_ptr2, plane_height, plane_stride);
-
-#endif
-
-    // Now copy the top and bottom source lines into each line of the respective borders
-    src_ptr1 = ybf->v_buffer - Border;
-    src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
-    dest_ptr1 = src_ptr1 - (Border * plane_stride);
-    dest_ptr2 = src_ptr2 + plane_stride;
-
-    for (i = 0; i < (int)(Border); i++)
-    {
-        vpx_memcpy(dest_ptr1, src_ptr1, plane_stride);
-        vpx_memcpy(dest_ptr2, src_ptr2, plane_stride);
-        dest_ptr1 += plane_stride;
-        dest_ptr2 += plane_stride;
-    }
-}
-/****************************************************************************
- *
- ****************************************************************************/
-void
-vpxyv12_extend_frame_tbborders(YV12_BUFFER_CONFIG *ybf)
-{
-    int i;
-    unsigned char *src_ptr1, *src_ptr2;
-    unsigned char *dest_ptr1, *dest_ptr2;
-    int tid1, tid2;
-
-    unsigned int Border;
-    int plane_stride;
-    int plane_height;
-    int plane_width;
-
-    /***********/
-    /* Y Plane */
-    /***********/
-    Border = ybf->border;
-    plane_stride = ybf->y_stride;
-    plane_height = ybf->y_height;
-    plane_width = ybf->y_width;
-
-
-    // Now copy the top and bottom source lines into each line of the respective borders
-    src_ptr1 = ybf->y_buffer - Border;
-    src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
-    dest_ptr1 = src_ptr1 - (Border * plane_stride);
-    dest_ptr2 = src_ptr2 + plane_stride;
-
-
-    for (i = 0; i < (int)Border; i++)
-    {
-        dat_copy(src_ptr1, dest_ptr1, plane_stride);
-        dat_copy(src_ptr2, dest_ptr2, plane_stride);
-        dest_ptr1 += plane_stride;
-        dest_ptr2 += plane_stride;
-    }
-
-    plane_stride /= 2;
-    plane_height /= 2;
-    plane_width /= 2;
-    Border /= 2;
-
-    /***********/
-    /* U Plane */
-    /***********/
-    // Now copy the top and bottom source lines into each line of the respective borders
-    src_ptr1 = ybf->u_buffer - Border;
-    src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
-    dest_ptr1 = src_ptr1 - (Border * plane_stride);
-    dest_ptr2 = src_ptr2 + plane_stride;
-
-    for (i = 0; i < (int)(Border); i++)
-    {
-        dat_copy(src_ptr1, dest_ptr1, plane_stride);
-        dat_copy(src_ptr2, dest_ptr2, plane_stride);
-        dest_ptr1 += plane_stride;
-        dest_ptr2 += plane_stride;
-    }
-
-    /***********/
-    /* V Plane */
-    /***********/
-    // Now copy the top and bottom source lines into each line of the respective borders
-    src_ptr1 = ybf->v_buffer - Border;
-    src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
-    dest_ptr1 = src_ptr1 - (Border * plane_stride);
-    dest_ptr2 = src_ptr2 + plane_stride;
-
-    for (i = 0; i < (int)(Border); i++)
-    {
-        tid1 = dat_copy(src_ptr1, dest_ptr1, plane_stride);
-        tid2 = dat_copy(src_ptr2, dest_ptr2, plane_stride);
-        dest_ptr1 += plane_stride;
-        dest_ptr2 += plane_stride;
-    }
-
-    dat_wait(tid1);
-    dat_wait(tid2);
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : vp8_yv12_copy_frame
- *
- *  INPUTS        :
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Copies the source image into the destination image and
- *                  updates the destination's UMV borders.  Because the
- *                  borders have been update prior to this so the whole frame
- *                  is copied, borders and all.  This is also to circumvent
- *                  using copy_left_right Border functions when copying data
- *                  between L2 and main memory.  When that occurs a cache
- *                  clean needs to be done, which would require invalidating
- *                  an entire frame.
- *
- *  SPECIAL NOTES : The frames are assumed to be identical in size.
- *
- ****************************************************************************/
-void
-vpxyv12_copy_frame_dma(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc)
-{
-    int yheight, uv_height;
-    int ystride, uv_stride;
-    int border;
-    int yoffset, uvoffset;
-
-    border = src_ybc->border;
-    yheight = src_ybc->y_height;
-    uv_height = src_ybc->uv_height;
-
-    ystride = src_ybc->y_stride;
-    uv_stride = src_ybc->uv_stride;
-
-    yoffset = border * (ystride + 1);
-    uvoffset = border / 2 * (uv_stride + 1);
-
-    dat_copy2d(DAT_2D2D,
-               src_ybc->y_buffer - yoffset,
-               dst_ybc->y_buffer - yoffset,
-               ystride,
-               yheight + 2 * border,
-               ystride);
-    dat_copy2d(DAT_2D2D,
-               src_ybc->u_buffer - uvoffset,
-               dst_ybc->u_buffer - uvoffset,
-               uv_stride,
-               uv_height + border,
-               uv_stride);
-    dat_copy2d(DAT_2D2D,
-               src_ybc->v_buffer - uvoffset,
-               dst_ybc->v_buffer - uvoffset,
-               uv_stride,
-               uv_height + border,
-               uv_stride);
-
-}
-
-
-/****************************************************************************
- *
- *  ROUTINE       : vp8_yv12_copy_frame
- *
- *  INPUTS        :
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Copies the source image into the destination image and
- *                  updates the destination's UMV borders.
- *
- *  SPECIAL NOTES : The frames are assumed to be identical in size.
- *
- ****************************************************************************/
-void
-vp8_yv12_copy_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc)
-{
-    int row;
-    unsigned char *source, *dest;
-
-    source = src_ybc->y_buffer;
-    dest = dst_ybc->y_buffer;
-
-    for (row = 0; row < src_ybc->y_height; row++)
-    {
-        vpx_memcpy(dest, source, src_ybc->y_width);
-        source += src_ybc->y_stride;
-        dest   += dst_ybc->y_stride;
-    }
-
-    source = src_ybc->u_buffer;
-    dest = dst_ybc->u_buffer;
-
-    for (row = 0; row < src_ybc->uv_height; row++)
-    {
-        vpx_memcpy(dest, source, src_ybc->uv_width);
-        source += src_ybc->uv_stride;
-        dest   += dst_ybc->uv_stride;
-    }
-
-    source = src_ybc->v_buffer;
-    dest = dst_ybc->v_buffer;
-
-    for (row = 0; row < src_ybc->uv_height; row++)
-    {
-        vpx_memcpy(dest, source, src_ybc->uv_width);
-        source += src_ybc->uv_stride;
-        dest   += dst_ybc->uv_stride;
-    }
-
-    vp8_yv12_extend_frame_borders(dst_ybc);
-}

diff --git a/vpx_scale/generic/bicubic_scaler.c b/vpx_scale/generic/bicubic_scaler.c
index 420f719..4468e9d 100644
--- a/vpx_scale/generic/bicubic_scaler.c
+++ b/vpx_scale/generic/bicubic_scaler.c

@@ -271,17 +271,17 @@
 {
     if (!g_first_time)
     {
-        if (g_b_scaler.l_w) vpx_free(g_b_scaler.l_w);
+        vpx_free(g_b_scaler.l_w);
 
-        if (g_b_scaler.l_h) vpx_free(g_b_scaler.l_h);
+        vpx_free(g_b_scaler.l_h);
 
-        if (g_b_scaler.l_h_uv) vpx_free(g_b_scaler.l_h_uv);
+        vpx_free(g_b_scaler.l_h_uv);
 
-        if (g_b_scaler.c_w) vpx_free(g_b_scaler.c_w);
+        vpx_free(g_b_scaler.c_w);
 
-        if (g_b_scaler.c_h) vpx_free(g_b_scaler.c_h);
+        vpx_free(g_b_scaler.c_h);
 
-        if (g_b_scaler.c_h_uv) vpx_free(g_b_scaler.c_h_uv);
+        vpx_free(g_b_scaler.c_h_uv);
 
         vpx_memset(&g_b_scaler, 0, sizeof(BICUBIC_SCALER_STRUCT));
     }
@@ -342,21 +342,21 @@
     d_h_uv = (in_height / 2) / gcd_h_uv;
 
     // allocate memory for the coefficents
-    if (g_b_scaler.l_w) vpx_free(g_b_scaler.l_w);
+    vpx_free(g_b_scaler.l_w);
 
-    if (g_b_scaler.l_h) vpx_free(g_b_scaler.l_h);
+    vpx_free(g_b_scaler.l_h);
 
-    if (g_b_scaler.l_h_uv) vpx_free(g_b_scaler.l_h_uv);
+    vpx_free(g_b_scaler.l_h_uv);
 
     g_b_scaler.l_w = (short *)vpx_memalign(32, out_width * 2);
     g_b_scaler.l_h = (short *)vpx_memalign(32, out_height * 2);
     g_b_scaler.l_h_uv = (short *)vpx_memalign(32, out_height * 2);
 
-    if (g_b_scaler.c_w) vpx_free(g_b_scaler.c_w);
+    vpx_free(g_b_scaler.c_w);
 
-    if (g_b_scaler.c_h) vpx_free(g_b_scaler.c_h);
+    vpx_free(g_b_scaler.c_h);
 
-    if (g_b_scaler.c_h_uv) vpx_free(g_b_scaler.c_h_uv);
+    vpx_free(g_b_scaler.c_h_uv);
 
     g_b_scaler.c_w = (short *)vpx_memalign(32, g_b_scaler.nw * 4 * 2);
     g_b_scaler.c_h = (short *)vpx_memalign(32, g_b_scaler.nh * 4 * 2);

diff --git a/vpx_scale/generic/scalesystemdependant.c b/vpx_scale/generic/scalesystemdependent.c
similarity index 100%
rename from vpx_scale/generic/scalesystemdependant.c
rename to vpx_scale/generic/scalesystemdependent.c


diff --git a/vpx_scale/generic/yv12config.c b/vpx_scale/generic/yv12config.c
index e7c5b18..cb0ab94 100644
--- a/vpx_scale/generic/yv12config.c
+++ b/vpx_scale/generic/yv12config.c

@@ -24,10 +24,7 @@
 {
     if (ybf)
     {
-        if (ybf->buffer_alloc)
-        {
             duck_free(ybf->buffer_alloc);
-        }
 
         ybf->buffer_alloc = 0;
     }
@@ -91,24 +88,3 @@
 
     return 0;
 }
-
-/****************************************************************************
- *
- ****************************************************************************/
-int
-vp8_yv12_black_frame_buffer(YV12_BUFFER_CONFIG *ybf)
-{
-    if (ybf)
-    {
-        if (ybf->buffer_alloc)
-        {
-            duck_memset(ybf->y_buffer, 0x0, ybf->y_stride * ybf->y_height);
-            duck_memset(ybf->u_buffer, 0x80, ybf->uv_stride * ybf->uv_height);
-            duck_memset(ybf->v_buffer, 0x80, ybf->uv_stride * ybf->uv_height);
-        }
-
-        return 0;
-    }
-
-    return -1;
-}

diff --git a/vpx_scale/generic/yv12extend.c b/vpx_scale/generic/yv12extend.c
index e58aa1f..c087bdd 100644
--- a/vpx_scale/generic/yv12extend.c
+++ b/vpx_scale/generic/yv12extend.c

@@ -145,8 +145,8 @@
 }
 
 
-void
-vp8_yv12_extend_frame_borders_yonly(YV12_BUFFER_CONFIG *ybf)
+static void
+extend_frame_borders_yonly(YV12_BUFFER_CONFIG *ybf)
 {
     int i;
     unsigned char *src_ptr1, *src_ptr2;
@@ -276,5 +276,5 @@
         dest   += dst_ybc->y_stride;
     }
 
-    vp8_yv12_extend_frame_borders_yonly(dst_ybc);
+    extend_frame_borders_yonly(dst_ybc);
 }

diff --git a/vpx_scale/include/leapster/vpxscale.h b/vpx_scale/include/leapster/vpxscale.h
deleted file mode 100644
index 5021849..0000000
--- a/vpx_scale/include/leapster/vpxscale.h
+++ /dev/null

@@ -1,62 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/****************************************************************************
-*
-*   Module Title :     postp.h
-*
-*   Description  :     Post processor interface
-*
-****************************************************************************/
-#ifndef VPXSCALE_H
-#define VPXSCALE_H
-
-
-// fwg 2004-10-14
-typedef void (*vpxvertical_band_4_5_scale_lf)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-typedef void (*vpxlast_vertical_band_4_5_scale_lf)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-typedef void (*vpxvertical_band_3_5_scale_lf)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-typedef void (*vpxlast_vertical_band_3_5_scale_lf)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-typedef void (*vpxhorizontal_line_1_2_scale_lf)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
-typedef void (*vpxhorizontal_line_3_5_scale_lf)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
-typedef void (*vpxhorizontal_line_4_5_scale_lf)(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width);
-typedef void (*vpxvertical_band_1_2_scale_lf)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-typedef void (*vpxlast_vertical_band_1_2_scale_lf)(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width);
-
-
-typedef struct vpxglobal_scalling_ptrs_t
-{
-    vpxvertical_band_4_5_scale_lf        vpxvertical_band_4_5_scale_t;
-    vpxlast_vertical_band_4_5_scale_lf    vpxlast_vertical_band_4_5_scale_t;
-    vpxvertical_band_3_5_scale_lf        vpxvertical_band_3_5_scale_t;
-    vpxlast_vertical_band_3_5_scale_lf    vpxlast_vertical_band_3_5_scale_t;
-    vpxhorizontal_line_1_2_scale_lf      vpxhorizontal_line_1_2_scale_t;
-    vpxhorizontal_line_3_5_scale_lf      vpxhorizontal_line_3_5_scale_t;
-    vpxhorizontal_line_4_5_scale_lf      vpxhorizontal_line_4_5_scale_t;
-    vpxvertical_band_1_2_scale_lf        vpxvertical_band_1_2_scale_t;
-    vpxlast_vertical_band_1_2_scale_lf    vpxlast_vertical_band_1_2_scale_t;
-} vpxglobal_scalling_ptrs;
-
-extern struct vpxglobal_scalling_ptrs_t *g_scaling_ptrs;
-
-/*
-extern void  (*vp8_vertical_band_4_5_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width);
-extern void  (*vp8_last_vertical_band_4_5_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width);
-extern void  (*vp8_vertical_band_3_5_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width);
-extern void  (*vp8_last_vertical_band_3_5_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width);
-extern void  (*vp8_horizontal_line_1_2_scale)(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width);
-extern void  (*vp8_horizontal_line_3_5_scale)(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width);
-extern void  (*vp8_horizontal_line_4_5_scale)(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width);
-extern void  (*vp8_vertical_band_1_2_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width);
-extern void  (*vp8_last_vertical_band_1_2_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width);
-*/
-
-#endif

diff --git a/vpx_scale/intel_linux/scaleopt.c b/vpx_scale/intel_linux/scaleopt.c
deleted file mode 100644
index 499f0ed..0000000
--- a/vpx_scale/intel_linux/scaleopt.c
+++ /dev/null

@@ -1,1853 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/****************************************************************************
-*
-*   Module Title :     scaleopt.cpp
-*
-*   Description  :     Optimized scaling functions
-*
-****************************************************************************/
-#include "pragmas.h"
-
-/****************************************************************************
-*  Module Statics
-****************************************************************************/
-#if 0
-__declspec(align(16)) const static unsigned short one_fifth[]  = { 51, 51, 51, 51 };
-__declspec(align(16)) const static unsigned short two_fifths[] = { 102, 102, 102, 102 };
-__declspec(align(16)) const static unsigned short three_fifths[] = { 154, 154, 154, 154 };
-__declspec(align(16)) const static unsigned short four_fifths[] = { 205, 205, 205, 205 };
-__declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 };
-__declspec(align(16)) const static unsigned short four_ones[] = { 1, 1, 1, 1};
-__declspec(align(16)) const static unsigned short const45_2[] = {205, 154, 102,  51 };
-__declspec(align(16)) const static unsigned short const45_1[] = { 51, 102, 154, 205 };
-__declspec(align(16)) const static unsigned char  mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0};
-__declspec(align(16)) const static unsigned short const35_2[] = { 154,  51, 205, 102 };
-__declspec(align(16)) const static unsigned short const35_1[] = { 102, 205,  51, 154 };
-#endif
-
-#include "vpx_scale/vpxscale.h"
-#include "vpx_mem/vpx_mem.h"
-
-/****************************************************************************
- *
- *  ROUTINE       : horizontal_line_3_5_scale_mmx
- *
- *  INPUTS        : const unsigned char *source :
- *                  unsigned int source_width    :
- *                  unsigned char *dest         :
- *                  unsigned int dest_width      :
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : 3 to 5 up-scaling of a horizontal line of pixels.
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-static
-void horizontal_line_3_5_scale_mmx
-(
-    const unsigned char *source,
-    unsigned int source_width,
-    unsigned char *dest,
-    unsigned int dest_width
-)
-{
-    __declspec(align(16)) unsigned short const35_2[] = { 154,  51, 205, 102 };
-    __declspec(align(16)) unsigned short const35_1[] = { 102, 205,  51, 154 };
-    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
-
-    (void) dest_width;
-
-    __asm
-    {
-
-        push ebx
-
-        mov         esi,    source
-        mov         edi,    dest
-
-        mov         ecx,    source_width
-        lea         edx,    [esi+ecx-3];
-
-        movq        mm5,    const35_1       // mm5 = 66 xx cd xx 33 xx 9a xx
-        movq        mm6,    const35_2       // mm6 = 9a xx 33 xx cd xx 66 xx
-
-        movq        mm4,    round_values     // mm4 = 80 xx 80 xx 80 xx 80 xx
-        pxor        mm7,    mm7             // clear mm7
-
-        horiz_line_3_5_loop:
-
-        mov        eax,    DWORD PTR [esi] // eax = 00 01 02 03
-        mov        ebx,    eax
-
-        and         ebx,    0xffff00        // ebx = xx 01 02 xx
-        mov         ecx,    eax             // ecx = 00 01 02 03
-
-        and         eax,    0xffff0000      // eax = xx xx 02 03
-        xor         ecx,    eax             // ecx = 00 01 xx xx
-
-        shr         ebx,    8               // ebx = 01 02 xx xx
-        or          eax,    ebx             // eax = 01 02 02 03
-
-        shl         ebx,    16              // ebx = xx xx 01 02
-        movd        mm1,    eax             // mm1 = 01 02 02 03 xx xx xx xx
-
-        or          ebx,    ecx             // ebx = 00 01 01 02
-        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 02 xx 03 xx
-
-        movd        mm0,    ebx             // mm0 = 00 01 01 02
-        pmullw      mm1,    mm6             //
-
-        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 01 xx 02 xx
-        pmullw      mm0,    mm5             //
-
-        mov         [edi],  ebx             // writeoutput 00 xx xx xx
-        add         esi,    3
-
-        add         edi,    5
-        paddw       mm0,    mm1
-
-        paddw       mm0,    mm4
-        psrlw       mm0,    8
-
-        cmp         esi,    edx
-        packuswb    mm0,    mm7
-
-        movd        DWORD Ptr [edi-4], mm0
-        jl          horiz_line_3_5_loop
-
-//Exit:
-        mov         eax,    DWORD PTR [esi] // eax = 00 01 02 03
-        mov         ebx,    eax
-
-        and         ebx,    0xffff00        // ebx = xx 01 02 xx
-        mov         ecx,    eax             // ecx = 00 01 02 03
-
-        and         eax,    0xffff0000      // eax = xx xx 02 03
-        xor         ecx,    eax             // ecx = 00 01 xx xx
-
-        shr         ebx,    8               // ebx = 01 02 xx xx
-        or          eax,    ebx             // eax = 01 02 02 03
-
-        shl         eax,    8               // eax = xx 01 02 02
-        and         eax,    0xffff0000      // eax = xx xx 02 02
-
-        or          eax,    ebx             // eax = 01 02 02 02
-
-        shl         ebx,    16              // ebx = xx xx 01 02
-        movd        mm1,    eax             // mm1 = 01 02 02 02 xx xx xx xx
-
-        or          ebx,    ecx             // ebx = 00 01 01 02
-        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 02 xx 02 xx
-
-        movd        mm0,    ebx             // mm0 = 00 01 01 02
-        pmullw      mm1,    mm6             //
-
-        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 01 xx 02 xx
-        pmullw      mm0,    mm5             //
-
-        mov         [edi],  ebx             // writeoutput 00 xx xx xx
-        paddw       mm0,    mm1
-
-        paddw       mm0,    mm4
-        psrlw       mm0,    8
-
-        packuswb    mm0,    mm7
-        movd        DWORD Ptr [edi+1], mm0
-
-        pop ebx
-
-    }
-
-    /*
-    const unsigned char *src = source;
-    unsigned char *des = dest;
-    unsigned int a, b, c ;
-    unsigned int i;
-    (void) dest_width;
-
-    for ( i=0; i<source_width-3; i+=3 )
-    {
-        a = src[0];
-        b = src[1];
-        des [0] = (UINT8) (a);
-        // 2 * left + 3 * right /5
-        des [1] = (UINT8) (( a * 102 + 154 * b + 128 ) >> 8);
-        c = src[2] ;
-        // 4 * left + 1 * right /5
-        des [2] = (UINT8) (( b * 205 + c * 51 + 128 ) >> 8);
-        // 1 * left + 4 * right /5
-        des [3] = (UINT8) (( b * 51 + c * 205 + 128 ) >> 8);
-
-        a = src[3];
-        // 3 * left + 2 * right /5
-        des [4] = (UINT8) (( c * 154 + a * 102 + 128 ) >> 8);
-
-        src += 3;
-        des += 5;
-    }
-
-    a = src[0];
-    b = src[1];
-    des [0] = (UINT8) (a);
-    // 2 * left + 3 * right /5
-    des [1] = (UINT8) (( a * 102 + 154 * b + 128 ) >> 8);
-    c = src[2] ;
-    // 4 * left + 1 * right /5
-    des [2] = (UINT8) (( b * 205 + c * 51 + 128 ) >> 8);
-    // 1 * left + 4 * right /5
-    des [3] = (UINT8) (( b * 51 + c * 205 + 128 ) >> 8);
-
-    des [4] = (UINT8) (c);
-    */
-}
-
-
-/****************************************************************************
- *
- *  ROUTINE       : horizontal_line_4_5_scale_mmx
- *
- *  INPUTS        : const unsigned char *source :
- *                  unsigned int source_width    :
- *                  unsigned char *dest         :
- *                  unsigned int dest_width      :
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : 4 to 5 up-scaling of a horizontal line of pixels.
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-static
-void horizontal_line_4_5_scale_mmx
-(
-    const unsigned char *source,
-    unsigned int source_width,
-    unsigned char *dest,
-    unsigned int dest_width
-)
-{
-    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
-    __declspec(align(16)) unsigned short const45_2[] = {205, 154, 102,  51 };
-    __declspec(align(16)) unsigned short const45_1[] = { 51, 102, 154, 205 };
-    __declspec(align(16)) unsigned char  mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0};
-
-    (void)dest_width;
-
-    __asm
-    {
-
-        mov         esi,    source
-        mov         edi,    dest
-
-        mov         ecx,    source_width
-        lea         edx,    [esi+ecx-8];
-
-        movq        mm5,    const45_1       // mm5 = 33 xx 66 xx 9a xx cd xx
-        movq        mm6,    const45_2       // mm6 = cd xx 9a xx 66 xx 33 xx
-
-        movq        mm4,    round_values     // mm4 = 80 xx 80 xx 80 xx 80 xx
-        pxor        mm7,    mm7             // clear mm7
-
-        horiz_line_4_5_loop:
-
-        movq        mm0,    QWORD PTR [esi]           // mm0 = 00 01 02 03 04 05 06 07
-        movq        mm1,    QWORD PTR [esi+1];        // mm1 = 01 02 03 04 05 06 07 08
-
-        movq        mm2,    mm0             // mm2 = 00 01 02 03 04 05 06 07
-        movq        mm3,    mm1             // mm3 = 01 02 03 04 05 06 07 08
-
-        movd        DWORD PTR [edi],  mm0             // write output 00 xx xx xx
-        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 02 xx 03 xx
-
-        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 03 xx 04 xx
-        pmullw      mm0,    mm5             // 00* 51 01*102 02*154 03*205
-
-        pmullw      mm1,    mm6             // 01*205 02*154 03*102 04* 51
-        punpckhbw   mm2,    mm7             // mm2 = 04 xx 05 xx 06 xx 07 xx
-
-        movd        DWORD PTR [edi+5], mm2            // write ouput 05 xx xx xx
-        pmullw      mm2,    mm5             // 04* 51 05*102 06*154 07*205
-
-        punpckhbw   mm3,    mm7             // mm3 = 05 xx 06 xx 07 xx 08 xx
-        pmullw      mm3,    mm6             // 05*205 06*154 07*102 08* 51
-
-        paddw       mm0,    mm1             // added round values
-        paddw       mm0,    mm4
-
-        psrlw       mm0,    8               // output: 01 xx 02 xx 03 xx 04 xx
-        packuswb    mm0,    mm7
-
-        movd        DWORD PTR [edi+1], mm0  // write output 01 02 03 04
-        add         edi,    10
-
-        add         esi,    8
-        paddw       mm2,    mm3             //
-
-        paddw       mm2,    mm4             // added round values
-        cmp         esi,    edx
-
-        psrlw       mm2,    8
-        packuswb    mm2,    mm7
-
-        movd        DWORD PTR [edi-4], mm2 // writeoutput 06 07 08 09
-        jl         horiz_line_4_5_loop
-
-//Exit:
-        movq        mm0,    [esi]           // mm0 = 00 01 02 03 04 05 06 07
-        movq        mm1,    mm0             // mm1 = 00 01 02 03 04 05 06 07
-
-        movq        mm2,    mm0             // mm2 = 00 01 02 03 04 05 06 07
-        psrlq       mm1,    8               // mm1 = 01 02 03 04 05 06 07 00
-
-        movq        mm3,    mask45          // mm3 = 00 00 00 00 00 00 ff 00
-        pand        mm3,    mm1             // mm3 = 00 00 00 00 00 00 07 00
-
-        psllq       mm3,    8               // mm3 = 00 00 00 00 00 00 00 07
-        por         mm1,    mm3             // mm1 = 01 02 03 04 05 06 07 07
-
-        movq        mm3,    mm1
-
-        movd        DWORD PTR [edi],  mm0   // write output 00 xx xx xx
-        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 02 xx 03 xx
-
-        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 03 xx 04 xx
-        pmullw      mm0,    mm5             // 00* 51 01*102 02*154 03*205
-
-        pmullw      mm1,    mm6             // 01*205 02*154 03*102 04* 51
-        punpckhbw   mm2,    mm7             // mm2 = 04 xx 05 xx 06 xx 07 xx
-
-        movd        DWORD PTR [edi+5], mm2  // write ouput 05 xx xx xx
-        pmullw      mm2,    mm5             // 04* 51 05*102 06*154 07*205
-
-        punpckhbw   mm3,    mm7             // mm3 = 05 xx 06 xx 07 xx 08 xx
-        pmullw      mm3,    mm6             // 05*205 06*154 07*102 07* 51
-
-        paddw       mm0,    mm1             // added round values
-        paddw       mm0,    mm4
-
-        psrlw       mm0,    8               // output: 01 xx 02 xx 03 xx 04 xx
-        packuswb    mm0,    mm7             // 01 02 03 04 xx xx xx xx
-
-        movd        DWORD PTR [edi+1], mm0  // write output 01 02 03 04
-        paddw       mm2,    mm3             //
-
-        paddw       mm2,    mm4             // added round values
-        psrlw       mm2,    8
-
-        packuswb    mm2,    mm7
-        movd        DWORD PTR [edi+6], mm2  // writeoutput 06 07 08 09
-
-
-    }
-    /*
-        const unsigned char *src = source;
-        unsigned char *des = dest;
-        unsigned int a, b, c ;
-        unsigned i;
-        (void) dest_width;
-
-        for ( i=0; i<source_width-4; i+=4 )
-        {
-            a = src[0];
-            b = src[1];
-            des [0] = (UINT8) a;
-            des [1] = (UINT8) (( a * 51 + 205 * b + 128) >> 8);
-            c = src[2] * 154;
-            a = src[3];
-            des [2] = (UINT8) (( b * 102 + c + 128) >> 8);
-            des [3] = (UINT8) (( c + 102 * a + 128) >> 8);
-            b = src[4];
-            des [4] = (UINT8) (( a * 205 + 51 * b + 128) >> 8);
-
-            src += 4;
-            des += 5;
-        }
-
-        a = src[0];
-        b = src[1];
-        des [0] = (UINT8) (a);
-        des [1] = (UINT8) (( a * 51 + 205 * b + 128) >> 8);
-        c = src[2] * 154;
-        a = src[3];
-        des [2] = (UINT8) (( b * 102 + c + 128) >> 8);
-        des [3] = (UINT8) (( c + 102 * a + 128) >> 8);
-        des [4] = (UINT8) (a);
-    */
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : vertical_band_4_5_scale_mmx
- *
- *  INPUTS        : unsigned char *dest    :
- *                  unsigned int dest_pitch :
- *                  unsigned int dest_width :
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : 4 to 5 up-scaling of a 4 pixel high band of pixels.
- *
- *  SPECIAL NOTES : The routine uses the first line of the band below
- *                  the current band. The function also has a "C" only
- *                  version.
- *
- ****************************************************************************/
-static
-void vertical_band_4_5_scale_mmx
-(
-    unsigned char *dest,
-    unsigned int dest_pitch,
-    unsigned int dest_width
-)
-{
-
-    __declspec(align(16)) unsigned short one_fifth[]  = { 51, 51, 51, 51 };
-    __declspec(align(16)) unsigned short two_fifths[] = { 102, 102, 102, 102 };
-    __declspec(align(16)) unsigned short three_fifths[] = { 154, 154, 154, 154 };
-    __declspec(align(16)) unsigned short four_fifths[] = { 205, 205, 205, 205 };
-    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
-
-    __asm
-    {
-
-        mov         esi,    dest                    // Get the source and destination pointer
-        mov         ecx,    dest_pitch               // Get the pitch size
-
-        lea         edi,    [esi+ecx*2]             // tow lines below
-        add         edi,    ecx                     // three lines below
-
-        pxor        mm7,    mm7                     // clear out mm7
-        mov         edx,    dest_width               // Loop counter
-
-        vs_4_5_loop:
-
-        movq        mm0,    QWORD ptr [esi]         // src[0];
-        movq        mm1,    QWORD ptr [esi+ecx]     // src[1];
-
-        movq        mm2,    mm0                     // Make a copy
-        punpcklbw   mm0,    mm7                     // unpack low to word
-
-        movq        mm5,    one_fifth
-        punpckhbw   mm2,    mm7                     // unpack high to word
-
-        pmullw      mm0,    mm5                     // a * 1/5
-
-        movq        mm3,    mm1                     // make a copy
-        punpcklbw   mm1,    mm7                     // unpack low to word
-
-        pmullw      mm2,    mm5                     // a * 1/5
-        movq        mm6,    four_fifths               // constan
-
-        movq        mm4,    mm1                     // copy of low b
-        pmullw      mm4,    mm6                     // b * 4/5
-
-        punpckhbw   mm3,    mm7                     // unpack high to word
-        movq        mm5,    mm3                     // copy of high b
-
-        pmullw      mm5,    mm6                     // b * 4/5
-        paddw       mm0,    mm4                     // a * 1/5 + b * 4/5
-
-        paddw       mm2,    mm5                     // a * 1/5 + b * 4/5
-        paddw       mm0,    round_values             // + 128
-
-        paddw       mm2,    round_values             // + 128
-        psrlw       mm0,    8
-
-        psrlw       mm2,    8
-        packuswb    mm0,    mm2                     // des [1]
-
-        movq        QWORD ptr [esi+ecx], mm0        // write des[1]
-        movq        mm0,    [esi+ecx*2]             // mm0 = src[2]
-
-        // mm1, mm3 --- Src[1]
-        // mm0 --- Src[2]
-        // mm7 for unpacking
-
-        movq        mm5,    two_fifths
-        movq        mm2,    mm0                     // make a copy
-
-        pmullw      mm1,    mm5                     // b * 2/5
-        movq        mm6,    three_fifths
-
-
-        punpcklbw   mm0,    mm7                     // unpack low to word
-        pmullw      mm3,    mm5                     // b * 2/5
-
-        movq        mm4,    mm0                     // make copy of c
-        punpckhbw   mm2,    mm7                     // unpack high to word
-
-        pmullw      mm4,    mm6                     // c * 3/5
-        movq        mm5,    mm2
-
-        pmullw      mm5,    mm6                     // c * 3/5
-        paddw       mm1,    mm4                     // b * 2/5 + c * 3/5
-
-        paddw       mm3,    mm5                     // b * 2/5 + c * 3/5
-        paddw       mm1,    round_values             // + 128
-
-        paddw       mm3,    round_values             // + 128
-        psrlw       mm1,    8
-
-        psrlw       mm3,    8
-        packuswb    mm1,    mm3                     // des[2]
-
-        movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
-        movq        mm1,    [edi]                   // mm1=Src[3];
-
-        // mm0, mm2 --- Src[2]
-        // mm1 --- Src[3]
-        // mm6 --- 3/5
-        // mm7 for unpacking
-
-        pmullw      mm0,    mm6                     // c * 3/5
-        movq        mm5,    two_fifths               // mm5 = 2/5
-
-        movq        mm3,    mm1                     // make a copy
-        pmullw      mm2,    mm6                     // c * 3/5
-
-        punpcklbw   mm1,    mm7                     // unpack low
-        movq        mm4,    mm1                     // make a copy
-
-        punpckhbw   mm3,    mm7                     // unpack high
-        pmullw      mm4,    mm5                     // d * 2/5
-
-        movq        mm6,    mm3                     // make a copy
-        pmullw      mm6,    mm5                     // d * 2/5
-
-        paddw       mm0,    mm4                     // c * 3/5 + d * 2/5
-        paddw       mm2,    mm6                     // c * 3/5 + d * 2/5
-
-        paddw       mm0,    round_values             // + 128
-        paddw       mm2,    round_values             // + 128
-
-        psrlw       mm0,    8
-        psrlw       mm2,    8
-
-        packuswb    mm0,    mm2                     // des[3]
-        movq        QWORD ptr [edi], mm0            // write des[3]
-
-        //  mm1, mm3 --- Src[3]
-        //  mm7 -- cleared for unpacking
-
-        movq        mm0,    [edi+ecx*2]             // mm0, Src[0] of the next group
-
-        movq        mm5,    four_fifths              // mm5 = 4/5
-        pmullw      mm1,    mm5                     // d * 4/5
-
-        movq        mm6,    one_fifth                // mm6 = 1/5
-        movq        mm2,    mm0                     // make a copy
-
-        pmullw      mm3,    mm5                     // d * 4/5
-        punpcklbw   mm0,    mm7                     // unpack low
-
-        pmullw      mm0,    mm6                     // an * 1/5
-        punpckhbw   mm2,    mm7                     // unpack high
-
-        paddw       mm1,    mm0                     // d * 4/5 + an * 1/5
-        pmullw      mm2,    mm6                     // an * 1/5
-
-        paddw       mm3,    mm2                     // d * 4/5 + an * 1/5
-        paddw       mm1,    round_values             // + 128
-
-        paddw       mm3,    round_values             // + 128
-        psrlw       mm1,    8
-
-        psrlw       mm3,    8
-        packuswb    mm1,    mm3                     // des[4]
-
-        movq        QWORD ptr [edi+ecx], mm1        // write des[4]
-
-        add         edi,    8
-        add         esi,    8
-
-        sub         edx,    8
-        jg         vs_4_5_loop
-    }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : last_vertical_band_4_5_scale_mmx
- *
- *  INPUTS        : unsigned char *dest    :
- *                  unsigned int dest_pitch :
- *                  unsigned int dest_width :
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : None
- *
- *  FUNCTION      : 4 to 5 up-scaling of the last 4-pixel high band in an image.
- *
- *  SPECIAL NOTES : The routine uses the first line of the band below
- *                  the current band. The function also has an "C" only
- *                  version.
- *
- ****************************************************************************/
-static
-void last_vertical_band_4_5_scale_mmx
-(
-    unsigned char *dest,
-    unsigned int dest_pitch,
-    unsigned int dest_width
-)
-{
-    __declspec(align(16)) unsigned short one_fifth[]  = { 51, 51, 51, 51 };
-    __declspec(align(16)) unsigned short two_fifths[] = { 102, 102, 102, 102 };
-    __declspec(align(16)) unsigned short three_fifths[] = { 154, 154, 154, 154 };
-    __declspec(align(16)) unsigned short four_fifths[] = { 205, 205, 205, 205 };
-    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
-
-    __asm
-    {
-        mov         esi,    dest                    // Get the source and destination pointer
-        mov         ecx,    dest_pitch               // Get the pitch size
-
-        lea         edi,    [esi+ecx*2]             // tow lines below
-        add         edi,    ecx                     // three lines below
-
-        pxor        mm7,    mm7                     // clear out mm7
-        mov         edx,    dest_width               // Loop counter
-
-        last_vs_4_5_loop:
-
-        movq        mm0,    QWORD ptr [esi]         // src[0];
-        movq        mm1,    QWORD ptr [esi+ecx]     // src[1];
-
-        movq        mm2,    mm0                     // Make a copy
-        punpcklbw   mm0,    mm7                     // unpack low to word
-
-        movq        mm5,    one_fifth
-        punpckhbw   mm2,    mm7                     // unpack high to word
-
-        pmullw      mm0,    mm5                     // a * 1/5
-
-        movq        mm3,    mm1                     // make a copy
-        punpcklbw   mm1,    mm7                     // unpack low to word
-
-        pmullw      mm2,    mm5                     // a * 1/5
-        movq        mm6,    four_fifths               // constan
-
-        movq        mm4,    mm1                     // copy of low b
-        pmullw      mm4,    mm6                     // b * 4/5
-
-        punpckhbw   mm3,    mm7                     // unpack high to word
-        movq        mm5,    mm3                     // copy of high b
-
-        pmullw      mm5,    mm6                     // b * 4/5
-        paddw       mm0,    mm4                     // a * 1/5 + b * 4/5
-
-        paddw       mm2,    mm5                     // a * 1/5 + b * 4/5
-        paddw       mm0,    round_values             // + 128
-
-        paddw       mm2,    round_values             // + 128
-        psrlw       mm0,    8
-
-        psrlw       mm2,    8
-        packuswb    mm0,    mm2                     // des [1]
-
-        movq        QWORD ptr [esi+ecx], mm0        // write des[1]
-        movq        mm0,    [esi+ecx*2]             // mm0 = src[2]
-
-        // mm1, mm3 --- Src[1]
-        // mm0 --- Src[2]
-        // mm7 for unpacking
-
-        movq        mm5,    two_fifths
-        movq        mm2,    mm0                     // make a copy
-
-        pmullw      mm1,    mm5                     // b * 2/5
-        movq        mm6,    three_fifths
-
-
-        punpcklbw   mm0,    mm7                     // unpack low to word
-        pmullw      mm3,    mm5                     // b * 2/5
-
-        movq        mm4,    mm0                     // make copy of c
-        punpckhbw   mm2,    mm7                     // unpack high to word
-
-        pmullw      mm4,    mm6                     // c * 3/5
-        movq        mm5,    mm2
-
-        pmullw      mm5,    mm6                     // c * 3/5
-        paddw       mm1,    mm4                     // b * 2/5 + c * 3/5
-
-        paddw       mm3,    mm5                     // b * 2/5 + c * 3/5
-        paddw       mm1,    round_values             // + 128
-
-        paddw       mm3,    round_values             // + 128
-        psrlw       mm1,    8
-
-        psrlw       mm3,    8
-        packuswb    mm1,    mm3                     // des[2]
-
-        movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
-        movq        mm1,    [edi]                   // mm1=Src[3];
-
-        movq        QWORD ptr [edi+ecx], mm1        // write des[4];
-
-        // mm0, mm2 --- Src[2]
-        // mm1 --- Src[3]
-        // mm6 --- 3/5
-        // mm7 for unpacking
-
-        pmullw      mm0,    mm6                     // c * 3/5
-        movq        mm5,    two_fifths               // mm5 = 2/5
-
-        movq        mm3,    mm1                     // make a copy
-        pmullw      mm2,    mm6                     // c * 3/5
-
-        punpcklbw   mm1,    mm7                     // unpack low
-        movq        mm4,    mm1                     // make a copy
-
-        punpckhbw   mm3,    mm7                     // unpack high
-        pmullw      mm4,    mm5                     // d * 2/5
-
-        movq        mm6,    mm3                     // make a copy
-        pmullw      mm6,    mm5                     // d * 2/5
-
-        paddw       mm0,    mm4                     // c * 3/5 + d * 2/5
-        paddw       mm2,    mm6                     // c * 3/5 + d * 2/5
-
-        paddw       mm0,    round_values             // + 128
-        paddw       mm2,    round_values             // + 128
-
-        psrlw       mm0,    8
-        psrlw       mm2,    8
-
-        packuswb    mm0,    mm2                     // des[3]
-        movq        QWORD ptr [edi], mm0            // write des[3]
-
-        //  mm1, mm3 --- Src[3]
-        //  mm7 -- cleared for unpacking
-        add         edi,    8
-        add         esi,    8
-
-        sub         edx,    8
-        jg          last_vs_4_5_loop
-    }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : vertical_band_3_5_scale_mmx
- *
- *  INPUTS        : unsigned char *dest    :
- *                  unsigned int dest_pitch :
- *                  unsigned int dest_width :
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : 3 to 5 up-scaling of a 3-pixel high band of pixels.
- *
- *  SPECIAL NOTES : The routine uses the first line of the band below
- *                  the current band. The function also has an "C" only
- *                  version.
- *
- ****************************************************************************/
-static
-void vertical_band_3_5_scale_mmx
-(
-    unsigned char *dest,
-    unsigned int dest_pitch,
-    unsigned int dest_width
-)
-{
-    __declspec(align(16)) unsigned short one_fifth[]  = { 51, 51, 51, 51 };
-    __declspec(align(16)) unsigned short two_fifths[] = { 102, 102, 102, 102 };
-    __declspec(align(16)) unsigned short three_fifths[] = { 154, 154, 154, 154 };
-    __declspec(align(16)) unsigned short four_fifths[] = { 205, 205, 205, 205 };
-    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
-
-    __asm
-    {
-        mov         esi,    dest                    // Get the source and destination pointer
-        mov         ecx,    dest_pitch               // Get the pitch size
-
-        lea         edi,    [esi+ecx*2]             // tow lines below
-        add         edi,    ecx                     // three lines below
-
-        pxor        mm7,    mm7                     // clear out mm7
-        mov         edx,    dest_width               // Loop counter
-
-        vs_3_5_loop:
-
-        movq        mm0,    QWORD ptr [esi]         // src[0];
-        movq        mm1,    QWORD ptr [esi+ecx]     // src[1];
-
-        movq        mm2,    mm0                     // Make a copy
-        punpcklbw   mm0,    mm7                     // unpack low to word
-
-        movq        mm5,    two_fifths               // mm5 = 2/5
-        punpckhbw   mm2,    mm7                     // unpack high to word
-
-        pmullw      mm0,    mm5                     // a * 2/5
-
-        movq        mm3,    mm1                     // make a copy
-        punpcklbw   mm1,    mm7                     // unpack low to word
-
-        pmullw      mm2,    mm5                     // a * 2/5
-        movq        mm6,    three_fifths             // mm6 = 3/5
-
-        movq        mm4,    mm1                     // copy of low b
-        pmullw      mm4,    mm6                     // b * 3/5
-
-        punpckhbw   mm3,    mm7                     // unpack high to word
-        movq        mm5,    mm3                     // copy of high b
-
-        pmullw      mm5,    mm6                     // b * 3/5
-        paddw       mm0,    mm4                     // a * 2/5 + b * 3/5
-
-        paddw       mm2,    mm5                     // a * 2/5 + b * 3/5
-        paddw       mm0,    round_values             // + 128
-
-        paddw       mm2,    round_values             // + 128
-        psrlw       mm0,    8
-
-        psrlw       mm2,    8
-        packuswb    mm0,    mm2                     // des [1]
-
-        movq        QWORD ptr [esi+ecx], mm0        // write des[1]
-        movq        mm0,    [esi+ecx*2]             // mm0 = src[2]
-
-        // mm1, mm3 --- Src[1]
-        // mm0 --- Src[2]
-        // mm7 for unpacking
-
-        movq        mm4,    mm1                     // b low
-        pmullw      mm1,    four_fifths              // b * 4/5 low
-
-        movq        mm5,    mm3                     // b high
-        pmullw      mm3,    four_fifths              // b * 4/5 high
-
-        movq        mm2,    mm0                     // c
-        pmullw      mm4,    one_fifth                // b * 1/5
-
-        punpcklbw   mm0,    mm7                     // c low
-        pmullw      mm5,    one_fifth                // b * 1/5
-
-        movq        mm6,    mm0                     // make copy of c low
-        punpckhbw   mm2,    mm7                     // c high
-
-        pmullw      mm6,    one_fifth                // c * 1/5 low
-        movq        mm7,    mm2                     // make copy of c high
-
-        pmullw      mm7,    one_fifth                // c * 1/5 high
-        paddw       mm1,    mm6                     // b * 4/5 + c * 1/5 low
-
-        paddw       mm3,    mm7                     // b * 4/5 + c * 1/5 high
-        movq        mm6,    mm0                     // make copy of c low
-
-        pmullw      mm6,    four_fifths              // c * 4/5 low
-        movq        mm7,    mm2                     // make copy of c high
-
-        pmullw      mm7,    four_fifths              // c * 4/5 high
-
-        paddw       mm4,    mm6                     // b * 1/5 + c * 4/5 low
-        paddw       mm5,    mm7                     // b * 1/5 + c * 4/5 high
-
-        paddw       mm1,    round_values             // + 128
-        paddw       mm3,    round_values             // + 128
-
-        psrlw       mm1,    8
-        psrlw       mm3,    8
-
-        packuswb    mm1,    mm3                     // des[2]
-        movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
-
-        paddw       mm4,    round_values             // + 128
-        paddw       mm5,    round_values             // + 128
-
-        psrlw       mm4,    8
-        psrlw       mm5,    8
-
-        packuswb    mm4,    mm5                     // des[3]
-        movq        QWORD ptr [edi], mm4            // write des[3]
-
-        //  mm0, mm2 --- Src[3]
-
-        pxor        mm7,    mm7                     // clear mm7 for unpacking
-        movq        mm1,    [edi+ecx*2]             // mm1 = Src[0] of the next group
-
-        movq        mm5,    three_fifths             // mm5 = 3/5
-        pmullw      mm0,    mm5                     // d * 3/5
-
-        movq        mm6,    two_fifths                // mm6 = 2/5
-        movq        mm3,    mm1                     // make a copy
-
-        pmullw      mm2,    mm5                     // d * 3/5
-        punpcklbw   mm1,    mm7                     // unpack low
-
-        pmullw      mm1,    mm6                     // an * 2/5
-        punpckhbw   mm3,    mm7                     // unpack high
-
-        paddw       mm0,    mm1                     // d * 3/5 + an * 2/5
-        pmullw      mm3,    mm6                     // an * 2/5
-
-        paddw       mm2,    mm3                     // d * 3/5 + an * 2/5
-        paddw       mm0,    round_values             // + 128
-
-        paddw       mm2,    round_values             // + 128
-        psrlw       mm0,    8
-
-        psrlw       mm2,    8
-        packuswb    mm0,    mm2                     // des[4]
-
-        movq        QWORD ptr [edi+ecx], mm0        // write des[4]
-
-        add         edi,    8
-        add         esi,    8
-
-        sub         edx,    8
-        jg          vs_3_5_loop
-    }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : last_vertical_band_3_5_scale_mmx
- *
- *  INPUTS        : unsigned char *dest    :
- *                  unsigned int dest_pitch :
- *                  unsigned int dest_width :
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : 3 to 5 up-scaling of a 3-pixel high band of pixels.
- *
- *  SPECIAL NOTES : The routine uses the first line of the band below
- *                  the current band. The function also has an "C" only
- *                  version.
- *
- ****************************************************************************/
-static
-void last_vertical_band_3_5_scale_mmx
-(
-    unsigned char *dest,
-    unsigned int dest_pitch,
-    unsigned int dest_width
-)
-{
-    __declspec(align(16)) unsigned short one_fifth[]  = { 51, 51, 51, 51 };
-    __declspec(align(16)) unsigned short two_fifths[] = { 102, 102, 102, 102 };
-    __declspec(align(16)) unsigned short three_fifths[] = { 154, 154, 154, 154 };
-    __declspec(align(16)) unsigned short four_fifths[] = { 205, 205, 205, 205 };
-    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
-    __asm
-    {
-        mov         esi,    dest                    // Get the source and destination pointer
-        mov         ecx,    dest_pitch               // Get the pitch size
-
-        lea         edi,    [esi+ecx*2]             // tow lines below
-        add         edi,    ecx                     // three lines below
-
-        pxor        mm7,    mm7                     // clear out mm7
-        mov         edx,    dest_width               // Loop counter
-
-
-        last_vs_3_5_loop:
-
-        movq        mm0,    QWORD ptr [esi]         // src[0];
-        movq        mm1,    QWORD ptr [esi+ecx]     // src[1];
-
-        movq        mm2,    mm0                     // Make a copy
-        punpcklbw   mm0,    mm7                     // unpack low to word
-
-        movq        mm5,    two_fifths               // mm5 = 2/5
-        punpckhbw   mm2,    mm7                     // unpack high to word
-
-        pmullw      mm0,    mm5                     // a * 2/5
-
-        movq        mm3,    mm1                     // make a copy
-        punpcklbw   mm1,    mm7                     // unpack low to word
-
-        pmullw      mm2,    mm5                     // a * 2/5
-        movq        mm6,    three_fifths             // mm6 = 3/5
-
-        movq        mm4,    mm1                     // copy of low b
-        pmullw      mm4,    mm6                     // b * 3/5
-
-        punpckhbw   mm3,    mm7                     // unpack high to word
-        movq        mm5,    mm3                     // copy of high b
-
-        pmullw      mm5,    mm6                     // b * 3/5
-        paddw       mm0,    mm4                     // a * 2/5 + b * 3/5
-
-        paddw       mm2,    mm5                     // a * 2/5 + b * 3/5
-        paddw       mm0,    round_values             // + 128
-
-        paddw       mm2,    round_values             // + 128
-        psrlw       mm0,    8
-
-        psrlw       mm2,    8
-        packuswb    mm0,    mm2                     // des [1]
-
-        movq        QWORD ptr [esi+ecx], mm0        // write des[1]
-        movq        mm0,    [esi+ecx*2]             // mm0 = src[2]
-
-
-
-        // mm1, mm3 --- Src[1]
-        // mm0 --- Src[2]
-        // mm7 for unpacking
-
-        movq        mm4,    mm1                     // b low
-        pmullw      mm1,    four_fifths              // b * 4/5 low
-
-        movq        QWORD ptr [edi+ecx], mm0        // write des[4]
-
-        movq        mm5,    mm3                     // b high
-        pmullw      mm3,    four_fifths              // b * 4/5 high
-
-        movq        mm2,    mm0                     // c
-        pmullw      mm4,    one_fifth                // b * 1/5
-
-        punpcklbw   mm0,    mm7                     // c low
-        pmullw      mm5,    one_fifth                // b * 1/5
-
-        movq        mm6,    mm0                     // make copy of c low
-        punpckhbw   mm2,    mm7                     // c high
-
-        pmullw      mm6,    one_fifth                // c * 1/5 low
-        movq        mm7,    mm2                     // make copy of c high
-
-        pmullw      mm7,    one_fifth                // c * 1/5 high
-        paddw       mm1,    mm6                     // b * 4/5 + c * 1/5 low
-
-        paddw       mm3,    mm7                     // b * 4/5 + c * 1/5 high
-        movq        mm6,    mm0                     // make copy of c low
-
-        pmullw      mm6,    four_fifths              // c * 4/5 low
-        movq        mm7,    mm2                     // make copy of c high
-
-        pmullw      mm7,    four_fifths              // c * 4/5 high
-
-        paddw       mm4,    mm6                     // b * 1/5 + c * 4/5 low
-        paddw       mm5,    mm7                     // b * 1/5 + c * 4/5 high
-
-        paddw       mm1,    round_values             // + 128
-        paddw       mm3,    round_values             // + 128
-
-        psrlw       mm1,    8
-        psrlw       mm3,    8
-
-        packuswb    mm1,    mm3                     // des[2]
-        movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
-
-        paddw       mm4,    round_values             // + 128
-        paddw       mm5,    round_values             // + 128
-
-        psrlw       mm4,    8
-        psrlw       mm5,    8
-
-        packuswb    mm4,    mm5                     // des[3]
-        movq        QWORD ptr [edi], mm4            // write des[3]
-
-        //  mm0, mm2 --- Src[3]
-
-        add         edi,    8
-        add         esi,    8
-
-        sub         edx,    8
-        jg          last_vs_3_5_loop
-    }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : vertical_band_1_2_scale_mmx
- *
- *  INPUTS        : unsigned char *dest    :
- *                  unsigned int dest_pitch :
- *                  unsigned int dest_width :
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : 1 to 2 up-scaling of a band of pixels.
- *
- *  SPECIAL NOTES : The routine uses the first line of the band below
- *                  the current band. The function also has an "C" only
- *                  version.
- *
- ****************************************************************************/
-static
-void vertical_band_1_2_scale_mmx
-(
-    unsigned char *dest,
-    unsigned int dest_pitch,
-    unsigned int dest_width
-)
-{
-    __declspec(align(16))unsigned short four_ones[] = { 1, 1, 1, 1};
-
-    __asm
-    {
-
-        mov         esi,    dest                    // Get the source and destination pointer
-        mov         ecx,    dest_pitch               // Get the pitch size
-
-        pxor        mm7,    mm7                     // clear out mm7
-        mov         edx,    dest_width               // Loop counter
-
-        vs_1_2_loop:
-
-        movq        mm0,    [esi]                   // get Src[0]
-        movq        mm1,    [esi + ecx * 2]         // get Src[1]
-
-        movq        mm2,    mm0                     // make copy before unpack
-        movq        mm3,    mm1                     // make copy before unpack
-
-        punpcklbw   mm0,    mm7                     // low Src[0]
-        movq        mm6,    four_ones                // mm6= 1, 1, 1, 1
-
-        punpcklbw   mm1,    mm7                     // low Src[1]
-        paddw       mm0,    mm1                     // low (a + b)
-
-        punpckhbw   mm2,    mm7                     // high Src[0]
-        paddw       mm0,    mm6                     // low (a + b + 1)
-
-        punpckhbw   mm3,    mm7
-        paddw       mm2,    mm3                     // high (a + b )
-
-        psraw       mm0,    1                       // low (a + b +1 )/2
-        paddw       mm2,    mm6                     // high (a + b + 1)
-
-        psraw       mm2,    1                       // high (a + b + 1)/2
-        packuswb    mm0,    mm2                     // pack results
-
-        movq        [esi+ecx], mm0                  // write out eight bytes
-        add         esi,    8
-
-        sub         edx,    8
-        jg          vs_1_2_loop
-    }
-
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : last_vertical_band_1_2_scale_mmx
- *
- *  INPUTS        : unsigned char *dest    :
- *                  unsigned int dest_pitch :
- *                  unsigned int dest_width :
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : 1 to 2 up-scaling of band of pixels.
- *
- *  SPECIAL NOTES : The routine uses the first line of the band below
- *                  the current band. The function also has an "C" only
- *                  version.
- *
- ****************************************************************************/
-static
-void last_vertical_band_1_2_scale_mmx
-(
-    unsigned char *dest,
-    unsigned int dest_pitch,
-    unsigned int dest_width
-)
-{
-    __asm
-    {
-        mov         esi,    dest                    // Get the source and destination pointer
-        mov         ecx,    dest_pitch               // Get the pitch size
-
-        mov         edx,    dest_width               // Loop counter
-
-        last_vs_1_2_loop:
-
-        movq        mm0,    [esi]                   // get Src[0]
-        movq        [esi+ecx], mm0                  // write out eight bytes
-
-        add         esi,    8
-        sub         edx,    8
-
-        jg         last_vs_1_2_loop
-    }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : horizontal_line_1_2_scale
- *
- *  INPUTS        : const unsigned char *source :
- *                  unsigned int source_width    :
- *                  unsigned char *dest         :
- *                  unsigned int dest_width      :
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : 1 to 2 up-scaling of a horizontal line of pixels.
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-static
-void horizontal_line_1_2_scale_mmx
-(
-    const unsigned char *source,
-    unsigned int source_width,
-    unsigned char *dest,
-    unsigned int dest_width
-)
-{
-    __declspec(align(16))unsigned short four_ones[] = { 1, 1, 1, 1};
-
-    (void) dest_width;
-
-    __asm
-    {
-        mov         esi,    source
-        mov         edi,    dest
-
-        pxor        mm7,    mm7
-        movq        mm6,    four_ones
-
-        mov         ecx,    source_width
-
-        hs_1_2_loop:
-
-        movq        mm0,    [esi]
-        movq        mm1,    [esi+1]
-
-        movq        mm2,    mm0
-        movq        mm3,    mm1
-
-        movq        mm4,    mm0
-        punpcklbw   mm0,    mm7
-
-        punpcklbw   mm1,    mm7
-        paddw       mm0,    mm1
-
-        paddw       mm0,    mm6
-        punpckhbw   mm2,    mm7
-
-        punpckhbw   mm3,    mm7
-        paddw       mm2,    mm3
-
-        paddw       mm2,    mm6
-        psraw       mm0,    1
-
-        psraw       mm2,    1
-        packuswb    mm0,    mm2
-
-        movq        mm2,    mm4
-        punpcklbw   mm2,    mm0
-
-        movq        [edi],  mm2
-        punpckhbw   mm4,    mm0
-
-        movq        [edi+8], mm4
-        add         esi,    8
-
-        add         edi,    16
-        sub         ecx,    8
-
-        cmp         ecx,    8
-        jg          hs_1_2_loop
-
-// last eight pixel
-
-        movq        mm0,    [esi]
-        movq        mm1,    mm0
-
-        movq        mm2,    mm0
-        movq        mm3,    mm1
-
-        psrlq       mm1,    8
-        psrlq       mm3,    56
-
-        psllq       mm3,    56
-        por         mm1,    mm3
-
-        movq        mm3,    mm1
-        movq        mm4,    mm0
-
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-
-        paddw       mm0,    mm1
-        paddw       mm0,    mm6
-
-        punpckhbw   mm2,    mm7
-        punpckhbw   mm3,    mm7
-
-        paddw       mm2,    mm3
-        paddw       mm2,    mm6
-
-        psraw       mm0,    1
-        psraw       mm2,    1
-
-        packuswb    mm0,    mm2
-        movq        mm2,    mm4
-
-        punpcklbw   mm2,    mm0
-        movq        [edi],  mm2
-
-        punpckhbw   mm4,    mm0
-        movq        [edi+8], mm4
-    }
-}
-
-
-
-
-
-
-/****************************************************************************
- *
- *  ROUTINE       : horizontal_line_5_4_scale_mmx
- *
- *  INPUTS        : const unsigned char *source : Pointer to source data.
- *                  unsigned int source_width    : Stride of source.
- *                  unsigned char *dest         : Pointer to destination data.
- *                  unsigned int dest_width      : Stride of destination (NOT USED).
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Copies horizontal line of pixels from source to
- *                  destination scaling up by 4 to 5.
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-static
-void horizontal_line_5_4_scale_mmx
-(
-    const unsigned char *source,
-    unsigned int source_width,
-    unsigned char *dest,
-    unsigned int dest_width
-)
-{
-
-    __declspec(align(16)) const unsigned short const54_2[] = {  0,  64, 128, 192 };
-    __declspec(align(16)) const unsigned short const54_1[] = {256, 192, 128,  64 };
-    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
-    /*
-    unsigned i;
-    unsigned int a, b, c, d, e;
-    unsigned char *des = dest;
-    const unsigned char *src = source;
-
-    (void) dest_width;
-
-    for ( i=0; i<source_width; i+=5 )
-    {
-        a = src[0];
-        b = src[1];
-        c = src[2];
-        d = src[3];
-        e = src[4];
-
-        des[0] = a;
-        des[1] = ((b*192 + c* 64 + 128)>>8);
-        des[2] = ((c*128 + d*128 + 128)>>8);
-        des[3] = ((d* 64 + e*192 + 128)>>8);
-
-        src += 5;
-        des += 4;
-    }
-    */
-    __asm
-    {
-
-        mov         esi,        source              ;
-        mov         edi,        dest                ;
-
-        mov         ecx,        source_width         ;
-        movq        mm5,        const54_1           ;
-
-        pxor        mm7,        mm7                 ;
-        movq        mm6,        const54_2           ;
-
-        movq        mm4,        round_values         ;
-        lea         edx,        [esi+ecx]           ;
-        horizontal_line_5_4_loop:
-
-        movq        mm0,        QWORD PTR  [esi]    ;
-        00 01 02 03 04 05 06 07
-        movq        mm1,        mm0                 ;
-        00 01 02 03 04 05 06 07
-
-        psrlq       mm0,        8                   ;
-        01 02 03 04 05 06 07 xx
-        punpcklbw   mm1,        mm7                 ;
-        xx 00 xx 01 xx 02 xx 03
-
-        punpcklbw   mm0,        mm7                 ;
-        xx 01 xx 02 xx 03 xx 04
-        pmullw      mm1,        mm5
-
-        pmullw      mm0,        mm6
-        add         esi,        5
-
-        add         edi,        4
-        paddw       mm1,        mm0
-
-        paddw       mm1,        mm4
-        psrlw       mm1,        8
-
-        cmp         esi,        edx
-        packuswb    mm1,        mm7
-
-        movd        DWORD PTR [edi-4], mm1
-
-        jl          horizontal_line_5_4_loop
-
-    }
-
-}
-
-static
-void vertical_band_5_4_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
-{
-
-    __declspec(align(16)) const unsigned short one_fourths[]   = {  64,  64,  64, 64  };
-    __declspec(align(16)) const unsigned short two_fourths[]   = { 128, 128, 128, 128 };
-    __declspec(align(16)) const unsigned short three_fourths[] = { 192, 192, 192, 192 };
-
-    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
-    __asm
-    {
-        push        ebx
-
-        mov         esi,    source                    // Get the source and destination pointer
-        mov         ecx,    src_pitch               // Get the pitch size
-
-        mov         edi,    dest                    // tow lines below
-        pxor        mm7,    mm7                     // clear out mm7
-
-        mov         edx,    dest_pitch               // Loop counter
-        mov         ebx,    dest_width
-
-        vs_5_4_loop:
-
-        movd        mm0,    DWORD ptr [esi]         // src[0];
-        movd        mm1,    DWORD ptr [esi+ecx]     // src[1];
-
-        movd        mm2,    DWORD ptr [esi+ecx*2]
-        lea         eax,    [esi+ecx*2]             //
-
-        punpcklbw   mm1,    mm7
-        punpcklbw   mm2,    mm7
-
-        movq        mm3,    mm2
-        pmullw      mm1,    three_fourths
-
-        pmullw      mm2,    one_fourths
-        movd        mm4,    [eax+ecx]
-
-        pmullw      mm3,    two_fourths
-        punpcklbw   mm4,    mm7
-
-        movq        mm5,    mm4
-        pmullw      mm4,    two_fourths
-
-        paddw       mm1,    mm2
-        movd        mm6,    [eax+ecx*2]
-
-        pmullw      mm5,    one_fourths
-        paddw       mm1,    round_values;
-
-        paddw       mm3,    mm4
-        psrlw       mm1,    8
-
-        punpcklbw   mm6,    mm7
-        paddw       mm3,    round_values
-
-        pmullw      mm6,    three_fourths
-        psrlw       mm3,    8
-
-        packuswb    mm1,    mm7
-        packuswb    mm3,    mm7
-
-        movd        DWORD PTR [edi], mm0
-        movd        DWORD PTR [edi+edx], mm1
-
-
-        paddw       mm5,    mm6
-        movd        DWORD PTR [edi+edx*2], mm3
-
-        lea         eax,    [edi+edx*2]
-        paddw       mm5,    round_values
-
-        psrlw       mm5,    8
-        add         edi,    4
-
-        packuswb    mm5,    mm7
-        movd        DWORD PTR [eax+edx], mm5
-
-        add         esi,    4
-        sub         ebx,    4
-
-        jg         vs_5_4_loop
-
-        pop         ebx
-    }
-}
-
-
-
-static
-void horizontal_line_5_3_scale_mmx
-(
-    const unsigned char *source,
-    unsigned int source_width,
-    unsigned char *dest,
-    unsigned int dest_width
-)
-{
-    __declspec(align(16)) const unsigned short const53_1[] = {  0,  85, 171, 0 };
-    __declspec(align(16)) const unsigned short const53_2[] = {256, 171,  85, 0 };
-    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
-    __asm
-    {
-
-        mov         esi,        source              ;
-        mov         edi,        dest                ;
-
-        mov         ecx,        source_width         ;
-        movq        mm5,        const53_1           ;
-
-        pxor        mm7,        mm7                 ;
-        movq        mm6,        const53_2           ;
-
-        movq        mm4,        round_values         ;
-        lea         edx,        [esi+ecx-5]         ;
-        horizontal_line_5_3_loop:
-
-        movq        mm0,        QWORD PTR  [esi]    ;
-        00 01 02 03 04 05 06 07
-        movq        mm1,        mm0                 ;
-        00 01 02 03 04 05 06 07
-
-        psllw       mm0,        8                   ;
-        xx 00 xx 02 xx 04 xx 06
-        psrlw       mm1,        8                   ;
-        01 xx 03 xx 05 xx 07 xx
-
-        psrlw       mm0,        8                   ;
-        00 xx 02 xx 04 xx 06 xx
-        psllq       mm1,        16                  ;
-        xx xx 01 xx 03 xx 05 xx
-
-        pmullw      mm0,        mm6
-
-        pmullw      mm1,        mm5
-        add         esi,        5
-
-        add         edi,        3
-        paddw       mm1,        mm0
-
-        paddw       mm1,        mm4
-        psrlw       mm1,        8
-
-        cmp         esi,        edx
-        packuswb    mm1,        mm7
-
-        movd        DWORD PTR [edi-3], mm1
-        jl          horizontal_line_5_3_loop
-
-//exit condition
-        movq        mm0,        QWORD PTR  [esi]    ;
-        00 01 02 03 04 05 06 07
-        movq        mm1,        mm0                 ;
-        00 01 02 03 04 05 06 07
-
-        psllw       mm0,        8                   ;
-        xx 00 xx 02 xx 04 xx 06
-        psrlw       mm1,        8                   ;
-        01 xx 03 xx 05 xx 07 xx
-
-        psrlw       mm0,        8                   ;
-        00 xx 02 xx 04 xx 06 xx
-        psllq       mm1,        16                  ;
-        xx xx 01 xx 03 xx 05 xx
-
-        pmullw      mm0,        mm6
-
-        pmullw      mm1,        mm5
-        paddw       mm1,        mm0
-
-        paddw       mm1,        mm4
-        psrlw       mm1,        8
-
-        packuswb    mm1,        mm7
-        movd        eax,        mm1
-
-        mov         edx,        eax
-        shr         edx,        16
-
-        mov         WORD PTR[edi],   ax
-        mov         BYTE PTR[edi+2], dl
-
-    }
-
-}
-
-
-static
-void vertical_band_5_3_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
-{
-    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
-    __declspec(align(16)) const unsigned short one_thirds[] = {  85,  85,  85,  85 };
-    __declspec(align(16)) const unsigned short two_thirds[] = { 171, 171, 171, 171 };
-
-    __asm
-    {
-        push        ebx
-
-        mov         esi,    source                    // Get the source and destination pointer
-        mov         ecx,    src_pitch               // Get the pitch size
-
-        mov         edi,    dest                    // tow lines below
-        pxor        mm7,    mm7                     // clear out mm7
-
-        mov         edx,    dest_pitch               // Loop counter
-        movq        mm5,    one_thirds
-
-        movq        mm6,    two_thirds
-        mov         ebx,    dest_width;
-
-        vs_5_3_loop:
-
-        movd        mm0,    DWORD ptr [esi]         // src[0];
-        movd        mm1,    DWORD ptr [esi+ecx]     // src[1];
-
-        movd        mm2,    DWORD ptr [esi+ecx*2]
-        lea         eax,    [esi+ecx*2]             //
-
-        punpcklbw   mm1,    mm7
-        punpcklbw   mm2,    mm7
-
-        pmullw      mm1,    mm5
-        pmullw      mm2,    mm6
-
-        movd        mm3,    DWORD ptr [eax+ecx]
-        movd        mm4,    DWORD ptr [eax+ecx*2]
-
-        punpcklbw   mm3,    mm7
-        punpcklbw   mm4,    mm7
-
-        pmullw      mm3,    mm6
-        pmullw      mm4,    mm5
-
-
-        movd        DWORD PTR [edi], mm0
-        paddw       mm1,    mm2
-
-        paddw       mm1,    round_values
-        psrlw       mm1,    8
-
-        packuswb    mm1,    mm7
-        paddw       mm3,    mm4
-
-        paddw       mm3,    round_values
-        movd        DWORD PTR [edi+edx], mm1
-
-        psrlw       mm3,    8
-        packuswb    mm3,    mm7
-
-        movd        DWORD PTR [edi+edx*2], mm3
-
-
-        add         edi,    4
-        add         esi,    4
-
-        sub         ebx,    4
-        jg          vs_5_3_loop
-
-        pop         ebx
-    }
-}
-
-
-
-
-/****************************************************************************
- *
- *  ROUTINE       : horizontal_line_2_1_scale
- *
- *  INPUTS        : const unsigned char *source :
- *                  unsigned int source_width    :
- *                  unsigned char *dest         :
- *                  unsigned int dest_width      :
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : 1 to 2 up-scaling of a horizontal line of pixels.
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-static
-void horizontal_line_2_1_scale_mmx
-(
-    const unsigned char *source,
-    unsigned int source_width,
-    unsigned char *dest,
-    unsigned int dest_width
-)
-{
-    (void) dest_width;
-
-    __asm
-    {
-        mov         esi,    source
-        mov         edi,    dest
-
-        pxor        mm7,    mm7
-        mov         ecx,    dest_width
-
-        xor         edx,    edx
-        hs_2_1_loop:
-
-        movq        mm0,    [esi+edx*2]
-        psllw       mm0,    8
-
-        psrlw       mm0,    8
-        packuswb    mm0,    mm7
-
-        movd        DWORD Ptr [edi+edx], mm0;
-        add         edx,    4
-
-        cmp         edx,    ecx
-        jl          hs_2_1_loop
-
-    }
-}
-
-
-
-static
-void vertical_band_2_1_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
-{
-    vpx_memcpy(dest, source, dest_width);
-}
-
-
-
-static
-void vertical_band_2_1_scale_i_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
-{
-
-    __declspec(align(16)) const unsigned short three_sixteenths[] = {  48,  48,  48,  48 };
-    __declspec(align(16)) const unsigned short ten_sixteenths[]   = { 160, 160, 160, 160 };
-    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
-    __asm
-    {
-        mov         esi,        source
-        mov         edi,        dest
-
-        mov         eax,        src_pitch
-        mov         edx,        dest_width
-
-        pxor        mm7,        mm7
-        sub         esi,        eax             //back one line
-
-
-        lea         ecx,        [esi+edx];
-        movq        mm6,        round_values;
-
-        movq        mm5,        three_sixteenths;
-        movq        mm4,        ten_sixteenths;
-
-        vs_2_1_i_loop:
-        movd        mm0,        [esi]           //
-        movd        mm1,        [esi+eax]       //
-
-        movd        mm2,        [esi+eax*2]     //
-        punpcklbw   mm0,        mm7
-
-        pmullw      mm0,        mm5
-        punpcklbw   mm1,        mm7
-
-        pmullw      mm1,        mm4
-        punpcklbw   mm2,        mm7
-
-        pmullw      mm2,        mm5
-        paddw       mm0,        round_values
-
-        paddw       mm1,        mm2
-        paddw       mm0,        mm1
-
-        psrlw       mm0,        8
-        packuswb    mm0,        mm7
-
-        movd        DWORD PTR [edi],        mm0
-        add         esi,        4
-
-        add         edi,        4;
-        cmp         esi,        ecx
-        jl          vs_2_1_i_loop
-
-    }
-}
-
-void
-register_mmxscalers(void)
-{
-    vp8_horizontal_line_1_2_scale        = horizontal_line_1_2_scale_mmx;
-    vp8_vertical_band_1_2_scale          = vertical_band_1_2_scale_mmx;
-    vp8_last_vertical_band_1_2_scale      = last_vertical_band_1_2_scale_mmx;
-    vp8_horizontal_line_3_5_scale        = horizontal_line_3_5_scale_mmx;
-    vp8_vertical_band_3_5_scale          = vertical_band_3_5_scale_mmx;
-    vp8_last_vertical_band_3_5_scale      = last_vertical_band_3_5_scale_mmx;
-    vp8_horizontal_line_4_5_scale        = horizontal_line_4_5_scale_mmx;
-    vp8_vertical_band_4_5_scale          = vertical_band_4_5_scale_mmx;
-    vp8_last_vertical_band_4_5_scale      = last_vertical_band_4_5_scale_mmx;
-
-    vp8_horizontal_line_3_4_scale        = vp8cx_horizontal_line_3_4_scale_c;
-    vp8_vertical_band_3_4_scale          = vp8cx_vertical_band_3_4_scale_c;
-    vp8_last_vertical_band_3_4_scale      = vp8cx_last_vertical_band_3_4_scale_c;
-    vp8_horizontal_line_2_3_scale        = vp8cx_horizontal_line_2_3_scale_c;
-    vp8_vertical_band_2_3_scale          = vp8cx_vertical_band_2_3_scale_c;
-    vp8_last_vertical_band_2_3_scale      = vp8cx_last_vertical_band_2_3_scale_c;
-
-
-
-    vp8_vertical_band_5_4_scale          = vertical_band_5_4_scale_mmx;
-    vp8_vertical_band_5_3_scale          = vertical_band_5_3_scale_mmx;
-    vp8_vertical_band_2_1_scale          = vertical_band_2_1_scale_mmx;
-    vp8_vertical_band_2_1_scale_i        = vertical_band_2_1_scale_i_mmx;
-    vp8_horizontal_line_2_1_scale        = horizontal_line_2_1_scale_mmx;
-    vp8_horizontal_line_5_3_scale        = horizontal_line_5_3_scale_mmx;
-    vp8_horizontal_line_5_4_scale        = horizontal_line_5_4_scale_mmx;
-
-}

diff --git a/vpx_scale/leapster/doptsystemdependant_lf.c b/vpx_scale/leapster/doptsystemdependant_lf.c
deleted file mode 100644
index 7cbb1c5..0000000
--- a/vpx_scale/leapster/doptsystemdependant_lf.c
+++ /dev/null

@@ -1,72 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/****************************************************************************
-*
-*   Module Title :     system_dependant.c
-*
-*   Description  :     Miscellaneous system dependant functions
-*
-****************************************************************************/
-
-/****************************************************************************
-*  Header Files
-****************************************************************************/
-#include "vpx_scale/vpxscale.h"
-
-/****************************************************************************
-*  Imports
-*****************************************************************************/
-extern int register_generic_scalers(void);
-extern int de_register_generic_scalers(void);
-
-/****************************************************************************
- *
- *  ROUTINE       : vp8_scale_machine_specific_config
- *
- *  INPUTS        : UINT32 Version : Codec version number.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : int
- *
- *  FUNCTION      : Checks for machine specifc features such as MMX support
- *                  sets appropriate flags and function pointers.
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-int
-vp8_scale_machine_specific_config()
-{
-    return register_generic_scalers();
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : vp8_scale_machine_specific_config
- *
- *  INPUTS        : UINT32 Version : Codec version number.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : int
- *
- *  FUNCTION      : Resets the funtion pointers and deallocates memory.
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-int
-scale_machine_specific_de_config()
-{
-    return de_register_generic_scalers();
-}

diff --git a/vpx_scale/leapster/gen_scalers_lf.c b/vpx_scale/leapster/gen_scalers_lf.c
deleted file mode 100644
index f1ee056..0000000
--- a/vpx_scale/leapster/gen_scalers_lf.c
+++ /dev/null

@@ -1,522 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/****************************************************************************
- *
- *   Module Title :     gen_scalers.c
- *
- *   Description  :     Generic image scaling functions.
- *
- ***************************************************************************/
-
-/****************************************************************************
-*  Header Files
-****************************************************************************/
-#include "vpx_scale/vpxscale.h"
-
-/****************************************************************************
-*  Imports
-****************************************************************************/
-
-/****************************************************************************
- *
- *  ROUTINE       : vp8cx_horizontal_line_4_5_scale_c
- *
- *  INPUTS        : const unsigned char *source : Pointer to source data.
- *                  unsigned int source_width    : Stride of source.
- *                  unsigned char *dest         : Pointer to destination data.
- *                  unsigned int dest_width      : Stride of destination (NOT USED).
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Copies horizontal line of pixels from source to
- *                  destination scaling up by 4 to 5.
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-static
-void vp8cx_horizontal_line_4_5_scale_c
-(
-    const unsigned char *source,
-    unsigned int source_width,
-    unsigned char *dest,
-    unsigned int dest_width
-)
-{
-    unsigned i;
-    unsigned int a, b, c;
-    unsigned char *des = dest;
-    const unsigned char *src = source;
-
-    (void) dest_width;
-
-    for (i = 0; i < source_width - 4; i += 4)
-    {
-        a = src[0];
-        b = src[1];
-        des [0] = (unsigned char) a;
-        des [1] = (unsigned char)((a * 51 + 205 * b + 128) >> 8);
-        c = src[2] * 154;
-        a = src[3];
-        des [2] = (unsigned char)((b * 102 + c + 128) >> 8);
-        des [3] = (unsigned char)((c + 102 * a + 128) >> 8);
-        b = src[4];
-        des [4] = (unsigned char)((a * 205 + 51 * b + 128) >> 8);
-
-        src += 4;
-        des += 5;
-    }
-
-    a = src[0];
-    b = src[1];
-    des [0] = (unsigned char)(a);
-    des [1] = (unsigned char)((a * 51 + 205 * b + 128) >> 8);
-    c = src[2] * 154;
-    a = src[3];
-    des [2] = (unsigned char)((b * 102 + c + 128) >> 8);
-    des [3] = (unsigned char)((c + 102 * a + 128) >> 8);
-    des [4] = (unsigned char)(a);
-
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : vp8cx_vertical_band_4_5_scale_c
- *
- *  INPUTS        : unsigned char *dest    : Pointer to destination data.
- *                  unsigned int dest_pitch : Stride of destination data.
- *                  unsigned int dest_width : Width of destination data.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Scales vertical band of pixels by scale 4 to 5. The
- *                  height of the band scaled is 4-pixels.
- *
- *  SPECIAL NOTES : The routine uses the first line of the band below
- *                  the current band.
- *
- ****************************************************************************/
-static
-void vp8cx_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
-{
-    unsigned int i;
-    unsigned int a, b, c, d;
-    unsigned char *des = dest;
-
-    for (i = 0; i < dest_width; i++)
-    {
-        a = des [0];
-        b = des [dest_pitch];
-
-        des[dest_pitch] = (unsigned char)((a * 51 + 205 * b + 128) >> 8);
-
-        c = des[dest_pitch*2] * 154;
-        d = des[dest_pitch*3];
-
-        des [dest_pitch*2] = (unsigned char)((b * 102 + c + 128) >> 8);
-        des [dest_pitch*3] = (unsigned char)((c + 102 * d + 128) >> 8);
-
-        // First line in next band
-        a = des [dest_pitch * 5];
-        des [dest_pitch * 4] = (unsigned char)((d * 205 + 51 * a + 128) >> 8);
-
-        des ++;
-    }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : vp8cx_last_vertical_band_4_5_scale_c
- *
- *  INPUTS        : unsigned char *dest    : Pointer to destination data.
- *                  unsigned int dest_pitch : Stride of destination data.
- *                  unsigned int dest_width : Width of destination data.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Scales last vertical band of pixels by scale 4 to 5. The
- *                  height of the band scaled is 4-pixels.
- *
- *  SPECIAL NOTES : The routine does not have available the first line of
- *                  the band below the current band, since this is the
- *                  last band.
- *
- ****************************************************************************/
-static
-void vp8cx_last_vertical_band_4_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
-{
-    unsigned int i;
-    unsigned int a, b, c, d;
-    unsigned char *des = dest;
-
-    for (i = 0; i < dest_width; ++i)
-    {
-        a = des[0];
-        b = des[dest_pitch];
-
-        des[dest_pitch] = (unsigned char)((a * 51 + 205 * b + 128) >> 8);
-
-        c = des[dest_pitch*2] * 154;
-        d = des[dest_pitch*3];
-
-        des [dest_pitch*2] = (unsigned char)((b * 102 + c + 128) >> 8);
-        des [dest_pitch*3] = (unsigned char)((c + 102 * d + 128) >> 8);
-
-        // No other line for interplation of this line, so ..
-        des[dest_pitch*4] = (unsigned char) d;
-
-        des++;
-    }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : vp8cx_horizontal_line_3_5_scale_c
- *
- *  INPUTS        : const unsigned char *source : Pointer to source data.
- *                  unsigned int source_width    : Stride of source.
- *                  unsigned char *dest         : Pointer to destination data.
- *                  unsigned int dest_width      : Stride of destination (NOT USED).
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Copies horizontal line of pixels from source to
- *                  destination scaling up by 3 to 5.
- *
- *  SPECIAL NOTES : None.
- *
- *
- ****************************************************************************/
-static
-void vp8cx_horizontal_line_3_5_scale_c
-(
-    const unsigned char *source,
-    unsigned int source_width,
-    unsigned char *dest,
-    unsigned int dest_width
-)
-{
-    unsigned int i;
-    unsigned int a, b, c;
-    unsigned char *des = dest;
-    const unsigned char *src = source;
-
-    (void) dest_width;
-
-    for (i = 0; i < source_width - 3; i += 3)
-    {
-        a = src[0];
-        b = src[1];
-        des [0] = (unsigned char)(a);
-        des [1] = (unsigned char)((a * 102 + 154 * b + 128) >> 8);
-
-        c = src[2] ;
-        des [2] = (unsigned char)((b * 205 + c * 51 + 128) >> 8);
-        des [3] = (unsigned char)((b * 51 + c * 205 + 128) >> 8);
-
-        a = src[3];
-        des [4] = (unsigned char)((c * 154 + a * 102 + 128) >> 8);
-
-        src += 3;
-        des += 5;
-    }
-
-    a = src[0];
-    b = src[1];
-    des [0] = (unsigned char)(a);
-
-    des [1] = (unsigned char)((a * 102 + 154 * b + 128) >> 8);
-    c = src[2] ;
-    des [2] = (unsigned char)((b * 205 + c * 51 + 128) >> 8);
-    des [3] = (unsigned char)((b * 51 + c * 205 + 128) >> 8);
-
-    des [4] = (unsigned char)(c);
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : vp8cx_vertical_band_3_5_scale_c
- *
- *  INPUTS        : unsigned char *dest    : Pointer to destination data.
- *                  unsigned int dest_pitch : Stride of destination data.
- *                  unsigned int dest_width : Width of destination data.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Scales vertical band of pixels by scale 3 to 5. The
- *                  height of the band scaled is 3-pixels.
- *
- *  SPECIAL NOTES : The routine uses the first line of the band below
- *                  the current band.
- *
- ****************************************************************************/
-static
-void vp8cx_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
-{
-    unsigned int i;
-    unsigned int a, b, c;
-    unsigned char *des = dest;
-
-    for (i = 0; i < dest_width; i++)
-    {
-        a = des [0];
-        b = des [dest_pitch];
-        des [dest_pitch] = (unsigned char)((a * 102 + 154 * b + 128) >> 8);
-
-        c = des[dest_pitch*2];
-        des [dest_pitch*2] = (unsigned char)((b * 205 + c * 51 + 128) >> 8);
-        des [dest_pitch*3] = (unsigned char)((b * 51 + c * 205 + 128) >> 8);
-
-        // First line in next band...
-        a = des [dest_pitch * 5];
-        des [dest_pitch * 4] = (unsigned char)((c * 154 + a * 102 + 128) >> 8);
-
-        des++;
-    }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : vp8cx_last_vertical_band_3_5_scale_c
- *
- *  INPUTS        : unsigned char *dest    : Pointer to destination data.
- *                  unsigned int dest_pitch : Stride of destination data.
- *                  unsigned int dest_width : Width of destination data.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Scales last vertical band of pixels by scale 3 to 5. The
- *                  height of the band scaled is 3-pixels.
- *
- *  SPECIAL NOTES : The routine does not have available the first line of
- *                  the band below the current band, since this is the
- *                  last band.
- *
- ****************************************************************************/
-static
-void vp8cx_last_vertical_band_3_5_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
-{
-    unsigned int i;
-    unsigned int a, b, c;
-    unsigned char *des = dest;
-
-    for (i = 0; i < dest_width; ++i)
-    {
-        a = des [0];
-        b = des [dest_pitch];
-
-        des [ dest_pitch ] = (unsigned char)((a * 102 + 154 * b + 128) >> 8);
-
-        c = des[dest_pitch*2];
-        des [dest_pitch*2] = (unsigned char)((b * 205 + c * 51 + 128) >> 8);
-        des [dest_pitch*3] = (unsigned char)((b * 51 + c * 205 + 128) >> 8);
-
-        // No other line for interplation of this line, so ..
-        des [ dest_pitch * 4 ] = (unsigned char)(c) ;
-
-        des++;
-    }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : vp8cx_horizontal_line_1_2_scale_c
- *
- *  INPUTS        : const unsigned char *source : Pointer to source data.
- *                  unsigned int source_width    : Stride of source.
- *                  unsigned char *dest         : Pointer to destination data.
- *                  unsigned int dest_width      : Stride of destination (NOT USED).
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Copies horizontal line of pixels from source to
- *                  destination scaling up by 1 to 2.
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-static
-void vp8cx_horizontal_line_1_2_scale_c
-(
-    const unsigned char *source,
-    unsigned int source_width,
-    unsigned char *dest,
-    unsigned int dest_width
-)
-{
-    unsigned int i;
-    unsigned int a, b;
-    unsigned char *des = dest;
-    const unsigned char *src = source;
-
-    (void) dest_width;
-
-    for (i = 0; i < source_width - 1; i += 1)
-    {
-        a = src[0];
-        b = src[1];
-        des [0] = (unsigned char)(a);
-        des [1] = (unsigned char)((a + b + 1) >> 1);
-        src += 1;
-        des += 2;
-    }
-
-    a = src[0];
-    des [0] = (unsigned char)(a);
-    des [1] = (unsigned char)(a);
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : vp8cx_vertical_band_1_2_scale_c
- *
- *  INPUTS        : unsigned char *dest    : Pointer to destination data.
- *                  unsigned int dest_pitch : Stride of destination data.
- *                  unsigned int dest_width : Width of destination data.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Scales vertical band of pixels by scale 1 to 2. The
- *                  height of the band scaled is 1-pixel.
- *
- *  SPECIAL NOTES : The routine uses the first line of the band below
- *                  the current band.
- *
- ****************************************************************************/
-static
-void vp8cx_vertical_band_1_2_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
-{
-    unsigned int i;
-    unsigned int a, b;
-    unsigned char *des = dest;
-
-    for (i = 0; i < dest_width; i++)
-    {
-        a = des [0];
-        b = des [dest_pitch * 2];
-
-        des[dest_pitch] = (unsigned char)((a + b + 1) >> 1);
-
-        des++;
-    }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : vp8cx_last_vertical_band_1_2_scale_c
- *
- *  INPUTS        : unsigned char *dest    : Pointer to destination data.
- *                  unsigned int dest_pitch : Stride of destination data.
- *                  unsigned int dest_width : Width of destination data.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Scales last vertical band of pixels by scale 1 to 2. The
- *                  height of the band scaled is 1-pixel.
- *
- *  SPECIAL NOTES : The routine does not have available the first line of
- *                  the band below the current band, since this is the
- *                  last band.
- *
- ****************************************************************************/
-static
-void vp8cx_last_vertical_band_1_2_scale_c(unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
-{
-    unsigned int i;
-    unsigned char *des = dest;
-
-    for (i = 0; i < dest_width; ++i)
-    {
-        des[dest_pitch] = des[0];
-        des++;
-    }
-}
-
-#include "vpx_scale/vpxscale.h"
-#include "vpx_mem/vpx_mem.h"
-
-struct vpxglobal_scalling_ptrs_t *g_scaling_ptrs = 0;
-
-int
-register_generic_scalers(void)
-{
-    int rv = 0;
-
-    g_scaling_ptrs = (struct vpxglobal_scalling_ptrs_t *)vpx_malloc(sizeof(struct vpxglobal_scalling_ptrs_t));
-
-    if (g_scaling_ptrs)
-    {
-        g_scaling_ptrs->vpxhorizontal_line_1_2_scale_t        = vp8cx_horizontal_line_1_2_scale_c;
-        g_scaling_ptrs->vpxvertical_band_1_2_scale_t          = vp8cx_vertical_band_1_2_scale_c;
-        g_scaling_ptrs->vpxlast_vertical_band_1_2_scale_t      = vp8cx_last_vertical_band_1_2_scale_c;
-        g_scaling_ptrs->vpxhorizontal_line_3_5_scale_t        = vp8cx_horizontal_line_3_5_scale_c;
-        g_scaling_ptrs->vpxvertical_band_3_5_scale_t          = vp8cx_vertical_band_3_5_scale_c;
-        g_scaling_ptrs->vpxlast_vertical_band_3_5_scale_t      = vp8cx_last_vertical_band_3_5_scale_c;
-        g_scaling_ptrs->vpxhorizontal_line_4_5_scale_t        = vp8cx_horizontal_line_4_5_scale_c;
-        g_scaling_ptrs->vpxvertical_band_4_5_scale_t          = vp8cx_vertical_band_4_5_scale_c;
-        g_scaling_ptrs->vpxlast_vertical_band_4_5_scale_t      = vp8cx_last_vertical_band_4_5_scale_c;
-    }
-    else
-    {
-        rv = -1;
-    }
-
-    /*
-    vp8_horizontal_line_1_2_scale        = vp8cx_horizontal_line_1_2_scale_c;
-    vp8_vertical_band_1_2_scale          = vp8cx_vertical_band_1_2_scale_c;
-    vp8_last_vertical_band_1_2_scale      = vp8cx_last_vertical_band_1_2_scale_c;
-    vp8_horizontal_line_3_5_scale        = vp8cx_horizontal_line_3_5_scale_c;
-    vp8_vertical_band_3_5_scale          = vp8cx_vertical_band_3_5_scale_c;
-    vp8_last_vertical_band_3_5_scale      = vp8cx_last_vertical_band_3_5_scale_c;
-    vp8_horizontal_line_4_5_scale        = vp8cx_horizontal_line_4_5_scale_c;
-    vp8_vertical_band_4_5_scale          = vp8cx_vertical_band_4_5_scale_c;
-    vp8_last_vertical_band_4_5_scale      = vp8cx_last_vertical_band_4_5_scale_c;
-    */
-
-    return rv;
-}
-
-int
-de_register_generic_scalers(void)
-{
-    int rv = 0;
-
-    if (g_scaling_ptrs)
-    {
-        vpx_free(g_scaling_ptrs);
-        g_scaling_ptrs = 0;
-    }
-    else
-    {
-        rv = -1;
-    }
-
-    return rv;
-}

diff --git a/vpx_scale/leapster/vpxscale_lf.c b/vpx_scale/leapster/vpxscale_lf.c
deleted file mode 100644
index 616ddf5..0000000
--- a/vpx_scale/leapster/vpxscale_lf.c
+++ /dev/null

@@ -1,891 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/****************************************************************************
- *
- *   Module Title :     scale.c
- *
- *   Description  :     Image scaling functions.
- *
- ***************************************************************************/
-
-/****************************************************************************
-*  Header Files
-****************************************************************************/
-#include "stdlib.h"
-#include "vpx_scale/vpxscale.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vpx_scale/yv12config.h"
-#include "codec_common_interface.h"
-
-/****************************************************************************
-*  Exports
-****************************************************************************/
-/*
-void  (*vp8_vertical_band_4_5_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width);
-void  (*vp8_last_vertical_band_4_5_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width);
-void  (*vp8_vertical_band_3_5_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width);
-void  (*vp8_last_vertical_band_3_5_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width);
-void  (*vp8_horizontal_line_1_2_scale)(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width);
-void  (*vp8_horizontal_line_3_5_scale)(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width);
-void  (*vp8_horizontal_line_4_5_scale)(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width);
-void  (*vp8_vertical_band_1_2_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width);
-void  (*vp8_last_vertical_band_1_2_scale)(unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width);
-*/
-
-
-typedef struct
-{
-    int     expanded_frame_width;
-    int     expanded_frame_height;
-
-    int HScale;
-    int HRatio;
-    int VScale;
-    int VRatio;
-
-    YV12_BUFFER_CONFIG *src_yuv_config;
-    YV12_BUFFER_CONFIG *dst_yuv_config;
-
-} SCALE_VARS;
-
-
-/****************************************************************************
- *
- *  ROUTINE       :     horizontal_line_copy
- *
- *  INPUTS        :     None
- *
- *
- *  OUTPUTS       :     None.
- *
- *  RETURNS       :     None
- *
- *  FUNCTION      :     1 to 1 scaling up for a horizontal line of pixles
- *
- *  SPECIAL NOTES :     None.
- *
- *  ERRORS        :     None.
- *
- ****************************************************************************/
-static
-void horizontal_line_copy(
-    const unsigned char *source,
-    unsigned int source_width,
-    unsigned char *dest,
-    unsigned int dest_width
-)
-{
-    duck_memcpy(dest, source, source_width);
-}
-/****************************************************************************
- *
- *  ROUTINE       :     null_scale
- *
- *  INPUTS        :     None
- *
- *
- *  OUTPUTS       :     None.
- *
- *  RETURNS       :     None
- *
- *  FUNCTION      :     1 to 1 scaling up for a vertical band
- *
- *  SPECIAL NOTES :     None.
- *
- *  ERRORS        :     None.
- *
- ****************************************************************************/
-static
-void null_scale(
-    unsigned char *dest,
-    unsigned int dest_pitch,
-    unsigned int dest_width
-)
-{
-    return;
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : scale1d_2t1_i
- *
- *  INPUTS        : const unsigned char *source : Pointer to data to be scaled.
- *                  int source_step              : Number of pixels to step on in source.
- *                  unsigned int source_scale    : Scale for source (UNUSED).
- *                  unsigned int source_length   : Length of source (UNUSED).
- *                  unsigned char *dest         : Pointer to output data array.
- *                  int dest_step                : Number of pixels to step on in destination.
- *                  unsigned int dest_scale      : Scale for destination (UNUSED).
- *                  unsigned int dest_length     : Length of destination.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Performs 2-to-1 interpolated scaling.
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-static
-void scale1d_2t1_i
-(
-    const unsigned char *source,
-    int source_step,
-    unsigned int source_scale,
-    unsigned int source_length,
-    unsigned char *dest,
-    int dest_step,
-    unsigned int dest_scale,
-    unsigned int dest_length
-)
-{
-    unsigned int i, j;
-    unsigned int temp;
-
-    (void) source_length;
-    (void) source_scale;
-    (void) dest_scale;
-
-    source_step *= 2;
-    dest[0] = source[0];
-
-    for (i = dest_step, j = source_step; i < dest_length * dest_step; i += dest_step, j += source_step)
-    {
-        temp = 8;
-        temp += 3 * source[j-source_step];
-        temp += 10 * source[j];
-        temp += 3 * source[j+source_step];
-        temp >>= 4;
-        dest[i] = (char)(temp);
-    }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : scale1d_2t1_ps
- *
- *  INPUTS        : const unsigned char *source : Pointer to data to be scaled.
- *                  int source_step              : Number of pixels to step on in source.
- *                  unsigned int source_scale    : Scale for source (UNUSED).
- *                  unsigned int source_length   : Length of source (UNUSED).
- *                  unsigned char *dest         : Pointer to output data array.
- *                  int dest_step                : Number of pixels to step on in destination.
- *                  unsigned int dest_scale      : Scale for destination (UNUSED).
- *                  unsigned int dest_length     : Length of destination.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Performs 2-to-1 point subsampled scaling.
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-static
-void scale1d_2t1_ps
-(
-    const unsigned char *source,
-    int source_step,
-    unsigned int source_scale,
-    unsigned int source_length,
-    unsigned char *dest,
-    int dest_step,
-    unsigned int dest_scale,
-    unsigned int dest_length
-)
-{
-    unsigned int i, j;
-
-    (void) source_length;
-    (void) source_scale;
-    (void) dest_scale;
-
-    source_step *= 2;
-    j = 0;
-
-    for (i = 0; i < dest_length * dest_step; i += dest_step, j += source_step)
-        dest[i] = source[j];
-}
-/****************************************************************************
- *
- *  ROUTINE       : scale1d_c
- *
- *  INPUTS        : const unsigned char *source : Pointer to data to be scaled.
- *                  int source_step              : Number of pixels to step on in source.
- *                  unsigned int source_scale    : Scale for source.
- *                  unsigned int source_length   : Length of source (UNUSED).
- *                  unsigned char *dest         : Pointer to output data array.
- *                  int dest_step                : Number of pixels to step on in destination.
- *                  unsigned int dest_scale      : Scale for destination.
- *                  unsigned int dest_length     : Length of destination.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Performs linear interpolation in one dimension.
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-static
-void scale1d_c
-(
-    const unsigned char *source,
-    int source_step,
-    unsigned int source_scale,
-    unsigned int source_length,
-    unsigned char *dest,
-    int dest_step,
-    unsigned int dest_scale,
-    unsigned int dest_length
-)
-{
-    unsigned int i;
-    unsigned int round_value = dest_scale / 2;
-    unsigned int left_modifier = dest_scale;
-    unsigned int right_modifier = 0;
-    unsigned char left_pixel = *source;
-    unsigned char right_pixel = *(source + source_step);
-
-    (void) source_length;
-
-    // These asserts are needed if there are boundary issues...
-    //assert ( dest_scale > source_scale );
-    //assert ( (source_length-1) * dest_scale >= (dest_length-1) * source_scale );
-
-    for (i = 0; i < dest_length * dest_step; i += dest_step)
-    {
-        dest[i] = (char)((left_modifier * left_pixel + right_modifier * right_pixel + round_value) / dest_scale);
-
-        right_modifier += source_scale;
-
-        while (right_modifier > dest_scale)
-        {
-            right_modifier -= dest_scale;
-            source += source_step;
-            left_pixel = *source;
-            right_pixel = *(source + source_step);
-        }
-
-        left_modifier = dest_scale - right_modifier;
-    }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : Scale2D
- *
- *  INPUTS        : const unsigned char *source  : Pointer to data to be scaled.
- *                  int source_pitch              : Stride of source image.
- *                  unsigned int source_width     : Width of input image.
- *                  unsigned int source_height    : Height of input image.
- *                  unsigned char *dest          : Pointer to output data array.
- *                  int dest_pitch                : Stride of destination image.
- *                  unsigned int dest_width       : Width of destination image.
- *                  unsigned int dest_height      : Height of destination image.
- *                  unsigned char *temp_area      : Pointer to temp work area.
- *                  unsigned char temp_area_height : Height of temp work area.
- *                  unsigned int hscale          : Horizontal scale factor numerator.
- *                  unsigned int hratio          : Horizontal scale factor denominator.
- *                  unsigned int vscale          : Vertical scale factor numerator.
- *                  unsigned int vratio          : Vertical scale factor denominator.
- *                  unsigned int interlaced      : Interlace flag.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Performs 2-tap linear interpolation in two dimensions.
- *
- *  SPECIAL NOTES : Expansion is performed one band at a time to help with
- *                  caching.
- *
- ****************************************************************************/
-static
-void Scale2D
-(
-    const unsigned char *source,
-    int source_pitch,
-    unsigned int source_width,
-    unsigned int source_height,
-    unsigned char *dest,
-    int dest_pitch,
-    unsigned int dest_width,
-    unsigned int dest_height,
-    unsigned char *temp_area,
-    unsigned char temp_area_height,
-    unsigned int hscale,
-    unsigned int hratio,
-    unsigned int vscale,
-    unsigned int vratio,
-    unsigned int interlaced
-)
-{
-    unsigned int i, j, k;
-    unsigned int bands;
-    unsigned int dest_band_height;
-    unsigned int source_band_height;
-
-    typedef void (*Scale1D)(const unsigned char * source, int source_step, unsigned int source_scale, unsigned int source_length,
-                            unsigned char * dest, int dest_step, unsigned int dest_scale, unsigned int dest_length);
-
-    Scale1D Scale1Dv = scale1d_c;
-    Scale1D Scale1Dh = scale1d_c;
-
-    if (hscale == 2 && hratio == 1)
-        Scale1Dh = scale1d_2t1_ps;
-
-    if (vscale == 2 && vratio == 1)
-    {
-        if (interlaced)
-            Scale1Dv = scale1d_2t1_ps;
-        else
-            Scale1Dv = scale1d_2t1_i;
-    }
-
-    if (source_height == dest_height)
-    {
-        // for each band of the image
-        for (k = 0; k < dest_height; k++)
-        {
-            Scale1Dh(source, 1, hscale, source_width + 1, dest, 1, hratio, dest_width);
-            source += source_pitch;
-            dest   += dest_pitch;
-        }
-
-        return;
-    }
-
-    if (dest_height > source_height)
-    {
-        dest_band_height   = temp_area_height - 1;
-        source_band_height = dest_band_height * source_height / dest_height;
-    }
-    else
-    {
-        source_band_height = temp_area_height - 1;
-        dest_band_height   = source_band_height * vratio / vscale;
-    }
-
-    // first row needs to be done so that we can stay one row ahead for vertical zoom
-    Scale1Dh(source, 1, hscale, source_width + 1, temp_area, 1, hratio, dest_width);
-
-    // for each band of the image
-    bands = (dest_height + dest_band_height - 1) / dest_band_height;
-
-    for (k = 0; k < bands; k++)
-    {
-        // scale one band horizontally
-        for (i = 1; i < source_band_height + 1; i++)
-        {
-            if (k * source_band_height + i < source_height)
-            {
-                Scale1Dh(source + i * source_pitch, 1, hscale, source_width + 1,
-                         temp_area + i * dest_pitch, 1, hratio, dest_width);
-            }
-            else  //  Duplicate the last row
-            {
-                // copy temp_area row 0 over from last row in the past
-                duck_memcpy(temp_area + i * dest_pitch, temp_area + (i - 1)*dest_pitch, dest_pitch);
-            }
-        }
-
-        // scale one band vertically
-        for (j = 0; j < dest_width; j++)
-        {
-            Scale1Dv(&temp_area[j], dest_pitch, vscale, source_band_height + 1,
-                     &dest[j], dest_pitch, vratio, dest_band_height);
-        }
-
-        // copy temp_area row 0 over from last row in the past
-        duck_memcpy(temp_area, temp_area + source_band_height * dest_pitch, dest_pitch);
-
-        // move to the next band
-        source += source_band_height * source_pitch;
-        dest   += dest_band_height * dest_pitch;
-    }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : vp8_scale_frame
- *
- *  INPUTS        : YV12_BUFFER_CONFIG *src       : Pointer to frame to be scaled.
- *                  YV12_BUFFER_CONFIG *dst       : Pointer to buffer to hold scaled frame.
- *                  unsigned char *temp_area      : Pointer to temp work area.
- *                  unsigned char temp_area_height : Height of temp work area.
- *                  unsigned int hscale          : Horizontal scale factor numerator.
- *                  unsigned int hratio          : Horizontal scale factor denominator.
- *                  unsigned int vscale          : Vertical scale factor numerator.
- *                  unsigned int vratio          : Vertical scale factor denominator.
- *                  unsigned int interlaced      : Interlace flag.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Performs 2-tap linear interpolation in two dimensions.
- *
- *  SPECIAL NOTES : Expansion is performed one band at a time to help with
- *                  caching.
- *
- ****************************************************************************/
-void vp8_scale_frame
-(
-    YV12_BUFFER_CONFIG *src,
-    YV12_BUFFER_CONFIG *dst,
-    unsigned char *temp_area,
-    unsigned char temp_height,
-    unsigned int hscale,
-    unsigned int hratio,
-    unsigned int vscale,
-    unsigned int vratio,
-    unsigned int interlaced
-)
-{
-    int i;
-    int dw = (hscale - 1 + src->y_width * hratio) / hscale;
-    int dh = (vscale - 1 + src->y_height * vratio) / vscale;
-
-    // call our internal scaling routines!!
-    Scale2D((unsigned char *) src->y_buffer, src->y_stride, src->y_width, src->y_height,
-            (unsigned char *) dst->y_buffer, dst->y_stride, dw, dh,
-            temp_area, temp_height, hscale, hratio, vscale, vratio, interlaced);
-
-    if (dw < (int)dst->y_width)
-        for (i = 0; i < dh; i++)
-            duck_memset(dst->y_buffer + i * dst->y_stride + dw - 1, dst->y_buffer[i*dst->y_stride+dw-2], dst->y_width - dw + 1);
-
-    if (dh < (int)dst->y_height)
-        for (i = dh - 1; i < (int)dst->y_height; i++)
-            duck_memcpy(dst->y_buffer + i * dst->y_stride, dst->y_buffer + (dh - 2) * dst->y_stride, dst->y_width + 1);
-
-    Scale2D((unsigned char *) src->u_buffer, src->uv_stride, src->uv_width, src->uv_height,
-            (unsigned char *) dst->u_buffer, dst->uv_stride, dw / 2, dh / 2,
-            temp_area, temp_height, hscale, hratio, vscale, vratio, interlaced);
-
-    if (dw / 2 < (int)dst->uv_width)
-        for (i = 0; i < dst->uv_height; i++)
-            duck_memset(dst->u_buffer + i * dst->uv_stride + dw / 2 - 1, dst->u_buffer[i*dst->uv_stride+dw/2-2], dst->uv_width - dw / 2 + 1);
-
-    if (dh / 2 < (int)dst->uv_height)
-        for (i = dh / 2 - 1; i < (int)dst->y_height / 2; i++)
-            duck_memcpy(dst->u_buffer + i * dst->uv_stride, dst->u_buffer + (dh / 2 - 2)*dst->uv_stride, dst->uv_width);
-
-    Scale2D((unsigned char *) src->v_buffer, src->uv_stride, src->uv_width, src->uv_height,
-            (unsigned char *) dst->v_buffer, dst->uv_stride, dw / 2, dh / 2,
-            temp_area, temp_height, hscale, hratio, vscale, vratio, interlaced);
-
-    if (dw / 2 < (int)dst->uv_width)
-        for (i = 0; i < dst->uv_height; i++)
-            duck_memset(dst->v_buffer + i * dst->uv_stride + dw / 2 - 1, dst->v_buffer[i*dst->uv_stride+dw/2-2], dst->uv_width - dw / 2 + 1);
-
-    if (dh / 2 < (int) dst->uv_height)
-        for (i = dh / 2 - 1; i < (int)dst->y_height / 2; i++)
-            duck_memcpy(dst->v_buffer + i * dst->uv_stride, dst->v_buffer + (dh / 2 - 2)*dst->uv_stride, dst->uv_width);
-}
-/****************************************************************************
- *
- *  ROUTINE       : any_ratio_2d_scale
- *
- *  INPUTS        : SCALE_INSTANCE *si      : Pointer to post-processor instance (NOT USED).
- *                  const unsigned char *source : Pointer to source image.
- *                  unsigned int source_pitch    : Stride of source image.
- *                  unsigned int source_width    : Width of source image.
- *                  unsigned int source_height   : Height of source image (NOT USED).
- *                  unsigned char *dest         : Pointer to destination image.
- *                  unsigned int dest_pitch      : Stride of destination image.
- *                  unsigned int dest_width      : Width of destination image.
- *                  unsigned int dest_height     : Height of destination image.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : int: 1 if image scaled, 0 if image could not be scaled.
- *
- *  FUNCTION      : Scale the image with changing apect ratio.
- *
- *  SPECIAL NOTES : This scaling is a bi-linear scaling. Need to re-work the
- *                  whole function for new scaling algorithm.
- *
- ****************************************************************************/
-static
-int any_ratio_2d_scale
-(
-    SCALE_VARS *si,
-    const unsigned char *source,
-    unsigned int source_pitch,
-    unsigned int source_width,
-    unsigned int source_height,
-    unsigned char *dest,
-    unsigned int dest_pitch,
-    unsigned int dest_width,
-    unsigned int dest_height
-)
-{
-    unsigned int i, k;
-    unsigned int src_band_height  = 0;
-    unsigned int dest_band_height = 0;
-
-    // suggested scale factors
-    int hs = si->HScale;
-    int hr = si->HRatio;
-    int vs = si->VScale;
-    int vr = si->VRatio;
-
-    // assume the ratios are scalable instead of should be centered
-    int ratio_scalable = 1;
-
-    void (*horiz_line_scale)(const unsigned char *, unsigned int, unsigned char *, unsigned int) = NULL;
-    void (*vert_band_scale)(unsigned char *, unsigned int, unsigned int) = NULL;
-    void (*last_vert_band_scale)(unsigned char *, unsigned int, unsigned int) = NULL;
-
-    (void) si;
-
-    // find out the ratio for each direction
-    switch (hr * 10 / hs)
-    {
-    case 8:
-        // 4-5 Scale in Width direction
-        horiz_line_scale = g_scaling_ptrs->vpxhorizontal_line_4_5_scale_t;
-        break;
-    case 6:
-        // 3-5 Scale in Width direction
-        horiz_line_scale = g_scaling_ptrs->vpxhorizontal_line_3_5_scale_t;
-        break;
-    case 5:
-        // 1-2 Scale in Width direction
-        horiz_line_scale = g_scaling_ptrs->vpxhorizontal_line_1_2_scale_t;
-        break;
-    case 10:
-        // no scale in Width direction
-        horiz_line_scale = horizontal_line_copy;
-        break;
-    default:
-        // The ratio is not acceptable now
-        // throw("The ratio is not acceptable for now!");
-        ratio_scalable = 0;
-        break;
-    }
-
-    switch (vr * 10 / vs)
-    {
-    case 8:
-        // 4-5 Scale in vertical direction
-        vert_band_scale     = g_scaling_ptrs->vpxvertical_band_4_5_scale_t;
-        last_vert_band_scale = g_scaling_ptrs->vpxlast_vertical_band_4_5_scale_t;
-        src_band_height     = 4;
-        dest_band_height    = 5;
-        break;
-    case 6:
-        // 3-5 Scale in vertical direction
-        vert_band_scale     = g_scaling_ptrs->vpxvertical_band_3_5_scale_t;
-        last_vert_band_scale = g_scaling_ptrs->vpxlast_vertical_band_3_5_scale_t;
-        src_band_height     = 3;
-        dest_band_height    = 5;
-        break;
-    case 5:
-        // 1-2 Scale in vertical direction
-        vert_band_scale     = g_scaling_ptrs->vpxvertical_band_1_2_scale_t;
-        last_vert_band_scale = g_scaling_ptrs->vpxlast_vertical_band_1_2_scale_t;
-        src_band_height     = 1;
-        dest_band_height    = 2;
-        break;
-    case 10:
-        // no scale in Width direction
-        vert_band_scale     = null_scale;
-        last_vert_band_scale = null_scale;
-        src_band_height     = 4;
-        dest_band_height    = 4;
-        break;
-    default:
-        // The ratio is not acceptable now
-        // throw("The ratio is not acceptable for now!");
-        ratio_scalable = 0;
-        break;
-    }
-
-    if (ratio_scalable == 0)
-        return ratio_scalable;
-
-    horiz_line_scale(source, source_width, dest, dest_width);
-
-    // except last band
-    for (k = 0; k < (dest_height + dest_band_height - 1) / dest_band_height - 1; k++)
-    {
-        // scale one band horizontally
-        for (i = 1; i < src_band_height; i++)
-        {
-            horiz_line_scale(source + i * source_pitch,
-                             source_width,
-                             dest + i * dest_pitch,
-                             dest_width);
-        }
-
-        // first line of next band
-        horiz_line_scale(source + src_band_height * source_pitch,
-                         source_width,
-                         dest + dest_band_height * dest_pitch,
-                         dest_width);
-
-        // Vertical scaling is in place
-        vert_band_scale(dest, dest_pitch, dest_width);
-
-        // Next band...
-        source += src_band_height  * source_pitch;
-        dest   += dest_band_height * dest_pitch;
-    }
-
-    // scale one band horizontally
-    for (i = 1; i < src_band_height; i++)
-    {
-        horiz_line_scale(source + i * source_pitch,
-                         source_width,
-                         dest + i * dest_pitch,
-                         dest_width);
-    }
-
-    // Vertical scaling is in place
-    last_vert_band_scale(dest, dest_pitch, dest_width);
-
-    return ratio_scalable;
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : any_ratio_frame_scale
- *
- *  INPUTS        : SCALE_INSTANCE *si       : Pointer to post-processor instance (NOT USED).
- *                  unsigned char *frame_buffer           : Pointer to source image.
- *                  int YOffset                : Offset from start of buffer to Y samples.
- *                  int UVOffset               : Offset from start of buffer to UV samples.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : int: 1 if image scaled, 0 if image could not be scaled.
- *
- *  FUNCTION      : Scale the image with changing apect ratio.
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-static
-int any_ratio_frame_scale(SCALE_VARS *scale_vars, int YOffset, int UVOffset)
-{
-    int i;
-    int ew;
-    int eh;
-
-    // suggested scale factors
-    int hs = scale_vars->HScale;
-    int hr = scale_vars->HRatio;
-    int vs = scale_vars->VScale;
-    int vr = scale_vars->VRatio;
-
-    int ratio_scalable = 1;
-
-    int sw = (scale_vars->expanded_frame_width * hr + hs - 1) / hs;
-    int sh = (scale_vars->expanded_frame_height * vr + vs - 1) / vs;
-    int dw = scale_vars->expanded_frame_width;
-    int dh = scale_vars->expanded_frame_height;
-    YV12_BUFFER_CONFIG *src_yuv_config = scale_vars->src_yuv_config;
-    YV12_BUFFER_CONFIG *dst_yuv_config = scale_vars->dst_yuv_config;
-
-    if (hr == 3)
-        ew = (sw + 2) / 3 * 3 * hs / hr;
-    else
-        ew = (sw + 7) / 8 * 8 * hs / hr;
-
-    if (vr == 3)
-        eh = (sh + 2) / 3 * 3 * vs / vr;
-    else
-        eh = (sh + 7) / 8 * 8 * vs / vr;
-
-    ratio_scalable = any_ratio_2d_scale(scale_vars,
-                                        (const unsigned char *)src_yuv_config->y_buffer,
-                                        src_yuv_config->y_stride, sw, sh,
-                                        (unsigned char *) dst_yuv_config->y_buffer + YOffset,
-                                        dst_yuv_config->y_stride, dw, dh);
-
-    for (i = 0; i < eh; i++)
-        duck_memset(dst_yuv_config->y_buffer + YOffset + i * dst_yuv_config->y_stride + dw, 0, ew - dw);
-
-    for (i = dh; i < eh; i++)
-        duck_memset(dst_yuv_config->y_buffer + YOffset + i * dst_yuv_config->y_stride, 0, ew);
-
-    if (ratio_scalable == 0)
-        return ratio_scalable;
-
-    sw = (sw + 1) >> 1;
-    sh = (sh + 1) >> 1;
-    dw = (dw + 1) >> 1;
-    dh = (dh + 1) >> 1;
-
-    any_ratio_2d_scale(scale_vars,
-                       (const unsigned char *)src_yuv_config->u_buffer,
-                       src_yuv_config->y_stride / 2, sw, sh,
-                       (unsigned char *)dst_yuv_config->u_buffer + UVOffset,
-                       dst_yuv_config->uv_stride, dw, dh);
-
-    any_ratio_2d_scale(scale_vars,
-                       (const unsigned char *)src_yuv_config->v_buffer,
-                       src_yuv_config->y_stride / 2, sw, sh,
-                       (unsigned char *)dst_yuv_config->v_buffer + UVOffset,
-                       dst_yuv_config->uv_stride, dw, dh);
-
-    return ratio_scalable;
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : center_image
- *
- *  INPUTS        : SCALE_INSTANCE *si       : Pointer to post-processor instance.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Centers the image without scaling in the output buffer.
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-static void
-center_image(YV12_BUFFER_CONFIG *src_yuv_config, YV12_BUFFER_CONFIG *dst_yuv_config)
-{
-    int i;
-    int row_offset, col_offset;
-    char *src_data_pointer;
-    char *dst_data_pointer;
-
-    // center values
-    row_offset = (dst_yuv_config->y_height - src_yuv_config->y_height) / 2;
-    col_offset = (dst_yuv_config->y_width - src_yuv_config->y_width) / 2;
-
-    // Y's
-    src_data_pointer = src_yuv_config->y_buffer;
-    dst_data_pointer = (char *)dst_yuv_config->y_buffer + (row_offset * dst_yuv_config->y_stride) + col_offset;
-
-    for (i = 0; i < src_yuv_config->y_height; i++)
-    {
-        duck_memcpy(dst_data_pointer, src_data_pointer, src_yuv_config->y_width);
-        dst_data_pointer += dst_yuv_config->y_stride;
-        src_data_pointer += src_yuv_config->y_stride;
-    }
-
-    row_offset /= 2;
-    col_offset /= 2;
-
-    // U's
-    src_data_pointer = src_yuv_config->u_buffer;
-    dst_data_pointer = (char *)dst_yuv_config->u_buffer + (row_offset * dst_yuv_config->uv_stride) + col_offset;
-
-    for (i = 0; i < src_yuv_config->uv_height; i++)
-    {
-        duck_memcpy(dst_data_pointer, src_data_pointer, src_yuv_config->uv_width);
-        dst_data_pointer += dst_yuv_config->uv_stride;
-        src_data_pointer += src_yuv_config->uv_stride;
-    }
-
-    // V's
-    src_data_pointer = src_yuv_config->v_buffer;
-    dst_data_pointer = (char *)dst_yuv_config->v_buffer + (row_offset * dst_yuv_config->uv_stride) + col_offset;
-
-    for (i = 0; i < src_yuv_config->uv_height; i++)
-    {
-        duck_memcpy(dst_data_pointer, src_data_pointer, src_yuv_config->uv_width);
-        dst_data_pointer += dst_yuv_config->uv_stride;
-        src_data_pointer += src_yuv_config->uv_stride;
-    }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : scale_or_center
- *
- *  INPUTS        : SCALE_INSTANCE *si       : Pointer to post-processor instance.
- *
- *
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Decides to scale or center image in scale buffer for blit
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-void
-vp8_yv12_scale_or_center
-(
-    YV12_BUFFER_CONFIG *src_yuv_config,
-    YV12_BUFFER_CONFIG *dst_yuv_config,
-    int expanded_frame_width,
-    int expanded_frame_height,
-    int scaling_mode,
-    int HScale,
-    int HRatio,
-    int VScale,
-    int VRatio
-)
-{
-//    if ( ppi->post_processing_level )
-    //      update_umvborder ( ppi, frame_buffer );
-
-
-    switch (scaling_mode)
-    {
-    case SCALE_TO_FIT:
-    case MAINTAIN_ASPECT_RATIO:
-    {
-        SCALE_VARS scale_vars;
-        // center values
-#if 1
-        int row = (dst_yuv_config->y_height - expanded_frame_height) / 2;
-        int col = (dst_yuv_config->y_width  - expanded_frame_width) / 2;
-//        int YOffset  = row * dst_yuv_config->y_width + col;
-//        int UVOffset = (row>>1) * dst_yuv_config->uv_width + (col>>1);
-        int YOffset  = row * dst_yuv_config->y_stride + col;
-        int UVOffset = (row >> 1) * dst_yuv_config->uv_stride + (col >> 1);
-#else
-        int row = (src_yuv_config->y_height - expanded_frame_height) / 2;
-        int col = (src_yuv_config->y_width  - expanded_frame_width) / 2;
-        int YOffset  = row * src_yuv_config->y_width + col;
-        int UVOffset = (row >> 1) * src_yuv_config->uv_width + (col >> 1);
-#endif
-
-        scale_vars.dst_yuv_config = dst_yuv_config;
-        scale_vars.src_yuv_config = src_yuv_config;
-        scale_vars.HScale = HScale;
-        scale_vars.HRatio = HRatio;
-        scale_vars.VScale = VScale;
-        scale_vars.VRatio = VRatio;
-        scale_vars.expanded_frame_width = expanded_frame_width;
-        scale_vars.expanded_frame_height = expanded_frame_height;
-
-        // perform center and scale
-        any_ratio_frame_scale(&scale_vars, YOffset, UVOffset);
-
-        break;
-    }
-    case CENTER:
-        center_image(src_yuv_config, dst_yuv_config);
-        break;
-
-    default:
-        break;
-    }
-}

diff --git a/vpx_scale/leapster/yv12extend.c b/vpx_scale/leapster/yv12extend.c
deleted file mode 100644
index 1cddd95..0000000
--- a/vpx_scale/leapster/yv12extend.c
+++ /dev/null

@@ -1,232 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/****************************************************************************
- *
- *   Module Title :     yv12extend.c
- *
- *   Description  :
- *
- ***************************************************************************/
-
-/****************************************************************************
-*  Header Files
-****************************************************************************/
-//#include <stdlib.h>
-#include "vpx_scale/yv12config.h"
-#include "vpx_mem/vpx_mem.h"
-
-/****************************************************************************
-*  Exports
-****************************************************************************/
-
-/****************************************************************************
- *
- ****************************************************************************/
-void
-vp8_yv12_extend_frame_borders(YV12_BUFFER_CONFIG *ybf)
-{
-    int i;
-    char *src_ptr1, *src_ptr2;
-    char *dest_ptr1, *dest_ptr2;
-
-    unsigned int Border;
-    int plane_stride;
-    int plane_height;
-    int plane_width;
-
-    /***********/
-    /* Y Plane */
-    /***********/
-    Border = ybf->border;
-    plane_stride = ybf->y_stride;
-    plane_height = ybf->y_height;
-    plane_width = ybf->y_width;
-
-    // copy the left and right most columns out
-    src_ptr1 = ybf->y_buffer;
-    src_ptr2 = src_ptr1 + plane_width - 1;
-    dest_ptr1 = src_ptr1 - Border;
-    dest_ptr2 = src_ptr2 + 1;
-
-    for (i = 0; i < plane_height; i++)
-    {
-        memset(dest_ptr1, src_ptr1[0], Border);
-        memset(dest_ptr2, src_ptr2[0], Border);
-        src_ptr1  += plane_stride;
-        src_ptr2  += plane_stride;
-        dest_ptr1 += plane_stride;
-        dest_ptr2 += plane_stride;
-    }
-
-    // Now copy the top and bottom source lines into each line of the respective borders
-    src_ptr1 = ybf->y_buffer - Border;
-    src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
-    dest_ptr1 = src_ptr1 - (Border * plane_stride);
-    dest_ptr2 = src_ptr2 + plane_stride;
-
-    for (i = 0; i < (int)Border; i++)
-    {
-        memcpy(dest_ptr1, src_ptr1, plane_stride);
-        memcpy(dest_ptr2, src_ptr2, plane_stride);
-        dest_ptr1 += plane_stride;
-        dest_ptr2 += plane_stride;
-    }
-
-    plane_stride /= 2;
-    plane_height /= 2;
-    plane_width /= 2;
-    Border /= 2;
-
-    /***********/
-    /* U Plane */
-    /***********/
-
-    // copy the left and right most columns out
-    src_ptr1 = ybf->u_buffer;
-    src_ptr2 = src_ptr1 + plane_width - 1;
-    dest_ptr1 = src_ptr1 - Border;
-    dest_ptr2 = src_ptr2 + 1;
-
-    for (i = 0; i < plane_height; i++)
-    {
-        memset(dest_ptr1, src_ptr1[0], Border);
-        memset(dest_ptr2, src_ptr2[0], Border);
-        src_ptr1  += plane_stride;
-        src_ptr2  += plane_stride;
-        dest_ptr1 += plane_stride;
-        dest_ptr2 += plane_stride;
-    }
-
-    // Now copy the top and bottom source lines into each line of the respective borders
-    src_ptr1 = ybf->u_buffer - Border;
-    src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
-    dest_ptr1 = src_ptr1 - (Border * plane_stride);
-    dest_ptr2 = src_ptr2 + plane_stride;
-
-    for (i = 0; i < (int)(Border); i++)
-    {
-        memcpy(dest_ptr1, src_ptr1, plane_stride);
-        memcpy(dest_ptr2, src_ptr2, plane_stride);
-        dest_ptr1 += plane_stride;
-        dest_ptr2 += plane_stride;
-    }
-
-    /***********/
-    /* V Plane */
-    /***********/
-
-    // copy the left and right most columns out
-    src_ptr1 = ybf->v_buffer;
-    src_ptr2 = src_ptr1 + plane_width - 1;
-    dest_ptr1 = src_ptr1 - Border;
-    dest_ptr2 = src_ptr2 + 1;
-
-    for (i = 0; i < plane_height; i++)
-    {
-        memset(dest_ptr1, src_ptr1[0], Border);
-        memset(dest_ptr2, src_ptr2[0], Border);
-        src_ptr1  += plane_stride;
-        src_ptr2  += plane_stride;
-        dest_ptr1 += plane_stride;
-        dest_ptr2 += plane_stride;
-    }
-
-    // Now copy the top and bottom source lines into each line of the respective borders
-    src_ptr1 = ybf->v_buffer - Border;
-    src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
-    dest_ptr1 = src_ptr1 - (Border * plane_stride);
-    dest_ptr2 = src_ptr2 + plane_stride;
-
-    for (i = 0; i < (int)(Border); i++)
-    {
-        memcpy(dest_ptr1, src_ptr1, plane_stride);
-        memcpy(dest_ptr2, src_ptr2, plane_stride);
-        dest_ptr1 += plane_stride;
-        dest_ptr2 += plane_stride;
-    }
-}
-/****************************************************************************
- *
- *  ROUTINE       : vp8_yv12_copy_frame
- *
- *  INPUTS        :
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Copies the source image into the destination image and
- *                  updates the destination's UMV borders.
- *
- *  SPECIAL NOTES : The frames are assumed to be identical in size.
- *
- ****************************************************************************/
-void
-vp8_yv12_copy_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc)
-{
-    int row;
-    int i;
-    unsigned int *source;
-    _Uncached unsigned int *dest;
-    int height;
-    int width;
-
-    height = src_ybc->y_height + (src_ybc->border * 2);
-    width =  src_ybc->y_width + (src_ybc->border * 2);
-    width /= 4;
-    source = (unsigned int *)(src_ybc->y_buffer - (src_ybc->border * src_ybc->y_stride) - src_ybc->border);
-    dest = (_Uncached unsigned int *)(dst_ybc->y_buffer - (dst_ybc->border * dst_ybc->y_stride) - dst_ybc->border);
-
-    for (row = 0; row < height; row++)
-    {
-        for (i = 0; i < width; i++)
-        {
-            dest[i] = source[i];
-        }
-
-        source += width;
-        dest   += width;
-    }
-
-    height = src_ybc->uv_height + (src_ybc->border);
-    width =  src_ybc->uv_width + (src_ybc->border);
-    width /= 4;
-
-    source = (unsigned int *)(src_ybc->u_buffer - (src_ybc->border / 2 * src_ybc->uv_stride) - src_ybc->border / 2);
-    dest = (_Uncached unsigned int *)(dst_ybc->u_buffer - (dst_ybc->border / 2 * dst_ybc->uv_stride) - dst_ybc->border / 2);
-
-    for (row = 0; row < height; row++)
-    {
-        for (i = 0; i < width; i++)
-        {
-            dest[i] = source[i];
-        }
-
-        source += width;
-        dest   += width;
-    }
-
-    source = (unsigned int *)(src_ybc->v_buffer - (src_ybc->border / 2 * src_ybc->uv_stride) - src_ybc->border / 2);
-    dest = (_Uncached unsigned int *)(dst_ybc->v_buffer - (dst_ybc->border / 2 * dst_ybc->uv_stride) - dst_ybc->border / 2);
-
-    for (row = 0; row < height; row++)
-    {
-        for (i = 0; i < width; i++)
-        {
-            dest[i] = source[i];
-        }
-
-        source += width;
-        dest   += width;
-    }
-
-}

diff --git a/vpx_scale/symbian/gen_scalers_armv4.asm b/vpx_scale/symbian/gen_scalers_armv4.asm
deleted file mode 100644
index e495184..0000000
--- a/vpx_scale/symbian/gen_scalers_armv4.asm
+++ /dev/null

@@ -1,774 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |horizontal_line_4_5_scale_armv4|
-    EXPORT  |vertical_band_4_5_scale_armv4|
-    EXPORT  |horizontal_line_2_3_scale_armv4|
-    EXPORT  |vertical_band_2_3_scale_armv4|
-    EXPORT  |horizontal_line_3_5_scale_armv4|
-    EXPORT  |vertical_band_3_5_scale_armv4|
-    EXPORT  |horizontal_line_3_4_scale_armv4|
-    EXPORT  |vertical_band_3_4_scale_armv4|
-    EXPORT  |horizontal_line_1_2_scale_armv4|
-    EXPORT  |vertical_band_1_2_scale_armv4|
-
-    AREA    |.text|, CODE, READONLY  ; name this block of code
-
-src         RN  r0
-srcw        RN  r1
-dest        RN  r2
-mask        RN  r12
-c51_205     RN  r10
-c102_154    RN  r11
-;/****************************************************************************
-; *
-; *  ROUTINE       : horizontal_line_4_5_scale_armv4
-; *
-; *  INPUTS        : const unsigned char *source : Pointer to source data.
-; *                  unsigned int source_width    : Stride of source.
-; *                  unsigned char *dest         : Pointer to destination data.
-; *                  unsigned int dest_width      : Stride of destination (NOT USED).
-; *
-; *  OUTPUTS       : None.
-; *
-; *  RETURNS       : void
-; *
-; *  FUNCTION      : Copies horizontal line of pixels from source to
-; *                  destination scaling up by 4 to 5.
-; *
-; *  SPECIAL NOTES : None.
-; *
-; ****************************************************************************/
-;void horizontal_line_4_5_scale_armv4
-;(
-;   r0 = UINT8 *source
-;   r1 = UINT32 source_width
-;   r2 = UINT8 *dest
-;   r3 = UINT32 dest_width
-;)
-|horizontal_line_4_5_scale_armv4| PROC
-    stmdb   sp!, {r4 - r11, lr}
-
-    mov     mask, #255              ; mask for selection
-    ldr     c51_205, =0x3300cd
-    ldr     c102_154, =0x66009a
-
-    ldr     r3, [src], #4
-
-hl45_loop
-
-    and     r4, r3, mask            ; a = src[0]
-    and     r5, mask, r3, lsr #8    ; b = src[1]
-    strb    r4, [dest], #1
-
-    orr     r6, r4, r5, lsl #16     ; b | a
-    and     r7, mask, r3, lsr #16   ; c = src[2]
-    mul     r6, c51_205, r6         ; a * 51 + 205 * b
-
-    orr     r5, r5, r7, lsl #16     ; c | b
-    mul     r5, c102_154, r5        ; b * 102 + 154 * c
-    add     r6, r6, #0x8000
-    and     r8, mask, r3, lsr #24   ; d = src[3]
-    mov     r6, r6, lsr #24
-    strb    r6, [dest], #1
-
-    orr     r7, r8, r7, lsl #16     ; c | d
-    mul     r7, c102_154, r7        ; c * 154 + 102 * d
-    add     r5, r5, #0x8000
-    ldr     r3, [src], #4
-    mov     r5, r5, lsr #24
-    strb    r5, [dest], #1
-
-    add     r7, r7, #0x8000
-    and     r9, mask, r3            ; e = src[4]
-    orr     r9, r9, r8, lsl #16     ; d | e
-    mul     r9, c51_205, r9         ; d * 205 + 51 * e
-    mov     r7, r7, lsr #24
-    strb    r7, [dest], #1
-
-    add     r9, r9, #0x8000
-    subs    srcw, srcw, #4
-    mov     r9, r9, lsr #24
-    strb    r9, [dest], #1
-
-    bne     hl45_loop
-
-    and     r4, r3, mask
-    and     r5, mask, r3, lsl #8
-    strb    r4, [dest], #1
-
-    orr     r6, r4, r5, lsl #16     ; b | a
-    mul     r6, c51_205, r6
-
-    and     r7, mask, r3, lsl #16
-    orr     r5, r5, r7, lsl #16     ; c | b
-    mul     r5, c102_154, r5
-    add     r6, r6, #0x8000
-    and     r8, mask, r3, lsl #24
-    mov     r6, r6, lsr #24
-    strb    r6, [dest], #1
-
-    orr     r7, r8, r7, lsl #16     ; c | d
-    mul     r7, c102_154, r7
-    add     r5, r5, #0x8000
-    mov     r5, r5, lsr #24
-    strb    r5, [dest], #1
-
-    add     r7, r7, #0x8000
-    mov     r7, r7, lsr #24
-    strb    r7, [dest], #1
-
-    ldrb    r3, [src]
-    strb    r3, [dest], #1
-
-    ldmia   sp!, {r4 - r11, pc}
-    ENDP    ;|vp8cx_horizontal_line_4_5_scale_c|
-
-;/****************************************************************************
-; *
-; *  ROUTINE       : vertical_band_4_5_scale_armv4
-; *
-; *  INPUTS        : unsigned char *dest    : Pointer to destination data.
-; *                  unsigned int dest_pitch : Stride of destination data.
-; *                  unsigned int dest_width : Width of destination data.
-; *
-; *  OUTPUTS       : None.
-; *
-; *  RETURNS       : void
-; *
-; *  FUNCTION      : Scales vertical band of pixels by scale 4 to 5. The
-; *                  height of the band scaled is 4-pixels.
-; *
-; *  SPECIAL NOTES : The routine uses the first line of the band below
-; *                  the current band.
-; *
-; ****************************************************************************/
-;void vertical_band_4_5_scale_armv4
-;(
-;   r0 = UINT8 *dest
-;   r1 = UINT32 dest_pitch
-;   r2 = UINT32 dest_width
-;)
-|vertical_band_4_5_scale_armv4| PROC
-    stmdb   sp!, {r4 - r11, lr}
-
-    ldr     c51_205, =0x3300cd
-    ldr     c102_154, =0x66009a
-
-vl45_loop
-    mov     r3, src
-    ldrb    r4, [r3], r1            ; a = des [0]
-    ldrb    r5, [r3], r1            ; b = des [dest_pitch]
-    ldrb    r7, [r3], r1            ; c = des[dest_pitch*2]
-    add     lr, src, r1
-
-    orr     r6, r4, r5, lsl #16     ; b | a
-    mul     r6, c51_205, r6         ; a * 51 + 205 * b
-
-    ldrb    r8, [r3], r1            ; d = des[dest_pitch*3]
-    orr     r5, r5, r7, lsl #16     ; c | b
-    mul     r5, c102_154, r5        ; b * 102 + 154 * c
-    add     r6, r6, #0x8000
-    orr     r7, r8, r7, lsl #16     ; c | d
-    mov     r6, r6, lsr #24
-    strb    r6, [lr], r1
-
-    ldrb    r9, [r3, r1]            ; e = des [dest_pitch * 5]
-    mul     r7, c102_154, r7        ; c * 154 + 102 * d
-    add     r5, r5, #0x8000
-    orr     r9, r9, r8, lsl #16     ; d | e
-    mov     r5, r5, lsr #24
-    strb    r5, [lr], r1
-
-    mul     r9, c51_205, r9         ; d * 205 + 51 * e
-    add     r7, r7, #0x8000
-    add     src, src, #1
-    mov     r7, r7, lsr #24
-    strb    r7, [lr], r1
-
-    add     r9, r9, #0x8000
-    subs    r2, r2, #1
-    mov     r9, r9, lsr #24
-    strb    r9, [lr], r1
-
-    bne     vl45_loop
-
-    ldmia   sp!, {r4 - r11, pc}
-    ENDP    ;|vertical_band_4_5_scale_armv4|
-
-;/****************************************************************************
-; *
-; *  ROUTINE       : horizontal_line_2_3_scale_armv4
-; *
-; *  INPUTS        : const unsigned char *source : Pointer to source data.
-; *                  unsigned int source_width    : Stride of source.
-; *                  unsigned char *dest         : Pointer to destination data.
-; *                  unsigned int dest_width      : Stride of destination (NOT USED).
-; *
-; *  OUTPUTS       : None.
-; *
-; *  RETURNS       : void
-; *
-; *  FUNCTION      : Copies horizontal line of pixels from source to
-; *                  destination scaling up by 2 to 3.
-; *
-; *  SPECIAL NOTES : None.
-; *
-; *
-; ****************************************************************************/
-;void horizontal_line_2_3_scale_armv4
-;(
-;   const unsigned char *source,
-;   unsigned int source_width,
-;   unsigned char *dest,
-;   unsigned int dest_width
-;)
-|horizontal_line_2_3_scale_armv4| PROC
-    stmdb   sp!, {r4 - r11, lr}
-    ldr     lr,  =85
-    ldr     r12, =171
-
-hl23_loop
-
-    ldrb    r3, [src], #1           ; a
-    ldrb    r4, [src], #1           ; b
-    ldrb    r5, [src]               ; c
-
-    strb    r3, [dest], #1
-    mul     r4, r12, r4             ; b * 171
-    mla     r6, lr, r3, r4          ; a * 85
-    mla     r7, lr, r5, r4          ; c * 85
-
-    add     r6, r6, #128
-    mov     r6, r6, lsr #8
-    strb    r6, [dest], #1
-
-    add     r7, r7, #128
-    mov     r7, r7, lsr #8
-    strb    r7, [dest], #1
-
-    subs    srcw, srcw, #2
-    bne     hl23_loop
-
-    ldrb    r4, [src, #1]           ; b
-    strb    r5, [dest], #1
-    strb    r4, [dest, #1]
-
-    mul     r4, r12, r4             ; b * 171
-    mla     r6, lr, r5, r4          ; a * 85 + b *171
-
-    add     r6, r6, #128
-    mov     r6, r6, lsr #8
-    strb    r6, [dest]
-
-    ldmia   sp!, {r4 - r11, pc}
-    ENDP    ;|horizontal_line_2_3_scale_armv4|
-
-;/****************************************************************************
-; *
-; *  ROUTINE       : vertical_band_2_3_scale_armv4
-; *
-; *  INPUTS        : unsigned char *dest    : Pointer to destination data.
-; *                  unsigned int dest_pitch : Stride of destination data.
-; *                  unsigned int dest_width : Width of destination data.
-; *
-; *  OUTPUTS       : None.
-; *
-; *  RETURNS       : void
-; *
-; *  FUNCTION      : Scales vertical band of pixels by scale 2 to 3. The
-; *                  height of the band scaled is 2-pixels.
-; *
-; *  SPECIAL NOTES : The routine uses the first line of the band below
-; *                  the current band.
-; *
-; ****************************************************************************/
-;void vertical_band_2_3_scale_armv4
-;(
-;   r0 = UINT8 *dest
-;   r1 = UINT32 dest_pitch
-;   r2 = UINT32 dest_width
-;)
-|vertical_band_2_3_scale_armv4| PROC
-    stmdb   sp!, {r4 - r8, lr}
-    ldr     lr,  =85
-    ldr     r12, =171
-    add     r3, r1, r1, lsl #1      ; 3 * dest_pitch
-
-vl23_loop
-    ldrb    r4, [src]               ; a = des [0]
-    ldrb    r5, [src, r1]           ; b = des [dest_pitch]
-    ldrb    r7, [src, r3]           ; c = des [dest_pitch*3]
-    subs    r2, r2, #1
-
-    mul     r5, r12, r5             ; b * 171
-    mla     r6, lr, r4, r5          ; a * 85
-    mla     r8, lr, r7, r5          ; c * 85
-
-    add     r6, r6, #128
-    mov     r6, r6, lsr #8
-    strb    r6, [src, r1]
-
-    add     r8, r8, #128
-    mov     r8, r8, lsr #8
-    strb    r8, [src, r1, lsl #1]
-
-    add     src, src, #1
-
-    bne     vl23_loop
-
-    ldmia   sp!, {r4 - r8, pc}
-    ENDP    ;|vertical_band_2_3_scale_armv4|
-
-;/****************************************************************************
-; *
-; *  ROUTINE       : vp8cx_horizontal_line_3_5_scale_c
-; *
-; *  INPUTS        : const unsigned char *source : Pointer to source data.
-; *                  unsigned int source_width    : Stride of source.
-; *                  unsigned char *dest         : Pointer to destination data.
-; *                  unsigned int dest_width      : Stride of destination (NOT USED).
-; *
-; *  OUTPUTS       : None.
-; *
-; *  RETURNS       : void
-; *
-; *  FUNCTION      : Copies horizontal line of pixels from source to
-; *                  destination scaling up by 3 to 5.
-; *
-; *  SPECIAL NOTES : None.
-; *
-; *
-; ****************************************************************************/
-;void vp8cx_horizontal_line_3_5_scale_c
-;(
-;   const unsigned char *source,
-;   unsigned int source_width,
-;   unsigned char *dest,
-;   unsigned int dest_width
-;)
-|horizontal_line_3_5_scale_armv4| PROC
-    stmdb   sp!, {r4 - r11, lr}
-
-    ldr     c51_205, =0x3300cd
-    ldr     c102_154, =0x66009a
-
-    ldrb    r4, [src], #1           ; a = src[0]
-
-hl35_loop
-
-    ldrb    r8, [src], #1           ; b = src[1]
-    strb    r4, [dest], #1
-
-    orr     r6, r4, r8, lsl #16     ; b | a
-    ldrb    r9, [src], #1           ; c = src[2]
-    mul     r6, c102_154, r6        ; a * 102 + 154 * b
-
-    orr     r5, r9, r8, lsl #16     ; b | c
-    mul     r5, c51_205, r5         ; b * 205 + 51 * c
-    add     r6, r6, #0x8000
-    ldrb    r4, [src], #1           ; d = src[3]
-    mov     r6, r6, lsr #24
-    strb    r6, [dest], #1
-
-    orr     r7, r8, r9, lsl #16     ; c | b
-    mul     r7, c51_205, r7         ; c * 205 + 154 * b
-    add     r5, r5, #0x8000
-    mov     r5, r5, lsr #24
-    strb    r5, [dest], #1
-
-    orr     r9, r4, r9, lsl #16     ; c | d
-    mul     r9, c102_154, r9        ; c * 154 + 102 * d
-    add     r7, r7, #0x8000
-    mov     r7, r7, lsr #24
-    strb    r7, [dest], #1
-
-    add     r9, r9, #0x8000
-    subs    srcw, srcw, #3
-    mov     r9, r9, lsr #24
-    strb    r9, [dest], #1
-
-    bpl     hl35_loop
-
-    ldrb    r5, [src], #1           ; b = src[1]
-    strb    r4, [dest], #1
-
-    orr     r6, r4, r8, lsl #16     ; b | a
-    ldrb    r9, [src], #1           ; c = src[2]
-    mul     r6, c102_154, r6        ; a * 102 + 154 * b
-
-    orr     r5, r9, r8, lsl #16     ; b | c
-    mul     r5, c51_205, r5         ; b * 205 + 51 * c
-    add     r6, r6, #0x8000
-    mov     r6, r6, lsr #24
-    strb    r6, [dest], #1
-
-    orr     r7, r8, r9, lsl #16     ; c | b
-    mul     r7, c51_205, r7         ; c * 205 + 154 * b
-    add     r5, r5, #0x8000
-    mov     r5, r5, lsr #24
-    strb    r5, [dest], #1
-
-    add     r7, r7, #0x8000
-    mov     r7, r7, lsr #24
-    strb    r7, [dest], #1
-    strb    r9, [dest], #1
-
-    ldmia   sp!, {r4 - r11, pc}
-    ENDP    ;|vp8cx_horizontal_line_3_5_scale_c|
-
-
-;/****************************************************************************
-; *
-; *  ROUTINE       : vp8cx_vertical_band_3_5_scale_c
-; *
-; *  INPUTS        : unsigned char *dest    : Pointer to destination data.
-; *                  unsigned int dest_pitch : Stride of destination data.
-; *                  unsigned int dest_width : Width of destination data.
-; *
-; *  OUTPUTS       : None.
-; *
-; *  RETURNS       : void
-; *
-; *  FUNCTION      : Scales vertical band of pixels by scale 3 to 5. The
-; *                  height of the band scaled is 3-pixels.
-; *
-; *  SPECIAL NOTES : The routine uses the first line of the band below
-; *                  the current band.
-; *
-; ****************************************************************************/
-;void vertical_band_4_5_scale_armv4
-;(
-;   r0 = UINT8 *dest
-;   r1 = UINT32 dest_pitch
-;   r2 = UINT32 dest_width
-;)
-|vertical_band_3_5_scale_armv4| PROC
-    stmdb   sp!, {r4 - r11, lr}
-
-    ldr     c51_205, =0x3300cd
-    ldr     c102_154, =0x66009a
-
-vl35_loop
-    mov     r3, src
-    ldrb    r4, [r3], r1            ; a = des [0]
-    ldrb    r5, [r3], r1            ; b = des [dest_pitch]
-    ldrb    r7, [r3], r1            ; c = des[dest_pitch*2]
-    add     lr, src, r1
-
-    orr     r8, r4, r5, lsl #16     ; b | a
-    mul     r6, c102_154, r8        ; a * 102 + 154 * b
-
-    ldrb    r8, [r3, r1, lsl #1]    ; d = des[dest_pitch*5]
-    orr     r3, r7, r5, lsl #16     ; b | c
-    mul     r9, c51_205, r3         ; b * 205 + 51 * c
-    add     r6, r6, #0x8000
-    orr     r3, r5, r7, lsl #16     ; c | b
-    mov     r6, r6, lsr #24
-    strb    r6, [lr], r1
-
-    mul     r5, c51_205, r3         ; c * 205 + 154 * b
-    add     r9, r9, #0x8000
-    orr     r3, r8, r7, lsl #16     ; c | d
-    mov     r9, r9, lsr #24
-    strb    r9, [lr], r1
-
-    mul     r7, c102_154, r3        ; c * 154 + 102 * d
-    add     r5, r5, #0x8000
-    add     src, src, #1
-    mov     r5, r5, lsr #24
-    strb    r5, [lr], r1
-
-    add     r7, r7, #0x8000
-    subs    r2, r2, #1
-    mov     r7, r7, lsr #24
-    strb    r7, [lr], r1
-
-
-    bne     vl35_loop
-
-    ldmia   sp!, {r4 - r11, pc}
-    ENDP    ;|vertical_band_3_5_scale_armv4|
-
-;/****************************************************************************
-; *
-; *  ROUTINE       : horizontal_line_3_4_scale_armv4
-; *
-; *  INPUTS        : const unsigned char *source : Pointer to source data.
-; *                  unsigned int source_width    : Stride of source.
-; *                  unsigned char *dest         : Pointer to destination data.
-; *                  unsigned int dest_width      : Stride of destination (NOT USED).
-; *
-; *  OUTPUTS       : None.
-; *
-; *  RETURNS       : void
-; *
-; *  FUNCTION      : Copies horizontal line of pixels from source to
-; *                  destination scaling up by 3 to 4.
-; *
-; *  SPECIAL NOTES : None.
-; *
-; *
-; ****************************************************************************/
-;void horizontal_line_3_4_scale_armv4
-;(
-;   const unsigned char *source,
-;   unsigned int source_width,
-;   unsigned char *dest,
-;   unsigned int dest_width
-;)
-|horizontal_line_3_4_scale_armv4| PROC
-    stmdb   sp!, {r4 - r11, lr}
-
-    ldr     r10, =64
-    ldr     r11, =192
-    mov     r9, #128
-
-    ldrb    r4, [src], #1           ; a = src[0]
-
-hl34_loop
-
-    ldrb    r8, [src], #1           ; b = src[1]
-    ldrb    r7, [src], #1           ; c = src[2]
-    strb    r4, [dest], #1
-
-    mla     r4, r10, r4, r9         ; a*64 + 128
-    mla     r4, r11, r8, r4         ; a*64 + b*192 + 1
-
-    add     r8, r8, #1              ; b + 1
-    add     r8, r8, r7              ; b + c + 1
-    mov     r8, r8, asr #1          ; (b + c + 1) >> 1
-
-    mov     r4, r4, asr #8          ; (a*64 + b*192 + 1) >> 8
-    strb    r4, [dest], #1
-
-    strb    r8, [dest], #1
-
-    ldrb    r4, [src], #1           ; [a+1]
-
-    mla     r7, r11, r7, r9         ; c*192 + 128
-    mla     r7, r4, r10, r7         ; a*64 + b*192 + 128
-
-    subs    srcw, srcw, #3
-
-    mov     r7, r7, asr #8          ; (a*64 + b*192 + 128) >> 8
-    strb    r7, [dest], #1
-
-    bpl     hl34_loop
-
-    ldrb    r8, [src], #1           ; b = src[1]
-    ldrb    r7, [src], #1           ; c = src[2]
-    strb    r4, [dest], #1
-
-    mla     r4, r10, r4, r9         ; a*64 + 128
-    mla     r4, r11, r8, r4         ; a*64 + b*192 + 1
-    mov     r4, r4, asr #8          ; (a*64 + b*192 + 1) >> 8
-    strb    r4, [dest], #1
-
-    add     r8, r8, #1              ; b + 1
-    add     r8, r8, r7              ; b + c + 1
-    mov     r8, r8, asr #1          ; (b + c + 1) >> 1
-    strb    r8, [dest], #1
-    strb    r7, [dest], #1
-
-    ldmia   sp!, {r4 - r11, pc}
-    ENDP    ;|vp8cx_horizontal_line_3_4_scale_c|
-
-
-;/****************************************************************************
-; *
-; *  ROUTINE       : vertical_band_3_4_scale_armv4
-; *
-; *  INPUTS        : unsigned char *dest    : Pointer to destination data.
-; *                  unsigned int dest_pitch : Stride of destination data.
-; *                  unsigned int dest_width : Width of destination data.
-; *
-; *  OUTPUTS       : None.
-; *
-; *  RETURNS       : void
-; *
-; *  FUNCTION      : Scales vertical band of pixels by scale 3 to 4. The
-; *                  height of the band scaled is 3-pixels.
-; *
-; *  SPECIAL NOTES : The routine uses the first line of the band below
-; *                  the current band.
-; *
-; ****************************************************************************/
-;void vertical_band_3_4_scale_armv4
-;(
-;   r0 = UINT8 *dest
-;   r1 = UINT32 dest_pitch
-;   r2 = UINT32 dest_width
-;)
-|vertical_band_3_4_scale_armv4| PROC
-    stmdb   sp!, {r4 - r11, lr}
-
-    ldr     r10, =64
-    ldr     r11, =192
-    mov     r9, #128
-
-;   ldr     r1,[r1]
-vl34_loop
-    mov     r3, src
-    ldrb    r4, [r3], r1            ; a = des [0]
-    ldrb    r5, [r3], r1            ; b = des [dest_pitch]
-    ldrb    r7, [r3], r1            ; c = des [dest_pitch*2]
-    add     lr, src, r1
-
-    mla     r4, r10, r4, r9         ; a*64 + 128
-    mla     r4, r11, r5, r4         ; a*64 + b*192 + 1
-
-    add     r5, r5, #1              ; b + 1
-    add     r5, r5, r7              ; b + c + 1
-    mov     r5, r5, asr #1          ; (b + c + 1) >> 1
-
-    mov     r4, r4, asr #8          ; (a*64 + b*192 + 1) >> 8
-    strb    r4, [lr], r1
-
-    ldrb    r4, [r3, r1]            ; a = des [dest_pitch*4]
-
-    strb    r5, [lr], r1
-
-    mla     r7, r11, r7, r9         ; c*192 + 128
-    mla     r7, r4, r10, r7         ; a*64 + b*192 + 128
-    mov     r7, r7, asr #8          ; (a*64 + b*192 + 128) >> 8
-
-    add     src, src, #1
-    subs    r2, r2, #1
-
-    strb    r7, [lr]
-
-    bne     vl34_loop
-
-    ldmia   sp!, {r4 - r11, pc}
-    ENDP    ;|vertical_band_3_4_scale_armv4|
-
-;/****************************************************************************
-; *
-; *  ROUTINE       : vp8cx_horizontal_line_1_2_scale_c
-; *
-; *  INPUTS        : const unsigned char *source : Pointer to source data.
-; *                  unsigned int source_width    : Stride of source.
-; *                  unsigned char *dest         : Pointer to destination data.
-; *                  unsigned int dest_width      : Stride of destination (NOT USED).
-; *
-; *  OUTPUTS       : None.
-; *
-; *  RETURNS       : void
-; *
-; *  FUNCTION      : Copies horizontal line of pixels from source to
-; *                  destination scaling up by 1 to 2.
-; *
-; *  SPECIAL NOTES : None.
-; *
-; ****************************************************************************/
-;void vp8cx_horizontal_line_1_2_scale_c
-;(
-;   const unsigned char *source,
-;   unsigned int source_width,
-;   unsigned char *dest,
-;   unsigned int dest_width
-;)
-|horizontal_line_1_2_scale_armv4| PROC
-    stmdb   sp!, {r4 - r5, lr}
-
-    sub     srcw, srcw, #1
-
-    ldrb    r3, [src], #1
-    ldrb    r4, [src], #1
-hl12_loop
-    subs    srcw, srcw, #1
-
-    add     r5, r3, r4
-    add     r5, r5, #1
-    mov     r5, r5, lsr #1
-
-    orr     r5, r3, r5, lsl #8
-    strh    r5, [dest], #2
-
-    mov     r3, r4
-
-    ldrneb  r4, [src], #1
-    bne     hl12_loop
-
-    orr     r5, r4, r4, lsl #8
-    strh    r5, [dest]
-
-    ldmia   sp!, {r4 - r5, pc}
-    ENDP    ;|vertical_band_3_5_scale_armv4|
-
-;/****************************************************************************
-; *
-; *  ROUTINE       : vp8cx_vertical_band_1_2_scale_c
-; *
-; *  INPUTS        : unsigned char *dest    : Pointer to destination data.
-; *                  unsigned int dest_pitch : Stride of destination data.
-; *                  unsigned int dest_width : Width of destination data.
-; *
-; *  OUTPUTS       : None.
-; *
-; *  RETURNS       : void
-; *
-; *  FUNCTION      : Scales vertical band of pixels by scale 1 to 2. The
-; *                  height of the band scaled is 1-pixel.
-; *
-; *  SPECIAL NOTES : The routine uses the first line of the band below
-; *                  the current band.
-; *
-; ****************************************************************************/
-;void vp8cx_vertical_band_1_2_scale_c
-;(
-;   r0 = UINT8 *dest
-;   r1 = UINT32 dest_pitch
-;   r2 = UINT32 dest_width
-;)
-|vertical_band_1_2_scale_armv4| PROC
-    stmdb   sp!, {r4 - r7, lr}
-
-    ldr     mask, =0xff00ff             ; mask for selection
-    ldr     lr, = 0x010001
-
-vl12_loop
-    mov     r3, src
-    ldr     r4, [r3], r1
-    ldr     r5, [r3, r1]
-
-    add     src, src, #4
-    subs    r2, r2, #4
-
-    and     r6, r4, mask
-    and     r7, r5, mask
-
-    add     r6, r7, r6
-    add     r6, r6, lr
-
-    and     r4, mask, r4, lsr #8
-    and     r5, mask, r5, lsr #8
-
-    mov     r6, r6, lsr #1
-    and     r6, r6, mask
-
-    add     r4, r5, r4
-    add     r4, r4, lr
-
-    mov     r4, r4, lsr #1
-    and     r4, r4, mask
-
-    orr     r5, r6, r4, lsl #8
-
-    str     r5, [r3]
-
-    bpl     vl12_loop
-
-    ldmia   sp!, {r4 - r7, pc}
-    ENDP    ;|vertical_band_3_5_scale_armv4|
-
-    END

diff --git a/vpx_scale/symbian/gen_scalers_armv4.s b/vpx_scale/symbian/gen_scalers_armv4.s
deleted file mode 100644
index 3dfd0b9..0000000
--- a/vpx_scale/symbian/gen_scalers_armv4.s
+++ /dev/null

@@ -1,808 +0,0 @@
-@ This file was created from a .asm file
-@  using the ads2gas.pl script.
-
-    .equ WIDE_REFERENCE, 0
-    .ifndef ARCHITECTURE
-    .equ ARCHITECTURE, 5
-    .endif
-    .global horizontal_line_4_5_scale_armv4
-    .ifndef NO_TYPE_PSEUDO_OP
-    .type horizontal_line_4_5_scale_armv4, function
-    .endif
-    .global vertical_band_4_5_scale_armv4
-    .ifndef NO_TYPE_PSEUDO_OP
-    .type vertical_band_4_5_scale_armv4, function
-    .endif
-    .global horizontal_line_2_3_scale_armv4
-    .ifndef NO_TYPE_PSEUDO_OP
-    .type horizontal_line_2_3_scale_armv4, function
-    .endif
-    .global vertical_band_2_3_scale_armv4
-    .ifndef NO_TYPE_PSEUDO_OP
-    .type vertical_band_2_3_scale_armv4, function
-    .endif
-    .global horizontal_line_3_5_scale_armv4
-    .ifndef NO_TYPE_PSEUDO_OP
-    .type horizontal_line_3_5_scale_armv4, function
-    .endif
-    .global vertical_band_3_5_scale_armv4
-    .ifndef NO_TYPE_PSEUDO_OP
-    .type vertical_band_3_5_scale_armv4, function
-    .endif
-    .global horizontal_line_3_4_scale_armv4
-    .ifndef NO_TYPE_PSEUDO_OP
-    .type horizontal_line_3_4_scale_armv4, function
-    .endif
-    .global vertical_band_3_4_scale_armv4
-    .ifndef NO_TYPE_PSEUDO_OP
-    .type vertical_band_3_4_scale_armv4, function
-    .endif
-    .global horizontal_line_1_2_scale_armv4
-    .ifndef NO_TYPE_PSEUDO_OP
-    .type horizontal_line_1_2_scale_armv4, function
-    .endif
-    .global vertical_band_1_2_scale_armv4
-    .ifndef NO_TYPE_PSEUDO_OP
-    .type vertical_band_1_2_scale_armv4, function
-    .endif
-
-.text
-
-src         .req    r0
-srcw        .req    r1
-dest        .req    r2
-mask        .req    r12
-c51_205     .req    r10
-c102_154    .req    r11
-@/****************************************************************************
-@ *
-@ *  ROUTINE       : horizontal_line_4_5_scale_armv4
-@ *
-@ *  INPUTS        : const unsigned char *source : Pointer to source data.
-@ *                  unsigned int source_width    : Stride of source.
-@ *                  unsigned char *dest         : Pointer to destination data.
-@ *                  unsigned int dest_width      : Stride of destination (NOT USED).
-@ *
-@ *  OUTPUTS       : None.
-@ *
-@ *  RETU.req_s       : void
-@ *
-@ *  FUNCTION      : Copies horizontal line of pixels from source to
-@ *                  destination scaling up by 4 to 5.
-@ *
-@ *  SPECIAL NOTES : None.
-@ *
-@ ****************************************************************************/
-@void horizontal_line_4_5_scale_armv4
-@(
-@   r0 = UINT8 *source
-@   r1 = UINT32 source_width
-@   r2 = UINT8 *dest
-@   r3 = UINT32 dest_width
-@)
-_HorizontalLine_4_5_Scale_ARMv4:
-    horizontal_line_4_5_scale_armv4: @
-    stmdb   sp!, {r4 - r11, lr}
-
-    mov     mask, #255              @ mask for selection
-    ldr     c51_205, =0x3300cd
-    ldr     c102_154, =0x66009a
-
-    ldr     r3, [src], #4
-
-hl45_loop:
-
-    and     r4, r3, mask            @ a = src[0]
-    and     r5, mask, r3, lsr #8    @ b = src[1]
-    strb    r4, [dest], #1
-
-    orr     r6, r4, r5, lsl #16     @ b | a
-    and     r7, mask, r3, lsr #16   @ c = src[2]
-    mul     r6, c51_205, r6         @ a * 51 + 205 * b
-
-    orr     r5, r5, r7, lsl #16     @ c | b
-    mul     r5, c102_154, r5        @ b * 102 + 154 * c
-    add     r6, r6, #0x8000
-    and     r8, mask, r3, lsr #24   @ d = src[3]
-    mov     r6, r6, lsr #24
-    strb    r6, [dest], #1
-
-    orr     r7, r8, r7, lsl #16     @ c | d
-    mul     r7, c102_154, r7        @ c * 154 + 102 * d
-    add     r5, r5, #0x8000
-    ldr     r3, [src], #4
-    mov     r5, r5, lsr #24
-    strb    r5, [dest], #1
-
-    add     r7, r7, #0x8000
-    and     r9, mask, r3            @ e = src[4]
-    orr     r9, r9, r8, lsl #16     @ d | e
-    mul     r9, c51_205, r9         @ d * 205 + 51 * e
-    mov     r7, r7, lsr #24
-    strb    r7, [dest], #1
-
-    add     r9, r9, #0x8000
-    subs    srcw, srcw, #4
-    mov     r9, r9, lsr #24
-    strb    r9, [dest], #1
-
-    bne     hl45_loop
-
-    and     r4, r3, mask
-    and     r5, mask, r3, lsl #8
-    strb    r4, [dest], #1
-
-    orr     r6, r4, r5, lsl #16     @ b | a
-    mul     r6, c51_205, r6
-
-    and     r7, mask, r3, lsl #16
-    orr     r5, r5, r7, lsl #16     @ c | b
-    mul     r5, c102_154, r5
-    add     r6, r6, #0x8000
-    and     r8, mask, r3, lsl #24
-    mov     r6, r6, lsr #24
-    strb    r6, [dest], #1
-
-    orr     r7, r8, r7, lsl #16     @ c | d
-    mul     r7, c102_154, r7
-    add     r5, r5, #0x8000
-    mov     r5, r5, lsr #24
-    strb    r5, [dest], #1
-
-    add     r7, r7, #0x8000
-    mov     r7, r7, lsr #24
-    strb    r7, [dest], #1
-
-    ldrb    r3, [src]
-    strb    r3, [dest], #1
-
-    ldmia   sp!, {r4 - r11, pc}
-    @   @|vp8cx_horizontal_line_4_5_scale_c|
-
-@/****************************************************************************
-@ *
-@ *  ROUTINE       : vertical_band_4_5_scale_armv4
-@ *
-@ *  INPUTS        : unsigned char *dest    : Pointer to destination data.
-@ *                  unsigned int dest_pitch : Stride of destination data.
-@ *                  unsigned int dest_width : Width of destination data.
-@ *
-@ *  OUTPUTS       : None.
-@ *
-@ *  RETU.req_s       : void
-@ *
-@ *  FUNCTION      : Scales vertical band of pixels by scale 4 to 5. The
-@ *                  height of the band scaled is 4-pixels.
-@ *
-@ *  SPECIAL NOTES : The routine uses the first line of the band below
-@ *                  the current band.
-@ *
-@ ****************************************************************************/
-@void vertical_band_4_5_scale_armv4
-@(
-@   r0 = UINT8 *dest
-@   r1 = UINT32 dest_pitch
-@   r2 = UINT32 dest_width
-@)
-_VerticalBand_4_5_Scale_ARMv4:
-    vertical_band_4_5_scale_armv4: @
-    stmdb   sp!, {r4 - r11, lr}
-
-    ldr     c51_205, =0x3300cd
-    ldr     c102_154, =0x66009a
-
-vl45_loop:
-    mov     r3, src
-    ldrb    r4, [r3], r1            @ a = des [0]
-    ldrb    r5, [r3], r1            @ b = des [dest_pitch]
-    ldrb    r7, [r3], r1            @ c = des[dest_pitch*2]
-    add     lr, src, r1
-
-    orr     r6, r4, r5, lsl #16     @ b | a
-    mul     r6, c51_205, r6         @ a * 51 + 205 * b
-
-    ldrb    r8, [r3], r1            @ d = des[dest_pitch*3]
-    orr     r5, r5, r7, lsl #16     @ c | b
-    mul     r5, c102_154, r5        @ b * 102 + 154 * c
-    add     r6, r6, #0x8000
-    orr     r7, r8, r7, lsl #16     @ c | d
-    mov     r6, r6, lsr #24
-    strb    r6, [lr], r1
-
-    ldrb    r9, [r3, r1]            @ e = des [dest_pitch * 5]
-    mul     r7, c102_154, r7        @ c * 154 + 102 * d
-    add     r5, r5, #0x8000
-    orr     r9, r9, r8, lsl #16     @ d | e
-    mov     r5, r5, lsr #24
-    strb    r5, [lr], r1
-
-    mul     r9, c51_205, r9         @ d * 205 + 51 * e
-    add     r7, r7, #0x8000
-    add     src, src, #1
-    mov     r7, r7, lsr #24
-    strb    r7, [lr], r1
-
-    add     r9, r9, #0x8000
-    subs    r2, r2, #1
-    mov     r9, r9, lsr #24
-    strb    r9, [lr], r1
-
-    bne     vl45_loop
-
-    ldmia   sp!, {r4 - r11, pc}
-    @   @|vertical_band_4_5_scale_armv4|
-
-@/****************************************************************************
-@ *
-@ *  ROUTINE       : horizontal_line_2_3_scale_armv4
-@ *
-@ *  INPUTS        : const unsigned char *source : Pointer to source data.
-@ *                  unsigned int source_width    : Stride of source.
-@ *                  unsigned char *dest         : Pointer to destination data.
-@ *                  unsigned int dest_width      : Stride of destination (NOT USED).
-@ *
-@ *  OUTPUTS       : None.
-@ *
-@ *  RETU.req_s       : void
-@ *
-@ *  FUNCTION      : Copies horizontal line of pixels from source to
-@ *                  destination scaling up by 2 to 3.
-@ *
-@ *  SPECIAL NOTES : None.
-@ *
-@ *
-@ ****************************************************************************/
-@void horizontal_line_2_3_scale_armv4
-@(
-@   const unsigned char *source,
-@   unsigned int source_width,
-@   unsigned char *dest,
-@   unsigned int dest_width
-@)
-_HorizontalLine_2_3_Scale_ARMv4:
-    horizontal_line_2_3_scale_armv4: @
-    stmdb   sp!, {r4 - r11, lr}
-    ldr     lr,  =85
-    ldr     r12, =171
-
-hl23_loop:
-
-    ldrb    r3, [src], #1           @ a
-    ldrb    r4, [src], #1           @ b
-    ldrb    r5, [src]               @ c
-
-    strb    r3, [dest], #1
-    mul     r4, r12, r4             @ b * 171
-    mla     r6, lr, r3, r4          @ a * 85
-    mla     r7, lr, r5, r4          @ c * 85
-
-    add     r6, r6, #128
-    mov     r6, r6, lsr #8
-    strb    r6, [dest], #1
-
-    add     r7, r7, #128
-    mov     r7, r7, lsr #8
-    strb    r7, [dest], #1
-
-    subs    srcw, srcw, #2
-    bne     hl23_loop
-
-    ldrb    r4, [src, #1]           @ b
-    strb    r5, [dest], #1
-    strb    r4, [dest, #1]
-
-    mul     r4, r12, r4             @ b * 171
-    mla     r6, lr, r5, r4          @ a * 85 + b *171
-
-    add     r6, r6, #128
-    mov     r6, r6, lsr #8
-    strb    r6, [dest]
-
-    ldmia   sp!, {r4 - r11, pc}
-    @   @|horizontal_line_2_3_scale_armv4|
-
-@/****************************************************************************
-@ *
-@ *  ROUTINE       : vertical_band_2_3_scale_armv4
-@ *
-@ *  INPUTS        : unsigned char *dest    : Pointer to destination data.
-@ *                  unsigned int dest_pitch : Stride of destination data.
-@ *                  unsigned int dest_width : Width of destination data.
-@ *
-@ *  OUTPUTS       : None.
-@ *
-@ *  RETU.req_s       : void
-@ *
-@ *  FUNCTION      : Scales vertical band of pixels by scale 2 to 3. The
-@ *                  height of the band scaled is 2-pixels.
-@ *
-@ *  SPECIAL NOTES : The routine uses the first line of the band below
-@ *                  the current band.
-@ *
-@ ****************************************************************************/
-@void vertical_band_2_3_scale_armv4
-@(
-@   r0 = UINT8 *dest
-@   r1 = UINT32 dest_pitch
-@   r2 = UINT32 dest_width
-@)
-_VerticalBand_2_3_Scale_ARMv4:
-    vertical_band_2_3_scale_armv4: @
-    stmdb   sp!, {r4 - r8, lr}
-    ldr     lr,  =85
-    ldr     r12, =171
-    add     r3, r1, r1, lsl #1      @ 3 * dest_pitch
-
-vl23_loop:
-    ldrb    r4, [src]               @ a = des [0]
-    ldrb    r5, [src, r1]           @ b = des [dest_pitch]
-    ldrb    r7, [src, r3]           @ c = des [dest_pitch*3]
-    subs    r2, r2, #1
-
-    mul     r5, r12, r5             @ b * 171
-    mla     r6, lr, r4, r5          @ a * 85
-    mla     r8, lr, r7, r5          @ c * 85
-
-    add     r6, r6, #128
-    mov     r6, r6, lsr #8
-    strb    r6, [src, r1]
-
-    add     r8, r8, #128
-    mov     r8, r8, lsr #8
-    strb    r8, [src, r1, lsl #1]
-
-    add     src, src, #1
-
-    bne     vl23_loop
-
-    ldmia   sp!, {r4 - r8, pc}
-    @   @|vertical_band_2_3_scale_armv4|
-
-@/****************************************************************************
-@ *
-@ *  ROUTINE       : vp8cx_horizontal_line_3_5_scale_c
-@ *
-@ *  INPUTS        : const unsigned char *source : Pointer to source data.
-@ *                  unsigned int source_width    : Stride of source.
-@ *                  unsigned char *dest         : Pointer to destination data.
-@ *                  unsigned int dest_width      : Stride of destination (NOT USED).
-@ *
-@ *  OUTPUTS       : None.
-@ *
-@ *  RETU.req_s       : void
-@ *
-@ *  FUNCTION      : Copies horizontal line of pixels from source to
-@ *                  destination scaling up by 3 to 5.
-@ *
-@ *  SPECIAL NOTES : None.
-@ *
-@ *
-@ ****************************************************************************/
-@void vp8cx_horizontal_line_3_5_scale_c
-@(
-@   const unsigned char *source,
-@   unsigned int source_width,
-@   unsigned char *dest,
-@   unsigned int dest_width
-@)
-_HorizontalLine_3_5_Scale_ARMv4:
-    horizontal_line_3_5_scale_armv4: @
-    stmdb   sp!, {r4 - r11, lr}
-
-    ldr     c51_205, =0x3300cd
-    ldr     c102_154, =0x66009a
-
-    ldrb    r4, [src], #1           @ a = src[0]
-
-hl35_loop:
-
-    ldrb    r8, [src], #1           @ b = src[1]
-    strb    r4, [dest], #1
-
-    orr     r6, r4, r8, lsl #16     @ b | a
-    ldrb    r9, [src], #1           @ c = src[2]
-    mul     r6, c102_154, r6        @ a * 102 + 154 * b
-
-    orr     r5, r9, r8, lsl #16     @ b | c
-    mul     r5, c51_205, r5         @ b * 205 + 51 * c
-    add     r6, r6, #0x8000
-    ldrb    r4, [src], #1           @ d = src[3]
-    mov     r6, r6, lsr #24
-    strb    r6, [dest], #1
-
-    orr     r7, r8, r9, lsl #16     @ c | b
-    mul     r7, c51_205, r7         @ c * 205 + 154 * b
-    add     r5, r5, #0x8000
-    mov     r5, r5, lsr #24
-    strb    r5, [dest], #1
-
-    orr     r9, r4, r9, lsl #16     @ c | d
-    mul     r9, c102_154, r9        @ c * 154 + 102 * d
-    add     r7, r7, #0x8000
-    mov     r7, r7, lsr #24
-    strb    r7, [dest], #1
-
-    add     r9, r9, #0x8000
-    subs    srcw, srcw, #3
-    mov     r9, r9, lsr #24
-    strb    r9, [dest], #1
-
-    bpl     hl35_loop
-
-    ldrb    r5, [src], #1           @ b = src[1]
-    strb    r4, [dest], #1
-
-    orr     r6, r4, r8, lsl #16     @ b | a
-    ldrb    r9, [src], #1           @ c = src[2]
-    mul     r6, c102_154, r6        @ a * 102 + 154 * b
-
-    orr     r5, r9, r8, lsl #16     @ b | c
-    mul     r5, c51_205, r5         @ b * 205 + 51 * c
-    add     r6, r6, #0x8000
-    mov     r6, r6, lsr #24
-    strb    r6, [dest], #1
-
-    orr     r7, r8, r9, lsl #16     @ c | b
-    mul     r7, c51_205, r7         @ c * 205 + 154 * b
-    add     r5, r5, #0x8000
-    mov     r5, r5, lsr #24
-    strb    r5, [dest], #1
-
-    add     r7, r7, #0x8000
-    mov     r7, r7, lsr #24
-    strb    r7, [dest], #1
-    strb    r9, [dest], #1
-
-    ldmia   sp!, {r4 - r11, pc}
-    @   @|vp8cx_horizontal_line_3_5_scale_c|
-
-
-@/****************************************************************************
-@ *
-@ *  ROUTINE       : vp8cx_vertical_band_3_5_scale_c
-@ *
-@ *  INPUTS        : unsigned char *dest    : Pointer to destination data.
-@ *                  unsigned int dest_pitch : Stride of destination data.
-@ *                  unsigned int dest_width : Width of destination data.
-@ *
-@ *  OUTPUTS       : None.
-@ *
-@ *  RETU.req_s       : void
-@ *
-@ *  FUNCTION      : Scales vertical band of pixels by scale 3 to 5. The
-@ *                  height of the band scaled is 3-pixels.
-@ *
-@ *  SPECIAL NOTES : The routine uses the first line of the band below
-@ *                  the current band.
-@ *
-@ ****************************************************************************/
-@void vertical_band_4_5_scale_armv4
-@(
-@   r0 = UINT8 *dest
-@   r1 = UINT32 dest_pitch
-@   r2 = UINT32 dest_width
-@)
-_VerticalBand_3_5_Scale_ARMv4:
-    vertical_band_3_5_scale_armv4: @
-    stmdb   sp!, {r4 - r11, lr}
-
-    ldr     c51_205, =0x3300cd
-    ldr     c102_154, =0x66009a
-
-vl35_loop:
-    mov     r3, src
-    ldrb    r4, [r3], r1            @ a = des [0]
-    ldrb    r5, [r3], r1            @ b = des [dest_pitch]
-    ldrb    r7, [r3], r1            @ c = des[dest_pitch*2]
-    add     lr, src, r1
-
-    orr     r8, r4, r5, lsl #16     @ b | a
-    mul     r6, c102_154, r8        @ a * 102 + 154 * b
-
-    ldrb    r8, [r3, r1, lsl #1]    @ d = des[dest_pitch*5]
-    orr     r3, r7, r5, lsl #16     @ b | c
-    mul     r9, c51_205, r3         @ b * 205 + 51 * c
-    add     r6, r6, #0x8000
-    orr     r3, r5, r7, lsl #16     @ c | b
-    mov     r6, r6, lsr #24
-    strb    r6, [lr], r1
-
-    mul     r5, c51_205, r3         @ c * 205 + 154 * b
-    add     r9, r9, #0x8000
-    orr     r3, r8, r7, lsl #16     @ c | d
-    mov     r9, r9, lsr #24
-    strb    r9, [lr], r1
-
-    mul     r7, c102_154, r3        @ c * 154 + 102 * d
-    add     r5, r5, #0x8000
-    add     src, src, #1
-    mov     r5, r5, lsr #24
-    strb    r5, [lr], r1
-
-    add     r7, r7, #0x8000
-    subs    r2, r2, #1
-    mov     r7, r7, lsr #24
-    strb    r7, [lr], r1
-
-
-    bne     vl35_loop
-
-    ldmia   sp!, {r4 - r11, pc}
-    @   @|vertical_band_3_5_scale_armv4|
-
-@/****************************************************************************
-@ *
-@ *  ROUTINE       : horizontal_line_3_4_scale_armv4
-@ *
-@ *  INPUTS        : const unsigned char *source : Pointer to source data.
-@ *                  unsigned int source_width    : Stride of source.
-@ *                  unsigned char *dest         : Pointer to destination data.
-@ *                  unsigned int dest_width      : Stride of destination (NOT USED).
-@ *
-@ *  OUTPUTS       : None.
-@ *
-@ *  RETU.req_s       : void
-@ *
-@ *  FUNCTION      : Copies horizontal line of pixels from source to
-@ *                  destination scaling up by 3 to 4.
-@ *
-@ *  SPECIAL NOTES : None.
-@ *
-@ *
-@ ****************************************************************************/
-@void horizontal_line_3_4_scale_armv4
-@(
-@   const unsigned char *source,
-@   unsigned int source_width,
-@   unsigned char *dest,
-@   unsigned int dest_width
-@)
-_HorizontalLine_3_4_Scale_ARMv4:
-    horizontal_line_3_4_scale_armv4: @
-    stmdb   sp!, {r4 - r11, lr}
-
-    ldr     r10, =64
-    ldr     r11, =192
-    mov     r9, #128
-
-    ldrb    r4, [src], #1           @ a = src[0]
-
-hl34_loop:
-
-    ldrb    r8, [src], #1           @ b = src[1]
-    ldrb    r7, [src], #1           @ c = src[2]
-    strb    r4, [dest], #1
-
-    mla     r4, r10, r4, r9         @ a*64 + 128
-    mla     r4, r11, r8, r4         @ a*64 + b*192 + 1
-
-    add     r8, r8, #1              @ b + 1
-    add     r8, r8, r7              @ b + c + 1
-    mov     r8, r8, asr #1          @ (b + c + 1) >> 1
-
-    mov     r4, r4, asr #8          @ (a*64 + b*192 + 1) >> 8
-    strb    r4, [dest], #1
-
-    strb    r8, [dest], #1
-
-    ldrb    r4, [src], #1           @ [a+1]
-
-    mla     r7, r11, r7, r9         @ c*192 + 128
-    mla     r7, r4, r10, r7         @ a*64 + b*192 + 128
-
-    subs    srcw, srcw, #3
-
-    mov     r7, r7, asr #8          @ (a*64 + b*192 + 128) >> 8
-    strb    r7, [dest], #1
-
-    bpl     hl34_loop
-
-    ldrb    r8, [src], #1           @ b = src[1]
-    ldrb    r7, [src], #1           @ c = src[2]
-    strb    r4, [dest], #1
-
-    mla     r4, r10, r4, r9         @ a*64 + 128
-    mla     r4, r11, r8, r4         @ a*64 + b*192 + 1
-    mov     r4, r4, asr #8          @ (a*64 + b*192 + 1) >> 8
-    strb    r4, [dest], #1
-
-    add     r8, r8, #1              @ b + 1
-    add     r8, r8, r7              @ b + c + 1
-    mov     r8, r8, asr #1          @ (b + c + 1) >> 1
-    strb    r8, [dest], #1
-    strb    r7, [dest], #1
-
-    ldmia   sp!, {r4 - r11, pc}
-    @   @|vp8cx_horizontal_line_3_4_scale_c|
-
-
-@/****************************************************************************
-@ *
-@ *  ROUTINE       : vertical_band_3_4_scale_armv4
-@ *
-@ *  INPUTS        : unsigned char *dest    : Pointer to destination data.
-@ *                  unsigned int dest_pitch : Stride of destination data.
-@ *                  unsigned int dest_width : Width of destination data.
-@ *
-@ *  OUTPUTS       : None.
-@ *
-@ *  RETU.req_s       : void
-@ *
-@ *  FUNCTION      : Scales vertical band of pixels by scale 3 to 4. The
-@ *                  height of the band scaled is 3-pixels.
-@ *
-@ *  SPECIAL NOTES : The routine uses the first line of the band below
-@ *                  the current band.
-@ *
-@ ****************************************************************************/
-@void vertical_band_3_4_scale_armv4
-@(
-@   r0 = UINT8 *dest
-@   r1 = UINT32 dest_pitch
-@   r2 = UINT32 dest_width
-@)
-_VerticalBand_3_4_Scale_ARMv4:
-    vertical_band_3_4_scale_armv4: @
-    stmdb   sp!, {r4 - r11, lr}
-
-    ldr     r10, =64
-    ldr     r11, =192
-    mov     r9, #128
-
-@   ldr     r1,[r1]
-vl34_loop:
-    mov     r3, src
-    ldrb    r4, [r3], r1            @ a = des [0]
-    ldrb    r5, [r3], r1            @ b = des [dest_pitch]
-    ldrb    r7, [r3], r1            @ c = des [dest_pitch*2]
-    add     lr, src, r1
-
-    mla     r4, r10, r4, r9         @ a*64 + 128
-    mla     r4, r11, r5, r4         @ a*64 + b*192 + 1
-
-    add     r5, r5, #1              @ b + 1
-    add     r5, r5, r7              @ b + c + 1
-    mov     r5, r5, asr #1          @ (b + c + 1) >> 1
-
-    mov     r4, r4, asr #8          @ (a*64 + b*192 + 1) >> 8
-    strb    r4, [lr], r1
-
-    ldrb    r4, [r3, r1]            @ a = des [dest_pitch*4]
-
-    strb    r5, [lr], r1
-
-    mla     r7, r11, r7, r9         @ c*192 + 128
-    mla     r7, r4, r10, r7         @ a*64 + b*192 + 128
-    mov     r7, r7, asr #8          @ (a*64 + b*192 + 128) >> 8
-
-    add     src, src, #1
-    subs    r2, r2, #1
-
-    strb    r7, [lr]
-
-    bne     vl34_loop
-
-    ldmia   sp!, {r4 - r11, pc}
-    @   @|vertical_band_3_4_scale_armv4|
-
-@/****************************************************************************
-@ *
-@ *  ROUTINE       : vp8cx_horizontal_line_1_2_scale_c
-@ *
-@ *  INPUTS        : const unsigned char *source : Pointer to source data.
-@ *                  unsigned int source_width    : Stride of source.
-@ *                  unsigned char *dest         : Pointer to destination data.
-@ *                  unsigned int dest_width      : Stride of destination (NOT USED).
-@ *
-@ *  OUTPUTS       : None.
-@ *
-@ *  RETU.req_s       : void
-@ *
-@ *  FUNCTION      : Copies horizontal line of pixels from source to
-@ *                  destination scaling up by 1 to 2.
-@ *
-@ *  SPECIAL NOTES : None.
-@ *
-@ ****************************************************************************/
-@void vp8cx_horizontal_line_1_2_scale_c
-@(
-@   const unsigned char *source,
-@   unsigned int source_width,
-@   unsigned char *dest,
-@   unsigned int dest_width
-@)
-_HorizontalLine_1_2_Scale_ARMv4:
-    horizontal_line_1_2_scale_armv4: @
-    stmdb   sp!, {r4 - r5, lr}
-
-    sub     srcw, srcw, #1
-
-    ldrb    r3, [src], #1
-    ldrb    r4, [src], #1
-hl12_loop:
-    subs    srcw, srcw, #1
-
-    add     r5, r3, r4
-    add     r5, r5, #1
-    mov     r5, r5, lsr #1
-
-    orr     r5, r3, r5, lsl #8
-    strh    r5, [dest], #2
-
-    mov     r3, r4
-
-    ldrneb  r4, [src], #1
-    bne     hl12_loop
-
-    orr     r5, r4, r4, lsl #8
-    strh    r5, [dest]
-
-    ldmia   sp!, {r4 - r5, pc}
-    @   @|vertical_band_3_5_scale_armv4|
-
-@/****************************************************************************
-@ *
-@ *  ROUTINE       : vp8cx_vertical_band_1_2_scale_c
-@ *
-@ *  INPUTS        : unsigned char *dest    : Pointer to destination data.
-@ *                  unsigned int dest_pitch : Stride of destination data.
-@ *                  unsigned int dest_width : Width of destination data.
-@ *
-@ *  OUTPUTS       : None.
-@ *
-@ *  RETU.req_s       : void
-@ *
-@ *  FUNCTION      : Scales vertical band of pixels by scale 1 to 2. The
-@ *                  height of the band scaled is 1-pixel.
-@ *
-@ *  SPECIAL NOTES : The routine uses the first line of the band below
-@ *                  the current band.
-@ *
-@ ****************************************************************************/
-@void vp8cx_vertical_band_1_2_scale_c
-@(
-@   r0 = UINT8 *dest
-@   r1 = UINT32 dest_pitch
-@   r2 = UINT32 dest_width
-@)
-_VerticalBand_1_2_Scale_ARMv4:
-    vertical_band_1_2_scale_armv4: @
-    stmdb   sp!, {r4 - r7, lr}
-
-    ldr     mask, =0xff00ff             @ mask for selection
-    ldr     lr, = 0x010001
-
-vl12_loop:
-    mov     r3, src
-    ldr     r4, [r3], r1
-    ldr     r5, [r3, r1]
-
-    add     src, src, #4
-    subs    r2, r2, #4
-
-    and     r6, r4, mask
-    and     r7, r5, mask
-
-    add     r6, r7, r6
-    add     r6, r6, lr
-
-    and     r4, mask, r4, lsr #8
-    and     r5, mask, r5, lsr #8
-
-    mov     r6, r6, lsr #1
-    and     r6, r6, mask
-
-    add     r4, r5, r4
-    add     r4, r4, lr
-
-    mov     r4, r4, lsr #1
-    and     r4, r4, mask
-
-    orr     r5, r6, r4, lsl #8
-
-    str     r5, [r3]
-
-    bpl     vl12_loop
-
-    ldmia   sp!, {r4 - r7, pc}
-    @   @|vertical_band_3_5_scale_armv4|

diff --git a/vpx_scale/symbian/scalesystemdependant.c b/vpx_scale/symbian/scalesystemdependant.c
deleted file mode 100644
index b0bffdd..0000000
--- a/vpx_scale/symbian/scalesystemdependant.c
+++ /dev/null

@@ -1,58 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_scale/vpxscale.h"
-
-/****************************************************************************
- *
- *  ROUTINE       : vp8_scale_machine_specific_config
- *
- *  INPUTS        : UINT32 Version : Codec version number.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Checks for machine specifc features such as MMX support
- *                  sets appropriate flags and function pointers.
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-void vp8_scale_machine_specific_config()
-{
-#ifndef VPX_NO_GLOBALS
-    vp8_horizontal_line_1_2_scale        = horizontal_line_1_2_scale_armv4;
-    vp8_vertical_band_1_2_scale          = vertical_band_1_2_scale_armv4;
-    vp8_last_vertical_band_1_2_scale      = vp8cx_last_vertical_band_1_2_scale_c;
-    vp8_horizontal_line_3_5_scale        = horizontal_line_3_5_scale_armv4;
-    vp8_vertical_band_3_5_scale          = vertical_band_3_5_scale_armv4;
-    vp8_last_vertical_band_3_5_scale      = vp8cx_last_vertical_band_3_5_scale_c;
-    vp8_horizontal_line_3_4_scale        = horizontal_line_3_4_scale_armv4;
-    vp8_vertical_band_3_4_scale          = vertical_band_3_4_scale_armv4;
-    vp8_last_vertical_band_3_4_scale      = vp8cx_last_vertical_band_3_4_scale_c;
-    vp8_horizontal_line_2_3_scale        = horizontal_line_2_3_scale_armv4;
-    vp8_vertical_band_2_3_scale          = vertical_band_2_3_scale_armv4;
-    vp8_last_vertical_band_2_3_scale      = vp8cx_last_vertical_band_2_3_scale_c;
-    vp8_horizontal_line_4_5_scale        = horizontal_line_4_5_scale_armv4;
-    vp8_vertical_band_4_5_scale          = vertical_band_4_5_scale_armv4;
-    vp8_last_vertical_band_4_5_scale      = vp8cx_last_vertical_band_4_5_scale_c;
-
-
-    vp8_vertical_band_5_4_scale           = vp8cx_vertical_band_5_4_scale_c;
-    vp8_vertical_band_5_3_scale           = vp8cx_vertical_band_5_3_scale_c;
-    vp8_vertical_band_2_1_scale           = vp8cx_vertical_band_2_1_scale_c;
-    vp8_vertical_band_2_1_scale_i         = vp8cx_vertical_band_2_1_scale_i_c;
-    vp8_horizontal_line_2_1_scale         = vp8cx_horizontal_line_2_1_scale_c;
-    vp8_horizontal_line_5_3_scale         = vp8cx_horizontal_line_5_3_scale_c;
-    vp8_horizontal_line_5_4_scale         = vp8cx_horizontal_line_5_4_scale_c;
-#endif
-}

diff --git a/vpx_scale/vpx_scale.mk b/vpx_scale/vpx_scale.mk
index f4ab258..edb5419 100644
--- a/vpx_scale/vpx_scale.mk
+++ b/vpx_scale/vpx_scale.mk

@@ -6,13 +6,13 @@
 SCALE_SRCS-yes += generic/vpxscale.c
 SCALE_SRCS-yes += generic/yv12config.c
 SCALE_SRCS-yes += generic/yv12extend.c
-SCALE_SRCS-yes += generic/scalesystemdependant.c
+SCALE_SRCS-yes += generic/scalesystemdependent.c
 SCALE_SRCS-$(CONFIG_SPATIAL_RESAMPLING) += generic/gen_scalers.c
 
 #arm
-SCALE_SRCS-$(HAVE_ARMV7)         += arm/scalesystemdependant.c
+SCALE_SRCS-$(HAVE_ARMV7)         += arm/scalesystemdependent.c
 SCALE_SRCS-$(HAVE_ARMV7)         += arm/yv12extend_arm.c
-SCALE_SRCS_REMOVE-$(HAVE_ARMV7)  += generic/scalesystemdependant.c
+SCALE_SRCS_REMOVE-$(HAVE_ARMV7)  += generic/scalesystemdependent.c
 
 #neon
 SCALE_SRCS-$(HAVE_ARMV7)  += arm/neon/vp8_vpxyv12_copyframe_func_neon$(ASM)

diff --git a/vpx_scale/wce/gen_scalers_armv4.asm b/vpx_scale/wce/gen_scalers_armv4.asm
deleted file mode 100644
index e495184..0000000
--- a/vpx_scale/wce/gen_scalers_armv4.asm
+++ /dev/null

@@ -1,774 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |horizontal_line_4_5_scale_armv4|
-    EXPORT  |vertical_band_4_5_scale_armv4|
-    EXPORT  |horizontal_line_2_3_scale_armv4|
-    EXPORT  |vertical_band_2_3_scale_armv4|
-    EXPORT  |horizontal_line_3_5_scale_armv4|
-    EXPORT  |vertical_band_3_5_scale_armv4|
-    EXPORT  |horizontal_line_3_4_scale_armv4|
-    EXPORT  |vertical_band_3_4_scale_armv4|
-    EXPORT  |horizontal_line_1_2_scale_armv4|
-    EXPORT  |vertical_band_1_2_scale_armv4|
-
-    AREA    |.text|, CODE, READONLY  ; name this block of code
-
-src         RN  r0
-srcw        RN  r1
-dest        RN  r2
-mask        RN  r12
-c51_205     RN  r10
-c102_154    RN  r11
-;/****************************************************************************
-; *
-; *  ROUTINE       : horizontal_line_4_5_scale_armv4
-; *
-; *  INPUTS        : const unsigned char *source : Pointer to source data.
-; *                  unsigned int source_width    : Stride of source.
-; *                  unsigned char *dest         : Pointer to destination data.
-; *                  unsigned int dest_width      : Stride of destination (NOT USED).
-; *
-; *  OUTPUTS       : None.
-; *
-; *  RETURNS       : void
-; *
-; *  FUNCTION      : Copies horizontal line of pixels from source to
-; *                  destination scaling up by 4 to 5.
-; *
-; *  SPECIAL NOTES : None.
-; *
-; ****************************************************************************/
-;void horizontal_line_4_5_scale_armv4
-;(
-;   r0 = UINT8 *source
-;   r1 = UINT32 source_width
-;   r2 = UINT8 *dest
-;   r3 = UINT32 dest_width
-;)
-|horizontal_line_4_5_scale_armv4| PROC
-    stmdb   sp!, {r4 - r11, lr}
-
-    mov     mask, #255              ; mask for selection
-    ldr     c51_205, =0x3300cd
-    ldr     c102_154, =0x66009a
-
-    ldr     r3, [src], #4
-
-hl45_loop
-
-    and     r4, r3, mask            ; a = src[0]
-    and     r5, mask, r3, lsr #8    ; b = src[1]
-    strb    r4, [dest], #1
-
-    orr     r6, r4, r5, lsl #16     ; b | a
-    and     r7, mask, r3, lsr #16   ; c = src[2]
-    mul     r6, c51_205, r6         ; a * 51 + 205 * b
-
-    orr     r5, r5, r7, lsl #16     ; c | b
-    mul     r5, c102_154, r5        ; b * 102 + 154 * c
-    add     r6, r6, #0x8000
-    and     r8, mask, r3, lsr #24   ; d = src[3]
-    mov     r6, r6, lsr #24
-    strb    r6, [dest], #1
-
-    orr     r7, r8, r7, lsl #16     ; c | d
-    mul     r7, c102_154, r7        ; c * 154 + 102 * d
-    add     r5, r5, #0x8000
-    ldr     r3, [src], #4
-    mov     r5, r5, lsr #24
-    strb    r5, [dest], #1
-
-    add     r7, r7, #0x8000
-    and     r9, mask, r3            ; e = src[4]
-    orr     r9, r9, r8, lsl #16     ; d | e
-    mul     r9, c51_205, r9         ; d * 205 + 51 * e
-    mov     r7, r7, lsr #24
-    strb    r7, [dest], #1
-
-    add     r9, r9, #0x8000
-    subs    srcw, srcw, #4
-    mov     r9, r9, lsr #24
-    strb    r9, [dest], #1
-
-    bne     hl45_loop
-
-    and     r4, r3, mask
-    and     r5, mask, r3, lsl #8
-    strb    r4, [dest], #1
-
-    orr     r6, r4, r5, lsl #16     ; b | a
-    mul     r6, c51_205, r6
-
-    and     r7, mask, r3, lsl #16
-    orr     r5, r5, r7, lsl #16     ; c | b
-    mul     r5, c102_154, r5
-    add     r6, r6, #0x8000
-    and     r8, mask, r3, lsl #24
-    mov     r6, r6, lsr #24
-    strb    r6, [dest], #1
-
-    orr     r7, r8, r7, lsl #16     ; c | d
-    mul     r7, c102_154, r7
-    add     r5, r5, #0x8000
-    mov     r5, r5, lsr #24
-    strb    r5, [dest], #1
-
-    add     r7, r7, #0x8000
-    mov     r7, r7, lsr #24
-    strb    r7, [dest], #1
-
-    ldrb    r3, [src]
-    strb    r3, [dest], #1
-
-    ldmia   sp!, {r4 - r11, pc}
-    ENDP    ;|vp8cx_horizontal_line_4_5_scale_c|
-
-;/****************************************************************************
-; *
-; *  ROUTINE       : vertical_band_4_5_scale_armv4
-; *
-; *  INPUTS        : unsigned char *dest    : Pointer to destination data.
-; *                  unsigned int dest_pitch : Stride of destination data.
-; *                  unsigned int dest_width : Width of destination data.
-; *
-; *  OUTPUTS       : None.
-; *
-; *  RETURNS       : void
-; *
-; *  FUNCTION      : Scales vertical band of pixels by scale 4 to 5. The
-; *                  height of the band scaled is 4-pixels.
-; *
-; *  SPECIAL NOTES : The routine uses the first line of the band below
-; *                  the current band.
-; *
-; ****************************************************************************/
-;void vertical_band_4_5_scale_armv4
-;(
-;   r0 = UINT8 *dest
-;   r1 = UINT32 dest_pitch
-;   r2 = UINT32 dest_width
-;)
-|vertical_band_4_5_scale_armv4| PROC
-    stmdb   sp!, {r4 - r11, lr}
-
-    ldr     c51_205, =0x3300cd
-    ldr     c102_154, =0x66009a
-
-vl45_loop
-    mov     r3, src
-    ldrb    r4, [r3], r1            ; a = des [0]
-    ldrb    r5, [r3], r1            ; b = des [dest_pitch]
-    ldrb    r7, [r3], r1            ; c = des[dest_pitch*2]
-    add     lr, src, r1
-
-    orr     r6, r4, r5, lsl #16     ; b | a
-    mul     r6, c51_205, r6         ; a * 51 + 205 * b
-
-    ldrb    r8, [r3], r1            ; d = des[dest_pitch*3]
-    orr     r5, r5, r7, lsl #16     ; c | b
-    mul     r5, c102_154, r5        ; b * 102 + 154 * c
-    add     r6, r6, #0x8000
-    orr     r7, r8, r7, lsl #16     ; c | d
-    mov     r6, r6, lsr #24
-    strb    r6, [lr], r1
-
-    ldrb    r9, [r3, r1]            ; e = des [dest_pitch * 5]
-    mul     r7, c102_154, r7        ; c * 154 + 102 * d
-    add     r5, r5, #0x8000
-    orr     r9, r9, r8, lsl #16     ; d | e
-    mov     r5, r5, lsr #24
-    strb    r5, [lr], r1
-
-    mul     r9, c51_205, r9         ; d * 205 + 51 * e
-    add     r7, r7, #0x8000
-    add     src, src, #1
-    mov     r7, r7, lsr #24
-    strb    r7, [lr], r1
-
-    add     r9, r9, #0x8000
-    subs    r2, r2, #1
-    mov     r9, r9, lsr #24
-    strb    r9, [lr], r1
-
-    bne     vl45_loop
-
-    ldmia   sp!, {r4 - r11, pc}
-    ENDP    ;|vertical_band_4_5_scale_armv4|
-
-;/****************************************************************************
-; *
-; *  ROUTINE       : horizontal_line_2_3_scale_armv4
-; *
-; *  INPUTS        : const unsigned char *source : Pointer to source data.
-; *                  unsigned int source_width    : Stride of source.
-; *                  unsigned char *dest         : Pointer to destination data.
-; *                  unsigned int dest_width      : Stride of destination (NOT USED).
-; *
-; *  OUTPUTS       : None.
-; *
-; *  RETURNS       : void
-; *
-; *  FUNCTION      : Copies horizontal line of pixels from source to
-; *                  destination scaling up by 2 to 3.
-; *
-; *  SPECIAL NOTES : None.
-; *
-; *
-; ****************************************************************************/
-;void horizontal_line_2_3_scale_armv4
-;(
-;   const unsigned char *source,
-;   unsigned int source_width,
-;   unsigned char *dest,
-;   unsigned int dest_width
-;)
-|horizontal_line_2_3_scale_armv4| PROC
-    stmdb   sp!, {r4 - r11, lr}
-    ldr     lr,  =85
-    ldr     r12, =171
-
-hl23_loop
-
-    ldrb    r3, [src], #1           ; a
-    ldrb    r4, [src], #1           ; b
-    ldrb    r5, [src]               ; c
-
-    strb    r3, [dest], #1
-    mul     r4, r12, r4             ; b * 171
-    mla     r6, lr, r3, r4          ; a * 85
-    mla     r7, lr, r5, r4          ; c * 85
-
-    add     r6, r6, #128
-    mov     r6, r6, lsr #8
-    strb    r6, [dest], #1
-
-    add     r7, r7, #128
-    mov     r7, r7, lsr #8
-    strb    r7, [dest], #1
-
-    subs    srcw, srcw, #2
-    bne     hl23_loop
-
-    ldrb    r4, [src, #1]           ; b
-    strb    r5, [dest], #1
-    strb    r4, [dest, #1]
-
-    mul     r4, r12, r4             ; b * 171
-    mla     r6, lr, r5, r4          ; a * 85 + b *171
-
-    add     r6, r6, #128
-    mov     r6, r6, lsr #8
-    strb    r6, [dest]
-
-    ldmia   sp!, {r4 - r11, pc}
-    ENDP    ;|horizontal_line_2_3_scale_armv4|
-
-;/****************************************************************************
-; *
-; *  ROUTINE       : vertical_band_2_3_scale_armv4
-; *
-; *  INPUTS        : unsigned char *dest    : Pointer to destination data.
-; *                  unsigned int dest_pitch : Stride of destination data.
-; *                  unsigned int dest_width : Width of destination data.
-; *
-; *  OUTPUTS       : None.
-; *
-; *  RETURNS       : void
-; *
-; *  FUNCTION      : Scales vertical band of pixels by scale 2 to 3. The
-; *                  height of the band scaled is 2-pixels.
-; *
-; *  SPECIAL NOTES : The routine uses the first line of the band below
-; *                  the current band.
-; *
-; ****************************************************************************/
-;void vertical_band_2_3_scale_armv4
-;(
-;   r0 = UINT8 *dest
-;   r1 = UINT32 dest_pitch
-;   r2 = UINT32 dest_width
-;)
-|vertical_band_2_3_scale_armv4| PROC
-    stmdb   sp!, {r4 - r8, lr}
-    ldr     lr,  =85
-    ldr     r12, =171
-    add     r3, r1, r1, lsl #1      ; 3 * dest_pitch
-
-vl23_loop
-    ldrb    r4, [src]               ; a = des [0]
-    ldrb    r5, [src, r1]           ; b = des [dest_pitch]
-    ldrb    r7, [src, r3]           ; c = des [dest_pitch*3]
-    subs    r2, r2, #1
-
-    mul     r5, r12, r5             ; b * 171
-    mla     r6, lr, r4, r5          ; a * 85
-    mla     r8, lr, r7, r5          ; c * 85
-
-    add     r6, r6, #128
-    mov     r6, r6, lsr #8
-    strb    r6, [src, r1]
-
-    add     r8, r8, #128
-    mov     r8, r8, lsr #8
-    strb    r8, [src, r1, lsl #1]
-
-    add     src, src, #1
-
-    bne     vl23_loop
-
-    ldmia   sp!, {r4 - r8, pc}
-    ENDP    ;|vertical_band_2_3_scale_armv4|
-
-;/****************************************************************************
-; *
-; *  ROUTINE       : vp8cx_horizontal_line_3_5_scale_c
-; *
-; *  INPUTS        : const unsigned char *source : Pointer to source data.
-; *                  unsigned int source_width    : Stride of source.
-; *                  unsigned char *dest         : Pointer to destination data.
-; *                  unsigned int dest_width      : Stride of destination (NOT USED).
-; *
-; *  OUTPUTS       : None.
-; *
-; *  RETURNS       : void
-; *
-; *  FUNCTION      : Copies horizontal line of pixels from source to
-; *                  destination scaling up by 3 to 5.
-; *
-; *  SPECIAL NOTES : None.
-; *
-; *
-; ****************************************************************************/
-;void vp8cx_horizontal_line_3_5_scale_c
-;(
-;   const unsigned char *source,
-;   unsigned int source_width,
-;   unsigned char *dest,
-;   unsigned int dest_width
-;)
-|horizontal_line_3_5_scale_armv4| PROC
-    stmdb   sp!, {r4 - r11, lr}
-
-    ldr     c51_205, =0x3300cd
-    ldr     c102_154, =0x66009a
-
-    ldrb    r4, [src], #1           ; a = src[0]
-
-hl35_loop
-
-    ldrb    r8, [src], #1           ; b = src[1]
-    strb    r4, [dest], #1
-
-    orr     r6, r4, r8, lsl #16     ; b | a
-    ldrb    r9, [src], #1           ; c = src[2]
-    mul     r6, c102_154, r6        ; a * 102 + 154 * b
-
-    orr     r5, r9, r8, lsl #16     ; b | c
-    mul     r5, c51_205, r5         ; b * 205 + 51 * c
-    add     r6, r6, #0x8000
-    ldrb    r4, [src], #1           ; d = src[3]
-    mov     r6, r6, lsr #24
-    strb    r6, [dest], #1
-
-    orr     r7, r8, r9, lsl #16     ; c | b
-    mul     r7, c51_205, r7         ; c * 205 + 154 * b
-    add     r5, r5, #0x8000
-    mov     r5, r5, lsr #24
-    strb    r5, [dest], #1
-
-    orr     r9, r4, r9, lsl #16     ; c | d
-    mul     r9, c102_154, r9        ; c * 154 + 102 * d
-    add     r7, r7, #0x8000
-    mov     r7, r7, lsr #24
-    strb    r7, [dest], #1
-
-    add     r9, r9, #0x8000
-    subs    srcw, srcw, #3
-    mov     r9, r9, lsr #24
-    strb    r9, [dest], #1
-
-    bpl     hl35_loop
-
-    ldrb    r5, [src], #1           ; b = src[1]
-    strb    r4, [dest], #1
-
-    orr     r6, r4, r8, lsl #16     ; b | a
-    ldrb    r9, [src], #1           ; c = src[2]
-    mul     r6, c102_154, r6        ; a * 102 + 154 * b
-
-    orr     r5, r9, r8, lsl #16     ; b | c
-    mul     r5, c51_205, r5         ; b * 205 + 51 * c
-    add     r6, r6, #0x8000
-    mov     r6, r6, lsr #24
-    strb    r6, [dest], #1
-
-    orr     r7, r8, r9, lsl #16     ; c | b
-    mul     r7, c51_205, r7         ; c * 205 + 154 * b
-    add     r5, r5, #0x8000
-    mov     r5, r5, lsr #24
-    strb    r5, [dest], #1
-
-    add     r7, r7, #0x8000
-    mov     r7, r7, lsr #24
-    strb    r7, [dest], #1
-    strb    r9, [dest], #1
-
-    ldmia   sp!, {r4 - r11, pc}
-    ENDP    ;|vp8cx_horizontal_line_3_5_scale_c|
-
-
-;/****************************************************************************
-; *
-; *  ROUTINE       : vp8cx_vertical_band_3_5_scale_c
-; *
-; *  INPUTS        : unsigned char *dest    : Pointer to destination data.
-; *                  unsigned int dest_pitch : Stride of destination data.
-; *                  unsigned int dest_width : Width of destination data.
-; *
-; *  OUTPUTS       : None.
-; *
-; *  RETURNS       : void
-; *
-; *  FUNCTION      : Scales vertical band of pixels by scale 3 to 5. The
-; *                  height of the band scaled is 3-pixels.
-; *
-; *  SPECIAL NOTES : The routine uses the first line of the band below
-; *                  the current band.
-; *
-; ****************************************************************************/
-;void vertical_band_4_5_scale_armv4
-;(
-;   r0 = UINT8 *dest
-;   r1 = UINT32 dest_pitch
-;   r2 = UINT32 dest_width
-;)
-|vertical_band_3_5_scale_armv4| PROC
-    stmdb   sp!, {r4 - r11, lr}
-
-    ldr     c51_205, =0x3300cd
-    ldr     c102_154, =0x66009a
-
-vl35_loop
-    mov     r3, src
-    ldrb    r4, [r3], r1            ; a = des [0]
-    ldrb    r5, [r3], r1            ; b = des [dest_pitch]
-    ldrb    r7, [r3], r1            ; c = des[dest_pitch*2]
-    add     lr, src, r1
-
-    orr     r8, r4, r5, lsl #16     ; b | a
-    mul     r6, c102_154, r8        ; a * 102 + 154 * b
-
-    ldrb    r8, [r3, r1, lsl #1]    ; d = des[dest_pitch*5]
-    orr     r3, r7, r5, lsl #16     ; b | c
-    mul     r9, c51_205, r3         ; b * 205 + 51 * c
-    add     r6, r6, #0x8000
-    orr     r3, r5, r7, lsl #16     ; c | b
-    mov     r6, r6, lsr #24
-    strb    r6, [lr], r1
-
-    mul     r5, c51_205, r3         ; c * 205 + 154 * b
-    add     r9, r9, #0x8000
-    orr     r3, r8, r7, lsl #16     ; c | d
-    mov     r9, r9, lsr #24
-    strb    r9, [lr], r1
-
-    mul     r7, c102_154, r3        ; c * 154 + 102 * d
-    add     r5, r5, #0x8000
-    add     src, src, #1
-    mov     r5, r5, lsr #24
-    strb    r5, [lr], r1
-
-    add     r7, r7, #0x8000
-    subs    r2, r2, #1
-    mov     r7, r7, lsr #24
-    strb    r7, [lr], r1
-
-
-    bne     vl35_loop
-
-    ldmia   sp!, {r4 - r11, pc}
-    ENDP    ;|vertical_band_3_5_scale_armv4|
-
-;/****************************************************************************
-; *
-; *  ROUTINE       : horizontal_line_3_4_scale_armv4
-; *
-; *  INPUTS        : const unsigned char *source : Pointer to source data.
-; *                  unsigned int source_width    : Stride of source.
-; *                  unsigned char *dest         : Pointer to destination data.
-; *                  unsigned int dest_width      : Stride of destination (NOT USED).
-; *
-; *  OUTPUTS       : None.
-; *
-; *  RETURNS       : void
-; *
-; *  FUNCTION      : Copies horizontal line of pixels from source to
-; *                  destination scaling up by 3 to 4.
-; *
-; *  SPECIAL NOTES : None.
-; *
-; *
-; ****************************************************************************/
-;void horizontal_line_3_4_scale_armv4
-;(
-;   const unsigned char *source,
-;   unsigned int source_width,
-;   unsigned char *dest,
-;   unsigned int dest_width
-;)
-|horizontal_line_3_4_scale_armv4| PROC
-    stmdb   sp!, {r4 - r11, lr}
-
-    ldr     r10, =64
-    ldr     r11, =192
-    mov     r9, #128
-
-    ldrb    r4, [src], #1           ; a = src[0]
-
-hl34_loop
-
-    ldrb    r8, [src], #1           ; b = src[1]
-    ldrb    r7, [src], #1           ; c = src[2]
-    strb    r4, [dest], #1
-
-    mla     r4, r10, r4, r9         ; a*64 + 128
-    mla     r4, r11, r8, r4         ; a*64 + b*192 + 1
-
-    add     r8, r8, #1              ; b + 1
-    add     r8, r8, r7              ; b + c + 1
-    mov     r8, r8, asr #1          ; (b + c + 1) >> 1
-
-    mov     r4, r4, asr #8          ; (a*64 + b*192 + 1) >> 8
-    strb    r4, [dest], #1
-
-    strb    r8, [dest], #1
-
-    ldrb    r4, [src], #1           ; [a+1]
-
-    mla     r7, r11, r7, r9         ; c*192 + 128
-    mla     r7, r4, r10, r7         ; a*64 + b*192 + 128
-
-    subs    srcw, srcw, #3
-
-    mov     r7, r7, asr #8          ; (a*64 + b*192 + 128) >> 8
-    strb    r7, [dest], #1
-
-    bpl     hl34_loop
-
-    ldrb    r8, [src], #1           ; b = src[1]
-    ldrb    r7, [src], #1           ; c = src[2]
-    strb    r4, [dest], #1
-
-    mla     r4, r10, r4, r9         ; a*64 + 128
-    mla     r4, r11, r8, r4         ; a*64 + b*192 + 1
-    mov     r4, r4, asr #8          ; (a*64 + b*192 + 1) >> 8
-    strb    r4, [dest], #1
-
-    add     r8, r8, #1              ; b + 1
-    add     r8, r8, r7              ; b + c + 1
-    mov     r8, r8, asr #1          ; (b + c + 1) >> 1
-    strb    r8, [dest], #1
-    strb    r7, [dest], #1
-
-    ldmia   sp!, {r4 - r11, pc}
-    ENDP    ;|vp8cx_horizontal_line_3_4_scale_c|
-
-
-;/****************************************************************************
-; *
-; *  ROUTINE       : vertical_band_3_4_scale_armv4
-; *
-; *  INPUTS        : unsigned char *dest    : Pointer to destination data.
-; *                  unsigned int dest_pitch : Stride of destination data.
-; *                  unsigned int dest_width : Width of destination data.
-; *
-; *  OUTPUTS       : None.
-; *
-; *  RETURNS       : void
-; *
-; *  FUNCTION      : Scales vertical band of pixels by scale 3 to 4. The
-; *                  height of the band scaled is 3-pixels.
-; *
-; *  SPECIAL NOTES : The routine uses the first line of the band below
-; *                  the current band.
-; *
-; ****************************************************************************/
-;void vertical_band_3_4_scale_armv4
-;(
-;   r0 = UINT8 *dest
-;   r1 = UINT32 dest_pitch
-;   r2 = UINT32 dest_width
-;)
-|vertical_band_3_4_scale_armv4| PROC
-    stmdb   sp!, {r4 - r11, lr}
-
-    ldr     r10, =64
-    ldr     r11, =192
-    mov     r9, #128
-
-;   ldr     r1,[r1]
-vl34_loop
-    mov     r3, src
-    ldrb    r4, [r3], r1            ; a = des [0]
-    ldrb    r5, [r3], r1            ; b = des [dest_pitch]
-    ldrb    r7, [r3], r1            ; c = des [dest_pitch*2]
-    add     lr, src, r1
-
-    mla     r4, r10, r4, r9         ; a*64 + 128
-    mla     r4, r11, r5, r4         ; a*64 + b*192 + 1
-
-    add     r5, r5, #1              ; b + 1
-    add     r5, r5, r7              ; b + c + 1
-    mov     r5, r5, asr #1          ; (b + c + 1) >> 1
-
-    mov     r4, r4, asr #8          ; (a*64 + b*192 + 1) >> 8
-    strb    r4, [lr], r1
-
-    ldrb    r4, [r3, r1]            ; a = des [dest_pitch*4]
-
-    strb    r5, [lr], r1
-
-    mla     r7, r11, r7, r9         ; c*192 + 128
-    mla     r7, r4, r10, r7         ; a*64 + b*192 + 128
-    mov     r7, r7, asr #8          ; (a*64 + b*192 + 128) >> 8
-
-    add     src, src, #1
-    subs    r2, r2, #1
-
-    strb    r7, [lr]
-
-    bne     vl34_loop
-
-    ldmia   sp!, {r4 - r11, pc}
-    ENDP    ;|vertical_band_3_4_scale_armv4|
-
-;/****************************************************************************
-; *
-; *  ROUTINE       : vp8cx_horizontal_line_1_2_scale_c
-; *
-; *  INPUTS        : const unsigned char *source : Pointer to source data.
-; *                  unsigned int source_width    : Stride of source.
-; *                  unsigned char *dest         : Pointer to destination data.
-; *                  unsigned int dest_width      : Stride of destination (NOT USED).
-; *
-; *  OUTPUTS       : None.
-; *
-; *  RETURNS       : void
-; *
-; *  FUNCTION      : Copies horizontal line of pixels from source to
-; *                  destination scaling up by 1 to 2.
-; *
-; *  SPECIAL NOTES : None.
-; *
-; ****************************************************************************/
-;void vp8cx_horizontal_line_1_2_scale_c
-;(
-;   const unsigned char *source,
-;   unsigned int source_width,
-;   unsigned char *dest,
-;   unsigned int dest_width
-;)
-|horizontal_line_1_2_scale_armv4| PROC
-    stmdb   sp!, {r4 - r5, lr}
-
-    sub     srcw, srcw, #1
-
-    ldrb    r3, [src], #1
-    ldrb    r4, [src], #1
-hl12_loop
-    subs    srcw, srcw, #1
-
-    add     r5, r3, r4
-    add     r5, r5, #1
-    mov     r5, r5, lsr #1
-
-    orr     r5, r3, r5, lsl #8
-    strh    r5, [dest], #2
-
-    mov     r3, r4
-
-    ldrneb  r4, [src], #1
-    bne     hl12_loop
-
-    orr     r5, r4, r4, lsl #8
-    strh    r5, [dest]
-
-    ldmia   sp!, {r4 - r5, pc}
-    ENDP    ;|vertical_band_3_5_scale_armv4|
-
-;/****************************************************************************
-; *
-; *  ROUTINE       : vp8cx_vertical_band_1_2_scale_c
-; *
-; *  INPUTS        : unsigned char *dest    : Pointer to destination data.
-; *                  unsigned int dest_pitch : Stride of destination data.
-; *                  unsigned int dest_width : Width of destination data.
-; *
-; *  OUTPUTS       : None.
-; *
-; *  RETURNS       : void
-; *
-; *  FUNCTION      : Scales vertical band of pixels by scale 1 to 2. The
-; *                  height of the band scaled is 1-pixel.
-; *
-; *  SPECIAL NOTES : The routine uses the first line of the band below
-; *                  the current band.
-; *
-; ****************************************************************************/
-;void vp8cx_vertical_band_1_2_scale_c
-;(
-;   r0 = UINT8 *dest
-;   r1 = UINT32 dest_pitch
-;   r2 = UINT32 dest_width
-;)
-|vertical_band_1_2_scale_armv4| PROC
-    stmdb   sp!, {r4 - r7, lr}
-
-    ldr     mask, =0xff00ff             ; mask for selection
-    ldr     lr, = 0x010001
-
-vl12_loop
-    mov     r3, src
-    ldr     r4, [r3], r1
-    ldr     r5, [r3, r1]
-
-    add     src, src, #4
-    subs    r2, r2, #4
-
-    and     r6, r4, mask
-    and     r7, r5, mask
-
-    add     r6, r7, r6
-    add     r6, r6, lr
-
-    and     r4, mask, r4, lsr #8
-    and     r5, mask, r5, lsr #8
-
-    mov     r6, r6, lsr #1
-    and     r6, r6, mask
-
-    add     r4, r5, r4
-    add     r4, r4, lr
-
-    mov     r4, r4, lsr #1
-    and     r4, r4, mask
-
-    orr     r5, r6, r4, lsl #8
-
-    str     r5, [r3]
-
-    bpl     vl12_loop
-
-    ldmia   sp!, {r4 - r7, pc}
-    ENDP    ;|vertical_band_3_5_scale_armv4|
-
-    END

diff --git a/vpx_scale/wce/scalesystemdependant.c b/vpx_scale/wce/scalesystemdependant.c
deleted file mode 100644
index e9f2c26..0000000
--- a/vpx_scale/wce/scalesystemdependant.c
+++ /dev/null

@@ -1,60 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_scale/vpxscale.h"
-
-/****************************************************************************
-*  Imports
-*****************************************************************************/
-
-/****************************************************************************
- *
- *  ROUTINE       : vp8_scale_machine_specific_config
- *
- *  INPUTS        : UINT32 Version : Codec version number.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Checks for machine specifc features such as MMX support
- *                  sets appropriate flags and function pointers.
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-void vp8_scale_machine_specific_config()
-{
-    vp8_horizontal_line_1_2_scale        = horizontal_line_1_2_scale_armv4;
-    vp8_vertical_band_1_2_scale          = vertical_band_1_2_scale_armv4;
-    vp8_last_vertical_band_1_2_scale      = vp8cx_last_vertical_band_1_2_scale_c;
-    vp8_horizontal_line_3_5_scale        = horizontal_line_3_5_scale_armv4;
-    vp8_vertical_band_3_5_scale          = vertical_band_3_5_scale_armv4;
-    vp8_last_vertical_band_3_5_scale      = vp8cx_last_vertical_band_3_5_scale_c;
-    vp8_horizontal_line_3_4_scale        = horizontal_line_3_4_scale_armv4;
-    vp8_vertical_band_3_4_scale          = vertical_band_3_4_scale_armv4;
-    vp8_last_vertical_band_3_4_scale      = vp8cx_last_vertical_band_3_4_scale_c;
-    vp8_horizontal_line_2_3_scale        = horizontal_line_2_3_scale_armv4;
-    vp8_vertical_band_2_3_scale          = vertical_band_2_3_scale_armv4;
-    vp8_last_vertical_band_2_3_scale      = vp8cx_last_vertical_band_2_3_scale_c;
-    vp8_horizontal_line_4_5_scale        = horizontal_line_4_5_scale_armv4;
-    vp8_vertical_band_4_5_scale          = vertical_band_4_5_scale_armv4;
-    vp8_last_vertical_band_4_5_scale      = vp8cx_last_vertical_band_4_5_scale_c;
-
-
-    vp8_vertical_band_5_4_scale           = vp8cx_vertical_band_5_4_scale_c;
-    vp8_vertical_band_5_3_scale           = vp8cx_vertical_band_5_3_scale_c;
-    vp8_vertical_band_2_1_scale           = vp8cx_vertical_band_2_1_scale_c;
-    vp8_vertical_band_2_1_scale_i         = vp8cx_vertical_band_2_1_scale_i_c;
-    vp8_horizontal_line_2_1_scale         = vp8cx_horizontal_line_2_1_scale_c;
-    vp8_horizontal_line_5_3_scale         = vp8cx_horizontal_line_5_3_scale_c;
-    vp8_horizontal_line_5_4_scale         = vp8cx_horizontal_line_5_4_scale_c;
-}

diff --git a/vpx_scale/win32/scalesystemdependant.c b/vpx_scale/win32/scalesystemdependant.c
deleted file mode 100644
index eab741f..0000000
--- a/vpx_scale/win32/scalesystemdependant.c
+++ /dev/null

@@ -1,91 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/****************************************************************************
-*
-*   Module Title :     system_dependant.c
-*
-*   Description  :     Miscellaneous system dependant functions
-*
-****************************************************************************/
-
-/****************************************************************************
-*  Header Files
-****************************************************************************/
-#include "vpx_scale/vpxscale.h"
-#include "cpuidlib.h"
-
-/****************************************************************************
-*  Imports
-*****************************************************************************/
-extern void register_generic_scalers(void);
-extern void register_mmxscalers(void);
-
-/****************************************************************************
- *
- *  ROUTINE       : post_proc_machine_specific_config
- *
- *  INPUTS        : UINT32 Version : Codec version number.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Checks for machine specifc features such as MMX support
- *                  sets appropriate flags and function pointers.
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-void
-vp8_scale_machine_specific_config(void)
-{
-    // If MMX supported then set to use MMX versions of functions else
-    // use original 'C' versions.
-    int mmx_enabled;
-    int xmm_enabled;
-    int wmt_enabled;
-
-    vpx_get_processor_flags(&mmx_enabled, &xmm_enabled, &wmt_enabled);
-
-    if (mmx_enabled || xmm_enabled || wmt_enabled)
-    {
-        register_mmxscalers();
-    }
-    else
-    {
-        vp8_horizontal_line_1_2_scale        = vp8cx_horizontal_line_1_2_scale_c;
-        vp8_vertical_band_1_2_scale          = vp8cx_vertical_band_1_2_scale_c;
-        vp8_last_vertical_band_1_2_scale      = vp8cx_last_vertical_band_1_2_scale_c;
-        vp8_horizontal_line_3_5_scale        = vp8cx_horizontal_line_3_5_scale_c;
-        vp8_vertical_band_3_5_scale          = vp8cx_vertical_band_3_5_scale_c;
-        vp8_last_vertical_band_3_5_scale      = vp8cx_last_vertical_band_3_5_scale_c;
-        vp8_horizontal_line_3_4_scale        = vp8cx_horizontal_line_3_4_scale_c;
-        vp8_vertical_band_3_4_scale          = vp8cx_vertical_band_3_4_scale_c;
-        vp8_last_vertical_band_3_4_scale      = vp8cx_last_vertical_band_3_4_scale_c;
-        vp8_horizontal_line_2_3_scale        = vp8cx_horizontal_line_2_3_scale_c;
-        vp8_vertical_band_2_3_scale          = vp8cx_vertical_band_2_3_scale_c;
-        vp8_last_vertical_band_2_3_scale      = vp8cx_last_vertical_band_2_3_scale_c;
-        vp8_horizontal_line_4_5_scale        = vp8cx_horizontal_line_4_5_scale_c;
-        vp8_vertical_band_4_5_scale          = vp8cx_vertical_band_4_5_scale_c;
-        vp8_last_vertical_band_4_5_scale      = vp8cx_last_vertical_band_4_5_scale_c;
-
-
-        vp8_vertical_band_5_4_scale           = vp8cx_vertical_band_5_4_scale_c;
-        vp8_vertical_band_5_3_scale           = vp8cx_vertical_band_5_3_scale_c;
-        vp8_vertical_band_2_1_scale           = vp8cx_vertical_band_2_1_scale_c;
-        vp8_vertical_band_2_1_scale_i         = vp8cx_vertical_band_2_1_scale_i_c;
-        vp8_horizontal_line_2_1_scale         = vp8cx_horizontal_line_2_1_scale_c;
-        vp8_horizontal_line_5_3_scale         = vp8cx_horizontal_line_5_3_scale_c;
-        vp8_horizontal_line_5_4_scale         = vp8cx_horizontal_line_5_4_scale_c;
-
-    }
-}

diff --git a/vpx_scale/intel_linux/scalesystemdependant.c b/vpx_scale/win32/scalesystemdependent.c
similarity index 96%
rename from vpx_scale/intel_linux/scalesystemdependant.c
rename to vpx_scale/win32/scalesystemdependent.c
index eab741f..19e61c3 100644
--- a/vpx_scale/intel_linux/scalesystemdependant.c
+++ b/vpx_scale/win32/scalesystemdependent.c

@@ -11,9 +11,9 @@
 
 /****************************************************************************
 *
-*   Module Title :     system_dependant.c
+*   Module Title :     system_dependent.c
 *
-*   Description  :     Miscellaneous system dependant functions
+*   Description  :     Miscellaneous system dependent functions
 *
 ****************************************************************************/
 

diff --git a/vpx_scale/x86_64/scaleopt.c b/vpx_scale/x86_64/scaleopt.c
deleted file mode 100644
index 101f5ff..0000000
--- a/vpx_scale/x86_64/scaleopt.c
+++ /dev/null

@@ -1,1750 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/****************************************************************************
-*
-*   Module Title :     scaleopt.cpp
-*
-*   Description  :     Optimized scaling functions
-*
-****************************************************************************/
-#include "pragmas.h"
-
-
-
-/****************************************************************************
-*  Module Statics
-****************************************************************************/
-__declspec(align(16)) const static unsigned short one_fifth[]  = { 51, 51, 51, 51 };
-__declspec(align(16)) const static unsigned short two_fifths[] = { 102, 102, 102, 102 };
-__declspec(align(16)) const static unsigned short three_fifths[] = { 154, 154, 154, 154 };
-__declspec(align(16)) const static unsigned short four_fifths[] = { 205, 205, 205, 205 };
-__declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 };
-__declspec(align(16)) const static unsigned short four_ones[] = { 1, 1, 1, 1};
-__declspec(align(16)) const static unsigned short const45_2[] = {205, 154, 102,  51 };
-__declspec(align(16)) const static unsigned short const45_1[] = { 51, 102, 154, 205 };
-__declspec(align(16)) const static unsigned char  mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0};
-__declspec(align(16)) const static unsigned short const35_2[] = { 154,  51, 205, 102 };
-__declspec(align(16)) const static unsigned short const35_1[] = { 102, 205,  51, 154 };
-
-
-
-#include "vpx_scale/vpxscale.h"
-#include "vpx_mem/vpx_mem.h"
-
-/****************************************************************************
-*
-*  ROUTINE       : horizontal_line_3_5_scale_mmx
-*
-*  INPUTS        : const unsigned char *source :
-*                  unsigned int source_width    :
-*                  unsigned char *dest         :
-*                  unsigned int dest_width      :
-*
-*  OUTPUTS       : None.
-*
-*  RETURNS       : void
-*
-*  FUNCTION      : 3 to 5 up-scaling of a horizontal line of pixels.
-*
-*  SPECIAL NOTES : None.
-*
-****************************************************************************/
-static
-void horizontal_line_3_5_scale_mmx
-(
-    const unsigned char *source,
-    unsigned int source_width,
-    unsigned char *dest,
-    unsigned int dest_width
-)
-{
-    (void) dest_width;
-
-    __asm
-    {
-
-        push        rbx
-
-        mov         rsi,    source
-        mov         rdi,    dest
-
-        mov         ecx,    source_width
-        lea         rdx,    [rsi+rcx-3];
-
-        movq        mm5,    const35_1       // mm5 = 66 xx cd xx 33 xx 9a xx
-        movq        mm6,    const35_2       // mm6 = 9a xx 33 xx cd xx 66 xx
-
-        movq        mm4,    round_values     // mm4 = 80 xx 80 xx 80 xx 80 xx
-        pxor        mm7,    mm7             // clear mm7
-
-        horiz_line_3_5_loop:
-
-        mov         eax,    DWORD PTR [rsi] // eax = 00 01 02 03
-        mov         ebx,    eax
-
-        and         ebx,    0xffff00        // ebx = xx 01 02 xx
-        mov         ecx,    eax             // ecx = 00 01 02 03
-
-        and         eax,    0xffff0000      // eax = xx xx 02 03
-        xor         ecx,    eax             // ecx = 00 01 xx xx
-
-        shr         ebx,    8               // ebx = 01 02 xx xx
-        or          eax,    ebx             // eax = 01 02 02 03
-
-        shl         ebx,    16              // ebx = xx xx 01 02
-        movd        mm1,    eax             // mm1 = 01 02 02 03 xx xx xx xx
-
-        or          ebx,    ecx             // ebx = 00 01 01 02
-        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 02 xx 03 xx
-
-        movd        mm0,    ebx             // mm0 = 00 01 01 02
-        pmullw      mm1,    mm6             //
-
-        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 01 xx 02 xx
-        pmullw      mm0,    mm5             //
-
-        mov         [rdi],  ebx             // writeoutput 00 xx xx xx
-        add         rsi,    3
-
-        add         rdi,    5
-        paddw       mm0,    mm1
-
-        paddw       mm0,    mm4
-        psrlw       mm0,    8
-
-        cmp         rsi,    rdx
-        packuswb    mm0,    mm7
-
-        movd        DWORD Ptr [rdi-4], mm0
-        jl          horiz_line_3_5_loop
-
-//Exit:
-        mov         eax,    DWORD PTR [rsi] // eax = 00 01 02 03
-        mov         ebx,    eax
-
-        and         ebx,    0xffff00        // ebx = xx 01 02 xx
-        mov         ecx,    eax             // ecx = 00 01 02 03
-
-        and         eax,    0xffff0000      // eax = xx xx 02 03
-        xor         ecx,    eax             // ecx = 00 01 xx xx
-
-        shr         ebx,    8               // ebx = 01 02 xx xx
-        or          eax,    ebx             // eax = 01 02 02 03
-
-        shl         eax,    8               // eax = xx 01 02 02
-        and         eax,    0xffff0000      // eax = xx xx 02 02
-
-        or          eax,    ebx             // eax = 01 02 02 02
-
-        shl         ebx,    16              // ebx = xx xx 01 02
-        movd        mm1,    eax             // mm1 = 01 02 02 02 xx xx xx xx
-
-        or          ebx,    ecx             // ebx = 00 01 01 02
-        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 02 xx 02 xx
-
-        movd        mm0,    ebx             // mm0 = 00 01 01 02
-        pmullw      mm1,    mm6             //
-
-        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 01 xx 02 xx
-        pmullw      mm0,    mm5             //
-
-        mov         [rdi],  ebx             // writeoutput 00 xx xx xx
-        paddw       mm0,    mm1
-
-        paddw       mm0,    mm4
-        psrlw       mm0,    8
-
-        packuswb    mm0,    mm7
-        movd        DWORD Ptr [rdi+1], mm0
-
-        pop rbx
-
-    }
-
-}
-
-
-/****************************************************************************
-*
-*  ROUTINE       : horizontal_line_4_5_scale_mmx
-*
-*  INPUTS        : const unsigned char *source :
-*                  unsigned int source_width    :
-*                  unsigned char *dest         :
-*                  unsigned int dest_width      :
-*
-*  OUTPUTS       : None.
-*
-*  RETURNS       : void
-*
-*  FUNCTION      : 4 to 5 up-scaling of a horizontal line of pixels.
-*
-*  SPECIAL NOTES : None.
-*
-****************************************************************************/
-static
-void horizontal_line_4_5_scale_mmx
-(
-    const unsigned char *source,
-    unsigned int source_width,
-    unsigned char *dest,
-    unsigned int dest_width
-)
-{
-    (void)dest_width;
-
-    __asm
-    {
-
-        mov         rsi,    source
-        mov         rdi,    dest
-
-        mov         ecx,    source_width
-        lea         rdx,    [rsi+rcx-8];
-
-        movq        mm5,    const45_1       // mm5 = 33 xx 66 xx 9a xx cd xx
-        movq        mm6,    const45_2       // mm6 = cd xx 9a xx 66 xx 33 xx
-
-        movq        mm4,    round_values     // mm4 = 80 xx 80 xx 80 xx 80 xx
-        pxor        mm7,    mm7             // clear mm7
-
-        horiz_line_4_5_loop:
-
-        movq        mm0,    QWORD PTR [rsi]           // mm0 = 00 01 02 03 04 05 06 07
-        movq        mm1,    QWORD PTR [rsi+1];        // mm1 = 01 02 03 04 05 06 07 08
-
-        movq        mm2,    mm0             // mm2 = 00 01 02 03 04 05 06 07
-        movq        mm3,    mm1             // mm3 = 01 02 03 04 05 06 07 08
-
-        movd        DWORD PTR [rdi],  mm0             // write output 00 xx xx xx
-        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 02 xx 03 xx
-
-        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 03 xx 04 xx
-        pmullw      mm0,    mm5             // 00* 51 01*102 02*154 03*205
-
-        pmullw      mm1,    mm6             // 01*205 02*154 03*102 04* 51
-        punpckhbw   mm2,    mm7             // mm2 = 04 xx 05 xx 06 xx 07 xx
-
-        movd        DWORD PTR [rdi+5], mm2            // write ouput 05 xx xx xx
-        pmullw      mm2,    mm5             // 04* 51 05*102 06*154 07*205
-
-        punpckhbw   mm3,    mm7             // mm3 = 05 xx 06 xx 07 xx 08 xx
-        pmullw      mm3,    mm6             // 05*205 06*154 07*102 08* 51
-
-        paddw       mm0,    mm1             // added round values
-        paddw       mm0,    mm4
-
-        psrlw       mm0,    8               // output: 01 xx 02 xx 03 xx 04 xx
-        packuswb    mm0,    mm7
-
-        movd        DWORD PTR [rdi+1], mm0  // write output 01 02 03 04
-        add         rdi,    10
-
-        add         rsi,    8
-        paddw       mm2,    mm3             //
-
-        paddw       mm2,    mm4             // added round values
-        cmp         rsi,    rdx
-
-        psrlw       mm2,    8
-        packuswb    mm2,    mm7
-
-        movd        DWORD PTR [rdi-4], mm2 // writeoutput 06 07 08 09
-        jl         horiz_line_4_5_loop
-
-//Exit:
-        movq        mm0,    [rsi]           // mm0 = 00 01 02 03 04 05 06 07
-        movq        mm1,    mm0             // mm1 = 00 01 02 03 04 05 06 07
-
-        movq        mm2,    mm0             // mm2 = 00 01 02 03 04 05 06 07
-        psrlq       mm1,    8               // mm1 = 01 02 03 04 05 06 07 00
-
-        movq        mm3,    mask45          // mm3 = 00 00 00 00 00 00 ff 00
-        pand        mm3,    mm1             // mm3 = 00 00 00 00 00 00 07 00
-
-        psllq       mm3,    8               // mm3 = 00 00 00 00 00 00 00 07
-        por         mm1,    mm3             // mm1 = 01 02 03 04 05 06 07 07
-
-        movq        mm3,    mm1
-
-        movd        DWORD PTR [rdi],  mm0   // write output 00 xx xx xx
-        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 02 xx 03 xx
-
-        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 03 xx 04 xx
-        pmullw      mm0,    mm5             // 00* 51 01*102 02*154 03*205
-
-        pmullw      mm1,    mm6             // 01*205 02*154 03*102 04* 51
-        punpckhbw   mm2,    mm7             // mm2 = 04 xx 05 xx 06 xx 07 xx
-
-        movd        DWORD PTR [rdi+5], mm2  // write ouput 05 xx xx xx
-        pmullw      mm2,    mm5             // 04* 51 05*102 06*154 07*205
-
-        punpckhbw   mm3,    mm7             // mm3 = 05 xx 06 xx 07 xx 08 xx
-        pmullw      mm3,    mm6             // 05*205 06*154 07*102 07* 51
-
-        paddw       mm0,    mm1             // added round values
-        paddw       mm0,    mm4
-
-        psrlw       mm0,    8               // output: 01 xx 02 xx 03 xx 04 xx
-        packuswb    mm0,    mm7             // 01 02 03 04 xx xx xx xx
-
-        movd        DWORD PTR [rdi+1], mm0  // write output 01 02 03 04
-        paddw       mm2,    mm3             //
-
-        paddw       mm2,    mm4             // added round values
-        psrlw       mm2,    8
-
-        packuswb    mm2,    mm7
-        movd        DWORD PTR [rdi+6], mm2  // writeoutput 06 07 08 09
-
-
-    }
-}
-
-/****************************************************************************
-*
-*  ROUTINE       : vertical_band_4_5_scale_mmx
-*
-*  INPUTS        : unsigned char *dest    :
-*                  unsigned int dest_pitch :
-*                  unsigned int dest_width :
-*
-*  OUTPUTS       : None.
-*
-*  RETURNS       : void
-*
-*  FUNCTION      : 4 to 5 up-scaling of a 4 pixel high band of pixels.
-*
-*  SPECIAL NOTES : The routine uses the first line of the band below
-*                  the current band. The function also has a "C" only
-*                  version.
-*
-****************************************************************************/
-static
-void vertical_band_4_5_scale_mmx
-(
-    unsigned char *dest,
-    unsigned int dest_pitch,
-    unsigned int dest_width
-)
-{
-    __asm
-    {
-
-        mov         rsi,    dest                    // Get the source and destination pointer
-        mov         ecx,    dest_pitch               // Get the pitch size
-
-        lea         rdi,    [rsi+rcx*2]             // tow lines below
-        add         rdi,    rcx                     // three lines below
-
-        pxor        mm7,    mm7                     // clear out mm7
-        mov         edx,    dest_width               // Loop counter
-
-        vs_4_5_loop:
-
-        movq        mm0,    QWORD ptr [rsi]         // src[0];
-        movq        mm1,    QWORD ptr [rsi+rcx]     // src[1];
-
-        movq        mm2,    mm0                     // Make a copy
-        punpcklbw   mm0,    mm7                     // unpack low to word
-
-        movq        mm5,    one_fifth
-        punpckhbw   mm2,    mm7                     // unpack high to word
-
-        pmullw      mm0,    mm5                     // a * 1/5
-
-        movq        mm3,    mm1                     // make a copy
-        punpcklbw   mm1,    mm7                     // unpack low to word
-
-        pmullw      mm2,    mm5                     // a * 1/5
-        movq        mm6,    four_fifths               // constan
-
-        movq        mm4,    mm1                     // copy of low b
-        pmullw      mm4,    mm6                     // b * 4/5
-
-        punpckhbw   mm3,    mm7                     // unpack high to word
-        movq        mm5,    mm3                     // copy of high b
-
-        pmullw      mm5,    mm6                     // b * 4/5
-        paddw       mm0,    mm4                     // a * 1/5 + b * 4/5
-
-        paddw       mm2,    mm5                     // a * 1/5 + b * 4/5
-        paddw       mm0,    round_values             // + 128
-
-        paddw       mm2,    round_values             // + 128
-        psrlw       mm0,    8
-
-        psrlw       mm2,    8
-        packuswb    mm0,    mm2                     // des [1]
-
-        movq        QWORD ptr [rsi+rcx], mm0        // write des[1]
-        movq        mm0,    [rsi+rcx*2]             // mm0 = src[2]
-
-        // mm1, mm3 --- Src[1]
-        // mm0 --- Src[2]
-        // mm7 for unpacking
-
-        movq        mm5,    two_fifths
-        movq        mm2,    mm0                     // make a copy
-
-        pmullw      mm1,    mm5                     // b * 2/5
-        movq        mm6,    three_fifths
-
-
-        punpcklbw   mm0,    mm7                     // unpack low to word
-        pmullw      mm3,    mm5                     // b * 2/5
-
-        movq        mm4,    mm0                     // make copy of c
-        punpckhbw   mm2,    mm7                     // unpack high to word
-
-        pmullw      mm4,    mm6                     // c * 3/5
-        movq        mm5,    mm2
-
-        pmullw      mm5,    mm6                     // c * 3/5
-        paddw       mm1,    mm4                     // b * 2/5 + c * 3/5
-
-        paddw       mm3,    mm5                     // b * 2/5 + c * 3/5
-        paddw       mm1,    round_values             // + 128
-
-        paddw       mm3,    round_values             // + 128
-        psrlw       mm1,    8
-
-        psrlw       mm3,    8
-        packuswb    mm1,    mm3                     // des[2]
-
-        movq        QWORD ptr [rsi+rcx*2], mm1      // write des[2]
-        movq        mm1,    [rdi]                   // mm1=Src[3];
-
-        // mm0, mm2 --- Src[2]
-        // mm1 --- Src[3]
-        // mm6 --- 3/5
-        // mm7 for unpacking
-
-        pmullw      mm0,    mm6                     // c * 3/5
-        movq        mm5,    two_fifths               // mm5 = 2/5
-
-        movq        mm3,    mm1                     // make a copy
-        pmullw      mm2,    mm6                     // c * 3/5
-
-        punpcklbw   mm1,    mm7                     // unpack low
-        movq        mm4,    mm1                     // make a copy
-
-        punpckhbw   mm3,    mm7                     // unpack high
-        pmullw      mm4,    mm5                     // d * 2/5
-
-        movq        mm6,    mm3                     // make a copy
-        pmullw      mm6,    mm5                     // d * 2/5
-
-        paddw       mm0,    mm4                     // c * 3/5 + d * 2/5
-        paddw       mm2,    mm6                     // c * 3/5 + d * 2/5
-
-        paddw       mm0,    round_values             // + 128
-        paddw       mm2,    round_values             // + 128
-
-        psrlw       mm0,    8
-        psrlw       mm2,    8
-
-        packuswb    mm0,    mm2                     // des[3]
-        movq        QWORD ptr [rdi], mm0            // write des[3]
-
-        //  mm1, mm3 --- Src[3]
-        //  mm7 -- cleared for unpacking
-
-        movq        mm0,    [rdi+rcx*2]             // mm0, Src[0] of the next group
-
-        movq        mm5,    four_fifths              // mm5 = 4/5
-        pmullw      mm1,    mm5                     // d * 4/5
-
-        movq        mm6,    one_fifth                // mm6 = 1/5
-        movq        mm2,    mm0                     // make a copy
-
-        pmullw      mm3,    mm5                     // d * 4/5
-        punpcklbw   mm0,    mm7                     // unpack low
-
-        pmullw      mm0,    mm6                     // an * 1/5
-        punpckhbw   mm2,    mm7                     // unpack high
-
-        paddw       mm1,    mm0                     // d * 4/5 + an * 1/5
-        pmullw      mm2,    mm6                     // an * 1/5
-
-        paddw       mm3,    mm2                     // d * 4/5 + an * 1/5
-        paddw       mm1,    round_values             // + 128
-
-        paddw       mm3,    round_values             // + 128
-        psrlw       mm1,    8
-
-        psrlw       mm3,    8
-        packuswb    mm1,    mm3                     // des[4]
-
-        movq        QWORD ptr [rdi+rcx], mm1        // write des[4]
-
-        add         rdi,    8
-        add         rsi,    8
-
-        sub         rdx,    8
-        jg          vs_4_5_loop
-    }
-}
-
-/****************************************************************************
-*
-*  ROUTINE       : last_vertical_band_4_5_scale_mmx
-*
-*  INPUTS        : unsigned char *dest    :
-*                  unsigned int dest_pitch :
-*                  unsigned int dest_width :
-*
-*  OUTPUTS       : None.
-*
-*  RETURNS       : None
-*
-*  FUNCTION      : 4 to 5 up-scaling of the last 4-pixel high band in an image.
-*
-*  SPECIAL NOTES : The routine uses the first line of the band below
-*                  the current band. The function also has an "C" only
-*                  version.
-*
-****************************************************************************/
-static
-void last_vertical_band_4_5_scale_mmx
-(
-    unsigned char *dest,
-    unsigned int dest_pitch,
-    unsigned int dest_width
-)
-{
-    __asm
-    {
-        mov         rsi,    dest                    // Get the source and destination pointer
-        mov         ecx,    dest_pitch               // Get the pitch size
-
-        lea         rdi,    [rsi+rcx*2]             // tow lines below
-        add         rdi,    rcx                     // three lines below
-
-        pxor        mm7,    mm7                     // clear out mm7
-        mov         edx,    dest_width               // Loop counter
-
-        last_vs_4_5_loop:
-
-        movq        mm0,    QWORD ptr [rsi]         // src[0];
-        movq        mm1,    QWORD ptr [rsi+rcx]     // src[1];
-
-        movq        mm2,    mm0                     // Make a copy
-        punpcklbw   mm0,    mm7                     // unpack low to word
-
-        movq        mm5,    one_fifth
-        punpckhbw   mm2,    mm7                     // unpack high to word
-
-        pmullw      mm0,    mm5                     // a * 1/5
-
-        movq        mm3,    mm1                     // make a copy
-        punpcklbw   mm1,    mm7                     // unpack low to word
-
-        pmullw      mm2,    mm5                     // a * 1/5
-        movq        mm6,    four_fifths               // constan
-
-        movq        mm4,    mm1                     // copy of low b
-        pmullw      mm4,    mm6                     // b * 4/5
-
-        punpckhbw   mm3,    mm7                     // unpack high to word
-        movq        mm5,    mm3                     // copy of high b
-
-        pmullw      mm5,    mm6                     // b * 4/5
-        paddw       mm0,    mm4                     // a * 1/5 + b * 4/5
-
-        paddw       mm2,    mm5                     // a * 1/5 + b * 4/5
-        paddw       mm0,    round_values             // + 128
-
-        paddw       mm2,    round_values             // + 128
-        psrlw       mm0,    8
-
-        psrlw       mm2,    8
-        packuswb    mm0,    mm2                     // des [1]
-
-        movq        QWORD ptr [rsi+rcx], mm0        // write des[1]
-        movq        mm0,    [rsi+rcx*2]             // mm0 = src[2]
-
-        // mm1, mm3 --- Src[1]
-        // mm0 --- Src[2]
-        // mm7 for unpacking
-
-        movq        mm5,    two_fifths
-        movq        mm2,    mm0                     // make a copy
-
-        pmullw      mm1,    mm5                     // b * 2/5
-        movq        mm6,    three_fifths
-
-
-        punpcklbw   mm0,    mm7                     // unpack low to word
-        pmullw      mm3,    mm5                     // b * 2/5
-
-        movq        mm4,    mm0                     // make copy of c
-        punpckhbw   mm2,    mm7                     // unpack high to word
-
-        pmullw      mm4,    mm6                     // c * 3/5
-        movq        mm5,    mm2
-
-        pmullw      mm5,    mm6                     // c * 3/5
-        paddw       mm1,    mm4                     // b * 2/5 + c * 3/5
-
-        paddw       mm3,    mm5                     // b * 2/5 + c * 3/5
-        paddw       mm1,    round_values             // + 128
-
-        paddw       mm3,    round_values             // + 128
-        psrlw       mm1,    8
-
-        psrlw       mm3,    8
-        packuswb    mm1,    mm3                     // des[2]
-
-        movq        QWORD ptr [rsi+rcx*2], mm1      // write des[2]
-        movq        mm1,    [rdi]                   // mm1=Src[3];
-
-        movq        QWORD ptr [rdi+rcx], mm1        // write des[4];
-
-        // mm0, mm2 --- Src[2]
-        // mm1 --- Src[3]
-        // mm6 --- 3/5
-        // mm7 for unpacking
-
-        pmullw      mm0,    mm6                     // c * 3/5
-        movq        mm5,    two_fifths               // mm5 = 2/5
-
-        movq        mm3,    mm1                     // make a copy
-        pmullw      mm2,    mm6                     // c * 3/5
-
-        punpcklbw   mm1,    mm7                     // unpack low
-        movq        mm4,    mm1                     // make a copy
-
-        punpckhbw   mm3,    mm7                     // unpack high
-        pmullw      mm4,    mm5                     // d * 2/5
-
-        movq        mm6,    mm3                     // make a copy
-        pmullw      mm6,    mm5                     // d * 2/5
-
-        paddw       mm0,    mm4                     // c * 3/5 + d * 2/5
-        paddw       mm2,    mm6                     // c * 3/5 + d * 2/5
-
-        paddw       mm0,    round_values             // + 128
-        paddw       mm2,    round_values             // + 128
-
-        psrlw       mm0,    8
-        psrlw       mm2,    8
-
-        packuswb    mm0,    mm2                     // des[3]
-        movq        QWORD ptr [rdi], mm0            // write des[3]
-
-        //  mm1, mm3 --- Src[3]
-        //  mm7 -- cleared for unpacking
-        add         rdi,    8
-        add         rsi,    8
-
-        sub         rdx,    8
-        jg          last_vs_4_5_loop
-    }
-}
-
-/****************************************************************************
-*
-*  ROUTINE       : vertical_band_3_5_scale_mmx
-*
-*  INPUTS        : unsigned char *dest    :
-*                  unsigned int dest_pitch :
-*                  unsigned int dest_width :
-*
-*  OUTPUTS       : None.
-*
-*  RETURNS       : void
-*
-*  FUNCTION      : 3 to 5 up-scaling of a 3-pixel high band of pixels.
-*
-*  SPECIAL NOTES : The routine uses the first line of the band below
-*                  the current band. The function also has an "C" only
-*                  version.
-*
-****************************************************************************/
-static
-void vertical_band_3_5_scale_mmx
-(
-    unsigned char *dest,
-    unsigned int dest_pitch,
-    unsigned int dest_width
-)
-{
-    __asm
-    {
-        mov         rsi,    dest                    // Get the source and destination pointer
-        mov         ecx,    dest_pitch               // Get the pitch size
-
-        lea         rdi,    [rsi+rcx*2]             // two lines below
-        add         rdi,    rcx                     // three lines below
-
-        pxor        mm7,    mm7                     // clear out mm7
-        mov         edx,    dest_width               // Loop counter
-
-        vs_3_5_loop:
-
-        movq        mm0,    QWORD ptr [rsi]         // src[0];
-        movq        mm1,    QWORD ptr [rsi+rcx]     // src[1];
-
-        movq        mm2,    mm0                     // Make a copy
-        punpcklbw   mm0,    mm7                     // unpack low to word
-
-        movq        mm5,    two_fifths               // mm5 = 2/5
-        punpckhbw   mm2,    mm7                     // unpack high to word
-
-        pmullw      mm0,    mm5                     // a * 2/5
-
-        movq        mm3,    mm1                     // make a copy
-        punpcklbw   mm1,    mm7                     // unpack low to word
-
-        pmullw      mm2,    mm5                     // a * 2/5
-        movq        mm6,    three_fifths             // mm6 = 3/5
-
-        movq        mm4,    mm1                     // copy of low b
-        pmullw      mm4,    mm6                     // b * 3/5
-
-        punpckhbw   mm3,    mm7                     // unpack high to word
-        movq        mm5,    mm3                     // copy of high b
-
-        pmullw      mm5,    mm6                     // b * 3/5
-        paddw       mm0,    mm4                     // a * 2/5 + b * 3/5
-
-        paddw       mm2,    mm5                     // a * 2/5 + b * 3/5
-        paddw       mm0,    round_values             // + 128
-
-        paddw       mm2,    round_values             // + 128
-        psrlw       mm0,    8
-
-        psrlw       mm2,    8
-        packuswb    mm0,    mm2                     // des [1]
-
-        movq        QWORD ptr [rsi+rcx], mm0        // write des[1]
-        movq        mm0,    [rsi+rcx*2]             // mm0 = src[2]
-
-        // mm1, mm3 --- Src[1]
-        // mm0 --- Src[2]
-        // mm7 for unpacking
-
-        movq        mm4,    mm1                     // b low
-        pmullw      mm1,    four_fifths              // b * 4/5 low
-
-        movq        mm5,    mm3                     // b high
-        pmullw      mm3,    four_fifths              // b * 4/5 high
-
-        movq        mm2,    mm0                     // c
-        pmullw      mm4,    one_fifth                // b * 1/5
-
-        punpcklbw   mm0,    mm7                     // c low
-        pmullw      mm5,    one_fifth                // b * 1/5
-
-        movq        mm6,    mm0                     // make copy of c low
-        punpckhbw   mm2,    mm7                     // c high
-
-        pmullw      mm6,    one_fifth                // c * 1/5 low
-        movq        mm7,    mm2                     // make copy of c high
-
-        pmullw      mm7,    one_fifth                // c * 1/5 high
-        paddw       mm1,    mm6                     // b * 4/5 + c * 1/5 low
-
-        paddw       mm3,    mm7                     // b * 4/5 + c * 1/5 high
-        movq        mm6,    mm0                     // make copy of c low
-
-        pmullw      mm6,    four_fifths              // c * 4/5 low
-        movq        mm7,    mm2                     // make copy of c high
-
-        pmullw      mm7,    four_fifths              // c * 4/5 high
-
-        paddw       mm4,    mm6                     // b * 1/5 + c * 4/5 low
-        paddw       mm5,    mm7                     // b * 1/5 + c * 4/5 high
-
-        paddw       mm1,    round_values             // + 128
-        paddw       mm3,    round_values             // + 128
-
-        psrlw       mm1,    8
-        psrlw       mm3,    8
-
-        packuswb    mm1,    mm3                     // des[2]
-        movq        QWORD ptr [rsi+rcx*2], mm1      // write des[2]
-
-        paddw       mm4,    round_values             // + 128
-        paddw       mm5,    round_values             // + 128
-
-        psrlw       mm4,    8
-        psrlw       mm5,    8
-
-        packuswb    mm4,    mm5                     // des[3]
-        movq        QWORD ptr [rdi], mm4            // write des[3]
-
-        //  mm0, mm2 --- Src[3]
-
-        pxor        mm7,    mm7                     // clear mm7 for unpacking
-        movq        mm1,    [rdi+rcx*2]             // mm1 = Src[0] of the next group
-
-        movq        mm5,    three_fifths             // mm5 = 3/5
-        pmullw      mm0,    mm5                     // d * 3/5
-
-        movq        mm6,    two_fifths                // mm6 = 2/5
-        movq        mm3,    mm1                     // make a copy
-
-        pmullw      mm2,    mm5                     // d * 3/5
-        punpcklbw   mm1,    mm7                     // unpack low
-
-        pmullw      mm1,    mm6                     // an * 2/5
-        punpckhbw   mm3,    mm7                     // unpack high
-
-        paddw       mm0,    mm1                     // d * 3/5 + an * 2/5
-        pmullw      mm3,    mm6                     // an * 2/5
-
-        paddw       mm2,    mm3                     // d * 3/5 + an * 2/5
-        paddw       mm0,    round_values             // + 128
-
-        paddw       mm2,    round_values             // + 128
-        psrlw       mm0,    8
-
-        psrlw       mm2,    8
-        packuswb    mm0,    mm2                     // des[4]
-
-        movq        QWORD ptr [rdi+rcx], mm0        // write des[4]
-
-        add         rdi,    8
-        add         rsi,    8
-
-        sub         rdx,    8
-        jg          vs_3_5_loop
-    }
-}
-
-/****************************************************************************
-*
-*  ROUTINE       : last_vertical_band_3_5_scale_mmx
-*
-*  INPUTS        : unsigned char *dest    :
-*                  unsigned int dest_pitch :
-*                  unsigned int dest_width :
-*
-*  OUTPUTS       : None.
-*
-*  RETURNS       : void
-*
-*  FUNCTION      : 3 to 5 up-scaling of a 3-pixel high band of pixels.
-*
-*  SPECIAL NOTES : The routine uses the first line of the band below
-*                  the current band. The function also has an "C" only
-*                  version.
-*
-****************************************************************************/
-static
-void last_vertical_band_3_5_scale_mmx
-(
-    unsigned char *dest,
-    unsigned int dest_pitch,
-    unsigned int dest_width
-)
-{
-    __asm
-    {
-        mov         rsi,    dest                    // Get the source and destination pointer
-        mov         ecx,    dest_pitch               // Get the pitch size
-
-        lea         rdi,    [rsi+rcx*2]             // tow lines below
-        add         rdi,    rcx                     // three lines below
-
-        pxor        mm7,    mm7                     // clear out mm7
-        mov         edx,    dest_width               // Loop counter
-
-
-        last_vs_3_5_loop:
-
-        movq        mm0,    QWORD ptr [rsi]         // src[0];
-        movq        mm1,    QWORD ptr [rsi+rcx]     // src[1];
-
-        movq        mm2,    mm0                     // Make a copy
-        punpcklbw   mm0,    mm7                     // unpack low to word
-
-        movq        mm5,    two_fifths               // mm5 = 2/5
-        punpckhbw   mm2,    mm7                     // unpack high to word
-
-        pmullw      mm0,    mm5                     // a * 2/5
-
-        movq        mm3,    mm1                     // make a copy
-        punpcklbw   mm1,    mm7                     // unpack low to word
-
-        pmullw      mm2,    mm5                     // a * 2/5
-        movq        mm6,    three_fifths             // mm6 = 3/5
-
-        movq        mm4,    mm1                     // copy of low b
-        pmullw      mm4,    mm6                     // b * 3/5
-
-        punpckhbw   mm3,    mm7                     // unpack high to word
-        movq        mm5,    mm3                     // copy of high b
-
-        pmullw      mm5,    mm6                     // b * 3/5
-        paddw       mm0,    mm4                     // a * 2/5 + b * 3/5
-
-        paddw       mm2,    mm5                     // a * 2/5 + b * 3/5
-        paddw       mm0,    round_values             // + 128
-
-        paddw       mm2,    round_values             // + 128
-        psrlw       mm0,    8
-
-        psrlw       mm2,    8
-        packuswb    mm0,    mm2                     // des [1]
-
-        movq        QWORD ptr [rsi+rcx], mm0        // write des[1]
-        movq        mm0,    [rsi+rcx*2]             // mm0 = src[2]
-
-
-
-        // mm1, mm3 --- Src[1]
-        // mm0 --- Src[2]
-        // mm7 for unpacking
-
-        movq        mm4,    mm1                     // b low
-        pmullw      mm1,    four_fifths              // b * 4/5 low
-
-        movq        QWORD ptr [rdi+rcx], mm0        // write des[4]
-
-        movq        mm5,    mm3                     // b high
-        pmullw      mm3,    four_fifths              // b * 4/5 high
-
-        movq        mm2,    mm0                     // c
-        pmullw      mm4,    one_fifth                // b * 1/5
-
-        punpcklbw   mm0,    mm7                     // c low
-        pmullw      mm5,    one_fifth                // b * 1/5
-
-        movq        mm6,    mm0                     // make copy of c low
-        punpckhbw   mm2,    mm7                     // c high
-
-        pmullw      mm6,    one_fifth                // c * 1/5 low
-        movq        mm7,    mm2                     // make copy of c high
-
-        pmullw      mm7,    one_fifth                // c * 1/5 high
-        paddw       mm1,    mm6                     // b * 4/5 + c * 1/5 low
-
-        paddw       mm3,    mm7                     // b * 4/5 + c * 1/5 high
-        movq        mm6,    mm0                     // make copy of c low
-
-        pmullw      mm6,    four_fifths              // c * 4/5 low
-        movq        mm7,    mm2                     // make copy of c high
-
-        pmullw      mm7,    four_fifths              // c * 4/5 high
-
-        paddw       mm4,    mm6                     // b * 1/5 + c * 4/5 low
-        paddw       mm5,    mm7                     // b * 1/5 + c * 4/5 high
-
-        paddw       mm1,    round_values             // + 128
-        paddw       mm3,    round_values             // + 128
-
-        psrlw       mm1,    8
-        psrlw       mm3,    8
-
-        packuswb    mm1,    mm3                     // des[2]
-        movq        QWORD ptr [rsi+rcx*2], mm1      // write des[2]
-
-        paddw       mm4,    round_values             // + 128
-        paddw       mm5,    round_values             // + 128
-
-        psrlw       mm4,    8
-        psrlw       mm5,    8
-
-        packuswb    mm4,    mm5                     // des[3]
-        movq        QWORD ptr [rdi], mm4            // write des[3]
-
-        //  mm0, mm2 --- Src[3]
-
-        add         rdi,    8
-        add         rsi,    8
-
-        sub         rdx,    8
-        jg          last_vs_3_5_loop
-    }
-}
-
-/****************************************************************************
-*
-*  ROUTINE       : vertical_band_1_2_scale_mmx
-*
-*  INPUTS        : unsigned char *dest    :
-*                  unsigned int dest_pitch :
-*                  unsigned int dest_width :
-*
-*  OUTPUTS       : None.
-*
-*  RETURNS       : void
-*
-*  FUNCTION      : 1 to 2 up-scaling of a band of pixels.
-*
-*  SPECIAL NOTES : The routine uses the first line of the band below
-*                  the current band. The function also has an "C" only
-*                  version.
-*
-****************************************************************************/
-static
-void vertical_band_1_2_scale_mmx
-(
-    unsigned char *dest,
-    unsigned int dest_pitch,
-    unsigned int dest_width
-)
-{
-    __asm
-    {
-
-        mov         rsi,    dest                    // Get the source and destination pointer
-        mov         ecx,    dest_pitch               // Get the pitch size
-
-        pxor        mm7,    mm7                     // clear out mm7
-        mov         edx,    dest_width               // Loop counter
-
-        vs_1_2_loop:
-
-        movq        mm0,    [rsi]                   // get Src[0]
-        movq        mm1,    [rsi + rcx * 2]         // get Src[1]
-
-        movq        mm2,    mm0                     // make copy before unpack
-        movq        mm3,    mm1                     // make copy before unpack
-
-        punpcklbw   mm0,    mm7                     // low Src[0]
-        movq        mm6,    four_ones                // mm6= 1, 1, 1, 1
-
-        punpcklbw   mm1,    mm7                     // low Src[1]
-        paddw       mm0,    mm1                     // low (a + b)
-
-        punpckhbw   mm2,    mm7                     // high Src[0]
-        paddw       mm0,    mm6                     // low (a + b + 1)
-
-        punpckhbw   mm3,    mm7
-        paddw       mm2,    mm3                     // high (a + b )
-
-        psraw       mm0,    1                       // low (a + b +1 )/2
-        paddw       mm2,    mm6                     // high (a + b + 1)
-
-        psraw       mm2,    1                       // high (a + b + 1)/2
-        packuswb    mm0,    mm2                     // pack results
-
-        movq        [rsi+rcx], mm0                  // write out eight bytes
-        add         rsi,    8
-
-        sub         rdx,    8
-        jg          vs_1_2_loop
-    }
-
-}
-
-/****************************************************************************
-*
-*  ROUTINE       : last_vertical_band_1_2_scale_mmx
-*
-*  INPUTS        : unsigned char *dest    :
-*                  unsigned int dest_pitch :
-*                  unsigned int dest_width :
-*
-*  OUTPUTS       : None.
-*
-*  RETURNS       : void
-*
-*  FUNCTION      : 1 to 2 up-scaling of band of pixels.
-*
-*  SPECIAL NOTES : The routine uses the first line of the band below
-*                  the current band. The function also has an "C" only
-*                  version.
-*
-****************************************************************************/
-static
-void last_vertical_band_1_2_scale_mmx
-(
-    unsigned char *dest,
-    unsigned int dest_pitch,
-    unsigned int dest_width
-)
-{
-    __asm
-    {
-        mov         rsi,    dest                    // Get the source and destination pointer
-        mov         ecx,    dest_pitch               // Get the pitch size
-
-        mov         edx,    dest_width               // Loop counter
-
-        last_vs_1_2_loop:
-
-        movq        mm0,    [rsi]                   // get Src[0]
-        movq        [rsi+rcx], mm0                  // write out eight bytes
-
-        add         rsi,    8
-        sub         rdx,    8
-
-        jg          last_vs_1_2_loop
-    }
-}
-
-/****************************************************************************
-*
-*  ROUTINE       : horizontal_line_1_2_scale
-*
-*  INPUTS        : const unsigned char *source :
-*                  unsigned int source_width    :
-*                  unsigned char *dest         :
-*                  unsigned int dest_width      :
-*
-*  OUTPUTS       : None.
-*
-*  RETURNS       : void
-*
-*  FUNCTION      : 1 to 2 up-scaling of a horizontal line of pixels.
-*
-*  SPECIAL NOTES : None.
-*
-****************************************************************************/
-static
-void horizontal_line_1_2_scale_mmx
-(
-    const unsigned char *source,
-    unsigned int source_width,
-    unsigned char *dest,
-    unsigned int dest_width
-)
-{
-    (void) dest_width;
-
-    __asm
-    {
-        mov         rsi,    source
-        mov         rdi,    dest
-
-        pxor        mm7,    mm7
-        movq        mm6,    four_ones
-
-        mov         ecx,    source_width
-
-        hs_1_2_loop:
-
-        movq        mm0,    [rsi]
-        movq        mm1,    [rsi+1]
-
-        movq        mm2,    mm0
-        movq        mm3,    mm1
-
-        movq        mm4,    mm0
-        punpcklbw   mm0,    mm7
-
-        punpcklbw   mm1,    mm7
-        paddw       mm0,    mm1
-
-        paddw       mm0,    mm6
-        punpckhbw   mm2,    mm7
-
-        punpckhbw   mm3,    mm7
-        paddw       mm2,    mm3
-
-        paddw       mm2,    mm6
-        psraw       mm0,    1
-
-        psraw       mm2,    1
-        packuswb    mm0,    mm2
-
-        movq        mm2,    mm4
-        punpcklbw   mm2,    mm0
-
-        movq        [rdi],  mm2
-        punpckhbw   mm4,    mm0
-
-        movq        [rdi+8], mm4
-        add         rsi,    8
-
-        add         rdi,    16
-        sub         rcx,    8
-
-        cmp         rcx,    8
-        jg          hs_1_2_loop
-
-// last eight pixel
-
-        movq        mm0,    [rsi]
-        movq        mm1,    mm0
-
-        movq        mm2,    mm0
-        movq        mm3,    mm1
-
-        psrlq       mm1,    8
-        psrlq       mm3,    56
-
-        psllq       mm3,    56
-        por         mm1,    mm3
-
-        movq        mm3,    mm1
-        movq        mm4,    mm0
-
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-
-        paddw       mm0,    mm1
-        paddw       mm0,    mm6
-
-        punpckhbw   mm2,    mm7
-        punpckhbw   mm3,    mm7
-
-        paddw       mm2,    mm3
-        paddw       mm2,    mm6
-
-        psraw       mm0,    1
-        psraw       mm2,    1
-
-        packuswb    mm0,    mm2
-        movq        mm2,    mm4
-
-        punpcklbw   mm2,    mm0
-        movq        [rdi],  mm2
-
-        punpckhbw   mm4,    mm0
-        movq        [rdi+8], mm4
-    }
-}
-
-
-
-
-
-__declspec(align(16)) const static unsigned short const54_2[] = {  0,  64, 128, 192 };
-__declspec(align(16)) const static unsigned short const54_1[] = {256, 192, 128,  64 };
-
-
-/****************************************************************************
-*
-*  ROUTINE       : horizontal_line_5_4_scale_mmx
-*
-*  INPUTS        : const unsigned char *source : Pointer to source data.
-*                  unsigned int source_width    : Stride of source.
-*                  unsigned char *dest         : Pointer to destination data.
-*                  unsigned int dest_width      : Stride of destination (NOT USED).
-*
-*  OUTPUTS       : None.
-*
-*  RETURNS       : void
-*
-*  FUNCTION      : Copies horizontal line of pixels from source to
-*                  destination scaling up by 4 to 5.
-*
-*  SPECIAL NOTES : None.
-*
-****************************************************************************/
-static
-void horizontal_line_5_4_scale_mmx
-(
-    const unsigned char *source,
-    unsigned int source_width,
-    unsigned char *dest,
-    unsigned int dest_width
-)
-{
-    /*
-    unsigned i;
-    unsigned int a, b, c, d, e;
-    unsigned char *des = dest;
-    const unsigned char *src = source;
-
-    (void) dest_width;
-
-    for ( i=0; i<source_width; i+=5 )
-    {
-        a = src[0];
-        b = src[1];
-        c = src[2];
-        d = src[3];
-        e = src[4];
-
-        des[0] = a;
-        des[1] = ((b*192 + c* 64 + 128)>>8);
-        des[2] = ((c*128 + d*128 + 128)>>8);
-        des[3] = ((d* 64 + e*192 + 128)>>8);
-
-        src += 5;
-        des += 4;
-    }
-    */
-    __asm
-    {
-
-        mov         rsi,        source              ;
-        mov         rdi,        dest                ;
-
-        mov         ecx,        source_width         ;
-        movq        mm5,        const54_1           ;
-
-        pxor        mm7,        mm7                 ;
-        movq        mm6,        const54_2           ;
-
-        movq        mm4,        round_values         ;
-        lea         rdx,        [rsi+rcx]           ;
-        horizontal_line_5_4_loop:
-
-        movq        mm0,        QWORD PTR  [rsi]    ;
-        00 01 02 03 04 05 06 07
-        movq        mm1,        mm0                 ;
-        00 01 02 03 04 05 06 07
-
-        psrlq       mm0,        8                   ;
-        01 02 03 04 05 06 07 xx
-        punpcklbw   mm1,        mm7                 ;
-        xx 00 xx 01 xx 02 xx 03
-
-        punpcklbw   mm0,        mm7                 ;
-        xx 01 xx 02 xx 03 xx 04
-        pmullw      mm1,        mm5
-
-        pmullw      mm0,        mm6
-        add         rsi,        5
-
-        add         rdi,        4
-        paddw       mm1,        mm0
-
-        paddw       mm1,        mm4
-        psrlw       mm1,        8
-
-        cmp         rsi,        rdx
-        packuswb    mm1,        mm7
-
-        movd        DWORD PTR [rdi-4], mm1
-
-        jl          horizontal_line_5_4_loop
-
-    }
-
-}
-__declspec(align(16)) const static unsigned short one_fourths[]   = {  64,  64,  64, 64  };
-__declspec(align(16)) const static unsigned short two_fourths[]   = { 128, 128, 128, 128 };
-__declspec(align(16)) const static unsigned short three_fourths[] = { 192, 192, 192, 192 };
-
-static
-void vertical_band_5_4_scale_mmx
-(
-    unsigned char *source,
-    unsigned int src_pitch,
-    unsigned char *dest,
-    unsigned int dest_pitch,
-    unsigned int dest_width
-)
-{
-
-    __asm
-    {
-
-        mov         rsi,    source                    // Get the source and destination pointer
-        mov         ecx,    src_pitch               // Get the pitch size
-
-        mov         rdi,    dest                    // tow lines below
-        pxor        mm7,    mm7                     // clear out mm7
-
-        mov         edx,    dest_pitch               // Loop counter
-        mov         ebx,    dest_width
-
-        vs_5_4_loop:
-
-        movd        mm0,    DWORD ptr [rsi]         // src[0];
-        movd        mm1,    DWORD ptr [rsi+rcx]     // src[1];
-
-        movd        mm2,    DWORD ptr [rsi+rcx*2]
-        lea         rax,    [rsi+rcx*2]             //
-
-        punpcklbw   mm1,    mm7
-        punpcklbw   mm2,    mm7
-
-        movq        mm3,    mm2
-        pmullw      mm1,    three_fourths
-
-        pmullw      mm2,    one_fourths
-        movd        mm4,    [rax+rcx]
-
-        pmullw      mm3,    two_fourths
-        punpcklbw   mm4,    mm7
-
-        movq        mm5,    mm4
-        pmullw      mm4,    two_fourths
-
-        paddw       mm1,    mm2
-        movd        mm6,    [rax+rcx*2]
-
-        pmullw      mm5,    one_fourths
-        paddw       mm1,    round_values;
-
-        paddw       mm3,    mm4
-        psrlw       mm1,    8
-
-        punpcklbw   mm6,    mm7
-        paddw       mm3,    round_values
-
-        pmullw      mm6,    three_fourths
-        psrlw       mm3,    8
-
-        packuswb    mm1,    mm7
-        packuswb    mm3,    mm7
-
-        movd        DWORD PTR [rdi], mm0
-        movd        DWORD PTR [rdi+rdx], mm1
-
-
-        paddw       mm5,    mm6
-        movd        DWORD PTR [rdi+rdx*2], mm3
-
-        lea         rax,    [rdi+rdx*2]
-        paddw       mm5,    round_values
-
-        psrlw       mm5,    8
-        add         rdi,    4
-
-        packuswb    mm5,    mm7
-        movd        DWORD PTR [rax+rdx], mm5
-
-        add         rsi,    4
-        sub         rbx,    4
-
-        jg         vs_5_4_loop
-    }
-}
-
-
-__declspec(align(16)) const static unsigned short const53_1[] = {  0,  85, 171, 0 };
-__declspec(align(16)) const static unsigned short const53_2[] = {256, 171,  85, 0 };
-
-
-static
-void horizontal_line_5_3_scale_mmx
-(
-    const unsigned char *source,
-    unsigned int source_width,
-    unsigned char *dest,
-    unsigned int dest_width
-)
-{
-    __asm
-    {
-
-        mov         rsi,        source              ;
-        mov         rdi,        dest                ;
-
-        mov         ecx,        source_width         ;
-        movq        mm5,        const53_1           ;
-
-        pxor        mm7,        mm7                 ;
-        movq        mm6,        const53_2           ;
-
-        movq        mm4,        round_values         ;
-        lea         rdx,        [rsi+rcx-5]         ;
-        horizontal_line_5_3_loop:
-
-        movq        mm0,        QWORD PTR  [rsi]    ;
-        00 01 02 03 04 05 06 07
-        movq        mm1,        mm0                 ;
-        00 01 02 03 04 05 06 07
-
-        psllw       mm0,        8                   ;
-        xx 00 xx 02 xx 04 xx 06
-        psrlw       mm1,        8                   ;
-        01 xx 03 xx 05 xx 07 xx
-
-        psrlw       mm0,        8                   ;
-        00 xx 02 xx 04 xx 06 xx
-        psllq       mm1,        16                  ;
-        xx xx 01 xx 03 xx 05 xx
-
-        pmullw      mm0,        mm6
-
-        pmullw      mm1,        mm5
-        add         rsi,        5
-
-        add         rdi,        3
-        paddw       mm1,        mm0
-
-        paddw       mm1,        mm4
-        psrlw       mm1,        8
-
-        cmp         rsi,        rdx
-        packuswb    mm1,        mm7
-
-        movd        DWORD PTR [rdi-3], mm1
-        jl          horizontal_line_5_3_loop
-
-//exit condition
-        movq        mm0,        QWORD PTR  [rsi]    ;
-        00 01 02 03 04 05 06 07
-        movq        mm1,        mm0                 ;
-        00 01 02 03 04 05 06 07
-
-        psllw       mm0,        8                   ;
-        xx 00 xx 02 xx 04 xx 06
-        psrlw       mm1,        8                   ;
-        01 xx 03 xx 05 xx 07 xx
-
-        psrlw       mm0,        8                   ;
-        00 xx 02 xx 04 xx 06 xx
-        psllq       mm1,        16                  ;
-        xx xx 01 xx 03 xx 05 xx
-
-        pmullw      mm0,        mm6
-
-        pmullw      mm1,        mm5
-        paddw       mm1,        mm0
-
-        paddw       mm1,        mm4
-        psrlw       mm1,        8
-
-        packuswb    mm1,        mm7
-        movd        rax,        mm1
-
-        mov         rdx,        rax
-        shr         rdx,        16
-
-        mov         WORD PTR[rdi],   ax
-        mov         BYTE PTR[rdi+2], dl
-
-    }
-
-}
-
-__declspec(align(16)) const static unsigned short one_thirds[] = {  85,  85,  85,  85 };
-__declspec(align(16)) const static unsigned short two_thirds[] = { 171, 171, 171, 171 };
-
-static
-void vertical_band_5_3_scale_mmx
-(
-    unsigned char *source,
-    unsigned int src_pitch,
-    unsigned char *dest,
-    unsigned int dest_pitch,
-    unsigned int dest_width
-)
-{
-
-    __asm
-    {
-
-        mov         rsi,    source                    // Get the source and destination pointer
-        mov         ecx,    src_pitch               // Get the pitch size
-
-        mov         rdi,    dest                    // tow lines below
-        pxor        mm7,    mm7                     // clear out mm7
-
-        mov         edx,    dest_pitch               // Loop counter
-        movq        mm5,    one_thirds
-
-        movq        mm6,    two_thirds
-        mov         ebx,    dest_width;
-
-        vs_5_3_loop:
-
-        movd        mm0,    DWORD ptr [rsi]         // src[0];
-        movd        mm1,    DWORD ptr [rsi+rcx]     // src[1];
-
-        movd        mm2,    DWORD ptr [rsi+rcx*2]
-        lea         rax,    [rsi+rcx*2]             //
-
-        punpcklbw   mm1,    mm7
-        punpcklbw   mm2,    mm7
-
-        pmullw      mm1,    mm5
-        pmullw      mm2,    mm6
-
-        movd        mm3,    DWORD ptr [rax+rcx]
-        movd        mm4,    DWORD ptr [rax+rcx*2]
-
-        punpcklbw   mm3,    mm7
-        punpcklbw   mm4,    mm7
-
-        pmullw      mm3,    mm6
-        pmullw      mm4,    mm5
-
-
-        movd        DWORD PTR [rdi], mm0
-        paddw       mm1,    mm2
-
-        paddw       mm1,    round_values
-        psrlw       mm1,    8
-
-        packuswb    mm1,    mm7
-        paddw       mm3,    mm4
-
-        paddw       mm3,    round_values
-        movd        DWORD PTR [rdi+rdx], mm1
-
-        psrlw       mm3,    8
-        packuswb    mm3,    mm7
-
-        movd        DWORD PTR [rdi+rdx*2], mm3
-
-
-        add         rdi,    4
-        add         rsi,    4
-
-        sub         rbx,    4
-        jg          vs_5_3_loop
-    }
-}
-
-
-
-
-/****************************************************************************
-*
-*  ROUTINE       : horizontal_line_2_1_scale
-*
-*  INPUTS        : const unsigned char *source :
-*                  unsigned int source_width    :
-*                  unsigned char *dest         :
-*                  unsigned int dest_width      :
-*
-*  OUTPUTS       : None.
-*
-*  RETURNS       : void
-*
-*  FUNCTION      : 1 to 2 up-scaling of a horizontal line of pixels.
-*
-*  SPECIAL NOTES : None.
-*
-****************************************************************************/
-static
-void horizontal_line_2_1_scale_mmx
-(
-    const unsigned char *source,
-    unsigned int source_width,
-    unsigned char *dest,
-    unsigned int dest_width
-)
-{
-    (void) dest_width;
-
-    __asm
-    {
-        mov         rsi,    source
-        mov         rdi,    dest
-
-        pxor        mm7,    mm7
-        mov         ecx,    dest_width
-
-        xor         rdx,    rdx
-        hs_2_1_loop:
-
-        movq        mm0,    [rsi+rdx*2]
-        psllw       mm0,    8
-
-        psrlw       mm0,    8
-        packuswb    mm0,    mm7
-
-        movd        DWORD Ptr [rdi+rdx], mm0;
-        add         rdx,    4
-
-        cmp         rdx,    rcx
-        jl          hs_2_1_loop
-
-    }
-}
-
-
-
-static
-void vertical_band_2_1_scale_mmx
-(
-    unsigned char *source,
-    unsigned int src_pitch,
-    unsigned char *dest,
-    unsigned int dest_pitch,
-    unsigned int dest_width)
-{
-    vpx_memcpy(dest, source, dest_width);
-}
-
-
-__declspec(align(16)) const static unsigned short three_sixteenths[] = {  48,  48,  48,  48 };
-__declspec(align(16)) const static unsigned short ten_sixteenths[]   = { 160, 160, 160, 160 };
-
-static
-void vertical_band_2_1_scale_i_mmx
-(
-    unsigned char *source,
-    unsigned int src_pitch,
-    unsigned char *dest,
-    unsigned int dest_pitch,
-    unsigned int dest_width
-)
-{
-    __asm
-    {
-        mov         rsi,        source
-        mov         rdi,        dest
-
-        mov         eax,        src_pitch
-        mov         edx,        dest_width
-
-        pxor        mm7,        mm7
-        sub         rsi,        rax             //back one line
-
-
-        lea         rcx,        [rsi+rdx];
-        movq        mm6,        round_values;
-
-        movq        mm5,        three_sixteenths;
-        movq        mm4,        ten_sixteenths;
-
-        vs_2_1_i_loop:
-        movd        mm0,        [rsi]           //
-        movd        mm1,        [rsi+rax]       //
-
-        movd        mm2,        [rsi+rax*2]     //
-        punpcklbw   mm0,        mm7
-
-        pmullw      mm0,        mm5
-        punpcklbw   mm1,        mm7
-
-        pmullw      mm1,        mm4
-        punpcklbw   mm2,        mm7
-
-        pmullw      mm2,        mm5
-        paddw       mm0,        round_values
-
-        paddw       mm1,        mm2
-        paddw       mm0,        mm1
-
-        psrlw       mm0,        8
-        packuswb    mm0,        mm7
-
-        movd        DWORD PTR [rdi],        mm0
-        add         rsi,        4
-
-        add         rdi,        4;
-        cmp         rsi,        rcx
-        jl          vs_2_1_i_loop
-
-    }
-}
-
-
-
-void
-register_mmxscalers(void)
-{
-    vp8_horizontal_line_1_2_scale        = horizontal_line_1_2_scale_mmx;
-    vp8_horizontal_line_3_5_scale        = horizontal_line_3_5_scale_mmx;
-    vp8_horizontal_line_4_5_scale        = horizontal_line_4_5_scale_mmx;
-    vp8_vertical_band_1_2_scale          = vertical_band_1_2_scale_mmx;
-    vp8_last_vertical_band_1_2_scale      = last_vertical_band_1_2_scale_mmx;
-    vp8_vertical_band_3_5_scale          = vertical_band_3_5_scale_mmx;
-    vp8_last_vertical_band_3_5_scale      = last_vertical_band_3_5_scale_mmx;
-    vp8_vertical_band_4_5_scale          = vertical_band_4_5_scale_mmx;
-    vp8_last_vertical_band_4_5_scale      = last_vertical_band_4_5_scale_mmx;
-
-    vp8_vertical_band_5_4_scale           = vertical_band_5_4_scale_mmx;
-    vp8_vertical_band_5_3_scale           = vertical_band_5_3_scale_mmx;
-    vp8_vertical_band_2_1_scale           = vertical_band_2_1_scale_mmx;
-    vp8_vertical_band_2_1_scale_i         = vertical_band_2_1_scale_i_mmx;
-    vp8_horizontal_line_2_1_scale         = horizontal_line_2_1_scale_mmx;
-    vp8_horizontal_line_5_3_scale         = horizontal_line_5_3_scale_mmx;
-    vp8_horizontal_line_5_4_scale         = horizontal_line_5_4_scale_mmx;
-}

diff --git a/vpx_scale/x86_64/scalesystemdependant.c b/vpx_scale/x86_64/scalesystemdependant.c
deleted file mode 100644
index 89ae86e..0000000
--- a/vpx_scale/x86_64/scalesystemdependant.c
+++ /dev/null

@@ -1,61 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/****************************************************************************
-*
-*   Module Title :     system_dependant.c
-*
-*   Description  :     Miscellaneous system dependant functions
-*
-****************************************************************************/
-
-/****************************************************************************
-*  Header Files
-****************************************************************************/
-#include "vpx_scale/vpxscale.h"
-#include "cpuidlib.h"
-
-/****************************************************************************
-*  Imports
-*****************************************************************************/
-extern void register_generic_scalers(void);
-extern void register_mmxscalers(void);
-
-/****************************************************************************
- *
- *  ROUTINE       : post_proc_machine_specific_config
- *
- *  INPUTS        : UINT32 Version : Codec version number.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Checks for machine specifc features such as MMX support
- *                  sets appropriate flags and function pointers.
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-void
-vp8_scale_machine_specific_config(void)
-{
-    int wmt_enabled = 1;
-
-    if (wmt_enabled)
-    {
-        register_mmxscalers();
-    }
-    else
-    {
-        register_generic_scalers();
-    }
-}

diff --git a/vpx_scale/yv12config.h b/vpx_scale/yv12config.h
index 0b08db3..e10db34 100644
--- a/vpx_scale/yv12config.h
+++ b/vpx_scale/yv12config.h

@@ -63,7 +63,6 @@
 
     int vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int border);
     int vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf);
-    int vp8_yv12_black_frame_buffer(YV12_BUFFER_CONFIG *ybf);
 
 #ifdef __cplusplus
 }

diff --git a/vpxenc.c b/vpxenc.c
index d0f4c23..6c13cd1 100755
--- a/vpxenc.c
+++ b/vpxenc.c

@@ -163,8 +163,8 @@
 
         if (!stats->buf.buf)
         {
-            fprintf(stderr, "Failed to allocate first-pass stats buffer (%d bytes)\n",
-                    stats->buf_alloc_sz);
+            fprintf(stderr, "Failed to allocate first-pass stats buffer (%lu bytes)\n",
+                    (unsigned long)stats->buf_alloc_sz);
             exit(EXIT_FAILURE);
         }
 
@@ -924,8 +924,14 @@
         "Upscale threshold (buf %)");
 static const arg_def_t resize_down_thresh = ARG_DEF(NULL, "resize-down", 1,
         "Downscale threshold (buf %)");
-static const arg_def_t end_usage          = ARG_DEF(NULL, "end-usage", 1,
-        "VBR=0 | CBR=1 | CQ=2");
+static const struct arg_enum_list end_usage_enum[] = {
+    {"vbr", VPX_VBR},
+    {"cbr", VPX_CBR},
+    {"cq",  VPX_CQ},
+    {NULL, 0}
+};
+static const arg_def_t end_usage          = ARG_DEF_ENUM(NULL, "end-usage", 1,
+        "Rate control mode", end_usage_enum);
 static const arg_def_t target_bitrate     = ARG_DEF(NULL, "target-bitrate", 1,
         "Bitrate (kbps)");
 static const arg_def_t min_quantizer      = ARG_DEF(NULL, "min-q", 1,
@@ -1256,7 +1262,7 @@
         else if (arg_match(&arg, &resize_down_thresh, argi))
             cfg.rc_resize_down_thresh = arg_parse_uint(&arg);
         else if (arg_match(&arg, &end_usage, argi))
-            cfg.rc_end_usage = arg_parse_uint(&arg);
+            cfg.rc_end_usage = arg_parse_enum_or_int(&arg);
         else if (arg_match(&arg, &target_bitrate, argi))
             cfg.rc_target_bitrate = arg_parse_uint(&arg);
         else if (arg_match(&arg, &min_quantizer, argi))

diff --git a/wince_wmain_adapter.cpp b/wince_wmain_adapter.cpp
deleted file mode 100644
index 57f880a..0000000
--- a/wince_wmain_adapter.cpp
+++ /dev/null

@@ -1,50 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/* This program is created to take command arguments and pass
- * them to main() in example.c or example_xma.c, because the
- * correspending part in example.c or example_xma.c does not
- * work on Pocket PC platform.
- * To modify the command arguments, go to "Property" page and
- * fill in "Command arguments." For example:
- *  --codec vp6 --flipuv --progress _bnd.vp6
- */
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#define MAX_NUM_ARG 64
-#define MAX_SIZ_ARG 512
-
-extern "C"
-{
-    int main(int argc, char **argv);
-}
-
-int wmain(int argc, wchar_t **argv) {
-    char *cargv[MAX_NUM_ARG];
-    char chargv[MAX_SIZ_ARG];
-    int ret;
-
-    /* transform command line arguments from (wchar_t *) to (char *) */
-    for(int i=0; i<argc; i++) {
-        wcstombs( chargv, argv[i], sizeof(chargv));
-        cargv[i] = _strdup(chargv);
-    }
-
-    ret = main(argc, (char **)cargv);
-
-    //free the memory located by _strdup()
-    for(int i=0; i<argc; i++)
-        free(cargv[i]);
-
-    return ret;
-}