Merge "Cleaning up vp9_mvref_common.c."
diff --git a/build/make/ads2armasm_ms.pl b/build/make/ads2armasm_ms.pl
index 1def539..95c8084 100755
--- a/build/make/ads2armasm_ms.pl
+++ b/build/make/ads2armasm_ms.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
 ##
 ##  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
 ##
diff --git a/build/make/rtcd.pl b/build/make/rtcd.pl
new file mode 100755
index 0000000..18ee80d
--- /dev/null
+++ b/build/make/rtcd.pl
@@ -0,0 +1,414 @@
+#!/usr/bin/env perl
+
+no strict 'refs';
+use warnings;
+use Getopt::Long;
+Getopt::Long::Configure("auto_help");
+
+my %ALL_FUNCS = ();
+my @ALL_ARCHS;
+my @ALL_FORWARD_DECLS;
+my @REQUIRES;
+
+my %opts = ();
+my %disabled = ();
+my %required = ();
+
+my @argv;
+foreach (@ARGV) {
+  $disabled{$1} = 1, next if /--disable-(.*)/;
+  $required{$1} = 1, next if /--require-(.*)/;
+  push @argv, $_;
+}
+
+# NB: use GetOptions() instead of GetOptionsFromArray() for compatibility.
+@ARGV = @argv;
+GetOptions(
+  \%opts,
+  'arch=s',
+  'sym=s',
+  'config=s',
+);
+
+foreach my $opt (qw/arch config/) {
+  if (!defined($opts{$opt})) {
+    warn "--$opt is required!\n";
+    Getopt::Long::HelpMessage('-exit' => 1);
+  }
+}
+
+foreach my $defs_file (@ARGV) {
+  if (!-f $defs_file) {
+    warn "$defs_file: $!\n";
+    Getopt::Long::HelpMessage('-exit' => 1);
+  }
+}
+
+open CONFIG_FILE, $opts{config} or
+  die "Error opening config file '$opts{config}': $!\n";
+
+my %config = ();
+while (<CONFIG_FILE>) {
+  next if !/^CONFIG_/;
+  chomp;
+  my @pair = split /=/;
+  $config{$pair[0]} = $pair[1];
+}
+close CONFIG_FILE;
+
+#
+# Routines for the RTCD DSL to call
+#
+sub vpx_config($) {
+  return (defined $config{$_[0]}) ? $config{$_[0]} : "";
+}
+
+sub specialize {
+  my $fn=$_[0];
+  shift;
+  foreach my $opt (@_) {
+    eval "\$${fn}_${opt}=${fn}_${opt}";
+  }
+}
+
+sub add_proto {
+  my $fn = splice(@_, -2, 1);
+  $ALL_FUNCS{$fn} = \@_;
+  specialize $fn, "c";
+}
+
+sub require {
+  foreach my $fn (keys %ALL_FUNCS) {
+    foreach my $opt (@_) {
+      my $ofn = eval "\$${fn}_${opt}";
+      next if !$ofn;
+
+      # if we already have a default, then we can disable it, as we know
+      # we can do better.
+      my $best = eval "\$${fn}_default";
+      if ($best) {
+        my $best_ofn = eval "\$${best}";
+        if ($best_ofn && "$best_ofn" ne "$ofn") {
+          eval "\$${best}_link = 'false'";
+        }
+      }
+      eval "\$${fn}_default=${fn}_${opt}";
+      eval "\$${fn}_${opt}_link='true'";
+    }
+  }
+}
+
+sub forward_decls {
+  push @ALL_FORWARD_DECLS, @_;
+}
+
+#
+# Include the user's directives
+#
+foreach my $f (@ARGV) {
+  open FILE, "<", $f or die "cannot open $f: $!\n";
+  my $contents = join('', <FILE>);
+  close FILE;
+  eval $contents or warn "eval failed: $@\n";
+}
+
+#
+# Process the directives according to the command line
+#
+sub process_forward_decls() {
+  foreach (@ALL_FORWARD_DECLS) {
+    $_->();
+  }
+}
+
+sub determine_indirection {
+  vpx_config("CONFIG_RUNTIME_CPU_DETECT") eq "yes" or &require(@ALL_ARCHS);
+  foreach my $fn (keys %ALL_FUNCS) {
+    my $n = "";
+    my @val = @{$ALL_FUNCS{$fn}};
+    my $args = pop @val;
+    my $rtyp = "@val";
+    my $dfn = eval "\$${fn}_default";
+    $dfn = eval "\$${dfn}";
+    foreach my $opt (@_) {
+      my $ofn = eval "\$${fn}_${opt}";
+      next if !$ofn;
+      my $link = eval "\$${fn}_${opt}_link";
+      next if $link && $link eq "false";
+      $n .= "x";
+    }
+    if ($n eq "x") {
+      eval "\$${fn}_indirect = 'false'";
+    } else {
+      eval "\$${fn}_indirect = 'true'";
+    }
+  }
+}
+
+sub declare_function_pointers {
+  foreach my $fn (sort keys %ALL_FUNCS) {
+    my @val = @{$ALL_FUNCS{$fn}};
+    my $args = pop @val;
+    my $rtyp = "@val";
+    my $dfn = eval "\$${fn}_default";
+    $dfn = eval "\$${dfn}";
+    foreach my $opt (@_) {
+      my $ofn = eval "\$${fn}_${opt}";
+      next if !$ofn;
+      print "$rtyp ${ofn}($args);\n";
+    }
+    if (eval "\$${fn}_indirect" eq "false") {
+      print "#define ${fn} ${dfn}\n";
+    } else {
+      print "RTCD_EXTERN $rtyp (*${fn})($args);\n";
+    }
+    print "\n";
+  }
+}
+
+sub set_function_pointers {
+  foreach my $fn (sort keys %ALL_FUNCS) {
+    my @val = @{$ALL_FUNCS{$fn}};
+    my $args = pop @val;
+    my $rtyp = "@val";
+    my $dfn = eval "\$${fn}_default";
+    $dfn = eval "\$${dfn}";
+    if (eval "\$${fn}_indirect" eq "true") {
+      print "    $fn = $dfn;\n";
+      foreach my $opt (@_) {
+        my $ofn = eval "\$${fn}_${opt}";
+        next if !$ofn;
+        next if "$ofn" eq "$dfn";
+        my $link = eval "\$${fn}_${opt}_link";
+        next if $link && $link eq "false";
+        my $cond = eval "\$have_${opt}";
+        print "    if (${cond}) $fn = $ofn;\n"
+      }
+    }
+  }
+}
+
+sub filter {
+  my @filtered;
+  foreach (@_) { push @filtered, $_ unless $disabled{$_}; }
+  return @filtered;
+}
+
+#
+# Helper functions for generating the arch specific RTCD files
+#
+sub common_top() {
+  my $include_guard = uc($opts{sym})."_H_";
+  print <<EOF;
+#ifndef ${include_guard}
+#define ${include_guard}
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+EOF
+
+process_forward_decls();
+print "\n";
+declare_function_pointers("c", @ALL_ARCHS);
+
+print <<EOF;
+void $opts{sym}(void);
+
+EOF
+}
+
+sub common_bottom() {
+  print <<EOF;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
+EOF
+}
+
+sub x86() {
+  determine_indirection("c", @ALL_ARCHS);
+
+  # Assign the helper variable for each enabled extension
+  foreach my $opt (@ALL_ARCHS) {
+    my $opt_uc = uc $opt;
+    eval "\$have_${opt}=\"flags & HAS_${opt_uc}\"";
+  }
+
+  common_top;
+  print <<EOF;
+#ifdef RTCD_C
+#include "vpx_ports/x86.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = x86_simd_caps();
+
+    (void)flags;
+
+EOF
+
+  set_function_pointers("c", @ALL_ARCHS);
+
+  print <<EOF;
+}
+#endif
+EOF
+  common_bottom;
+}
+
+sub arm() {
+  determine_indirection("c", @ALL_ARCHS);
+
+  # Assign the helper variable for each enabled extension
+  foreach my $opt (@ALL_ARCHS) {
+    my $opt_uc = uc $opt;
+    eval "\$have_${opt}=\"flags & HAS_${opt_uc}\"";
+  }
+
+  common_top;
+  print <<EOF;
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+#include "vpx_ports/arm.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = arm_cpu_caps();
+
+    (void)flags;
+
+EOF
+
+  set_function_pointers("c", @ALL_ARCHS);
+
+  print <<EOF;
+}
+#endif
+EOF
+  common_bottom;
+}
+
+sub mips() {
+  determine_indirection("c", @ALL_ARCHS);
+  common_top;
+
+  print <<EOF;
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+static void setup_rtcd_internal(void)
+{
+EOF
+
+  set_function_pointers("c", @ALL_ARCHS);
+
+  print <<EOF;
+#if HAVE_DSPR2
+#if CONFIG_VP8
+void dsputil_static_init();
+dsputil_static_init();
+#endif
+#if CONFIG_VP9
+void vp9_dsputil_static_init();
+vp9_dsputil_static_init();
+#endif
+#endif
+}
+#endif
+EOF
+  common_bottom;
+}
+
+sub unoptimized() {
+  determine_indirection "c";
+  common_top;
+  print <<EOF;
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+static void setup_rtcd_internal(void)
+{
+EOF
+
+  set_function_pointers "c";
+
+  print <<EOF;
+}
+#endif
+EOF
+  common_bottom;
+}
+
+#
+# Main Driver
+#
+
+&require("c");
+if ($opts{arch} eq 'x86') {
+  @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 avx avx2/);
+  x86;
+} elsif ($opts{arch} eq 'x86_64') {
+  @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 avx avx2/);
+  @REQUIRES = filter(keys %required ? keys %required : qw/mmx sse sse2/);
+  &require(@REQUIRES);
+  x86;
+} elsif ($opts{arch} eq 'mips32') {
+  @ALL_ARCHS = filter(qw/mips32/);
+  open CONFIG_FILE, $opts{config} or
+    die "Error opening config file '$opts{config}': $!\n";
+  while (<CONFIG_FILE>) {
+    if (/HAVE_DSPR2=yes/) {
+      @ALL_ARCHS = filter(qw/mips32 dspr2/);
+      last;
+    }
+  }
+  close CONFIG_FILE;
+  mips;
+} elsif ($opts{arch} eq 'armv5te') {
+  @ALL_ARCHS = filter(qw/edsp/);
+  arm;
+} elsif ($opts{arch} eq 'armv6') {
+  @ALL_ARCHS = filter(qw/edsp media/);
+  arm;
+} elsif ($opts{arch} eq 'armv7') {
+  @ALL_ARCHS = filter(qw/edsp media neon/);
+  arm;
+} else {
+  unoptimized;
+}
+
+__END__
+
+=head1 NAME
+
+rtcd -
+
+=head1 SYNOPSIS
+
+Usage: rtcd.pl [options] FILE
+
+See 'perldoc rtcd.pl' for more details.
+
+=head1 DESCRIPTION
+
+Reads the Run Time CPU Detections definitions from FILE and generates a
+C header file on stdout.
+
+=head1 OPTIONS
+
+Options:
+  --arch=ARCH       Architecture to generate defs for (required)
+  --disable-EXT     Disable support for EXT extensions
+  --require-EXT     Require support for EXT extensions
+  --sym=SYMBOL      Unique symbol to use for RTCD initialization function
+  --config=FILE     File with CONFIG_FOO=yes lines to parse
diff --git a/build/make/rtcd.sh b/build/make/rtcd.sh
deleted file mode 100755
index 93c9adc..0000000
--- a/build/make/rtcd.sh
+++ /dev/null
@@ -1,373 +0,0 @@
-#!/bin/sh
-self=$0
-
-usage() {
-  cat <<EOF >&2
-Usage: $self [options] FILE
-
-Reads the Run Time CPU Detections definitions from FILE and generates a
-C header file on stdout.
-
-Options:
-  --arch=ARCH   Architecture to generate defs for (required)
-  --disable-EXT Disable support for EXT extensions
-  --require-EXT Require support for EXT extensions
-  --sym=SYMBOL  Unique symbol to use for RTCD initialization function
-  --config=FILE File with CONFIG_FOO=yes lines to parse
-EOF
-  exit 1
-}
-
-die() {
-  echo "$@" >&2
-  exit 1
-}
-
-die_argument_required() {
-  die "Option $opt requires argument"
-}
-
-for opt; do
-  optval="${opt#*=}"
-  case "$opt" in
-    --arch) die_argument_required;;
-    --arch=*) arch=${optval};;
-    --disable-*) eval "disable_${opt#--disable-}=true";;
-    --require-*) REQUIRES="${REQUIRES}${opt#--require-} ";;
-    --sym) die_argument_required;;
-    --sym=*) symbol=${optval};;
-    --config=*) config_file=${optval};;
-    -h|--help)
-      usage
-      ;;
-    -*)
-      die "Unrecognized option: ${opt%%=*}"
-      ;;
-    *)
-      defs_file="$defs_file $opt"
-      ;;
-  esac
-  shift
-done
-for f in $defs_file; do [ -f "$f" ] || usage; done
-[ -n "$arch" ] || usage
-
-# Import the configuration
-[ -f "$config_file" ] && eval $(grep CONFIG_ "$config_file")
-
-#
-# Routines for the RTCD DSL to call
-#
-prototype() {
-  rtyp=""
-  case "$1" in
-    unsigned) rtyp="$1 "; shift;;
-  esac
-  rtyp="${rtyp}$1"
-  fn="$2"
-  args="$3"
-
-  eval "${2}_rtyp='$rtyp'"
-  eval "${2}_args='$3'"
-  ALL_FUNCS="$ALL_FUNCS $fn"
-  specialize $fn c
-}
-
-specialize() {
-  fn="$1"
-  shift
-  for opt in "$@"; do
-    eval "${fn}_${opt}=${fn}_${opt}"
-  done
-}
-
-require() {
-  for fn in $ALL_FUNCS; do
-    for opt in "$@"; do
-      ofn=$(eval "echo \$${fn}_${opt}")
-      [ -z "$ofn" ] && continue
-
-      # if we already have a default, then we can disable it, as we know
-      # we can do better.
-      best=$(eval "echo \$${fn}_default")
-      best_ofn=$(eval "echo \$${best}")
-      [ -n "$best" ] && [ "$best_ofn" != "$ofn" ] && eval "${best}_link=false"
-      eval "${fn}_default=${fn}_${opt}"
-      eval "${fn}_${opt}_link=true"
-    done
-  done
-}
-
-forward_decls() {
-  ALL_FORWARD_DECLS="$ALL_FORWARD_DECLS $1"
-}
-
-#
-# Include the user's directives
-#
-for f in $defs_file; do
-  . $f
-done
-
-#
-# Process the directives according to the command line
-#
-process_forward_decls() {
-  for fn in $ALL_FORWARD_DECLS; do
-    eval $fn
-  done
-}
-
-determine_indirection() {
-  [ "$CONFIG_RUNTIME_CPU_DETECT" = "yes" ] || require $ALL_ARCHS
-  for fn in $ALL_FUNCS; do
-    n=""
-    rtyp="$(eval "echo \$${fn}_rtyp")"
-    args="$(eval "echo \"\$${fn}_args\"")"
-    dfn="$(eval "echo \$${fn}_default")"
-    dfn=$(eval "echo \$${dfn}")
-    for opt in "$@"; do
-      ofn=$(eval "echo \$${fn}_${opt}")
-      [ -z "$ofn" ] && continue
-      link=$(eval "echo \$${fn}_${opt}_link")
-      [ "$link" = "false" ] && continue
-      n="${n}x"
-    done
-    if [ "$n" = "x" ]; then
-      eval "${fn}_indirect=false"
-    else
-      eval "${fn}_indirect=true"
-    fi
-  done
-}
-
-declare_function_pointers() {
-  for fn in $ALL_FUNCS; do
-    rtyp="$(eval "echo \$${fn}_rtyp")"
-    args="$(eval "echo \"\$${fn}_args\"")"
-    dfn="$(eval "echo \$${fn}_default")"
-    dfn=$(eval "echo \$${dfn}")
-    for opt in "$@"; do
-      ofn=$(eval "echo \$${fn}_${opt}")
-      [ -z "$ofn" ] && continue
-      echo "$rtyp ${ofn}($args);"
-    done
-    if [ "$(eval "echo \$${fn}_indirect")" = "false" ]; then
-      echo "#define ${fn} ${dfn}"
-    else
-      echo "RTCD_EXTERN $rtyp (*${fn})($args);"
-    fi
-    echo
-  done
-}
-
-set_function_pointers() {
-  for fn in $ALL_FUNCS; do
-    n=""
-    rtyp="$(eval "echo \$${fn}_rtyp")"
-    args="$(eval "echo \"\$${fn}_args\"")"
-    dfn="$(eval "echo \$${fn}_default")"
-    dfn=$(eval "echo \$${dfn}")
-    if $(eval "echo \$${fn}_indirect"); then
-      echo "    $fn = $dfn;"
-      for opt in "$@"; do
-        ofn=$(eval "echo \$${fn}_${opt}")
-        [ -z "$ofn" ] && continue
-        [ "$ofn" = "$dfn" ] && continue;
-        link=$(eval "echo \$${fn}_${opt}_link")
-        [ "$link" = "false" ] && continue
-        cond="$(eval "echo \$have_${opt}")"
-        echo "    if (${cond}) $fn = $ofn;"
-      done
-    fi
-    echo
-  done
-}
-
-filter() {
-  filtered=""
-  for opt in "$@"; do
-    [ -z $(eval "echo \$disable_${opt}") ] && filtered="$filtered $opt"
-  done
-  echo $filtered
-}
-
-#
-# Helper functions for generating the arch specific RTCD files
-#
-common_top() {
-  outfile_basename=$(basename ${symbol:-rtcd})
-  include_guard=$(echo $outfile_basename | tr '[a-z]' '[A-Z]' | \
-    tr -c '[A-Z0-9]' _)H_
-  cat <<EOF
-#ifndef ${include_guard}
-#define ${include_guard}
-
-#ifdef RTCD_C
-#define RTCD_EXTERN
-#else
-#define RTCD_EXTERN extern
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-$(process_forward_decls)
-
-$(declare_function_pointers c $ALL_ARCHS)
-
-void ${symbol:-rtcd}(void);
-EOF
-}
-
-common_bottom() {
-  cat <<EOF
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif
-EOF
-}
-
-x86() {
-  determine_indirection c $ALL_ARCHS
-
-  # Assign the helper variable for each enabled extension
-  for opt in $ALL_ARCHS; do
-    uc=$(echo $opt | tr '[a-z]' '[A-Z]')
-    eval "have_${opt}=\"flags & HAS_${uc}\""
-  done
-
-  cat <<EOF
-$(common_top)
-
-#ifdef RTCD_C
-#include "vpx_ports/x86.h"
-static void setup_rtcd_internal(void)
-{
-    int flags = x86_simd_caps();
-
-    (void)flags;
-
-$(set_function_pointers c $ALL_ARCHS)
-}
-#endif
-$(common_bottom)
-EOF
-}
-
-arm() {
-  determine_indirection c $ALL_ARCHS
-
-  # Assign the helper variable for each enabled extension
-  for opt in $ALL_ARCHS; do
-    uc=$(echo $opt | tr '[a-z]' '[A-Z]')
-    eval "have_${opt}=\"flags & HAS_${uc}\""
-  done
-
-  cat <<EOF
-$(common_top)
-#include "vpx_config.h"
-
-#ifdef RTCD_C
-#include "vpx_ports/arm.h"
-static void setup_rtcd_internal(void)
-{
-    int flags = arm_cpu_caps();
-
-    (void)flags;
-
-$(set_function_pointers c $ALL_ARCHS)
-}
-#endif
-$(common_bottom)
-EOF
-}
-
-
-mips() {
-  determine_indirection c $ALL_ARCHS
-  cat <<EOF
-$(common_top)
-#include "vpx_config.h"
-
-#ifdef RTCD_C
-static void setup_rtcd_internal(void)
-{
-$(set_function_pointers c $ALL_ARCHS)
-#if HAVE_DSPR2
-#if CONFIG_VP8
-void dsputil_static_init();
-dsputil_static_init();
-#endif
-#if CONFIG_VP9
-void vp9_dsputil_static_init();
-vp9_dsputil_static_init();
-#endif
-#endif
-}
-#endif
-$(common_bottom)
-EOF
-}
-
-unoptimized() {
-  determine_indirection c
-  cat <<EOF
-$(common_top)
-#include "vpx_config.h"
-
-#ifdef RTCD_C
-static void setup_rtcd_internal(void)
-{
-$(set_function_pointers c)
-}
-#endif
-$(common_bottom)
-EOF
-
-}
-#
-# Main Driver
-#
-ALL_FUNCS=$(export LC_ALL=C; echo $ALL_FUNCS | tr ' ' '\n' | sort |tr '\n' ' ')
-require c
-case $arch in
-  x86)
-    ALL_ARCHS=$(filter mmx sse sse2 sse3 ssse3 sse4_1 avx avx2)
-    x86
-    ;;
-  x86_64)
-    ALL_ARCHS=$(filter mmx sse sse2 sse3 ssse3 sse4_1 avx avx2)
-    REQUIRES=${REQUIRES:-mmx sse sse2}
-    require $(filter $REQUIRES)
-    x86
-    ;;
-  mips32)
-    ALL_ARCHS=$(filter mips32)
-    dspr2=$([ -f "$config_file" ] && eval echo $(grep HAVE_DSPR2 "$config_file"))
-    HAVE_DSPR2="${dspr2#*=}"
-    if [ "$HAVE_DSPR2" = "yes" ]; then
-        ALL_ARCHS=$(filter mips32 dspr2)
-    fi
-    mips
-    ;;
-  armv5te)
-    ALL_ARCHS=$(filter edsp)
-    arm
-    ;;
-  armv6)
-    ALL_ARCHS=$(filter edsp media)
-    arm
-    ;;
-  armv7)
-    ALL_ARCHS=$(filter edsp media neon)
-    arm
-    ;;
-  *)
-    unoptimized
-    ;;
-esac
diff --git a/build/make/thumb.pm b/build/make/thumb.pm
index d8d04aa..9604c8e 100644
--- a/build/make/thumb.pm
+++ b/build/make/thumb.pm
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/bin/env perl
 ##
 ##  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
 ##
diff --git a/configure b/configure
index 9f5a435..f37615f 100755
--- a/configure
+++ b/configure
@@ -160,6 +160,10 @@
     [ -f ${source_path}/${t}.mk ] && enable_feature ${t}
 done
 
+if ! perl --version >/dev/null; then
+    die "Perl is required to build"
+fi
+
 # check installed doxygen version
 doxy_version=$(doxygen --version 2>/dev/null)
 doxy_major=${doxy_version%%.*}
diff --git a/examples.mk b/examples.mk
index 85b8457..aeb54ab 100644
--- a/examples.mk
+++ b/examples.mk
@@ -142,10 +142,6 @@
 endif
 decode_with_partial_drops.GUID           = 61C2D026-5754-46AC-916F-1343ECC5537E
 decode_with_partial_drops.DESCRIPTION    = Drops parts of frames while decoding
-EXAMPLES-$(CONFIG_VP8_ENCODER)  += error_resilient.c
-error_resilient.GUID             = DF5837B9-4145-4F92-A031-44E4F832E00C
-error_resilient.DESCRIPTION      = Error Resiliency Feature
-
 EXAMPLES-$(CONFIG_VP8_ENCODER)     += vp8_set_maps.c
 vp8_set_maps.SRCS                  += ivfenc.h ivfenc.c
 vp8_set_maps.SRCS                  += tools_common.h tools_common.c
diff --git a/examples/error_resilient.c b/examples/error_resilient.c
deleted file mode 100644
index 19235c8..0000000
--- a/examples/error_resilient.c
+++ /dev/null
@@ -1,223 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-// Error Resiliency Features
-// =========================
-//
-// This is an example demonstrating how to enable the error resiliency
-// features of the codec.
-//
-// Configuration
-// -------------
-// Error resiliency is controlled by the g_error_resilient member of the
-// configuration structure.
-//
-// Observing The Effects
-// ---------------------
-// Use the `decode_with_drops` example to decode with frames 5-10 dropped.
-// Compare the output for a file encoded with this example versus one
-// encoded with the `simple_encoder` example.
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdarg.h>
-#include <string.h>
-#define VPX_CODEC_DISABLE_COMPAT 1
-#include "vpx/vpx_encoder.h"
-#include "vpx/vp8cx.h"
-#define interface (vpx_codec_vp8_cx())
-#define fourcc    0x30385056
-
-#define IVF_FILE_HDR_SZ  (32)
-#define IVF_FRAME_HDR_SZ (12)
-
-static void mem_put_le16(char *mem, unsigned int val) {
-    mem[0] = val;
-    mem[1] = val>>8;
-}
-
-static void mem_put_le32(char *mem, unsigned int val) {
-    mem[0] = val;
-    mem[1] = val>>8;
-    mem[2] = val>>16;
-    mem[3] = val>>24;
-}
-
-static void die(const char *fmt, ...) {
-    va_list ap;
-
-    va_start(ap, fmt);
-    vprintf(fmt, ap);
-    if(fmt[strlen(fmt)-1] != '\n')
-        printf("\n");
-    exit(EXIT_FAILURE);
-}
-
-static void die_codec(vpx_codec_ctx_t *ctx, const char *s) {
-    const char *detail = vpx_codec_error_detail(ctx);
-
-    printf("%s: %s\n", s, vpx_codec_error(ctx));
-    if(detail)
-        printf("    %s\n",detail);
-    exit(EXIT_FAILURE);
-}
-
-static int read_frame(FILE *f, vpx_image_t *img) {
-    size_t nbytes, to_read;
-    int    res = 1;
-
-    to_read = img->w*img->h*3/2;
-    nbytes = fread(img->planes[0], 1, to_read, f);
-    if(nbytes != to_read) {
-        res = 0;
-        if(nbytes > 0)
-            printf("Warning: Read partial frame. Check your width & height!\n");
-    }
-    return res;
-}
-
-static void write_ivf_file_header(FILE *outfile,
-                                  const vpx_codec_enc_cfg_t *cfg,
-                                  int frame_cnt) {
-    char header[32];
-
-    if(cfg->g_pass != VPX_RC_ONE_PASS && cfg->g_pass != VPX_RC_LAST_PASS)
-        return;
-    header[0] = 'D';
-    header[1] = 'K';
-    header[2] = 'I';
-    header[3] = 'F';
-    mem_put_le16(header+4,  0);                   /* version */
-    mem_put_le16(header+6,  32);                  /* headersize */
-    mem_put_le32(header+8,  fourcc);              /* headersize */
-    mem_put_le16(header+12, cfg->g_w);            /* width */
-    mem_put_le16(header+14, cfg->g_h);            /* height */
-    mem_put_le32(header+16, cfg->g_timebase.den); /* rate */
-    mem_put_le32(header+20, cfg->g_timebase.num); /* scale */
-    mem_put_le32(header+24, frame_cnt);           /* length */
-    mem_put_le32(header+28, 0);                   /* unused */
-
-    (void) fwrite(header, 1, 32, outfile);
-}
-
-
-static void write_ivf_frame_header(FILE *outfile,
-                                   const vpx_codec_cx_pkt_t *pkt)
-{
-    char             header[12];
-    vpx_codec_pts_t  pts;
-
-    if(pkt->kind != VPX_CODEC_CX_FRAME_PKT)
-        return;
-
-    pts = pkt->data.frame.pts;
-    mem_put_le32(header, (unsigned int)pkt->data.frame.sz);
-    mem_put_le32(header+4, pts&0xFFFFFFFF);
-    mem_put_le32(header+8, pts >> 32);
-
-    (void) fwrite(header, 1, 12, outfile);
-}
-
-int main(int argc, char **argv) {
-    FILE                *infile, *outfile;
-    vpx_codec_ctx_t      codec;
-    vpx_codec_enc_cfg_t  cfg;
-    int                  frame_cnt = 0;
-    vpx_image_t          raw;
-    vpx_codec_err_t      res;
-    long                 width;
-    long                 height;
-    int                  frame_avail;
-    int                  got_data;
-    int                  flags = 0;
-
-    /* Open files */
-    if(argc!=5)
-        die("Usage: %s <width> <height> <infile> <outfile>\n", argv[0]);
-    width = strtol(argv[1], NULL, 0);
-    height = strtol(argv[2], NULL, 0);
-    if(width < 16 || width%2 || height <16 || height%2)
-        die("Invalid resolution: %ldx%ld", width, height);
-    if(!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, width, height, 1))
-        die("Faile to allocate image", width, height);
-    if(!(outfile = fopen(argv[4], "wb")))
-        die("Failed to open %s for writing", argv[4]);
-
-    printf("Using %s\n",vpx_codec_iface_name(interface));
-
-    /* Populate encoder configuration */
-    res = vpx_codec_enc_config_default(interface, &cfg, 0);
-    if(res) {
-        printf("Failed to get config: %s\n", vpx_codec_err_to_string(res));
-        return EXIT_FAILURE;
-    }
-
-    /* Update the default configuration with our settings */
-    cfg.rc_target_bitrate = width * height * cfg.rc_target_bitrate
-                            / cfg.g_w / cfg.g_h;
-    cfg.g_w = width;
-    cfg.g_h = height;
-
-    /* Enable error resilient mode */
-    cfg.g_error_resilient = 1;
-
-    write_ivf_file_header(outfile, &cfg, 0);
-
-
-        /* Open input file for this encoding pass */
-        if(!(infile = fopen(argv[3], "rb")))
-            die("Failed to open %s for reading", argv[3]);
-
-        /* Initialize codec */
-        if(vpx_codec_enc_init(&codec, interface, &cfg, 0))
-            die_codec(&codec, "Failed to initialize encoder");
-
-        frame_avail = 1;
-        got_data = 0;
-        while(frame_avail || got_data) {
-            vpx_codec_iter_t iter = NULL;
-            const vpx_codec_cx_pkt_t *pkt;
-
-            frame_avail = read_frame(infile, &raw);
-            if(vpx_codec_encode(&codec, frame_avail? &raw : NULL, frame_cnt,
-                                1, flags, VPX_DL_REALTIME))
-                die_codec(&codec, "Failed to encode frame");
-            got_data = 0;
-            while( (pkt = vpx_codec_get_cx_data(&codec, &iter)) ) {
-                got_data = 1;
-                switch(pkt->kind) {
-                case VPX_CODEC_CX_FRAME_PKT:
-                    write_ivf_frame_header(outfile, pkt);
-                    (void) fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz,
-                                  outfile);
-                    break;
-                default:
-                    break;
-                }
-                printf(pkt->kind == VPX_CODEC_CX_FRAME_PKT
-                       && (pkt->data.frame.flags & VPX_FRAME_IS_KEY)? "K":".");
-                fflush(stdout);
-            }
-            frame_cnt++;
-        }
-        printf("\n");
-        fclose(infile);
-
-    printf("Processed %d frames.\n",frame_cnt-1);
-    vpx_img_free(&raw);
-    if(vpx_codec_destroy(&codec))
-        die_codec(&codec, "Failed to destroy codec");
-
-    /* Try to rewrite the file header with the actual frame count */
-    if(!fseek(outfile, 0, SEEK_SET))
-        write_ivf_file_header(outfile, &cfg, frame_cnt-1);
-    fclose(outfile);
-    return EXIT_SUCCESS;
-}
diff --git a/examples/simple_encoder.c b/examples/simple_encoder.c
index e419e81..6ecd498 100644
--- a/examples/simple_encoder.c
+++ b/examples/simple_encoder.c
@@ -36,7 +36,7 @@
 // ---------------------------------
 // Encoders have the notion of "usage profiles." For example, an encoder
 // may want to publish default configurations for both a video
-// conferencing appliction and a best quality offline encoder. These
+// conferencing application and a best quality offline encoder. These
 // obviously have very different default settings. Consult the
 // documentation for your codec to see if it provides any default
 // configurations. All codecs provide a default configuration, number 0,
@@ -80,6 +80,13 @@
 // an error, a descriptive message is printed and the program exits. With
 // few exeptions, vpx_codec functions return an enumerated error status,
 // with the value `0` indicating success.
+//
+// Error Resiliency Features
+// -------------------------
+// Error resiliency is controlled by the g_error_resilient member of the
+// configuration structure. Use the `decode_with_drops` example to decode with
+// frames 5-10 dropped. Compare the output for a file encoded with this example
+// versus one encoded with the `simple_encoder` example.
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -94,7 +101,10 @@
 static const char *exec_name;
 
 void usage_exit() {
-  fprintf(stderr, "Usage: %s <codec> <width> <height> <infile> <outfile>\n",
+  fprintf(stderr,
+          "Usage: %s <codec> <width> <height> <infile> <outfile> "
+              "[<error-resilient>]\nSee comments in simple_encoder.c for more "
+              "information.\n",
           exec_name);
   exit(EXIT_FAILURE);
 }
@@ -138,17 +148,23 @@
   const VpxInterface *encoder = NULL;
   const int fps = 30;        // TODO(dkovalev) add command line argument
   const int bitrate = 200;   // kbit/s TODO(dkovalev) add command line argument
-  const char *const codec_arg = argv[1];
-  const char *const width_arg = argv[2];
-  const char *const height_arg = argv[3];
-  const char *const infile_arg = argv[4];
-  const char *const outfile_arg = argv[5];
+  const char *codec_arg = NULL;
+  const char *width_arg = NULL;
+  const char *height_arg = NULL;
+  const char *infile_arg = NULL;
+  const char *outfile_arg = NULL;
 
   exec_name = argv[0];
 
-  if (argc != 6)
+  if (argc < 6)
     die("Invalid number of arguments");
 
+  codec_arg = argv[1];
+  width_arg = argv[2];
+  height_arg = argv[3];
+  infile_arg = argv[4];
+  outfile_arg = argv[5];
+
   encoder = get_vpx_encoder_by_name(codec_arg);
   if (!encoder)
      die("Unsupported codec.");
@@ -182,6 +198,7 @@
   cfg.g_timebase.num = info.time_base.numerator;
   cfg.g_timebase.den = info.time_base.denominator;
   cfg.rc_target_bitrate = bitrate;
+  cfg.g_error_resilient = argc > 6 ? strtol(argv[6], NULL, 0) : 0;
 
   writer = vpx_video_writer_open(outfile_arg, kContainerIVF, &info);
   if (!writer)
diff --git a/libs.mk b/libs.mk
index 302d2af..a5c4b76 100644
--- a/libs.mk
+++ b/libs.mk
@@ -49,7 +49,7 @@
 define rtcd_h_template
 $$(BUILD_PFX)$(1).h: $$(SRC_PATH_BARE)/$(2)
 	@echo "    [CREATE] $$@"
-	$$(qexec)$$(SRC_PATH_BARE)/build/make/rtcd.sh --arch=$$(TGT_ISA) \
+	$$(qexec)$$(SRC_PATH_BARE)/build/make/rtcd.pl --arch=$$(TGT_ISA) \
           --sym=$(1) \
           --config=$$(CONFIG_DIR)$$(target)$$(if $$(FAT_ARCHS),,-$$(TOOLCHAIN)).mk \
           $$(RTCD_OPTIONS) $$^ > $$@
@@ -162,7 +162,7 @@
 endif
 
 CODEC_SRCS-$(BUILD_LIBVPX) += build/make/version.sh
-CODEC_SRCS-$(BUILD_LIBVPX) += build/make/rtcd.sh
+CODEC_SRCS-$(BUILD_LIBVPX) += build/make/rtcd.pl
 CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/emmintrin_compat.h
 CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/mem_ops.h
 CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/mem_ops_aligned.h
@@ -236,6 +236,13 @@
             --out=$@ $^
 CLEAN-OBJS += vpx.def
 
+# Assembly files that are included, but don't define symbols themselves.
+# Filtered out to avoid Visual Studio build warnings.
+ASM_INCLUDES := \
+    third_party/x86inc/x86inc.asm \
+    vpx_config.asm \
+    vpx_ports/x86_abi_support.asm \
+
 vpx.$(VCPROJ_SFX): $(CODEC_SRCS) vpx.def obj_int_extract.$(VCPROJ_SFX)
 	@echo "    [CREATE] $@"
 	$(qexec)$(GEN_VCPROJ) \
@@ -246,7 +253,8 @@
             --proj-guid=DCE19DAF-69AC-46DB-B14A-39F0FAA5DB74 \
             --module-def=vpx.def \
             --ver=$(CONFIG_VS_VERSION) \
-            --out=$@ $(CFLAGS) $^ \
+            --out=$@ $(CFLAGS) \
+            $(filter-out $(addprefix %, $(ASM_INCLUDES)), $^) \
             --src-path-bare="$(SRC_PATH_BARE)" \
 
 PROJECTS-$(BUILD_LIBVPX) += vpx.$(VCPROJ_SFX)
diff --git a/test/sad_test.cc b/test/sad_test.cc
index 401fa1d..a692891 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -545,10 +545,12 @@
 
 #if HAVE_SSSE3
 #if CONFIG_USE_X86INC
+#if CONFIG_VP8_ENCODER
 const sad_m_by_n_fn_t sad_16x16_sse3 = vp8_sad16x16_sse3;
 INSTANTIATE_TEST_CASE_P(SSE3, SADTest, ::testing::Values(
                         make_tuple(16, 16, sad_16x16_sse3)));
 #endif
 #endif
+#endif
 
 }  // namespace
diff --git a/test/vp8_boolcoder_test.cc b/test/vp8_boolcoder_test.cc
index 7c6c601..9cd1987 100644
--- a/test/vp8_boolcoder_test.cc
+++ b/test/vp8_boolcoder_test.cc
@@ -35,14 +35,14 @@
   0x89, 0x9a, 0xab, 0xbc, 0xcd, 0xde, 0xef, 0xf0
 };
 
-void encrypt_buffer(uint8_t *buffer, int size) {
-  for (int i = 0; i < size; ++i) {
+void encrypt_buffer(uint8_t *buffer, size_t size) {
+  for (size_t i = 0; i < size; ++i) {
     buffer[i] ^= secret_key[i & 15];
   }
 }
 
 void test_decrypt_cb(void *decrypt_state, const uint8_t *input,
-                           uint8_t *output, int count) {
+                     uint8_t *output, int count) {
   const size_t offset = input - reinterpret_cast<uint8_t*>(decrypt_state);
   for (int i = 0; i < count; i++) {
     output[i] = input[i] ^ secret_key[(offset + i) & 15];
diff --git a/test/vp8_decrypt_test.cc b/test/vp8_decrypt_test.cc
index b092509..1b5b083 100644
--- a/test/vp8_decrypt_test.cc
+++ b/test/vp8_decrypt_test.cc
@@ -26,9 +26,9 @@
   0x89, 0x9a, 0xab, 0xbc, 0xcd, 0xde, 0xef, 0xf0
 };
 
-void encrypt_buffer(const uint8_t *src, uint8_t *dst,
-                    int size, int offset = 0) {
-  for (int i = 0; i < size; ++i) {
+void encrypt_buffer(const uint8_t *src, uint8_t *dst, size_t size,
+                    ptrdiff_t offset) {
+  for (size_t i = 0; i < size; ++i) {
     dst[i] = src[i] ^ test_key[(offset + i) & 15];
   }
 }
@@ -61,7 +61,7 @@
 
 #if CONFIG_DECRYPT
   std::vector<uint8_t> encrypted(video.frame_size());
-  encrypt_buffer(video.cxdata(), &encrypted[0], video.frame_size());
+  encrypt_buffer(video.cxdata(), &encrypted[0], video.frame_size(), 0);
   vp8_decrypt_init di = { test_decrypt_cb, &encrypted[0] };
   decoder.Control(VP8D_SET_DECRYPTOR, &di);
 #endif  // CONFIG_DECRYPT
diff --git a/third_party/nestegg/README.webm b/third_party/nestegg/README.webm
index c931168..8e3760b 100644
--- a/third_party/nestegg/README.webm
+++ b/third_party/nestegg/README.webm
@@ -14,3 +14,11 @@
 - 0002-ne_read_simple-uninitialized_variable.diff
   fixes:
 nestegg.c|975 col 6| warning: ‘r’ may be used uninitialized in this function [-Wuninitialized]
+- add ne_get_uint32 convenience function
+- fix track_number uint64->uint32 warnings
+- fix track_scale double->uint64 warning
+- nestegg_packet_track: fix uint64->uint32 warning
+- ne_read_(string|binary|block): normalize size_t usage
+- ne_parse: normalize size_t usage
+- quiet read related uint64->size_t warnings
+- ne_buffer_read: quiet uint64->size_t warning
diff --git a/third_party/nestegg/src/nestegg.c b/third_party/nestegg/src/nestegg.c
index 30e0e2b..c7e2b02 100644
--- a/third_party/nestegg/src/nestegg.c
+++ b/third_party/nestegg/src/nestegg.c
@@ -694,14 +694,15 @@
 {
   char * str;
   int r;
+  const size_t alloc_size = (size_t)length + 1;
 
   if (length == 0 || length > LIMIT_STRING)
     return -1;
-  str = ne_pool_alloc(length + 1, ctx->alloc_pool);
-  r = ne_io_read(ctx->io, (unsigned char *) str, length);
+  str = ne_pool_alloc(alloc_size, ctx->alloc_pool);
+  r = ne_io_read(ctx->io, (unsigned char *) str, alloc_size - 1);
   if (r != 1)
     return r;
-  str[length] = '\0';
+  str[alloc_size - 1] = '\0';
   *val = str;
   return 1;
 }
@@ -711,9 +712,9 @@
 {
   if (length == 0 || length > LIMIT_BINARY)
     return -1;
-  val->data = ne_pool_alloc(length, ctx->alloc_pool);
-  val->length = length;
-  return ne_io_read(ctx->io, val->data, length);
+  val->length = (size_t)length;
+  val->data = ne_pool_alloc(val->length, ctx->alloc_pool);
+  return ne_io_read(ctx->io, val->data, val->length);
 }
 
 static int
@@ -730,6 +731,20 @@
 }
 
 static int
+ne_get_uint32(struct ebml_type type, unsigned int * value)
+{
+  uint64_t v;
+  if (ne_get_uint(type, &v))
+    return -1;
+
+  assert((unsigned int)v == v);
+
+  *value = (unsigned int)v;
+
+  return 0;
+}
+
+static int
 ne_get_float(struct ebml_type type, double * value)
 {
   if (!type.read)
@@ -1029,7 +1044,7 @@
           ne_read_single_master(ctx, element);
         continue;
       } else {
-        r = ne_read_simple(ctx, element, size);
+        r = ne_read_simple(ctx, element, (size_t)size);
         if (r < 0)
           break;
       }
@@ -1048,7 +1063,7 @@
 
       if (id != ID_VOID && id != ID_CRC32)
         ctx->log(ctx, NESTEGG_LOG_DEBUG, "unknown element %llx", id);
-      r = ne_io_read_skip(ctx->io, size);
+      r = ne_io_read_skip(ctx->io, (size_t)size);
       if (r != 1)
         break;
     }
@@ -1137,7 +1152,8 @@
   r = ne_read_vint(io, &lace, &length);
   if (r != 1)
     return r;
-  *read += length;
+  assert(length <= 8);
+  *read += (size_t)length;
 
   sizes[i] = lace;
   sum = sizes[i];
@@ -1149,7 +1165,8 @@
     r = ne_read_svint(io, &slace, &length);
     if (r != 1)
       return r;
-    *read += length;
+    assert(length <= 8);
+    *read += (size_t)length;
     sizes[i] = sizes[i - 1] + slace;
     sum += sizes[i];
     i += 1;
@@ -1232,7 +1249,7 @@
   struct cluster * cluster;
   struct frame * f, * last;
   struct track_entry * entry;
-  double track_scale;
+  const int track_scale = 1;
   uint64_t track_number, length, frame_sizes[256], cluster_tc, flags, frames, tc_scale, total;
   unsigned int i, lacing, track;
   size_t consumed = 0;
@@ -1246,10 +1263,11 @@
   if (r != 1)
     return r;
 
-  if (track_number == 0)
+  if (track_number == 0 || (unsigned int)track_number != track_number)
     return -1;
 
-  consumed += length;
+  assert(length <= 8);
+  consumed += (size_t)length;
 
   r = ne_read_int(ctx->io, &timecode, 2);
   if (r != 1)
@@ -1293,7 +1311,7 @@
   case LACING_XIPH:
     if (frames == 1)
       return -1;
-    r = ne_read_xiph_lacing(ctx->io, block_size, &consumed, frames, frame_sizes);
+    r = ne_read_xiph_lacing(ctx->io, (size_t)block_size, &consumed, frames, frame_sizes);
     if (r != 1)
       return r;
     break;
@@ -1306,7 +1324,7 @@
   case LACING_EBML:
     if (frames == 1)
       return -1;
-    r = ne_read_ebml_lacing(ctx->io, block_size, &consumed, frames, frame_sizes);
+    r = ne_read_ebml_lacing(ctx->io, (size_t)block_size, &consumed, frames, frame_sizes);
     if (r != 1)
       return r;
     break;
@@ -1319,15 +1337,13 @@
   if (total > block_size)
     return -1;
 
-  if (ne_map_track_number_to_index(ctx, track_number, &track) != 0)
+  if (ne_map_track_number_to_index(ctx, (unsigned int)track_number, &track) != 0)
     return -1;
 
   entry = ne_find_track_entry(ctx, track);
   if (!entry)
     return -1;
 
-  track_scale = 1.0;
-
   tc_scale = ne_get_timecode_scale(ctx);
 
   assert(ctx->segment.cluster.tail->id == ID_CLUSTER);
@@ -1353,9 +1369,9 @@
       return -1;
     }
     f = ne_alloc(sizeof(*f));
-    f->data = ne_alloc(frame_sizes[i]);
-    f->length = frame_sizes[i];
-    r = ne_io_read(ctx->io, f->data, frame_sizes[i]);
+    f->length = (size_t)frame_sizes[i];
+    f->data = ne_alloc(f->length);
+    r = ne_io_read(ctx->io, f->data, f->length);
     if (r != 1) {
       free(f->data);
       free(f);
@@ -1394,7 +1410,8 @@
   if (!element)
     return 1;
 
-  r = ne_read_simple(ctx, element, size);
+  assert((size_t)size == size);
+  r = ne_read_simple(ctx, element, (size_t)size);
   if (r != 1)
     return r;
   storage = (struct ebml_type *) (ctx->ancestor->data + element->offset);
@@ -1451,13 +1468,13 @@
 ne_find_cue_position_for_track(nestegg * ctx, struct ebml_list_node * node, unsigned int track)
 {
   struct cue_track_positions * pos = NULL;
-  uint64_t track_number;
+  unsigned int track_number;
   unsigned int t;
 
   while (node) {
     assert(node->id == ID_CUE_TRACK_POSITIONS);
     pos = node->data;
-    if (ne_get_uint(pos->track, &track_number) != 0)
+    if (ne_get_uint32(pos->track, &track_number) != 0)
       return NULL;
 
     if (ne_map_track_number_to_index(ctx, track_number, &t) != 0)
@@ -1588,7 +1605,7 @@
   struct sniff_buffer * sb = user_data;
 
   int rv = 1;
-  size_t available = sb->length - sb->offset;
+  size_t available = sb->length - (size_t)sb->offset;
 
   if (available < length)
     return 0;
@@ -1844,7 +1861,7 @@
         if (ne_get_uint(pos->track, &track_number) != 0)
           return -1;
 
-        if (ne_map_track_number_to_index(ctx, track_number, &track_index) != 0)
+        if (ne_map_track_number_to_index(ctx, (unsigned int)track_number, &track_index) != 0)
           return -1;
 
         if (track_index == track) {
@@ -2062,7 +2079,7 @@
         p += sizes[i];
       }
       *data = p;
-      *length = sizes[item];
+      *length = (size_t)sizes[item];
   } else {
     *data = codec_private.data;
     *length = codec_private.length;
@@ -2076,7 +2093,7 @@
                            nestegg_video_params * params)
 {
   struct track_entry * entry;
-  uint64_t value;
+  unsigned int value;
 
   memset(params, 0, sizeof(*params));
 
@@ -2088,41 +2105,41 @@
     return -1;
 
   value = 0;
-  ne_get_uint(entry->video.stereo_mode, &value);
+  ne_get_uint32(entry->video.stereo_mode, &value);
   if (value <= NESTEGG_VIDEO_STEREO_TOP_BOTTOM ||
       value == NESTEGG_VIDEO_STEREO_RIGHT_LEFT)
     params->stereo_mode = value;
 
-  if (ne_get_uint(entry->video.pixel_width, &value) != 0)
+  if (ne_get_uint32(entry->video.pixel_width, &value) != 0)
     return -1;
   params->width = value;
 
-  if (ne_get_uint(entry->video.pixel_height, &value) != 0)
+  if (ne_get_uint32(entry->video.pixel_height, &value) != 0)
     return -1;
   params->height = value;
 
   value = 0;
-  ne_get_uint(entry->video.pixel_crop_bottom, &value);
+  ne_get_uint32(entry->video.pixel_crop_bottom, &value);
   params->crop_bottom = value;
 
   value = 0;
-  ne_get_uint(entry->video.pixel_crop_top, &value);
+  ne_get_uint32(entry->video.pixel_crop_top, &value);
   params->crop_top = value;
 
   value = 0;
-  ne_get_uint(entry->video.pixel_crop_left, &value);
+  ne_get_uint32(entry->video.pixel_crop_left, &value);
   params->crop_left = value;
 
   value = 0;
-  ne_get_uint(entry->video.pixel_crop_right, &value);
+  ne_get_uint32(entry->video.pixel_crop_right, &value);
   params->crop_right = value;
 
   value = params->width;
-  ne_get_uint(entry->video.display_width, &value);
+  ne_get_uint32(entry->video.display_width, &value);
   params->display_width = value;
 
   value = params->height;
-  ne_get_uint(entry->video.display_height, &value);
+  ne_get_uint32(entry->video.display_height, &value);
   params->display_height = value;
 
   return 0;
@@ -2133,7 +2150,7 @@
                            nestegg_audio_params * params)
 {
   struct track_entry * entry;
-  uint64_t value;
+  unsigned int value;
 
   memset(params, 0, sizeof(*params));
 
@@ -2148,19 +2165,19 @@
   ne_get_float(entry->audio.sampling_frequency, &params->rate);
 
   value = 1;
-  ne_get_uint(entry->audio.channels, &value);
+  ne_get_uint32(entry->audio.channels, &value);
   params->channels = value;
 
   value = 16;
-  ne_get_uint(entry->audio.bit_depth, &value);
+  ne_get_uint32(entry->audio.bit_depth, &value);
   params->depth = value;
 
   value = 0;
-  ne_get_uint(entry->codec_delay, &value);
+  ne_get_uint32(entry->codec_delay, &value);
   params->codec_delay = value;
 
   value = 0;
-  ne_get_uint(entry->seek_preroll, &value);
+  ne_get_uint32(entry->seek_preroll, &value);
   params->seek_preroll = value;
 
   return 0;
@@ -2224,7 +2241,7 @@
 int
 nestegg_packet_track(nestegg_packet * pkt, unsigned int * track)
 {
-  *track = pkt->track;
+  *track = (unsigned int)pkt->track;
   return 0;
 }
 
diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
new file mode 100644
index 0000000..130d965
--- /dev/null
+++ b/vp8/common/rtcd_defs.pl
@@ -0,0 +1,541 @@
+sub vp8_common_forward_decls() {
+print <<EOF
+/*
+ * VP8
+ */
+
+struct blockd;
+struct macroblockd;
+struct loop_filter_info;
+
+/* Encoder forward decls */
+struct block;
+struct macroblock;
+struct variance_vtable;
+union int_mv;
+struct yv12_buffer_config;
+EOF
+}
+forward_decls qw/vp8_common_forward_decls/;
+
+#
+# system state
+#
+add_proto qw/void vp8_clear_system_state/, "";
+specialize qw/vp8_clear_system_state mmx/;
+$vp8_clear_system_state_mmx=vpx_reset_mmx_state;
+
+#
+# Dequant
+#
+add_proto qw/void vp8_dequantize_b/, "struct blockd*, short *dqc";
+specialize qw/vp8_dequantize_b mmx media neon/;
+$vp8_dequantize_b_media=vp8_dequantize_b_v6;
+
+add_proto qw/void vp8_dequant_idct_add/, "short *input, short *dq, unsigned char *output, int stride";
+specialize qw/vp8_dequant_idct_add mmx media neon dspr2/;
+$vp8_dequant_idct_add_media=vp8_dequant_idct_add_v6;
+$vp8_dequant_idct_add_dspr2=vp8_dequant_idct_add_dspr2;
+
+add_proto qw/void vp8_dequant_idct_add_y_block/, "short *q, short *dq, unsigned char *dst, int stride, char *eobs";
+specialize qw/vp8_dequant_idct_add_y_block mmx sse2 media neon dspr2/;
+$vp8_dequant_idct_add_y_block_media=vp8_dequant_idct_add_y_block_v6;
+$vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2;
+
+add_proto qw/void vp8_dequant_idct_add_uv_block/, "short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs";
+specialize qw/vp8_dequant_idct_add_uv_block mmx sse2 media neon dspr2/;
+$vp8_dequant_idct_add_uv_block_media=vp8_dequant_idct_add_uv_block_v6;
+$vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2;
+
+#
+# Loopfilter
+#
+add_proto qw/void vp8_loop_filter_mbv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
+specialize qw/vp8_loop_filter_mbv mmx sse2 media neon dspr2/;
+$vp8_loop_filter_mbv_media=vp8_loop_filter_mbv_armv6;
+$vp8_loop_filter_mbv_dspr2=vp8_loop_filter_mbv_dspr2;
+
+add_proto qw/void vp8_loop_filter_bv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
+specialize qw/vp8_loop_filter_bv mmx sse2 media neon dspr2/;
+$vp8_loop_filter_bv_media=vp8_loop_filter_bv_armv6;
+$vp8_loop_filter_bv_dspr2=vp8_loop_filter_bv_dspr2;
+
+add_proto qw/void vp8_loop_filter_mbh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
+specialize qw/vp8_loop_filter_mbh mmx sse2 media neon dspr2/;
+$vp8_loop_filter_mbh_media=vp8_loop_filter_mbh_armv6;
+$vp8_loop_filter_mbh_dspr2=vp8_loop_filter_mbh_dspr2;
+
+add_proto qw/void vp8_loop_filter_bh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
+specialize qw/vp8_loop_filter_bh mmx sse2 media neon dspr2/;
+$vp8_loop_filter_bh_media=vp8_loop_filter_bh_armv6;
+$vp8_loop_filter_bh_dspr2=vp8_loop_filter_bh_dspr2;
+
+
+add_proto qw/void vp8_loop_filter_simple_mbv/, "unsigned char *y, int ystride, const unsigned char *blimit";
+specialize qw/vp8_loop_filter_simple_mbv mmx sse2 media neon/;
+$vp8_loop_filter_simple_mbv_c=vp8_loop_filter_simple_vertical_edge_c;
+$vp8_loop_filter_simple_mbv_mmx=vp8_loop_filter_simple_vertical_edge_mmx;
+$vp8_loop_filter_simple_mbv_sse2=vp8_loop_filter_simple_vertical_edge_sse2;
+$vp8_loop_filter_simple_mbv_media=vp8_loop_filter_simple_vertical_edge_armv6;
+$vp8_loop_filter_simple_mbv_neon=vp8_loop_filter_mbvs_neon;
+
+add_proto qw/void vp8_loop_filter_simple_mbh/, "unsigned char *y, int ystride, const unsigned char *blimit";
+specialize qw/vp8_loop_filter_simple_mbh mmx sse2 media neon/;
+$vp8_loop_filter_simple_mbh_c=vp8_loop_filter_simple_horizontal_edge_c;
+$vp8_loop_filter_simple_mbh_mmx=vp8_loop_filter_simple_horizontal_edge_mmx;
+$vp8_loop_filter_simple_mbh_sse2=vp8_loop_filter_simple_horizontal_edge_sse2;
+$vp8_loop_filter_simple_mbh_media=vp8_loop_filter_simple_horizontal_edge_armv6;
+$vp8_loop_filter_simple_mbh_neon=vp8_loop_filter_mbhs_neon;
+
+add_proto qw/void vp8_loop_filter_simple_bv/, "unsigned char *y, int ystride, const unsigned char *blimit";
+specialize qw/vp8_loop_filter_simple_bv mmx sse2 media neon/;
+$vp8_loop_filter_simple_bv_c=vp8_loop_filter_bvs_c;
+$vp8_loop_filter_simple_bv_mmx=vp8_loop_filter_bvs_mmx;
+$vp8_loop_filter_simple_bv_sse2=vp8_loop_filter_bvs_sse2;
+$vp8_loop_filter_simple_bv_media=vp8_loop_filter_bvs_armv6;
+$vp8_loop_filter_simple_bv_neon=vp8_loop_filter_bvs_neon;
+
+add_proto qw/void vp8_loop_filter_simple_bh/, "unsigned char *y, int ystride, const unsigned char *blimit";
+specialize qw/vp8_loop_filter_simple_bh mmx sse2 media neon/;
+$vp8_loop_filter_simple_bh_c=vp8_loop_filter_bhs_c;
+$vp8_loop_filter_simple_bh_mmx=vp8_loop_filter_bhs_mmx;
+$vp8_loop_filter_simple_bh_sse2=vp8_loop_filter_bhs_sse2;
+$vp8_loop_filter_simple_bh_media=vp8_loop_filter_bhs_armv6;
+$vp8_loop_filter_simple_bh_neon=vp8_loop_filter_bhs_neon;
+
+#
+# IDCT
+#
+#idct16
+add_proto qw/void vp8_short_idct4x4llm/, "short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride";
+specialize qw/vp8_short_idct4x4llm mmx media neon dspr2/;
+$vp8_short_idct4x4llm_media=vp8_short_idct4x4llm_v6_dual;
+$vp8_short_idct4x4llm_dspr2=vp8_short_idct4x4llm_dspr2;
+
+#iwalsh1
+add_proto qw/void vp8_short_inv_walsh4x4_1/, "short *input, short *output";
+specialize qw/vp8_short_inv_walsh4x4_1 dspr2/;
+$vp8_short_inv_walsh4x4_1_dspr2=vp8_short_inv_walsh4x4_1_dspr2;
+# no asm yet
+
+#iwalsh16
+add_proto qw/void vp8_short_inv_walsh4x4/, "short *input, short *output";
+specialize qw/vp8_short_inv_walsh4x4 mmx sse2 media neon dspr2/;
+$vp8_short_inv_walsh4x4_media=vp8_short_inv_walsh4x4_v6;
+$vp8_short_inv_walsh4x4_dspr2=vp8_short_inv_walsh4x4_dspr2;
+
+#idct1_scalar_add
+add_proto qw/void vp8_dc_only_idct_add/, "short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride";
+specialize qw/vp8_dc_only_idct_add	mmx media neon dspr2/;
+$vp8_dc_only_idct_add_media=vp8_dc_only_idct_add_v6;
+$vp8_dc_only_idct_add_dspr2=vp8_dc_only_idct_add_dspr2;
+
+#
+# RECON
+#
+add_proto qw/void vp8_copy_mem16x16/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch";
+specialize qw/vp8_copy_mem16x16 mmx sse2 media neon dspr2/;
+$vp8_copy_mem16x16_media=vp8_copy_mem16x16_v6;
+$vp8_copy_mem16x16_dspr2=vp8_copy_mem16x16_dspr2;
+
+add_proto qw/void vp8_copy_mem8x8/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch";
+specialize qw/vp8_copy_mem8x8 mmx media neon dspr2/;
+$vp8_copy_mem8x8_media=vp8_copy_mem8x8_v6;
+$vp8_copy_mem8x8_dspr2=vp8_copy_mem8x8_dspr2;
+
+add_proto qw/void vp8_copy_mem8x4/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch";
+specialize qw/vp8_copy_mem8x4 mmx media neon dspr2/;
+$vp8_copy_mem8x4_media=vp8_copy_mem8x4_v6;
+$vp8_copy_mem8x4_dspr2=vp8_copy_mem8x4_dspr2;
+
+add_proto qw/void vp8_build_intra_predictors_mby_s/, "struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride";
+specialize qw/vp8_build_intra_predictors_mby_s sse2 ssse3/;
+#TODO: fix assembly for neon
+
+add_proto qw/void vp8_build_intra_predictors_mbuv_s/, "struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride";
+specialize qw/vp8_build_intra_predictors_mbuv_s sse2 ssse3/;
+
+add_proto qw/void vp8_intra4x4_predict/, "unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left";
+specialize qw/vp8_intra4x4_predict media/;
+$vp8_intra4x4_predict_media=vp8_intra4x4_predict_armv6;
+
+#
+# Postproc
+#
+if (vpx_config("CONFIG_POSTPROC") eq "yes") {
+    add_proto qw/void vp8_mbpost_proc_down/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
+    specialize qw/vp8_mbpost_proc_down mmx sse2/;
+    $vp8_mbpost_proc_down_sse2=vp8_mbpost_proc_down_xmm;
+
+    add_proto qw/void vp8_mbpost_proc_across_ip/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
+    specialize qw/vp8_mbpost_proc_across_ip sse2/;
+    $vp8_mbpost_proc_across_ip_sse2=vp8_mbpost_proc_across_ip_xmm;
+
+    add_proto qw/void vp8_post_proc_down_and_across_mb_row/, "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size";
+    specialize qw/vp8_post_proc_down_and_across_mb_row sse2/;
+
+    add_proto qw/void vp8_plane_add_noise/, "unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch";
+    specialize qw/vp8_plane_add_noise mmx sse2/;
+    $vp8_plane_add_noise_sse2=vp8_plane_add_noise_wmt;
+
+    add_proto qw/void vp8_blend_mb_inner/, "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride";
+    # no asm yet
+
+    add_proto qw/void vp8_blend_mb_outer/, "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride";
+    # no asm yet
+
+    add_proto qw/void vp8_blend_b/, "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride";
+    # no asm yet
+
+    add_proto qw/void vp8_filter_by_weight16x16/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight";
+    specialize qw/vp8_filter_by_weight16x16 sse2/;
+
+    add_proto qw/void vp8_filter_by_weight8x8/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight";
+    specialize qw/vp8_filter_by_weight8x8 sse2/;
+
+    add_proto qw/void vp8_filter_by_weight4x4/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight";
+    # no asm yet
+}
+
+#
+# Subpixel
+#
+add_proto qw/void vp8_sixtap_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
+specialize qw/vp8_sixtap_predict16x16 mmx sse2 ssse3 media neon dspr2/;
+$vp8_sixtap_predict16x16_media=vp8_sixtap_predict16x16_armv6;
+$vp8_sixtap_predict16x16_dspr2=vp8_sixtap_predict16x16_dspr2;
+
+add_proto qw/void vp8_sixtap_predict8x8/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
+specialize qw/vp8_sixtap_predict8x8 mmx sse2 ssse3 media neon dspr2/;
+$vp8_sixtap_predict8x8_media=vp8_sixtap_predict8x8_armv6;
+$vp8_sixtap_predict8x8_dspr2=vp8_sixtap_predict8x8_dspr2;
+
+add_proto qw/void vp8_sixtap_predict8x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
+specialize qw/vp8_sixtap_predict8x4 mmx sse2 ssse3 media neon dspr2/;
+$vp8_sixtap_predict8x4_media=vp8_sixtap_predict8x4_armv6;
+$vp8_sixtap_predict8x4_dspr2=vp8_sixtap_predict8x4_dspr2;
+
+add_proto qw/void vp8_sixtap_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
+specialize qw/vp8_sixtap_predict4x4 mmx ssse3 media neon dspr2/;
+$vp8_sixtap_predict4x4_media=vp8_sixtap_predict4x4_armv6;
+$vp8_sixtap_predict4x4_dspr2=vp8_sixtap_predict4x4_dspr2;
+
+add_proto qw/void vp8_bilinear_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
+specialize qw/vp8_bilinear_predict16x16 mmx sse2 ssse3 media neon/;
+$vp8_bilinear_predict16x16_media=vp8_bilinear_predict16x16_armv6;
+
+add_proto qw/void vp8_bilinear_predict8x8/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
+specialize qw/vp8_bilinear_predict8x8 mmx sse2 ssse3 media neon/;
+$vp8_bilinear_predict8x8_media=vp8_bilinear_predict8x8_armv6;
+
+add_proto qw/void vp8_bilinear_predict8x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
+specialize qw/vp8_bilinear_predict8x4 mmx media neon/;
+$vp8_bilinear_predict8x4_media=vp8_bilinear_predict8x4_armv6;
+
+add_proto qw/void vp8_bilinear_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
+specialize qw/vp8_bilinear_predict4x4 mmx media neon/;
+$vp8_bilinear_predict4x4_media=vp8_bilinear_predict4x4_armv6;
+
+#
+# Whole-pixel Variance
+#
+add_proto qw/unsigned int vp8_variance4x4/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
+specialize qw/vp8_variance4x4 mmx sse2/;
+$vp8_variance4x4_sse2=vp8_variance4x4_wmt;
+
+add_proto qw/unsigned int vp8_variance8x8/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
+specialize qw/vp8_variance8x8 mmx sse2 media neon/;
+$vp8_variance8x8_sse2=vp8_variance8x8_wmt;
+$vp8_variance8x8_media=vp8_variance8x8_armv6;
+
+add_proto qw/unsigned int vp8_variance8x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
+specialize qw/vp8_variance8x16 mmx sse2 neon/;
+$vp8_variance8x16_sse2=vp8_variance8x16_wmt;
+
+add_proto qw/unsigned int vp8_variance16x8/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
+specialize qw/vp8_variance16x8 mmx sse2 neon/;
+$vp8_variance16x8_sse2=vp8_variance16x8_wmt;
+
+add_proto qw/unsigned int vp8_variance16x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
+specialize qw/vp8_variance16x16 mmx sse2 media neon/;
+$vp8_variance16x16_sse2=vp8_variance16x16_wmt;
+$vp8_variance16x16_media=vp8_variance16x16_armv6;
+
+#
+# Sub-pixel Variance
+#
+add_proto qw/unsigned int vp8_sub_pixel_variance4x4/, "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
+specialize qw/vp8_sub_pixel_variance4x4 mmx sse2/;
+$vp8_sub_pixel_variance4x4_sse2=vp8_sub_pixel_variance4x4_wmt;
+
+add_proto qw/unsigned int vp8_sub_pixel_variance8x8/, "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
+specialize qw/vp8_sub_pixel_variance8x8 mmx sse2 media neon/;
+$vp8_sub_pixel_variance8x8_sse2=vp8_sub_pixel_variance8x8_wmt;
+$vp8_sub_pixel_variance8x8_media=vp8_sub_pixel_variance8x8_armv6;
+
+add_proto qw/unsigned int vp8_sub_pixel_variance8x16/, "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
+specialize qw/vp8_sub_pixel_variance8x16 mmx sse2/;
+$vp8_sub_pixel_variance8x16_sse2=vp8_sub_pixel_variance8x16_wmt;
+
+add_proto qw/unsigned int vp8_sub_pixel_variance16x8/, "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
+specialize qw/vp8_sub_pixel_variance16x8 mmx sse2 ssse3/;
+$vp8_sub_pixel_variance16x8_sse2=vp8_sub_pixel_variance16x8_wmt;
+
+add_proto qw/unsigned int vp8_sub_pixel_variance16x16/, "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
+specialize qw/vp8_sub_pixel_variance16x16 mmx sse2 ssse3 media neon/;
+$vp8_sub_pixel_variance16x16_sse2=vp8_sub_pixel_variance16x16_wmt;
+$vp8_sub_pixel_variance16x16_media=vp8_sub_pixel_variance16x16_armv6;
+
+add_proto qw/unsigned int vp8_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
+specialize qw/vp8_variance_halfpixvar16x16_h mmx sse2 media neon/;
+$vp8_variance_halfpixvar16x16_h_sse2=vp8_variance_halfpixvar16x16_h_wmt;
+$vp8_variance_halfpixvar16x16_h_media=vp8_variance_halfpixvar16x16_h_armv6;
+
+add_proto qw/unsigned int vp8_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
+specialize qw/vp8_variance_halfpixvar16x16_v mmx sse2 media neon/;
+$vp8_variance_halfpixvar16x16_v_sse2=vp8_variance_halfpixvar16x16_v_wmt;
+$vp8_variance_halfpixvar16x16_v_media=vp8_variance_halfpixvar16x16_v_armv6;
+
+add_proto qw/unsigned int vp8_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
+specialize qw/vp8_variance_halfpixvar16x16_hv mmx sse2 media neon/;
+$vp8_variance_halfpixvar16x16_hv_sse2=vp8_variance_halfpixvar16x16_hv_wmt;
+$vp8_variance_halfpixvar16x16_hv_media=vp8_variance_halfpixvar16x16_hv_armv6;
+
+#
+# Single block SAD
+#
+add_proto qw/unsigned int vp8_sad4x4/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad";
+specialize qw/vp8_sad4x4 mmx sse2 neon/;
+$vp8_sad4x4_sse2=vp8_sad4x4_wmt;
+
+add_proto qw/unsigned int vp8_sad8x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad";
+specialize qw/vp8_sad8x8 mmx sse2 neon/;
+$vp8_sad8x8_sse2=vp8_sad8x8_wmt;
+
+add_proto qw/unsigned int vp8_sad8x16/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad";
+specialize qw/vp8_sad8x16 mmx sse2 neon/;
+$vp8_sad8x16_sse2=vp8_sad8x16_wmt;
+
+add_proto qw/unsigned int vp8_sad16x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad";
+specialize qw/vp8_sad16x8 mmx sse2 neon/;
+$vp8_sad16x8_sse2=vp8_sad16x8_wmt;
+
+add_proto qw/unsigned int vp8_sad16x16/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad";
+specialize qw/vp8_sad16x16 mmx sse2 sse3 media neon/;
+$vp8_sad16x16_sse2=vp8_sad16x16_wmt;
+$vp8_sad16x16_media=vp8_sad16x16_armv6;
+
+#
+# Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
+#
+add_proto qw/void vp8_sad4x4x3/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array";
+specialize qw/vp8_sad4x4x3 sse3/;
+
+add_proto qw/void vp8_sad8x8x3/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array";
+specialize qw/vp8_sad8x8x3 sse3/;
+
+add_proto qw/void vp8_sad8x16x3/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array";
+specialize qw/vp8_sad8x16x3 sse3/;
+
+add_proto qw/void vp8_sad16x8x3/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array";
+specialize qw/vp8_sad16x8x3 sse3 ssse3/;
+
+add_proto qw/void vp8_sad16x16x3/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array";
+specialize qw/vp8_sad16x16x3 sse3 ssse3/;
+
+# Note the only difference in the following prototypes is that they return into
+# an array of short
+add_proto qw/void vp8_sad4x4x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array";
+specialize qw/vp8_sad4x4x8 sse4_1/;
+$vp8_sad4x4x8_sse4_1=vp8_sad4x4x8_sse4;
+
+add_proto qw/void vp8_sad8x8x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array";
+specialize qw/vp8_sad8x8x8 sse4_1/;
+$vp8_sad8x8x8_sse4_1=vp8_sad8x8x8_sse4;
+
+add_proto qw/void vp8_sad8x16x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array";
+specialize qw/vp8_sad8x16x8 sse4_1/;
+$vp8_sad8x16x8_sse4_1=vp8_sad8x16x8_sse4;
+
+add_proto qw/void vp8_sad16x8x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array";
+specialize qw/vp8_sad16x8x8 sse4_1/;
+$vp8_sad16x8x8_sse4_1=vp8_sad16x8x8_sse4;
+
+add_proto qw/void vp8_sad16x16x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array";
+specialize qw/vp8_sad16x16x8 sse4_1/;
+$vp8_sad16x16x8_sse4_1=vp8_sad16x16x8_sse4;
+
+#
+# Multi-block SAD, comparing a reference to N independent blocks
+#
+add_proto qw/void vp8_sad4x4x4d/, "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+specialize qw/vp8_sad4x4x4d sse3/;
+
+add_proto qw/void vp8_sad8x8x4d/, "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+specialize qw/vp8_sad8x8x4d sse3/;
+
+add_proto qw/void vp8_sad8x16x4d/, "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+specialize qw/vp8_sad8x16x4d sse3/;
+
+add_proto qw/void vp8_sad16x8x4d/, "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+specialize qw/vp8_sad16x8x4d sse3/;
+
+add_proto qw/void vp8_sad16x16x4d/, "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+specialize qw/vp8_sad16x16x4d sse3/;
+
+#
+# Encoder functions below this point.
+#
+if (vpx_config("CONFIG_VP8_ENCODER") eq "yes") {
+
+#
+# Sum of squares (vector)
+#
+add_proto qw/unsigned int vp8_get_mb_ss/, "const short *";
+specialize qw/vp8_get_mb_ss mmx sse2/;
+
+#
+# SSE (Sum Squared Error)
+#
+add_proto qw/unsigned int vp8_sub_pixel_mse16x16/, "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
+specialize qw/vp8_sub_pixel_mse16x16 mmx sse2/;
+$vp8_sub_pixel_mse16x16_sse2=vp8_sub_pixel_mse16x16_wmt;
+
+add_proto qw/unsigned int vp8_mse16x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
+specialize qw/vp8_mse16x16 mmx sse2 media neon/;
+$vp8_mse16x16_sse2=vp8_mse16x16_wmt;
+$vp8_mse16x16_media=vp8_mse16x16_armv6;
+
+add_proto qw/unsigned int vp8_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride";
+specialize qw/vp8_get4x4sse_cs mmx neon/;
+
+#
+# Block copy
+#
+if ($opts{arch} =~ /x86/) {
+    add_proto qw/void vp8_copy32xn/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n";
+    specialize qw/vp8_copy32xn sse2 sse3/;
+}
+
+#
+# Structured Similarity (SSIM)
+#
+if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {
+    $opts{arch} eq "x86_64" and $sse2_on_x86_64 = "sse2";
+
+    add_proto qw/void vp8_ssim_parms_8x8/, "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr";
+    specialize qw/vp8_ssim_parms_8x8/, "$sse2_on_x86_64";
+
+    add_proto qw/void vp8_ssim_parms_16x16/, "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr";
+    specialize qw/vp8_ssim_parms_16x16/, "$sse2_on_x86_64";
+}
+
+#
+# Forward DCT
+#
+add_proto qw/void vp8_short_fdct4x4/, "short *input, short *output, int pitch";
+specialize qw/vp8_short_fdct4x4 mmx sse2 media neon/;
+$vp8_short_fdct4x4_media=vp8_short_fdct4x4_armv6;
+
+add_proto qw/void vp8_short_fdct8x4/, "short *input, short *output, int pitch";
+specialize qw/vp8_short_fdct8x4 mmx sse2 media neon/;
+$vp8_short_fdct8x4_media=vp8_short_fdct8x4_armv6;
+
+add_proto qw/void vp8_short_walsh4x4/, "short *input, short *output, int pitch";
+specialize qw/vp8_short_walsh4x4 sse2 media neon/;
+$vp8_short_walsh4x4_media=vp8_short_walsh4x4_armv6;
+
+#
+# Quantizer
+#
+add_proto qw/void vp8_regular_quantize_b/, "struct block *, struct blockd *";
+specialize qw/vp8_regular_quantize_b sse2/;
+# TODO(johann) Update sse4 implementation and re-enable
+#$vp8_regular_quantize_b_sse4_1=vp8_regular_quantize_b_sse4;
+
+add_proto qw/void vp8_fast_quantize_b/, "struct block *, struct blockd *";
+specialize qw/vp8_fast_quantize_b sse2 ssse3 media neon/;
+$vp8_fast_quantize_b_media=vp8_fast_quantize_b_armv6;
+
+add_proto qw/void vp8_regular_quantize_b_pair/, "struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2";
+# no asm yet
+
+add_proto qw/void vp8_fast_quantize_b_pair/, "struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2";
+specialize qw/vp8_fast_quantize_b_pair neon/;
+
+add_proto qw/void vp8_quantize_mb/, "struct macroblock *";
+specialize qw/vp8_quantize_mb neon/;
+
+add_proto qw/void vp8_quantize_mby/, "struct macroblock *";
+specialize qw/vp8_quantize_mby neon/;
+
+add_proto qw/void vp8_quantize_mbuv/, "struct macroblock *";
+specialize qw/vp8_quantize_mbuv neon/;
+
+#
+# Block subtraction
+#
+add_proto qw/int vp8_block_error/, "short *coeff, short *dqcoeff";
+specialize qw/vp8_block_error mmx sse2/;
+$vp8_block_error_sse2=vp8_block_error_xmm;
+
+add_proto qw/int vp8_mbblock_error/, "struct macroblock *mb, int dc";
+specialize qw/vp8_mbblock_error mmx sse2/;
+$vp8_mbblock_error_sse2=vp8_mbblock_error_xmm;
+
+add_proto qw/int vp8_mbuverror/, "struct macroblock *mb";
+specialize qw/vp8_mbuverror mmx sse2/;
+$vp8_mbuverror_sse2=vp8_mbuverror_xmm;
+
+add_proto qw/void vp8_subtract_b/, "struct block *be, struct blockd *bd, int pitch";
+specialize qw/vp8_subtract_b mmx sse2 media neon/;
+$vp8_subtract_b_media=vp8_subtract_b_armv6;
+
+add_proto qw/void vp8_subtract_mby/, "short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride";
+specialize qw/vp8_subtract_mby mmx sse2 media neon/;
+$vp8_subtract_mby_media=vp8_subtract_mby_armv6;
+
+add_proto qw/void vp8_subtract_mbuv/, "short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride";
+specialize qw/vp8_subtract_mbuv mmx sse2 media neon/;
+$vp8_subtract_mbuv_media=vp8_subtract_mbuv_armv6;
+
+#
+# Motion search
+#
+add_proto qw/int vp8_full_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv";
+specialize qw/vp8_full_search_sad sse3 sse4_1/;
+$vp8_full_search_sad_sse3=vp8_full_search_sadx3;
+$vp8_full_search_sad_sse4_1=vp8_full_search_sadx8;
+
+add_proto qw/int vp8_refining_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv";
+specialize qw/vp8_refining_search_sad sse3/;
+$vp8_refining_search_sad_sse3=vp8_refining_search_sadx4;
+
+add_proto qw/int vp8_diamond_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv";
+$vp8_diamond_search_sad_sse3=vp8_diamond_search_sadx4;
+
+#
+# Alt-ref Noise Reduction (ARNR)
+#
+if (vpx_config("CONFIG_REALTIME_ONLY") ne "yes") {
+    add_proto qw/void vp8_temporal_filter_apply/, "unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count";
+    specialize qw/vp8_temporal_filter_apply sse2/;
+}
+
+#
+# Pick Loopfilter
+#
+add_proto qw/void vp8_yv12_copy_partial_frame/, "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc";
+specialize qw/vp8_yv12_copy_partial_frame neon/;
+
+#
+# Denoiser filter
+#
+if (vpx_config("CONFIG_TEMPORAL_DENOISING") eq "yes") {
+    add_proto qw/int vp8_denoiser_filter/, "struct yv12_buffer_config* mc_running_avg, struct yv12_buffer_config* running_avg, struct macroblock* signal, unsigned int motion_magnitude2, int y_offset, int uv_offset";
+    specialize qw/vp8_denoiser_filter sse2 neon/;
+}
+
+# End of encoder only functions
+}
+1;
diff --git a/vp8/common/rtcd_defs.sh b/vp8/common/rtcd_defs.sh
deleted file mode 100644
index 28e6754..0000000
--- a/vp8/common/rtcd_defs.sh
+++ /dev/null
@@ -1,542 +0,0 @@
-vp8_common_forward_decls() {
-cat <<EOF
-/*
- * VP8
- */
-
-struct blockd;
-struct macroblockd;
-struct loop_filter_info;
-
-/* Encoder forward decls */
-struct block;
-struct macroblock;
-struct variance_vtable;
-union int_mv;
-struct yv12_buffer_config;
-EOF
-}
-forward_decls vp8_common_forward_decls
-
-#
-# system state
-#
-prototype void vp8_clear_system_state ""
-specialize vp8_clear_system_state mmx
-vp8_clear_system_state_mmx=vpx_reset_mmx_state
-
-#
-# Dequant
-#
-prototype void vp8_dequantize_b "struct blockd*, short *dqc"
-specialize vp8_dequantize_b mmx media neon
-vp8_dequantize_b_media=vp8_dequantize_b_v6
-
-prototype void vp8_dequant_idct_add "short *input, short *dq, unsigned char *output, int stride"
-specialize vp8_dequant_idct_add mmx media neon dspr2
-vp8_dequant_idct_add_media=vp8_dequant_idct_add_v6
-vp8_dequant_idct_add_dspr2=vp8_dequant_idct_add_dspr2
-
-prototype void vp8_dequant_idct_add_y_block "short *q, short *dq, unsigned char *dst, int stride, char *eobs"
-specialize vp8_dequant_idct_add_y_block mmx sse2 media neon dspr2
-vp8_dequant_idct_add_y_block_media=vp8_dequant_idct_add_y_block_v6
-vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2
-
-prototype void vp8_dequant_idct_add_uv_block "short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs"
-specialize vp8_dequant_idct_add_uv_block mmx sse2 media neon dspr2
-vp8_dequant_idct_add_uv_block_media=vp8_dequant_idct_add_uv_block_v6
-vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2
-
-#
-# Loopfilter
-#
-prototype void vp8_loop_filter_mbv "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
-specialize vp8_loop_filter_mbv mmx sse2 media neon dspr2
-vp8_loop_filter_mbv_media=vp8_loop_filter_mbv_armv6
-vp8_loop_filter_mbv_dspr2=vp8_loop_filter_mbv_dspr2
-
-prototype void vp8_loop_filter_bv "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
-specialize vp8_loop_filter_bv mmx sse2 media neon dspr2
-vp8_loop_filter_bv_media=vp8_loop_filter_bv_armv6
-vp8_loop_filter_bv_dspr2=vp8_loop_filter_bv_dspr2
-
-prototype void vp8_loop_filter_mbh "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
-specialize vp8_loop_filter_mbh mmx sse2 media neon dspr2
-vp8_loop_filter_mbh_media=vp8_loop_filter_mbh_armv6
-vp8_loop_filter_mbh_dspr2=vp8_loop_filter_mbh_dspr2
-
-prototype void vp8_loop_filter_bh "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
-specialize vp8_loop_filter_bh mmx sse2 media neon dspr2
-vp8_loop_filter_bh_media=vp8_loop_filter_bh_armv6
-vp8_loop_filter_bh_dspr2=vp8_loop_filter_bh_dspr2
-
-
-prototype void vp8_loop_filter_simple_mbv "unsigned char *y, int ystride, const unsigned char *blimit"
-specialize vp8_loop_filter_simple_mbv mmx sse2 media neon
-vp8_loop_filter_simple_mbv_c=vp8_loop_filter_simple_vertical_edge_c
-vp8_loop_filter_simple_mbv_mmx=vp8_loop_filter_simple_vertical_edge_mmx
-vp8_loop_filter_simple_mbv_sse2=vp8_loop_filter_simple_vertical_edge_sse2
-vp8_loop_filter_simple_mbv_media=vp8_loop_filter_simple_vertical_edge_armv6
-vp8_loop_filter_simple_mbv_neon=vp8_loop_filter_mbvs_neon
-
-prototype void vp8_loop_filter_simple_mbh "unsigned char *y, int ystride, const unsigned char *blimit"
-specialize vp8_loop_filter_simple_mbh mmx sse2 media neon
-vp8_loop_filter_simple_mbh_c=vp8_loop_filter_simple_horizontal_edge_c
-vp8_loop_filter_simple_mbh_mmx=vp8_loop_filter_simple_horizontal_edge_mmx
-vp8_loop_filter_simple_mbh_sse2=vp8_loop_filter_simple_horizontal_edge_sse2
-vp8_loop_filter_simple_mbh_media=vp8_loop_filter_simple_horizontal_edge_armv6
-vp8_loop_filter_simple_mbh_neon=vp8_loop_filter_mbhs_neon
-
-prototype void vp8_loop_filter_simple_bv "unsigned char *y, int ystride, const unsigned char *blimit"
-specialize vp8_loop_filter_simple_bv mmx sse2 media neon
-vp8_loop_filter_simple_bv_c=vp8_loop_filter_bvs_c
-vp8_loop_filter_simple_bv_mmx=vp8_loop_filter_bvs_mmx
-vp8_loop_filter_simple_bv_sse2=vp8_loop_filter_bvs_sse2
-vp8_loop_filter_simple_bv_media=vp8_loop_filter_bvs_armv6
-vp8_loop_filter_simple_bv_neon=vp8_loop_filter_bvs_neon
-
-prototype void vp8_loop_filter_simple_bh "unsigned char *y, int ystride, const unsigned char *blimit"
-specialize vp8_loop_filter_simple_bh mmx sse2 media neon
-vp8_loop_filter_simple_bh_c=vp8_loop_filter_bhs_c
-vp8_loop_filter_simple_bh_mmx=vp8_loop_filter_bhs_mmx
-vp8_loop_filter_simple_bh_sse2=vp8_loop_filter_bhs_sse2
-vp8_loop_filter_simple_bh_media=vp8_loop_filter_bhs_armv6
-vp8_loop_filter_simple_bh_neon=vp8_loop_filter_bhs_neon
-
-#
-# IDCT
-#
-#idct16
-prototype void vp8_short_idct4x4llm "short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride"
-specialize vp8_short_idct4x4llm mmx media neon dspr2
-vp8_short_idct4x4llm_media=vp8_short_idct4x4llm_v6_dual
-vp8_short_idct4x4llm_dspr2=vp8_short_idct4x4llm_dspr2
-
-#iwalsh1
-prototype void vp8_short_inv_walsh4x4_1 "short *input, short *output"
-specialize vp8_short_inv_walsh4x4_1 dspr2
-vp8_short_inv_walsh4x4_1_dspr2=vp8_short_inv_walsh4x4_1_dspr2
-# no asm yet
-
-#iwalsh16
-prototype void vp8_short_inv_walsh4x4 "short *input, short *output"
-specialize vp8_short_inv_walsh4x4 mmx sse2 media neon dspr2
-vp8_short_inv_walsh4x4_media=vp8_short_inv_walsh4x4_v6
-vp8_short_inv_walsh4x4_dspr2=vp8_short_inv_walsh4x4_dspr2
-
-#idct1_scalar_add
-prototype void vp8_dc_only_idct_add "short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride"
-specialize vp8_dc_only_idct_add	mmx media neon dspr2
-vp8_dc_only_idct_add_media=vp8_dc_only_idct_add_v6
-vp8_dc_only_idct_add_dspr2=vp8_dc_only_idct_add_dspr2
-
-#
-# RECON
-#
-prototype void vp8_copy_mem16x16 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
-specialize vp8_copy_mem16x16 mmx sse2 media neon dspr2
-vp8_copy_mem16x16_media=vp8_copy_mem16x16_v6
-vp8_copy_mem16x16_dspr2=vp8_copy_mem16x16_dspr2
-
-prototype void vp8_copy_mem8x8 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
-specialize vp8_copy_mem8x8 mmx media neon dspr2
-vp8_copy_mem8x8_media=vp8_copy_mem8x8_v6
-vp8_copy_mem8x8_dspr2=vp8_copy_mem8x8_dspr2
-
-prototype void vp8_copy_mem8x4 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
-specialize vp8_copy_mem8x4 mmx media neon dspr2
-vp8_copy_mem8x4_media=vp8_copy_mem8x4_v6
-vp8_copy_mem8x4_dspr2=vp8_copy_mem8x4_dspr2
-
-prototype void vp8_build_intra_predictors_mby_s "struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride"
-specialize vp8_build_intra_predictors_mby_s sse2 ssse3
-#TODO: fix assembly for neon
-
-prototype void vp8_build_intra_predictors_mbuv_s "struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride"
-specialize vp8_build_intra_predictors_mbuv_s sse2 ssse3
-
-prototype void vp8_intra4x4_predict "unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left"
-specialize vp8_intra4x4_predict media
-vp8_intra4x4_predict_media=vp8_intra4x4_predict_armv6
-
-#
-# Postproc
-#
-if [ "$CONFIG_POSTPROC" = "yes" ]; then
-    prototype void vp8_mbpost_proc_down "unsigned char *dst, int pitch, int rows, int cols,int flimit"
-    specialize vp8_mbpost_proc_down mmx sse2
-    vp8_mbpost_proc_down_sse2=vp8_mbpost_proc_down_xmm
-
-    prototype void vp8_mbpost_proc_across_ip "unsigned char *dst, int pitch, int rows, int cols,int flimit"
-    specialize vp8_mbpost_proc_across_ip sse2
-    vp8_mbpost_proc_across_ip_sse2=vp8_mbpost_proc_across_ip_xmm
-
-    prototype void vp8_post_proc_down_and_across_mb_row "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size"
-    specialize vp8_post_proc_down_and_across_mb_row sse2
-
-    prototype void vp8_plane_add_noise "unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch"
-    specialize vp8_plane_add_noise mmx sse2
-    vp8_plane_add_noise_sse2=vp8_plane_add_noise_wmt
-
-    prototype void vp8_blend_mb_inner "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride"
-    # no asm yet
-
-    prototype void vp8_blend_mb_outer "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride"
-    # no asm yet
-
-    prototype void vp8_blend_b "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride"
-    # no asm yet
-
-    prototype void vp8_filter_by_weight16x16 "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight"
-    specialize vp8_filter_by_weight16x16 sse2
-
-    prototype void vp8_filter_by_weight8x8 "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight"
-    specialize vp8_filter_by_weight8x8 sse2
-
-    prototype void vp8_filter_by_weight4x4 "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight"
-    # no asm yet
-fi
-
-#
-# Subpixel
-#
-prototype void vp8_sixtap_predict16x16 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
-specialize vp8_sixtap_predict16x16 mmx sse2 ssse3 media neon dspr2
-vp8_sixtap_predict16x16_media=vp8_sixtap_predict16x16_armv6
-vp8_sixtap_predict16x16_dspr2=vp8_sixtap_predict16x16_dspr2
-
-prototype void vp8_sixtap_predict8x8 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
-specialize vp8_sixtap_predict8x8 mmx sse2 ssse3 media neon dspr2
-vp8_sixtap_predict8x8_media=vp8_sixtap_predict8x8_armv6
-vp8_sixtap_predict8x8_dspr2=vp8_sixtap_predict8x8_dspr2
-
-prototype void vp8_sixtap_predict8x4 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
-specialize vp8_sixtap_predict8x4 mmx sse2 ssse3 media neon dspr2
-vp8_sixtap_predict8x4_media=vp8_sixtap_predict8x4_armv6
-vp8_sixtap_predict8x4_dspr2=vp8_sixtap_predict8x4_dspr2
-
-prototype void vp8_sixtap_predict4x4 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
-specialize vp8_sixtap_predict4x4 mmx ssse3 media neon dspr2
-vp8_sixtap_predict4x4_media=vp8_sixtap_predict4x4_armv6
-vp8_sixtap_predict4x4_dspr2=vp8_sixtap_predict4x4_dspr2
-
-prototype void vp8_bilinear_predict16x16 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
-specialize vp8_bilinear_predict16x16 mmx sse2 ssse3 media neon
-vp8_bilinear_predict16x16_media=vp8_bilinear_predict16x16_armv6
-
-prototype void vp8_bilinear_predict8x8 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
-specialize vp8_bilinear_predict8x8 mmx sse2 ssse3 media neon
-vp8_bilinear_predict8x8_media=vp8_bilinear_predict8x8_armv6
-
-prototype void vp8_bilinear_predict8x4 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
-specialize vp8_bilinear_predict8x4 mmx media neon
-vp8_bilinear_predict8x4_media=vp8_bilinear_predict8x4_armv6
-
-prototype void vp8_bilinear_predict4x4 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"
-specialize vp8_bilinear_predict4x4 mmx media neon
-vp8_bilinear_predict4x4_media=vp8_bilinear_predict4x4_armv6
-
-#
-# Whole-pixel Variance
-#
-prototype unsigned int vp8_variance4x4 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse"
-specialize vp8_variance4x4 mmx sse2
-vp8_variance4x4_sse2=vp8_variance4x4_wmt
-
-prototype unsigned int vp8_variance8x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse"
-specialize vp8_variance8x8 mmx sse2 media neon
-vp8_variance8x8_sse2=vp8_variance8x8_wmt
-vp8_variance8x8_media=vp8_variance8x8_armv6
-
-prototype unsigned int vp8_variance8x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse"
-specialize vp8_variance8x16 mmx sse2 neon
-vp8_variance8x16_sse2=vp8_variance8x16_wmt
-
-prototype unsigned int vp8_variance16x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse"
-specialize vp8_variance16x8 mmx sse2 neon
-vp8_variance16x8_sse2=vp8_variance16x8_wmt
-
-prototype unsigned int vp8_variance16x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse"
-specialize vp8_variance16x16 mmx sse2 media neon
-vp8_variance16x16_sse2=vp8_variance16x16_wmt
-vp8_variance16x16_media=vp8_variance16x16_armv6
-
-#
-# Sub-pixel Variance
-#
-prototype unsigned int vp8_sub_pixel_variance4x4 "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
-specialize vp8_sub_pixel_variance4x4 mmx sse2
-vp8_sub_pixel_variance4x4_sse2=vp8_sub_pixel_variance4x4_wmt
-
-prototype unsigned int vp8_sub_pixel_variance8x8 "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
-specialize vp8_sub_pixel_variance8x8 mmx sse2 media neon
-vp8_sub_pixel_variance8x8_sse2=vp8_sub_pixel_variance8x8_wmt
-vp8_sub_pixel_variance8x8_media=vp8_sub_pixel_variance8x8_armv6
-
-prototype unsigned int vp8_sub_pixel_variance8x16 "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
-specialize vp8_sub_pixel_variance8x16 mmx sse2
-vp8_sub_pixel_variance8x16_sse2=vp8_sub_pixel_variance8x16_wmt
-
-prototype unsigned int vp8_sub_pixel_variance16x8 "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
-specialize vp8_sub_pixel_variance16x8 mmx sse2 ssse3
-vp8_sub_pixel_variance16x8_sse2=vp8_sub_pixel_variance16x8_wmt
-
-prototype unsigned int vp8_sub_pixel_variance16x16 "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
-specialize vp8_sub_pixel_variance16x16 mmx sse2 ssse3 media neon
-vp8_sub_pixel_variance16x16_sse2=vp8_sub_pixel_variance16x16_wmt
-vp8_sub_pixel_variance16x16_media=vp8_sub_pixel_variance16x16_armv6
-
-prototype unsigned int vp8_variance_halfpixvar16x16_h "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse"
-specialize vp8_variance_halfpixvar16x16_h mmx sse2 media neon
-vp8_variance_halfpixvar16x16_h_sse2=vp8_variance_halfpixvar16x16_h_wmt
-vp8_variance_halfpixvar16x16_h_media=vp8_variance_halfpixvar16x16_h_armv6
-
-prototype unsigned int vp8_variance_halfpixvar16x16_v "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse"
-specialize vp8_variance_halfpixvar16x16_v mmx sse2 media neon
-vp8_variance_halfpixvar16x16_v_sse2=vp8_variance_halfpixvar16x16_v_wmt
-vp8_variance_halfpixvar16x16_v_media=vp8_variance_halfpixvar16x16_v_armv6
-
-prototype unsigned int vp8_variance_halfpixvar16x16_hv "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse"
-specialize vp8_variance_halfpixvar16x16_hv mmx sse2 media neon
-vp8_variance_halfpixvar16x16_hv_sse2=vp8_variance_halfpixvar16x16_hv_wmt
-vp8_variance_halfpixvar16x16_hv_media=vp8_variance_halfpixvar16x16_hv_armv6
-
-#
-# Single block SAD
-#
-prototype unsigned int vp8_sad4x4 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"
-specialize vp8_sad4x4 mmx sse2 neon
-vp8_sad4x4_sse2=vp8_sad4x4_wmt
-
-prototype unsigned int vp8_sad8x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"
-specialize vp8_sad8x8 mmx sse2 neon
-vp8_sad8x8_sse2=vp8_sad8x8_wmt
-
-prototype unsigned int vp8_sad8x16 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"
-specialize vp8_sad8x16 mmx sse2 neon
-vp8_sad8x16_sse2=vp8_sad8x16_wmt
-
-prototype unsigned int vp8_sad16x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"
-specialize vp8_sad16x8 mmx sse2 neon
-vp8_sad16x8_sse2=vp8_sad16x8_wmt
-
-prototype unsigned int vp8_sad16x16 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"
-specialize vp8_sad16x16 mmx sse2 sse3 media neon
-vp8_sad16x16_sse2=vp8_sad16x16_wmt
-vp8_sad16x16_media=vp8_sad16x16_armv6
-
-#
-# Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
-#
-prototype void vp8_sad4x4x3 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
-specialize vp8_sad4x4x3 sse3
-
-prototype void vp8_sad8x8x3 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
-specialize vp8_sad8x8x3 sse3
-
-prototype void vp8_sad8x16x3 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
-specialize vp8_sad8x16x3 sse3
-
-prototype void vp8_sad16x8x3 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
-specialize vp8_sad16x8x3 sse3 ssse3
-
-prototype void vp8_sad16x16x3 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
-specialize vp8_sad16x16x3 sse3 ssse3
-
-# Note the only difference in the following prototypes is that they return into
-# an array of short
-prototype void vp8_sad4x4x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
-specialize vp8_sad4x4x8 sse4_1
-vp8_sad4x4x8_sse4_1=vp8_sad4x4x8_sse4
-
-prototype void vp8_sad8x8x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
-specialize vp8_sad8x8x8 sse4_1
-vp8_sad8x8x8_sse4_1=vp8_sad8x8x8_sse4
-
-prototype void vp8_sad8x16x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
-specialize vp8_sad8x16x8 sse4_1
-vp8_sad8x16x8_sse4_1=vp8_sad8x16x8_sse4
-
-prototype void vp8_sad16x8x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
-specialize vp8_sad16x8x8 sse4_1
-vp8_sad16x8x8_sse4_1=vp8_sad16x8x8_sse4
-
-prototype void vp8_sad16x16x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
-specialize vp8_sad16x16x8 sse4_1
-vp8_sad16x16x8_sse4_1=vp8_sad16x16x8_sse4
-
-#
-# Multi-block SAD, comparing a reference to N independent blocks
-#
-prototype void vp8_sad4x4x4d "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array"
-specialize vp8_sad4x4x4d sse3
-
-prototype void vp8_sad8x8x4d "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array"
-specialize vp8_sad8x8x4d sse3
-
-prototype void vp8_sad8x16x4d "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array"
-specialize vp8_sad8x16x4d sse3
-
-prototype void vp8_sad16x8x4d "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array"
-specialize vp8_sad16x8x4d sse3
-
-prototype void vp8_sad16x16x4d "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array"
-specialize vp8_sad16x16x4d sse3
-
-#
-# Encoder functions below this point.
-#
-if [ "$CONFIG_VP8_ENCODER" = "yes" ]; then
-
-#
-# Sum of squares (vector)
-#
-prototype unsigned int vp8_get_mb_ss "const short *"
-specialize vp8_get_mb_ss mmx sse2
-
-#
-# SSE (Sum Squared Error)
-#
-prototype unsigned int vp8_sub_pixel_mse16x16 "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
-specialize vp8_sub_pixel_mse16x16 mmx sse2
-vp8_sub_pixel_mse16x16_sse2=vp8_sub_pixel_mse16x16_wmt
-
-prototype unsigned int vp8_mse16x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse"
-specialize vp8_mse16x16 mmx sse2 media neon
-vp8_mse16x16_sse2=vp8_mse16x16_wmt
-vp8_mse16x16_media=vp8_mse16x16_armv6
-
-prototype unsigned int vp8_get4x4sse_cs "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride"
-specialize vp8_get4x4sse_cs mmx neon
-
-#
-# Block copy
-#
-case $arch in
-    x86*)
-    prototype void vp8_copy32xn "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n"
-    specialize vp8_copy32xn sse2 sse3
-    ;;
-esac
-
-#
-# Structured Similarity (SSIM)
-#
-if [ "$CONFIG_INTERNAL_STATS" = "yes" ]; then
-    [ $arch = "x86_64" ] && sse2_on_x86_64=sse2
-
-    prototype void vp8_ssim_parms_8x8 "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"
-    specialize vp8_ssim_parms_8x8 $sse2_on_x86_64
-
-    prototype void vp8_ssim_parms_16x16 "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"
-    specialize vp8_ssim_parms_16x16 $sse2_on_x86_64
-fi
-
-#
-# Forward DCT
-#
-prototype void vp8_short_fdct4x4 "short *input, short *output, int pitch"
-specialize vp8_short_fdct4x4 mmx sse2 media neon
-vp8_short_fdct4x4_media=vp8_short_fdct4x4_armv6
-
-prototype void vp8_short_fdct8x4 "short *input, short *output, int pitch"
-specialize vp8_short_fdct8x4 mmx sse2 media neon
-vp8_short_fdct8x4_media=vp8_short_fdct8x4_armv6
-
-prototype void vp8_short_walsh4x4 "short *input, short *output, int pitch"
-specialize vp8_short_walsh4x4 sse2 media neon
-vp8_short_walsh4x4_media=vp8_short_walsh4x4_armv6
-
-#
-# Quantizer
-#
-prototype void vp8_regular_quantize_b "struct block *, struct blockd *"
-specialize vp8_regular_quantize_b sse2 #sse4_1
-# TODO(johann) Update sse4 implementation and re-enable
-#vp8_regular_quantize_b_sse4_1=vp8_regular_quantize_b_sse4
-
-prototype void vp8_fast_quantize_b "struct block *, struct blockd *"
-specialize vp8_fast_quantize_b sse2 ssse3 media neon
-vp8_fast_quantize_b_media=vp8_fast_quantize_b_armv6
-
-prototype void vp8_regular_quantize_b_pair "struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2"
-# no asm yet
-
-prototype void vp8_fast_quantize_b_pair "struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2"
-specialize vp8_fast_quantize_b_pair neon
-
-prototype void vp8_quantize_mb "struct macroblock *"
-specialize vp8_quantize_mb neon
-
-prototype void vp8_quantize_mby "struct macroblock *"
-specialize vp8_quantize_mby neon
-
-prototype void vp8_quantize_mbuv "struct macroblock *"
-specialize vp8_quantize_mbuv neon
-
-#
-# Block subtraction
-#
-prototype int vp8_block_error "short *coeff, short *dqcoeff"
-specialize vp8_block_error mmx sse2
-vp8_block_error_sse2=vp8_block_error_xmm
-
-prototype int vp8_mbblock_error "struct macroblock *mb, int dc"
-specialize vp8_mbblock_error mmx sse2
-vp8_mbblock_error_sse2=vp8_mbblock_error_xmm
-
-prototype int vp8_mbuverror "struct macroblock *mb"
-specialize vp8_mbuverror mmx sse2
-vp8_mbuverror_sse2=vp8_mbuverror_xmm
-
-prototype void vp8_subtract_b "struct block *be, struct blockd *bd, int pitch"
-specialize vp8_subtract_b mmx sse2 media neon
-vp8_subtract_b_media=vp8_subtract_b_armv6
-
-prototype void vp8_subtract_mby "short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride"
-specialize vp8_subtract_mby mmx sse2 media neon
-vp8_subtract_mby_media=vp8_subtract_mby_armv6
-
-prototype void vp8_subtract_mbuv "short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride"
-specialize vp8_subtract_mbuv mmx sse2 media neon
-vp8_subtract_mbuv_media=vp8_subtract_mbuv_armv6
-
-#
-# Motion search
-#
-prototype int vp8_full_search_sad "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv"
-specialize vp8_full_search_sad sse3 sse4_1
-vp8_full_search_sad_sse3=vp8_full_search_sadx3
-vp8_full_search_sad_sse4_1=vp8_full_search_sadx8
-
-prototype int vp8_refining_search_sad "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv"
-specialize vp8_refining_search_sad sse3
-vp8_refining_search_sad_sse3=vp8_refining_search_sadx4
-
-prototype int vp8_diamond_search_sad "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv"
-vp8_diamond_search_sad_sse3=vp8_diamond_search_sadx4
-
-#
-# Alt-ref Noise Reduction (ARNR)
-#
-if [ "$CONFIG_REALTIME_ONLY" != "yes" ]; then
-    prototype void vp8_temporal_filter_apply "unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count"
-    specialize vp8_temporal_filter_apply sse2
-fi
-
-#
-# Pick Loopfilter
-#
-prototype void vp8_yv12_copy_partial_frame "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc"
-specialize vp8_yv12_copy_partial_frame neon
-
-#
-# Denoiser filter
-#
-if [ "$CONFIG_TEMPORAL_DENOISING" = "yes" ]; then
-    prototype int vp8_denoiser_filter "struct yv12_buffer_config* mc_running_avg, struct yv12_buffer_config* running_avg, struct macroblock* signal, unsigned int motion_magnitude2, int y_offset, int uv_offset"
-    specialize vp8_denoiser_filter sse2 neon
-fi
-
-# End of encoder only functions
-fi
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index ac91d7a..dfb54a5 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -47,7 +47,7 @@
 VP8_COMMON_SRCS-yes += common/reconinter.h
 VP8_COMMON_SRCS-yes += common/reconintra4x4.h
 VP8_COMMON_SRCS-yes += common/rtcd.c
-VP8_COMMON_SRCS-yes += common/rtcd_defs.sh
+VP8_COMMON_SRCS-yes += common/rtcd_defs.pl
 VP8_COMMON_SRCS-yes += common/setupintrarecon.h
 VP8_COMMON_SRCS-yes += common/swapyv12buffer.h
 VP8_COMMON_SRCS-yes += common/systemdependent.h
@@ -189,4 +189,4 @@
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dequantizeb_neon.c
 
 
-$(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.sh))
+$(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.pl))
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index ff4b7c1..a72821b 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -200,10 +200,6 @@
   return 1;
 }
 
-void vp9_create_common(VP9_COMMON *cm) {
-  vp9_rtcd();
-}
-
 void vp9_remove_common(VP9_COMMON *cm) {
   vp9_free_frame_buffers(cm);
   vp9_free_internal_frame_buffers(&cm->int_frame_buffers);
diff --git a/vp9/common/vp9_alloccommon.h b/vp9/common/vp9_alloccommon.h
index e3b5b95..066c778 100644
--- a/vp9/common/vp9_alloccommon.h
+++ b/vp9/common/vp9_alloccommon.h
@@ -22,7 +22,6 @@
 
 void vp9_update_mode_info_border(VP9_COMMON *cm, MODE_INFO *mi);
 
-void vp9_create_common(VP9_COMMON *cm);
 void vp9_remove_common(VP9_COMMON *cm);
 
 int vp9_resize_frame_buffers(VP9_COMMON *cm, int width, int height);
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 2a0ebfb..84403ae 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -255,7 +255,16 @@
   return subsize;
 }
 
-extern const TX_TYPE mode2txfm_map[MB_MODE_COUNT];
+extern const TX_TYPE mode2txfm_map[INTRA_MODES];
+
+static INLINE TX_TYPE get_tx_type(PLANE_TYPE plane_type,
+                                  const MACROBLOCKD *xd) {
+  const MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
+
+  if (plane_type != PLANE_TYPE_Y || is_inter_block(mbmi))
+    return DCT_DCT;
+  return mode2txfm_map[mbmi->mode];
+}
 
 static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type,
                                       const MACROBLOCKD *xd, int ib) {
@@ -267,18 +276,6 @@
   return mode2txfm_map[get_y_mode(mi, ib)];
 }
 
-static INLINE TX_TYPE get_tx_type_8x8(PLANE_TYPE plane_type,
-                                      const MACROBLOCKD *xd) {
-  return plane_type == PLANE_TYPE_Y ? mode2txfm_map[xd->mi_8x8[0]->mbmi.mode]
-                                    : DCT_DCT;
-}
-
-static INLINE TX_TYPE get_tx_type_16x16(PLANE_TYPE plane_type,
-                                        const MACROBLOCKD *xd) {
-  return plane_type == PLANE_TYPE_Y ? mode2txfm_map[xd->mi_8x8[0]->mbmi.mode]
-                                    : DCT_DCT;
-}
-
 void vp9_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y);
 
 static INLINE TX_SIZE get_uv_tx_size_impl(TX_SIZE y_tx_size, BLOCK_SIZE bsize) {
diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index bd5086a..f009777 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -16,7 +16,6 @@
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_scan.h"
-#include "vp9/common/vp9_entropymode.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index 8921539..f2c81bc 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -10,7 +10,6 @@
 
 #include "vpx_mem/vpx_mem.h"
 
-#include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_seg_common.h"
 
@@ -315,18 +314,18 @@
   { 149, 144, },
 };
 
-void vp9_init_mbmode_probs(VP9_COMMON *cm) {
-  vp9_copy(cm->fc.uv_mode_prob, default_if_uv_probs);
-  vp9_copy(cm->fc.y_mode_prob, default_if_y_probs);
-  vp9_copy(cm->fc.switchable_interp_prob, default_switchable_interp_prob);
-  vp9_copy(cm->fc.partition_prob, default_partition_probs);
-  vp9_copy(cm->fc.intra_inter_prob, default_intra_inter_p);
-  vp9_copy(cm->fc.comp_inter_prob, default_comp_inter_p);
-  vp9_copy(cm->fc.comp_ref_prob, default_comp_ref_p);
-  vp9_copy(cm->fc.single_ref_prob, default_single_ref_p);
-  cm->fc.tx_probs = default_tx_probs;
-  vp9_copy(cm->fc.skip_probs, default_skip_probs);
-  vp9_copy(cm->fc.inter_mode_probs, default_inter_mode_probs);
+void vp9_init_mode_probs(FRAME_CONTEXT *fc) {
+  vp9_copy(fc->uv_mode_prob, default_if_uv_probs);
+  vp9_copy(fc->y_mode_prob, default_if_y_probs);
+  vp9_copy(fc->switchable_interp_prob, default_switchable_interp_prob);
+  vp9_copy(fc->partition_prob, default_partition_probs);
+  vp9_copy(fc->intra_inter_prob, default_intra_inter_p);
+  vp9_copy(fc->comp_inter_prob, default_comp_inter_p);
+  vp9_copy(fc->comp_ref_prob, default_comp_ref_p);
+  vp9_copy(fc->single_ref_prob, default_single_ref_p);
+  fc->tx_probs = default_tx_probs;
+  vp9_copy(fc->skip_probs, default_skip_probs);
+  vp9_copy(fc->inter_mode_probs, default_inter_mode_probs);
 }
 
 const vp9_tree_index vp9_switchable_interp_tree
@@ -452,7 +451,7 @@
   lf->last_sharpness_level = -1;
 
   vp9_default_coef_probs(cm);
-  vp9_init_mbmode_probs(cm);
+  vp9_init_mode_probs(&cm->fc);
   vp9_init_mv_probs(cm);
 
   if (cm->frame_type == KEY_FRAME ||
diff --git a/vp9/common/vp9_entropymode.h b/vp9/common/vp9_entropymode.h
index deec3f6..f881952 100644
--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h
@@ -12,6 +12,8 @@
 #define VP9_COMMON_VP9_ENTROPYMODE_H_
 
 #include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_entropymv.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -35,6 +37,42 @@
   unsigned int p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 2];
 };
 
+typedef struct frame_contexts {
+  vp9_prob y_mode_prob[BLOCK_SIZE_GROUPS][INTRA_MODES - 1];
+  vp9_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
+  vp9_prob partition_prob[PARTITION_CONTEXTS][PARTITION_TYPES - 1];
+  vp9_coeff_probs_model coef_probs[TX_SIZES][PLANE_TYPES];
+  vp9_prob switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
+                                 [SWITCHABLE_FILTERS - 1];
+  vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1];
+  vp9_prob intra_inter_prob[INTRA_INTER_CONTEXTS];
+  vp9_prob comp_inter_prob[COMP_INTER_CONTEXTS];
+  vp9_prob single_ref_prob[REF_CONTEXTS][2];
+  vp9_prob comp_ref_prob[REF_CONTEXTS];
+  struct tx_probs tx_probs;
+  vp9_prob skip_probs[SKIP_CONTEXTS];
+  nmv_context nmvc;
+} FRAME_CONTEXT;
+
+typedef struct {
+  unsigned int y_mode[BLOCK_SIZE_GROUPS][INTRA_MODES];
+  unsigned int uv_mode[INTRA_MODES][INTRA_MODES];
+  unsigned int partition[PARTITION_CONTEXTS][PARTITION_TYPES];
+  vp9_coeff_count_model coef[TX_SIZES][PLANE_TYPES];
+  unsigned int eob_branch[TX_SIZES][PLANE_TYPES][REF_TYPES]
+                         [COEF_BANDS][COEFF_CONTEXTS];
+  unsigned int switchable_interp[SWITCHABLE_FILTER_CONTEXTS]
+                                [SWITCHABLE_FILTERS];
+  unsigned int inter_mode[INTER_MODE_CONTEXTS][INTER_MODES];
+  unsigned int intra_inter[INTRA_INTER_CONTEXTS][2];
+  unsigned int comp_inter[COMP_INTER_CONTEXTS][2];
+  unsigned int single_ref[REF_CONTEXTS][2][2];
+  unsigned int comp_ref[REF_CONTEXTS][2];
+  struct tx_counts tx;
+  unsigned int skip[SKIP_CONTEXTS][2];
+  nmv_context_counts mv;
+} FRAME_COUNTS;
+
 extern const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
 extern const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES]
                                         [INTRA_MODES - 1];
@@ -48,7 +86,7 @@
 
 void vp9_setup_past_independence(struct VP9Common *cm);
 
-void vp9_init_mbmode_probs(struct VP9Common *cm);
+void vp9_init_mode_probs(FRAME_CONTEXT *fc);
 
 void vp9_adapt_mode_probs(struct VP9Common *cm);
 
diff --git a/vp9/common/vp9_onyx.h b/vp9/common/vp9_onyx.h
index 2220868..61fd21b 100644
--- a/vp9/common/vp9_onyx.h
+++ b/vp9/common/vp9_onyx.h
@@ -23,7 +23,7 @@
 
 #define MAX_SEGMENTS 8
 
-  typedef int *VP9_PTR;
+  struct VP9_COMP;
 
   /* Create/destroy static data structures. */
 
@@ -187,59 +187,59 @@
   } VP9_CONFIG;
 
 
-  void vp9_initialize_enc();
+void vp9_initialize_enc();
 
-  VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf);
-  void vp9_remove_compressor(VP9_PTR *comp);
+struct VP9_COMP *vp9_create_compressor(VP9_CONFIG *oxcf);
+void vp9_remove_compressor(struct VP9_COMP *cpi);
 
-  void vp9_change_config(VP9_PTR onyx, VP9_CONFIG *oxcf);
+void vp9_change_config(struct VP9_COMP *cpi, VP9_CONFIG *oxcf);
 
   // receive a frames worth of data. caller can assume that a copy of this
   // frame is made and not just a copy of the pointer..
-  int vp9_receive_raw_frame(VP9_PTR comp, unsigned int frame_flags,
-                            YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
-                            int64_t end_time_stamp);
+int vp9_receive_raw_frame(struct VP9_COMP *cpi, unsigned int frame_flags,
+                          YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
+                          int64_t end_time_stamp);
 
-  int vp9_get_compressed_data(VP9_PTR comp, unsigned int *frame_flags,
-                              size_t *size, uint8_t *dest,
-                              int64_t *time_stamp, int64_t *time_end,
-                              int flush);
+int vp9_get_compressed_data(struct VP9_COMP *cpi, unsigned int *frame_flags,
+                            size_t *size, uint8_t *dest,
+                            int64_t *time_stamp, int64_t *time_end, int flush);
 
-  int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest,
-                                vp9_ppflags_t *flags);
+int vp9_get_preview_raw_frame(struct VP9_COMP *cpi, YV12_BUFFER_CONFIG *dest,
+                              vp9_ppflags_t *flags);
 
-  int vp9_use_as_reference(VP9_PTR comp, int ref_frame_flags);
+int vp9_use_as_reference(struct VP9_COMP *cpi, int ref_frame_flags);
 
-  int vp9_update_reference(VP9_PTR comp, int ref_frame_flags);
+int vp9_update_reference(struct VP9_COMP *cpi, int ref_frame_flags);
 
-  int vp9_copy_reference_enc(VP9_PTR comp, VP9_REFFRAME ref_frame_flag,
-                             YV12_BUFFER_CONFIG *sd);
+int vp9_copy_reference_enc(struct VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag,
+                           YV12_BUFFER_CONFIG *sd);
 
-  int vp9_get_reference_enc(VP9_PTR ptr, int index, YV12_BUFFER_CONFIG **fb);
+int vp9_get_reference_enc(struct VP9_COMP *cpi, int index,
+                          YV12_BUFFER_CONFIG **fb);
 
-  int vp9_set_reference_enc(VP9_PTR comp, VP9_REFFRAME ref_frame_flag,
-                            YV12_BUFFER_CONFIG *sd);
+int vp9_set_reference_enc(struct VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag,
+                          YV12_BUFFER_CONFIG *sd);
 
-  int vp9_update_entropy(VP9_PTR comp, int update);
+int vp9_update_entropy(struct VP9_COMP *cpi, int update);
 
-  int vp9_set_roimap(VP9_PTR comp, unsigned char *map,
-                     unsigned int rows, unsigned int cols,
-                     int delta_q[MAX_SEGMENTS],
-                     int delta_lf[MAX_SEGMENTS],
-                     unsigned int threshold[MAX_SEGMENTS]);
+int vp9_set_roimap(struct VP9_COMP *cpi, unsigned char *map,
+                   unsigned int rows, unsigned int cols,
+                   int delta_q[MAX_SEGMENTS],
+                   int delta_lf[MAX_SEGMENTS],
+                   unsigned int threshold[MAX_SEGMENTS]);
 
-  int vp9_set_active_map(VP9_PTR comp, unsigned char *map,
-                         unsigned int rows, unsigned int cols);
+int vp9_set_active_map(struct VP9_COMP *cpi, unsigned char *map,
+                       unsigned int rows, unsigned int cols);
 
-  int vp9_set_internal_size(VP9_PTR comp,
-                            VPX_SCALING horiz_mode, VPX_SCALING vert_mode);
+int vp9_set_internal_size(struct VP9_COMP *cpi,
+                          VPX_SCALING horiz_mode, VPX_SCALING vert_mode);
 
-  int vp9_set_size_literal(VP9_PTR comp, unsigned int width,
-                           unsigned int height);
+int vp9_set_size_literal(struct VP9_COMP *cpi, unsigned int width,
+                         unsigned int height);
 
-  void vp9_set_svc(VP9_PTR comp, int use_svc);
+void vp9_set_svc(struct VP9_COMP *cpi, int use_svc);
 
-  int vp9_get_quantizer(VP9_PTR c);
+int vp9_get_quantizer(struct VP9_COMP *cpi);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index e6d6ea7..69452d3 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -48,42 +48,6 @@
   PARTITION_CONTEXT left;
 } partition_context_lookup[BLOCK_SIZES];
 
-typedef struct frame_contexts {
-  vp9_prob y_mode_prob[BLOCK_SIZE_GROUPS][INTRA_MODES - 1];
-  vp9_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
-  vp9_prob partition_prob[PARTITION_CONTEXTS][PARTITION_TYPES - 1];
-  vp9_coeff_probs_model coef_probs[TX_SIZES][PLANE_TYPES];
-  vp9_prob switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
-                                 [SWITCHABLE_FILTERS - 1];
-  vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1];
-  vp9_prob intra_inter_prob[INTRA_INTER_CONTEXTS];
-  vp9_prob comp_inter_prob[COMP_INTER_CONTEXTS];
-  vp9_prob single_ref_prob[REF_CONTEXTS][2];
-  vp9_prob comp_ref_prob[REF_CONTEXTS];
-  struct tx_probs tx_probs;
-  vp9_prob skip_probs[SKIP_CONTEXTS];
-  nmv_context nmvc;
-} FRAME_CONTEXT;
-
-typedef struct {
-  unsigned int y_mode[BLOCK_SIZE_GROUPS][INTRA_MODES];
-  unsigned int uv_mode[INTRA_MODES][INTRA_MODES];
-  unsigned int partition[PARTITION_CONTEXTS][PARTITION_TYPES];
-  vp9_coeff_count_model coef[TX_SIZES][PLANE_TYPES];
-  unsigned int eob_branch[TX_SIZES][PLANE_TYPES][REF_TYPES]
-                         [COEF_BANDS][COEFF_CONTEXTS];
-  unsigned int switchable_interp[SWITCHABLE_FILTER_CONTEXTS]
-                                [SWITCHABLE_FILTERS];
-  unsigned int inter_mode[INTER_MODE_CONTEXTS][INTER_MODES];
-  unsigned int intra_inter[INTRA_INTER_CONTEXTS][2];
-  unsigned int comp_inter[COMP_INTER_CONTEXTS][2];
-  unsigned int single_ref[REF_CONTEXTS][2][2];
-  unsigned int comp_ref[REF_CONTEXTS][2];
-  struct tx_counts tx;
-  unsigned int skip[SKIP_CONTEXTS][2];
-  nmv_context_counts mv;
-} FRAME_COUNTS;
-
 
 typedef enum {
   SINGLE_REFERENCE      = 0,
@@ -270,7 +234,8 @@
   return ALIGN_POWER_OF_TWO(n_mis, MI_BLOCK_SIZE_LOG2);
 }
 
-static INLINE const vp9_prob* get_partition_probs(VP9_COMMON *cm, int ctx) {
+static INLINE const vp9_prob* get_partition_probs(const VP9_COMMON *cm,
+                                                  int ctx) {
   return cm->frame_type == KEY_FRAME ? vp9_kf_partition_probs[ctx]
                                      : cm->fc.partition_prob[ctx];
 }
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index df603ad..bdcfafa 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -244,7 +244,6 @@
 // TODO(jingning): This function serves as a placeholder for decoder prediction
 // using on demand border extension. It should be moved to /decoder/ directory.
 static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
-                                       int bw, int bh,
                                        int x, int y, int w, int h,
                                        int mi_x, int mi_y) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
@@ -378,10 +377,10 @@
       assert(bsize == BLOCK_8X8);
       for (y = 0; y < num_4x4_h; ++y)
         for (x = 0; x < num_4x4_w; ++x)
-          dec_build_inter_predictors(xd, plane, i++, bw, bh,
+          dec_build_inter_predictors(xd, plane, i++,
                                      4 * x, 4 * y, 4, 4, mi_x, mi_y);
     } else {
-      dec_build_inter_predictors(xd, plane, 0, bw, bh,
+      dec_build_inter_predictors(xd, plane, 0,
                                  0, 0, bw, bh, mi_x, mi_y);
     }
   }
diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c
index 71a41a9..86f4f35 100644
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@@ -18,21 +18,17 @@
 #include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_onyxc_int.h"
 
-const TX_TYPE mode2txfm_map[MB_MODE_COUNT] = {
-    DCT_DCT,    // DC
-    ADST_DCT,   // V
-    DCT_ADST,   // H
-    DCT_DCT,    // D45
-    ADST_ADST,  // D135
-    ADST_DCT,   // D117
-    DCT_ADST,   // D153
-    DCT_ADST,   // D207
-    ADST_DCT,   // D63
-    ADST_ADST,  // TM
-    DCT_DCT,    // NEARESTMV
-    DCT_DCT,    // NEARMV
-    DCT_DCT,    // ZEROMV
-    DCT_DCT     // NEWMV
+const TX_TYPE mode2txfm_map[INTRA_MODES] = {
+  DCT_DCT,    // DC
+  ADST_DCT,   // V
+  DCT_ADST,   // H
+  DCT_DCT,    // D45
+  ADST_ADST,  // D135
+  ADST_DCT,   // D117
+  DCT_ADST,   // D153
+  DCT_ADST,   // D207
+  ADST_DCT,   // D63
+  ADST_ADST,  // TM
 };
 
 #define intra_pred_sized(type, size) \
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
new file mode 100644
index 0000000..e4cd9d4
--- /dev/null
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -0,0 +1,778 @@
+sub vp9_common_forward_decls() {
+print <<EOF
+/*
+ * VP9
+ */
+
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_enums.h"
+
+struct macroblockd;
+
+/* Encoder forward decls */
+struct macroblock;
+struct vp9_variance_vtable;
+
+#define DEC_MVCOSTS int *mvjcost, int *mvcost[2]
+struct mv;
+union int_mv;
+struct yv12_buffer_config;
+EOF
+}
+forward_decls qw/vp9_common_forward_decls/;
+
+# x86inc.asm doesn't work if pic is enabled on 32 bit platforms so no assembly.
+if (vpx_config("CONFIG_USE_X86INC") eq "yes") {
+  $mmx_x86inc = 'mmx';
+  $sse_x86inc = 'sse';
+  $sse2_x86inc = 'sse2';
+  $ssse3_x86inc = 'ssse3';
+  $avx_x86inc = 'avx';
+  $avx2_x86inc = 'avx2';
+} else {
+  $mmx_x86inc = $sse_x86inc = $sse2_x86inc = $ssse3_x86inc =
+  $avx_x86inc = $avx2_x86inc = '';
+}
+
+# this variable is for functions that are 64 bit only.
+if ($opts{arch} eq "x86_64") {
+  $mmx_x86_64 = 'mmx';
+  $sse2_x86_64 = 'sse2';
+  $ssse3_x86_64 = 'ssse3';
+  $avx_x86_64 = 'avx';
+  $avx2_x86_64 = 'avx2';
+} else {
+  $mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 =
+  $avx_x86_64 = $avx2_x86_64 = '';
+}
+
+#
+# RECON
+#
+add_proto qw/void vp9_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d207_predictor_4x4/, "$ssse3_x86inc";
+
+add_proto qw/void vp9_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d45_predictor_4x4/, "$ssse3_x86inc";
+
+add_proto qw/void vp9_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d63_predictor_4x4/, "$ssse3_x86inc";
+
+add_proto qw/void vp9_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_h_predictor_4x4 neon dspr2/, "$ssse3_x86inc";
+
+add_proto qw/void vp9_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d117_predictor_4x4/;
+
+add_proto qw/void vp9_d135_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d135_predictor_4x4/;
+
+add_proto qw/void vp9_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d153_predictor_4x4/, "$ssse3_x86inc";
+
+add_proto qw/void vp9_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_v_predictor_4x4 neon/, "$sse_x86inc";
+
+add_proto qw/void vp9_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_tm_predictor_4x4 neon dspr2/, "$sse_x86inc";
+
+add_proto qw/void vp9_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_dc_predictor_4x4 dspr2/, "$sse_x86inc";
+
+add_proto qw/void vp9_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_dc_top_predictor_4x4/;
+
+add_proto qw/void vp9_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_dc_left_predictor_4x4/;
+
+add_proto qw/void vp9_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_dc_128_predictor_4x4/;
+
+add_proto qw/void vp9_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d207_predictor_8x8/, "$ssse3_x86inc";
+
+add_proto qw/void vp9_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d45_predictor_8x8/, "$ssse3_x86inc";
+
+add_proto qw/void vp9_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d63_predictor_8x8/, "$ssse3_x86inc";
+
+add_proto qw/void vp9_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_h_predictor_8x8 neon dspr2/, "$ssse3_x86inc";
+
+add_proto qw/void vp9_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d117_predictor_8x8/;
+
+add_proto qw/void vp9_d135_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d135_predictor_8x8/;
+
+add_proto qw/void vp9_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d153_predictor_8x8/, "$ssse3_x86inc";
+
+add_proto qw/void vp9_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_v_predictor_8x8 neon/, "$sse_x86inc";
+
+add_proto qw/void vp9_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_tm_predictor_8x8 neon dspr2/, "$sse2_x86inc";
+
+add_proto qw/void vp9_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_dc_predictor_8x8 dspr2/, "$sse_x86inc";
+
+add_proto qw/void vp9_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_dc_top_predictor_8x8/;
+
+add_proto qw/void vp9_dc_left_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_dc_left_predictor_8x8/;
+
+add_proto qw/void vp9_dc_128_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_dc_128_predictor_8x8/;
+
+add_proto qw/void vp9_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d207_predictor_16x16/, "$ssse3_x86inc";
+
+add_proto qw/void vp9_d45_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d45_predictor_16x16/, "$ssse3_x86inc";
+
+add_proto qw/void vp9_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d63_predictor_16x16/, "$ssse3_x86inc";
+
+add_proto qw/void vp9_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_h_predictor_16x16 neon dspr2/, "$ssse3_x86inc";
+
+add_proto qw/void vp9_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d117_predictor_16x16/;
+
+add_proto qw/void vp9_d135_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d135_predictor_16x16/;
+
+add_proto qw/void vp9_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d153_predictor_16x16/, "$ssse3_x86inc";
+
+add_proto qw/void vp9_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_v_predictor_16x16 neon/, "$sse2_x86inc";
+
+add_proto qw/void vp9_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_tm_predictor_16x16 neon/, "$sse2_x86inc";
+
+add_proto qw/void vp9_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_dc_predictor_16x16 dspr2/, "$sse2_x86inc";
+
+add_proto qw/void vp9_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_dc_top_predictor_16x16/;
+
+add_proto qw/void vp9_dc_left_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_dc_left_predictor_16x16/;
+
+add_proto qw/void vp9_dc_128_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_dc_128_predictor_16x16/;
+
+add_proto qw/void vp9_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d207_predictor_32x32/, "$ssse3_x86inc";
+
+add_proto qw/void vp9_d45_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d45_predictor_32x32/, "$ssse3_x86inc";
+
+add_proto qw/void vp9_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d63_predictor_32x32/, "$ssse3_x86inc";
+
+add_proto qw/void vp9_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_h_predictor_32x32 neon/, "$ssse3_x86inc";
+
+add_proto qw/void vp9_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d117_predictor_32x32/;
+
+add_proto qw/void vp9_d135_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d135_predictor_32x32/;
+
+add_proto qw/void vp9_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_d153_predictor_32x32/;
+
+add_proto qw/void vp9_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_v_predictor_32x32 neon/, "$sse2_x86inc";
+
+add_proto qw/void vp9_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_tm_predictor_32x32 neon/, "$sse2_x86_64";
+
+add_proto qw/void vp9_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_dc_predictor_32x32/, "$sse2_x86inc";
+
+add_proto qw/void vp9_dc_top_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_dc_top_predictor_32x32/;
+
+add_proto qw/void vp9_dc_left_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_dc_left_predictor_32x32/;
+
+add_proto qw/void vp9_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vp9_dc_128_predictor_32x32/;
+
+#
+# Loopfilter
+#
+add_proto qw/void vp9_lpf_vertical_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/vp9_lpf_vertical_16 sse2 neon dspr2/;
+
+add_proto qw/void vp9_lpf_vertical_16_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/vp9_lpf_vertical_16_dual sse2 neon dspr2/;
+
+add_proto qw/void vp9_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
+specialize qw/vp9_lpf_vertical_8 sse2 neon dspr2/;
+
+add_proto qw/void vp9_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/vp9_lpf_vertical_8_dual sse2 neon dspr2/;
+
+add_proto qw/void vp9_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
+specialize qw/vp9_lpf_vertical_4 mmx neon dspr2/;
+
+add_proto qw/void vp9_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/vp9_lpf_vertical_4_dual sse2 neon dspr2/;
+
+add_proto qw/void vp9_lpf_horizontal_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
+specialize qw/vp9_lpf_horizontal_16 sse2 avx2 neon dspr2/;
+
+add_proto qw/void vp9_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
+specialize qw/vp9_lpf_horizontal_8 sse2 neon dspr2/;
+
+add_proto qw/void vp9_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/vp9_lpf_horizontal_8_dual sse2 neon dspr2/;
+
+add_proto qw/void vp9_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
+specialize qw/vp9_lpf_horizontal_4 mmx neon dspr2/;
+
+add_proto qw/void vp9_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/vp9_lpf_horizontal_4_dual sse2 neon dspr2/;
+
+#
+# post proc
+#
+if (vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
+add_proto qw/void vp9_mbpost_proc_down/, "uint8_t *dst, int pitch, int rows, int cols, int flimit";
+specialize qw/vp9_mbpost_proc_down mmx sse2/;
+$vp9_mbpost_proc_down_sse2=vp9_mbpost_proc_down_xmm;
+
+add_proto qw/void vp9_mbpost_proc_across_ip/, "uint8_t *src, int pitch, int rows, int cols, int flimit";
+specialize qw/vp9_mbpost_proc_across_ip sse2/;
+$vp9_mbpost_proc_across_ip_sse2=vp9_mbpost_proc_across_ip_xmm;
+
+add_proto qw/void vp9_post_proc_down_and_across/, "const uint8_t *src_ptr, uint8_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit";
+specialize qw/vp9_post_proc_down_and_across mmx sse2/;
+$vp9_post_proc_down_and_across_sse2=vp9_post_proc_down_and_across_xmm;
+
+add_proto qw/void vp9_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
+specialize qw/vp9_plane_add_noise mmx sse2/;
+$vp9_plane_add_noise_sse2=vp9_plane_add_noise_wmt;
+}
+
+add_proto qw/void vp9_blend_mb_inner/, "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride";
+specialize qw/vp9_blend_mb_inner/;
+
+add_proto qw/void vp9_blend_mb_outer/, "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride";
+specialize qw/vp9_blend_mb_outer/;
+
+add_proto qw/void vp9_blend_b/, "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride";
+specialize qw/vp9_blend_b/;
+
+#
+# Sub Pixel Filters
+#
+add_proto qw/void vp9_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vp9_convolve_copy neon dspr2/, "$sse2_x86inc";
+
+add_proto qw/void vp9_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vp9_convolve_avg neon dspr2/, "$sse2_x86inc";
+
+add_proto qw/void vp9_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vp9_convolve8 sse2 ssse3 avx2 neon dspr2/;
+
+add_proto qw/void vp9_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vp9_convolve8_horiz sse2 ssse3 avx2 neon dspr2/;
+
+add_proto qw/void vp9_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vp9_convolve8_vert sse2 ssse3 avx2 neon dspr2/;
+
+add_proto qw/void vp9_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vp9_convolve8_avg sse2 ssse3 neon dspr2/;
+
+add_proto qw/void vp9_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vp9_convolve8_avg_horiz sse2 ssse3 neon dspr2/;
+
+add_proto qw/void vp9_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+specialize qw/vp9_convolve8_avg_vert sse2 ssse3 neon dspr2/;
+
+#
+# dct
+#
+add_proto qw/void vp9_idct4x4_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
+specialize qw/vp9_idct4x4_1_add sse2 neon dspr2/;
+
+add_proto qw/void vp9_idct4x4_16_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
+specialize qw/vp9_idct4x4_16_add sse2 neon dspr2/;
+
+add_proto qw/void vp9_idct8x8_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
+specialize qw/vp9_idct8x8_1_add sse2 neon dspr2/;
+
+add_proto qw/void vp9_idct8x8_64_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
+specialize qw/vp9_idct8x8_64_add sse2 neon dspr2/;
+
+add_proto qw/void vp9_idct8x8_10_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
+specialize qw/vp9_idct8x8_10_add sse2 neon dspr2/;
+
+add_proto qw/void vp9_idct16x16_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
+specialize qw/vp9_idct16x16_1_add sse2 neon dspr2/;
+
+add_proto qw/void vp9_idct16x16_256_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
+specialize qw/vp9_idct16x16_256_add sse2 neon dspr2/;
+
+add_proto qw/void vp9_idct16x16_10_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
+specialize qw/vp9_idct16x16_10_add sse2 neon dspr2/;
+
+add_proto qw/void vp9_idct32x32_1024_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
+specialize qw/vp9_idct32x32_1024_add sse2 neon dspr2/;
+
+add_proto qw/void vp9_idct32x32_34_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
+specialize qw/vp9_idct32x32_34_add sse2 neon dspr2/;
+$vp9_idct32x32_34_add_neon=vp9_idct32x32_1024_add_neon;
+
+add_proto qw/void vp9_idct32x32_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
+specialize qw/vp9_idct32x32_1_add sse2 neon dspr2/;
+
+add_proto qw/void vp9_iht4x4_16_add/, "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type";
+specialize qw/vp9_iht4x4_16_add sse2 neon dspr2/;
+
+add_proto qw/void vp9_iht8x8_64_add/, "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type";
+specialize qw/vp9_iht8x8_64_add sse2 neon dspr2/;
+
+add_proto qw/void vp9_iht16x16_256_add/, "const int16_t *input, uint8_t *output, int pitch, int tx_type";
+specialize qw/vp9_iht16x16_256_add sse2 dspr2/;
+
+# dct and add
+
+add_proto qw/void vp9_iwht4x4_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
+specialize qw/vp9_iwht4x4_1_add/;
+
+add_proto qw/void vp9_iwht4x4_16_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
+specialize qw/vp9_iwht4x4_16_add/;
+
+#
+# Encoder functions below this point.
+#
+if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
+
+
+# variance
+add_proto qw/unsigned int vp9_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_variance32x16/, "$sse2_x86inc", "$avx2_x86inc";
+
+add_proto qw/unsigned int vp9_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_variance16x32/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_variance64x32/, "$sse2_x86inc", "$avx2_x86inc";
+
+add_proto qw/unsigned int vp9_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_variance32x64/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_variance32x32/, "$sse2_x86inc", "$avx2_x86inc";
+
+add_proto qw/unsigned int vp9_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_variance64x64/, "$sse2_x86inc", "$avx2_x86inc";
+
+add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_variance16x16 mmx/, "$sse2_x86inc", "$avx2_x86inc";
+
+add_proto qw/unsigned int vp9_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_variance16x8 mmx/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_variance8x16 mmx/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_variance8x8 mmx/, "$sse2_x86inc";
+
+add_proto qw/void vp9_get_sse_sum_8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+specialize qw/vp9_get_sse_sum_8x8 sse2/;
+$vp9_get_sse_sum_8x8_sse2=vp9_get8x8var_sse2;
+
+add_proto qw/unsigned int vp9_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_variance8x4/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_variance4x8/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_variance4x4 mmx/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_sub_pixel_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+specialize qw/vp9_sub_pixel_avg_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_sub_pixel_variance32x64/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+specialize qw/vp9_sub_pixel_avg_variance32x64/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_sub_pixel_variance64x32/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+specialize qw/vp9_sub_pixel_avg_variance64x32/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_sub_pixel_variance32x16/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+specialize qw/vp9_sub_pixel_avg_variance32x16/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_sub_pixel_variance16x32/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+specialize qw/vp9_sub_pixel_avg_variance16x32/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_sub_pixel_variance32x32 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+specialize qw/vp9_sub_pixel_avg_variance32x32 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_sub_pixel_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+specialize qw/vp9_sub_pixel_avg_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_sub_pixel_variance8x16/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+specialize qw/vp9_sub_pixel_avg_variance8x16/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_sub_pixel_variance16x8/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+specialize qw/vp9_sub_pixel_avg_variance16x8/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_sub_pixel_variance8x8/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+specialize qw/vp9_sub_pixel_avg_variance8x8/, "$sse2_x86inc", "$ssse3_x86inc";
+
+# TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form
+add_proto qw/unsigned int vp9_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_sub_pixel_variance8x4/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+specialize qw/vp9_sub_pixel_avg_variance8x4/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_sub_pixel_variance4x8/, "$sse_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+specialize qw/vp9_sub_pixel_avg_variance4x8/, "$sse_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_sub_pixel_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
+#vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
+
+add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+specialize qw/vp9_sub_pixel_avg_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
+
+add_proto qw/unsigned int vp9_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad";
+specialize qw/vp9_sad64x64/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad";
+specialize qw/vp9_sad32x64/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad";
+specialize qw/vp9_sad64x32/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad";
+specialize qw/vp9_sad32x16/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad";
+specialize qw/vp9_sad16x32/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad";
+specialize qw/vp9_sad32x32/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad";
+specialize qw/vp9_sad16x16 mmx/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad";
+specialize qw/vp9_sad16x8 mmx/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad";
+specialize qw/vp9_sad8x16 mmx/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad";
+specialize qw/vp9_sad8x8 mmx/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad";
+specialize qw/vp9_sad8x4/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sad4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad";
+specialize qw/vp9_sad4x8/, "$sse_x86inc";
+
+add_proto qw/unsigned int vp9_sad4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad";
+specialize qw/vp9_sad4x4 mmx/, "$sse_x86inc";
+
+add_proto qw/unsigned int vp9_sad64x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+specialize qw/vp9_sad64x64_avg/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sad32x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+specialize qw/vp9_sad32x64_avg/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sad64x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+specialize qw/vp9_sad64x32_avg/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sad32x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+specialize qw/vp9_sad32x16_avg/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sad16x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+specialize qw/vp9_sad16x32_avg/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sad32x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+specialize qw/vp9_sad32x32_avg/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sad16x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+specialize qw/vp9_sad16x16_avg/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sad16x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+specialize qw/vp9_sad16x8_avg/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sad8x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+specialize qw/vp9_sad8x16_avg/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sad8x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+specialize qw/vp9_sad8x8_avg/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sad8x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+specialize qw/vp9_sad8x4_avg/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_sad4x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+specialize qw/vp9_sad4x8_avg/, "$sse_x86inc";
+
+add_proto qw/unsigned int vp9_sad4x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad";
+specialize qw/vp9_sad4x4_avg/, "$sse_x86inc";
+
+add_proto qw/unsigned int vp9_variance_halfpixvar16x16_h/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_variance_halfpixvar16x16_h/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_variance_halfpixvar16x16_v/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_variance_halfpixvar16x16_v/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_variance_halfpixvar16x16_hv/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_variance_halfpixvar16x16_hv/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vp9_variance_halfpixvar64x64_h/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_variance_halfpixvar64x64_h/;
+
+add_proto qw/unsigned int vp9_variance_halfpixvar64x64_v/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_variance_halfpixvar64x64_v/;
+
+add_proto qw/unsigned int vp9_variance_halfpixvar64x64_hv/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_variance_halfpixvar64x64_hv/;
+
+add_proto qw/unsigned int vp9_variance_halfpixvar32x32_h/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_variance_halfpixvar32x32_h/;
+
+add_proto qw/unsigned int vp9_variance_halfpixvar32x32_v/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_variance_halfpixvar32x32_v/;
+
+add_proto qw/unsigned int vp9_variance_halfpixvar32x32_hv/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_variance_halfpixvar32x32_hv/;
+
+add_proto qw/void vp9_sad64x64x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
+specialize qw/vp9_sad64x64x3/;
+
+add_proto qw/void vp9_sad32x32x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
+specialize qw/vp9_sad32x32x3/;
+
+add_proto qw/void vp9_sad16x16x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
+specialize qw/vp9_sad16x16x3 sse3 ssse3/;
+
+add_proto qw/void vp9_sad16x8x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
+specialize qw/vp9_sad16x8x3 sse3 ssse3/;
+
+add_proto qw/void vp9_sad8x16x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
+specialize qw/vp9_sad8x16x3 sse3/;
+
+add_proto qw/void vp9_sad8x8x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
+specialize qw/vp9_sad8x8x3 sse3/;
+
+add_proto qw/void vp9_sad4x4x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
+specialize qw/vp9_sad4x4x3 sse3/;
+
+add_proto qw/void vp9_sad64x64x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
+specialize qw/vp9_sad64x64x8/;
+
+add_proto qw/void vp9_sad32x32x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
+specialize qw/vp9_sad32x32x8/;
+
+add_proto qw/void vp9_sad16x16x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
+specialize qw/vp9_sad16x16x8 sse4/;
+
+add_proto qw/void vp9_sad16x8x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
+specialize qw/vp9_sad16x8x8 sse4/;
+
+add_proto qw/void vp9_sad8x16x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
+specialize qw/vp9_sad8x16x8 sse4/;
+
+add_proto qw/void vp9_sad8x8x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
+specialize qw/vp9_sad8x8x8 sse4/;
+
+add_proto qw/void vp9_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vp9_sad8x4x8/;
+
+add_proto qw/void vp9_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vp9_sad4x8x8/;
+
+add_proto qw/void vp9_sad4x4x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
+specialize qw/vp9_sad4x4x8 sse4/;
+
+add_proto qw/void vp9_sad64x64x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+specialize qw/vp9_sad64x64x4d sse2/;
+
+add_proto qw/void vp9_sad32x64x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+specialize qw/vp9_sad32x64x4d sse2/;
+
+add_proto qw/void vp9_sad64x32x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+specialize qw/vp9_sad64x32x4d sse2/;
+
+add_proto qw/void vp9_sad32x16x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+specialize qw/vp9_sad32x16x4d sse2/;
+
+add_proto qw/void vp9_sad16x32x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+specialize qw/vp9_sad16x32x4d sse2/;
+
+add_proto qw/void vp9_sad32x32x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+specialize qw/vp9_sad32x32x4d sse2/;
+
+add_proto qw/void vp9_sad16x16x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+specialize qw/vp9_sad16x16x4d sse2/;
+
+add_proto qw/void vp9_sad16x8x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+specialize qw/vp9_sad16x8x4d sse2/;
+
+add_proto qw/void vp9_sad8x16x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+specialize qw/vp9_sad8x16x4d sse2/;
+
+add_proto qw/void vp9_sad8x8x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+specialize qw/vp9_sad8x8x4d sse2/;
+
+# TODO(jingning): need to convert these 4x8/8x4 functions into sse2 form
+add_proto qw/void vp9_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
+specialize qw/vp9_sad8x4x4d sse2/;
+
+add_proto qw/void vp9_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
+specialize qw/vp9_sad4x8x4d sse/;
+
+add_proto qw/void vp9_sad4x4x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+specialize qw/vp9_sad4x4x4d sse/;
+
+#add_proto qw/unsigned int vp9_sub_pixel_mse16x16/, "const uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse";
+#specialize qw/vp9_sub_pixel_mse16x16 sse2 mmx/;
+
+add_proto qw/unsigned int vp9_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+specialize qw/vp9_mse16x16 mmx/, "$sse2_x86inc", "$avx2_x86inc";
+
+add_proto qw/unsigned int vp9_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+specialize qw/vp9_mse8x16/;
+
+add_proto qw/unsigned int vp9_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+specialize qw/vp9_mse16x8/;
+
+add_proto qw/unsigned int vp9_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+specialize qw/vp9_mse8x8/;
+
+add_proto qw/unsigned int vp9_sub_pixel_mse64x64/, "const uint8_t *src_ptr, int  source_stride, int  xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_sub_pixel_mse64x64/;
+
+add_proto qw/unsigned int vp9_sub_pixel_mse32x32/, "const uint8_t *src_ptr, int  source_stride, int  xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+specialize qw/vp9_sub_pixel_mse32x32/;
+
+add_proto qw/unsigned int vp9_get_mb_ss/, "const int16_t *";
+specialize qw/vp9_get_mb_ss mmx sse2/;
+# ENCODEMB INVOKE
+
+add_proto qw/int64_t vp9_block_error/, "const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+specialize qw/vp9_block_error/, "$sse2_x86inc";
+
+add_proto qw/void vp9_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
+specialize qw/vp9_subtract_block/, "$sse2_x86inc";
+
+add_proto qw/void vp9_quantize_b/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+specialize qw/vp9_quantize_b/, "$ssse3_x86_64";
+
+add_proto qw/void vp9_quantize_b_32x32/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+specialize qw/vp9_quantize_b_32x32/, "$ssse3_x86_64";
+
+#
+# Structured Similarity (SSIM)
+#
+if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {
+    add_proto qw/void vp9_ssim_parms_8x8/, "uint8_t *s, int sp, uint8_t *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr";
+    specialize qw/vp9_ssim_parms_8x8/, "$sse2_x86_64";
+
+    add_proto qw/void vp9_ssim_parms_16x16/, "uint8_t *s, int sp, uint8_t *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr";
+    specialize qw/vp9_ssim_parms_16x16/, "$sse2_x86_64";
+}
+
+# fdct functions
+add_proto qw/void vp9_fht4x4/, "const int16_t *input, int16_t *output, int stride, int tx_type";
+specialize qw/vp9_fht4x4 sse2 avx2/;
+
+add_proto qw/void vp9_fht8x8/, "const int16_t *input, int16_t *output, int stride, int tx_type";
+specialize qw/vp9_fht8x8 sse2 avx2/;
+
+add_proto qw/void vp9_fht16x16/, "const int16_t *input, int16_t *output, int stride, int tx_type";
+specialize qw/vp9_fht16x16 sse2 avx2/;
+
+add_proto qw/void vp9_fwht4x4/, "const int16_t *input, int16_t *output, int stride";
+specialize qw/vp9_fwht4x4/;
+
+add_proto qw/void vp9_fdct4x4/, "const int16_t *input, int16_t *output, int stride";
+specialize qw/vp9_fdct4x4 sse2 avx2/;
+
+add_proto qw/void vp9_fdct8x8/, "const int16_t *input, int16_t *output, int stride";
+specialize qw/vp9_fdct8x8 sse2 avx2/;
+
+add_proto qw/void vp9_fdct16x16/, "const int16_t *input, int16_t *output, int stride";
+specialize qw/vp9_fdct16x16 sse2 avx2/;
+
+add_proto qw/void vp9_fdct32x32/, "const int16_t *input, int16_t *output, int stride";
+specialize qw/vp9_fdct32x32 sse2 avx2/;
+
+add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, int16_t *output, int stride";
+specialize qw/vp9_fdct32x32_rd sse2 avx2/;
+
+#
+# Motion search
+#
+add_proto qw/int vp9_full_search_sad/, "const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv, struct mv *best_mv";
+specialize qw/vp9_full_search_sad sse3 sse4_1/;
+$vp9_full_search_sad_sse3=vp9_full_search_sadx3;
+$vp9_full_search_sad_sse4_1=vp9_full_search_sadx8;
+
+add_proto qw/int vp9_refining_search_sad/, "const struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv";
+specialize qw/vp9_refining_search_sad sse3/;
+$vp9_refining_search_sad_sse3=vp9_refining_search_sadx4;
+
+add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv";
+specialize qw/vp9_diamond_search_sad sse3/;
+$vp9_diamond_search_sad_sse3=vp9_diamond_search_sadx4;
+
+add_proto qw/int vp9_full_range_search/, "const struct macroblock *x, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv";
+specialize qw/vp9_full_range_search/;
+
+add_proto qw/void vp9_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
+specialize qw/vp9_temporal_filter_apply sse2/;
+
+}
+# end encoder functions
+1;
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
deleted file mode 100644
index 83ee69b..0000000
--- a/vp9/common/vp9_rtcd_defs.sh
+++ /dev/null
@@ -1,760 +0,0 @@
-vp9_common_forward_decls() {
-cat <<EOF
-/*
- * VP9
- */
-
-#include "vpx/vpx_integer.h"
-#include "vp9/common/vp9_enums.h"
-
-struct macroblockd;
-
-/* Encoder forward decls */
-struct macroblock;
-struct vp9_variance_vtable;
-
-#define DEC_MVCOSTS int *mvjcost, int *mvcost[2]
-struct mv;
-union int_mv;
-struct yv12_buffer_config;
-EOF
-}
-forward_decls vp9_common_forward_decls
-
-# x86inc.asm doesn't work if pic is enabled on 32 bit platforms so no assembly.
-[ "$CONFIG_USE_X86INC" = "yes" ] && mmx_x86inc=mmx && sse_x86inc=sse &&
-  sse2_x86inc=sse2 && ssse3_x86inc=ssse3 && avx_x86inc=avx && avx2_x86inc=avx2
-
-# this variable is for functions that are 64 bit only.
-[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2 && 
-  ssse3_x86_64=ssse3 && avx_x86_64=avx && avx2_x86_64=avx2
-
-#
-# RECON
-#
-prototype void vp9_d207_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d207_predictor_4x4 $ssse3_x86inc
-
-prototype void vp9_d45_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d45_predictor_4x4 $ssse3_x86inc
-
-prototype void vp9_d63_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d63_predictor_4x4 $ssse3_x86inc
-
-prototype void vp9_h_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_h_predictor_4x4 $ssse3_x86inc neon dspr2
-
-prototype void vp9_d117_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d117_predictor_4x4
-
-prototype void vp9_d135_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d135_predictor_4x4
-
-prototype void vp9_d153_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d153_predictor_4x4 $ssse3_x86inc
-
-prototype void vp9_v_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_v_predictor_4x4 $sse_x86inc neon
-
-prototype void vp9_tm_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_tm_predictor_4x4 $sse_x86inc neon dspr2
-
-prototype void vp9_dc_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_dc_predictor_4x4 $sse_x86inc dspr2
-
-prototype void vp9_dc_top_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_dc_top_predictor_4x4
-
-prototype void vp9_dc_left_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_dc_left_predictor_4x4
-
-prototype void vp9_dc_128_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_dc_128_predictor_4x4
-
-prototype void vp9_d207_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d207_predictor_8x8 $ssse3_x86inc
-
-prototype void vp9_d45_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d45_predictor_8x8 $ssse3_x86inc
-
-prototype void vp9_d63_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d63_predictor_8x8 $ssse3_x86inc
-
-prototype void vp9_h_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_h_predictor_8x8 $ssse3_x86inc neon dspr2
-
-prototype void vp9_d117_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d117_predictor_8x8
-
-prototype void vp9_d135_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d135_predictor_8x8
-
-prototype void vp9_d153_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d153_predictor_8x8 $ssse3_x86inc
-
-prototype void vp9_v_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_v_predictor_8x8 $sse_x86inc neon
-
-prototype void vp9_tm_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_tm_predictor_8x8 $sse2_x86inc neon dspr2
-
-prototype void vp9_dc_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_dc_predictor_8x8 $sse_x86inc dspr2
-
-prototype void vp9_dc_top_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_dc_top_predictor_8x8
-
-prototype void vp9_dc_left_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_dc_left_predictor_8x8
-
-prototype void vp9_dc_128_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_dc_128_predictor_8x8
-
-prototype void vp9_d207_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d207_predictor_16x16 $ssse3_x86inc
-
-prototype void vp9_d45_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d45_predictor_16x16 $ssse3_x86inc
-
-prototype void vp9_d63_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d63_predictor_16x16 $ssse3_x86inc
-
-prototype void vp9_h_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_h_predictor_16x16 $ssse3_x86inc neon dspr2
-
-prototype void vp9_d117_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d117_predictor_16x16
-
-prototype void vp9_d135_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d135_predictor_16x16
-
-prototype void vp9_d153_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d153_predictor_16x16 $ssse3_x86inc
-
-prototype void vp9_v_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_v_predictor_16x16 $sse2_x86inc neon
-
-prototype void vp9_tm_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_tm_predictor_16x16 $sse2_x86inc neon
-
-prototype void vp9_dc_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_dc_predictor_16x16 $sse2_x86inc dspr2
-
-prototype void vp9_dc_top_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_dc_top_predictor_16x16
-
-prototype void vp9_dc_left_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_dc_left_predictor_16x16
-
-prototype void vp9_dc_128_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_dc_128_predictor_16x16
-
-prototype void vp9_d207_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d207_predictor_32x32 $ssse3_x86inc
-
-prototype void vp9_d45_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d45_predictor_32x32 $ssse3_x86inc
-
-prototype void vp9_d63_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d63_predictor_32x32 $ssse3_x86inc
-
-prototype void vp9_h_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_h_predictor_32x32 $ssse3_x86inc neon
-
-prototype void vp9_d117_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d117_predictor_32x32
-
-prototype void vp9_d135_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d135_predictor_32x32
-
-prototype void vp9_d153_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_d153_predictor_32x32
-
-prototype void vp9_v_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_v_predictor_32x32 $sse2_x86inc neon
-
-prototype void vp9_tm_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_tm_predictor_32x32 $sse2_x86_64 neon
-
-prototype void vp9_dc_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_dc_predictor_32x32 $sse2_x86inc
-
-prototype void vp9_dc_top_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_dc_top_predictor_32x32
-
-prototype void vp9_dc_left_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_dc_left_predictor_32x32
-
-prototype void vp9_dc_128_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_dc_128_predictor_32x32
-
-#
-# Loopfilter
-#
-prototype void vp9_lpf_vertical_16 "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"
-specialize vp9_lpf_vertical_16 sse2 neon dspr2
-
-prototype void vp9_lpf_vertical_16_dual "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"
-specialize vp9_lpf_vertical_16_dual sse2 neon dspr2
-
-prototype void vp9_lpf_vertical_8 "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_lpf_vertical_8 sse2 neon dspr2
-
-prototype void vp9_lpf_vertical_8_dual "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"
-specialize vp9_lpf_vertical_8_dual sse2 neon dspr2
-
-prototype void vp9_lpf_vertical_4 "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_lpf_vertical_4 mmx neon dspr2
-
-prototype void vp9_lpf_vertical_4_dual "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"
-specialize vp9_lpf_vertical_4_dual sse2 neon dspr2
-
-prototype void vp9_lpf_horizontal_16 "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_lpf_horizontal_16 sse2 avx2 neon dspr2
-
-prototype void vp9_lpf_horizontal_8 "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_lpf_horizontal_8 sse2 neon dspr2
-
-prototype void vp9_lpf_horizontal_8_dual "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"
-specialize vp9_lpf_horizontal_8_dual sse2 neon dspr2
-
-prototype void vp9_lpf_horizontal_4 "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_lpf_horizontal_4 mmx neon dspr2
-
-prototype void vp9_lpf_horizontal_4_dual "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"
-specialize vp9_lpf_horizontal_4_dual sse2 neon dspr2
-
-#
-# post proc
-#
-if [ "$CONFIG_VP9_POSTPROC" = "yes" ]; then
-prototype void vp9_mbpost_proc_down "uint8_t *dst, int pitch, int rows, int cols, int flimit"
-specialize vp9_mbpost_proc_down mmx sse2
-vp9_mbpost_proc_down_sse2=vp9_mbpost_proc_down_xmm
-
-prototype void vp9_mbpost_proc_across_ip "uint8_t *src, int pitch, int rows, int cols, int flimit"
-specialize vp9_mbpost_proc_across_ip sse2
-vp9_mbpost_proc_across_ip_sse2=vp9_mbpost_proc_across_ip_xmm
-
-prototype void vp9_post_proc_down_and_across "const uint8_t *src_ptr, uint8_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit"
-specialize vp9_post_proc_down_and_across mmx sse2
-vp9_post_proc_down_and_across_sse2=vp9_post_proc_down_and_across_xmm
-
-prototype void vp9_plane_add_noise "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch"
-specialize vp9_plane_add_noise mmx sse2
-vp9_plane_add_noise_sse2=vp9_plane_add_noise_wmt
-fi
-
-prototype void vp9_blend_mb_inner "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride"
-specialize vp9_blend_mb_inner
-
-prototype void vp9_blend_mb_outer "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride"
-specialize vp9_blend_mb_outer
-
-prototype void vp9_blend_b "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride"
-specialize vp9_blend_b
-
-#
-# Sub Pixel Filters
-#
-prototype void vp9_convolve_copy "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve_copy $sse2_x86inc neon dspr2
-
-prototype void vp9_convolve_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve_avg $sse2_x86inc neon dspr2
-
-prototype void vp9_convolve8 "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8 sse2 ssse3 avx2 neon dspr2
-
-prototype void vp9_convolve8_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_horiz sse2 ssse3 avx2 neon dspr2
-
-prototype void vp9_convolve8_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_vert sse2 ssse3 avx2 neon dspr2
-
-prototype void vp9_convolve8_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_avg sse2 ssse3 neon dspr2
-
-prototype void vp9_convolve8_avg_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_avg_horiz sse2 ssse3 neon dspr2
-
-prototype void vp9_convolve8_avg_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve8_avg_vert sse2 ssse3 neon dspr2
-
-#
-# dct
-#
-prototype void vp9_idct4x4_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct4x4_1_add sse2 neon dspr2
-
-prototype void vp9_idct4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct4x4_16_add sse2 neon dspr2
-
-prototype void vp9_idct8x8_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct8x8_1_add sse2 neon dspr2
-
-prototype void vp9_idct8x8_64_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct8x8_64_add sse2 neon dspr2
-
-prototype void vp9_idct8x8_10_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct8x8_10_add sse2 neon dspr2
-
-prototype void vp9_idct16x16_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct16x16_1_add sse2 neon dspr2
-
-prototype void vp9_idct16x16_256_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct16x16_256_add sse2 neon dspr2
-
-prototype void vp9_idct16x16_10_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct16x16_10_add sse2 neon dspr2
-
-prototype void vp9_idct32x32_1024_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct32x32_1024_add sse2 neon dspr2
-
-prototype void vp9_idct32x32_34_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct32x32_34_add sse2 neon dspr2
-vp9_idct32x32_34_add_neon=vp9_idct32x32_1024_add_neon
-
-prototype void vp9_idct32x32_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct32x32_1_add sse2 neon dspr2
-
-prototype void vp9_iht4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
-specialize vp9_iht4x4_16_add sse2 neon dspr2
-
-prototype void vp9_iht8x8_64_add "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
-specialize vp9_iht8x8_64_add sse2 neon dspr2
-
-prototype void vp9_iht16x16_256_add "const int16_t *input, uint8_t *output, int pitch, int tx_type"
-specialize vp9_iht16x16_256_add sse2 dspr2
-
-# dct and add
-
-prototype void vp9_iwht4x4_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_iwht4x4_1_add
-
-prototype void vp9_iwht4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_iwht4x4_16_add
-
-#
-# Encoder functions below this point.
-#
-if [ "$CONFIG_VP9_ENCODER" = "yes" ]; then
-
-
-# variance
-prototype unsigned int vp9_variance32x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance32x16 $sse2_x86inc $avx2_x86inc
-
-prototype unsigned int vp9_variance16x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance16x32 $sse2_x86inc
-
-prototype unsigned int vp9_variance64x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance64x32 $sse2_x86inc $avx2_x86inc
-
-prototype unsigned int vp9_variance32x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance32x64 $sse2_x86inc
-
-prototype unsigned int vp9_variance32x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance32x32 $sse2_x86inc $avx2_x86inc
-
-prototype unsigned int vp9_variance64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance64x64 $sse2_x86inc $avx2_x86inc
-
-prototype unsigned int vp9_variance16x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance16x16 mmx $sse2_x86inc $avx2_x86inc
-
-prototype unsigned int vp9_variance16x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance16x8 mmx $sse2_x86inc
-
-prototype unsigned int vp9_variance8x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance8x16 mmx $sse2_x86inc
-
-prototype unsigned int vp9_variance8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance8x8 mmx $sse2_x86inc
-
-prototype void vp9_get_sse_sum_8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"
-specialize vp9_get_sse_sum_8x8 sse2
-vp9_get_sse_sum_8x8_sse2=vp9_get8x8var_sse2
-
-prototype unsigned int vp9_variance8x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance8x4 $sse2_x86inc
-
-prototype unsigned int vp9_variance4x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance4x8 $sse2_x86inc
-
-prototype unsigned int vp9_variance4x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance4x4 mmx $sse2_x86inc
-
-prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance64x64 $sse2_x86inc $ssse3_x86inc avx2
-
-prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance64x64 $sse2_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sub_pixel_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance32x64 $sse2_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sub_pixel_avg_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance32x64 $sse2_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sub_pixel_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance64x32 $sse2_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sub_pixel_avg_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance64x32 $sse2_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sub_pixel_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance32x16 $sse2_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sub_pixel_avg_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance32x16 $sse2_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sub_pixel_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance16x32 $sse2_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sub_pixel_avg_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance16x32 $sse2_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance32x32 $sse2_x86inc $ssse3_x86inc avx2
-
-prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance32x32 $sse2_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance16x16 $sse2_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sub_pixel_avg_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance16x16 $sse2_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance8x16 $sse2_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sub_pixel_avg_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance8x16 $sse2_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance16x8 $sse2_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sub_pixel_avg_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance16x8 $sse2_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance8x8 $sse2_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sub_pixel_avg_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance8x8 $sse2_x86inc $ssse3_x86inc
-
-# TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form
-prototype unsigned int vp9_sub_pixel_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance8x4 $sse2_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sub_pixel_avg_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance8x4 $sse2_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sub_pixel_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance4x8 $sse_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sub_pixel_avg_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance4x8 $sse_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance4x4 $sse_x86inc $ssse3_x86inc
-#vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
-
-prototype unsigned int vp9_sub_pixel_avg_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance4x4 $sse_x86inc $ssse3_x86inc
-
-prototype unsigned int vp9_sad64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
-specialize vp9_sad64x64 $sse2_x86inc
-
-prototype unsigned int vp9_sad32x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
-specialize vp9_sad32x64 $sse2_x86inc
-
-prototype unsigned int vp9_sad64x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
-specialize vp9_sad64x32 $sse2_x86inc
-
-prototype unsigned int vp9_sad32x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
-specialize vp9_sad32x16 $sse2_x86inc
-
-prototype unsigned int vp9_sad16x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
-specialize vp9_sad16x32 $sse2_x86inc
-
-prototype unsigned int vp9_sad32x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
-specialize vp9_sad32x32 $sse2_x86inc
-
-prototype unsigned int vp9_sad16x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
-specialize vp9_sad16x16 mmx $sse2_x86inc
-
-prototype unsigned int vp9_sad16x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
-specialize vp9_sad16x8 mmx $sse2_x86inc
-
-prototype unsigned int vp9_sad8x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
-specialize vp9_sad8x16 mmx $sse2_x86inc
-
-prototype unsigned int vp9_sad8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
-specialize vp9_sad8x8 mmx $sse2_x86inc
-
-prototype unsigned int vp9_sad8x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
-specialize vp9_sad8x4 $sse2_x86inc
-
-prototype unsigned int vp9_sad4x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
-specialize vp9_sad4x8 $sse_x86inc
-
-prototype unsigned int vp9_sad4x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
-specialize vp9_sad4x4 mmx $sse_x86inc
-
-prototype unsigned int vp9_sad64x64_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad64x64_avg $sse2_x86inc
-
-prototype unsigned int vp9_sad32x64_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad32x64_avg $sse2_x86inc
-
-prototype unsigned int vp9_sad64x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad64x32_avg $sse2_x86inc
-
-prototype unsigned int vp9_sad32x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad32x16_avg $sse2_x86inc
-
-prototype unsigned int vp9_sad16x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad16x32_avg $sse2_x86inc
-
-prototype unsigned int vp9_sad32x32_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad32x32_avg $sse2_x86inc
-
-prototype unsigned int vp9_sad16x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad16x16_avg $sse2_x86inc
-
-prototype unsigned int vp9_sad16x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad16x8_avg $sse2_x86inc
-
-prototype unsigned int vp9_sad8x16_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad8x16_avg $sse2_x86inc
-
-prototype unsigned int vp9_sad8x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad8x8_avg $sse2_x86inc
-
-prototype unsigned int vp9_sad8x4_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad8x4_avg $sse2_x86inc
-
-prototype unsigned int vp9_sad4x8_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad4x8_avg $sse_x86inc
-
-prototype unsigned int vp9_sad4x4_avg "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad"
-specialize vp9_sad4x4_avg $sse_x86inc
-
-prototype unsigned int vp9_variance_halfpixvar16x16_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar16x16_h $sse2_x86inc
-
-prototype unsigned int vp9_variance_halfpixvar16x16_v "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar16x16_v $sse2_x86inc
-
-prototype unsigned int vp9_variance_halfpixvar16x16_hv "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar16x16_hv $sse2_x86inc
-
-prototype unsigned int vp9_variance_halfpixvar64x64_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar64x64_h
-
-prototype unsigned int vp9_variance_halfpixvar64x64_v "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar64x64_v
-
-prototype unsigned int vp9_variance_halfpixvar64x64_hv "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar64x64_hv
-
-prototype unsigned int vp9_variance_halfpixvar32x32_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar32x32_h
-
-prototype unsigned int vp9_variance_halfpixvar32x32_v "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar32x32_v
-
-prototype unsigned int vp9_variance_halfpixvar32x32_hv "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar32x32_hv
-
-prototype void vp9_sad64x64x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad64x64x3
-
-prototype void vp9_sad32x32x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad32x32x3
-
-prototype void vp9_sad16x16x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad16x16x3 sse3 ssse3
-
-prototype void vp9_sad16x8x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad16x8x3 sse3 ssse3
-
-prototype void vp9_sad8x16x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad8x16x3 sse3
-
-prototype void vp9_sad8x8x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad8x8x3 sse3
-
-prototype void vp9_sad4x4x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad4x4x3 sse3
-
-prototype void vp9_sad64x64x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"
-specialize vp9_sad64x64x8
-
-prototype void vp9_sad32x32x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"
-specialize vp9_sad32x32x8
-
-prototype void vp9_sad16x16x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"
-specialize vp9_sad16x16x8 sse4
-
-prototype void vp9_sad16x8x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"
-specialize vp9_sad16x8x8 sse4
-
-prototype void vp9_sad8x16x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"
-specialize vp9_sad8x16x8 sse4
-
-prototype void vp9_sad8x8x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"
-specialize vp9_sad8x8x8 sse4
-
-prototype void vp9_sad8x4x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"
-specialize vp9_sad8x4x8
-
-prototype void vp9_sad4x8x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"
-specialize vp9_sad4x8x8
-
-prototype void vp9_sad4x4x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"
-specialize vp9_sad4x4x8 sse4
-
-prototype void vp9_sad64x64x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad64x64x4d sse2
-
-prototype void vp9_sad32x64x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad32x64x4d sse2
-
-prototype void vp9_sad64x32x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad64x32x4d sse2
-
-prototype void vp9_sad32x16x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad32x16x4d sse2
-
-prototype void vp9_sad16x32x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad16x32x4d sse2
-
-prototype void vp9_sad32x32x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad32x32x4d sse2
-
-prototype void vp9_sad16x16x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad16x16x4d sse2
-
-prototype void vp9_sad16x8x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad16x8x4d sse2
-
-prototype void vp9_sad8x16x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad8x16x4d sse2
-
-prototype void vp9_sad8x8x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad8x8x4d sse2
-
-# TODO(jingning): need to convert these 4x8/8x4 functions into sse2 form
-prototype void vp9_sad8x4x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"
-specialize vp9_sad8x4x4d sse2
-
-prototype void vp9_sad4x8x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"
-specialize vp9_sad4x8x4d sse
-
-prototype void vp9_sad4x4x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
-specialize vp9_sad4x4x4d sse
-
-#prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse"
-#specialize vp9_sub_pixel_mse16x16 sse2 mmx
-
-prototype unsigned int vp9_mse16x16 "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse"
-specialize vp9_mse16x16 mmx $sse2_x86inc $avx2_x86inc
-
-prototype unsigned int vp9_mse8x16 "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse"
-specialize vp9_mse8x16
-
-prototype unsigned int vp9_mse16x8 "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse"
-specialize vp9_mse16x8
-
-prototype unsigned int vp9_mse8x8 "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse"
-specialize vp9_mse8x8
-
-prototype unsigned int vp9_sub_pixel_mse64x64 "const uint8_t *src_ptr, int  source_stride, int  xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_mse64x64
-
-prototype unsigned int vp9_sub_pixel_mse32x32 "const uint8_t *src_ptr, int  source_stride, int  xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_mse32x32
-
-prototype unsigned int vp9_get_mb_ss "const int16_t *"
-specialize vp9_get_mb_ss mmx sse2
-# ENCODEMB INVOKE
-
-prototype int64_t vp9_block_error "const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz"
-specialize vp9_block_error $sse2_x86inc
-
-prototype void vp9_subtract_block "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"
-specialize vp9_subtract_block $sse2_x86inc
-
-prototype void vp9_quantize_b "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
-specialize vp9_quantize_b $ssse3_x86_64
-
-prototype void vp9_quantize_b_32x32 "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
-specialize vp9_quantize_b_32x32 $ssse3_x86_64
-
-#
-# Structured Similarity (SSIM)
-#
-if [ "$CONFIG_INTERNAL_STATS" = "yes" ]; then
-    prototype void vp9_ssim_parms_8x8 "uint8_t *s, int sp, uint8_t *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"
-    specialize vp9_ssim_parms_8x8 $sse2_x86_64
-
-    prototype void vp9_ssim_parms_16x16 "uint8_t *s, int sp, uint8_t *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"
-    specialize vp9_ssim_parms_16x16 $sse2_x86_64
-fi
-
-# fdct functions
-prototype void vp9_fht4x4 "const int16_t *input, int16_t *output, int stride, int tx_type"
-specialize vp9_fht4x4 sse2 avx2
-
-prototype void vp9_fht8x8 "const int16_t *input, int16_t *output, int stride, int tx_type"
-specialize vp9_fht8x8 sse2 avx2
-
-prototype void vp9_fht16x16 "const int16_t *input, int16_t *output, int stride, int tx_type"
-specialize vp9_fht16x16 sse2 avx2
-
-prototype void vp9_fwht4x4 "const int16_t *input, int16_t *output, int stride"
-specialize vp9_fwht4x4
-
-prototype void vp9_fdct4x4 "const int16_t *input, int16_t *output, int stride"
-specialize vp9_fdct4x4 sse2 avx2
-
-prototype void vp9_fdct8x8 "const int16_t *input, int16_t *output, int stride"
-specialize vp9_fdct8x8 sse2 avx2
-
-prototype void vp9_fdct16x16 "const int16_t *input, int16_t *output, int stride"
-specialize vp9_fdct16x16 sse2 avx2
-
-prototype void vp9_fdct32x32 "const int16_t *input, int16_t *output, int stride"
-specialize vp9_fdct32x32 sse2 avx2
-
-prototype void vp9_fdct32x32_rd "const int16_t *input, int16_t *output, int stride"
-specialize vp9_fdct32x32_rd sse2 avx2
-
-#
-# Motion search
-#
-prototype int vp9_full_search_sad "const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv, struct mv *best_mv"
-specialize vp9_full_search_sad sse3 sse4_1
-vp9_full_search_sad_sse3=vp9_full_search_sadx3
-vp9_full_search_sad_sse4_1=vp9_full_search_sadx8
-
-prototype int vp9_refining_search_sad "const struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv"
-specialize vp9_refining_search_sad sse3
-vp9_refining_search_sad_sse3=vp9_refining_search_sadx4
-
-prototype int vp9_diamond_search_sad "const struct macroblock *x, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv"
-specialize vp9_diamond_search_sad sse3
-vp9_diamond_search_sad_sse3=vp9_diamond_search_sadx4
-
-prototype int vp9_full_range_search "const struct macroblock *x, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv"
-specialize vp9_full_range_search
-
-prototype void vp9_temporal_filter_apply "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count"
-specialize vp9_temporal_filter_apply sse2
-
-fi
-# end encoder functions
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 56b993d..54a14f3 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -245,11 +245,11 @@
           vp9_iht4x4_16_add(dqcoeff, dst, stride, tx_type);
         break;
       case TX_8X8:
-        tx_type = get_tx_type_8x8(plane_type, xd);
+        tx_type = get_tx_type(plane_type, xd);
         vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob);
         break;
       case TX_16X16:
-        tx_type = get_tx_type_16x16(plane_type, xd);
+        tx_type = get_tx_type(plane_type, xd);
         vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob);
         break;
       case TX_32X32:
@@ -1087,7 +1087,7 @@
   }
 }
 
-static void error_handler(void *data, size_t bit_offset) {
+static void error_handler(void *data) {
   VP9_COMMON *const cm = (VP9_COMMON *)data;
   vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, "Truncated packet");
 }
@@ -1198,8 +1198,7 @@
                                           ref_buf->buf->y_crop_height,
                                           cm->width, cm->height);
         if (vp9_is_scaled(&ref_buf->sf))
-          vp9_extend_frame_borders(ref_buf->buf,
-                                   cm->subsampling_x, cm->subsampling_y);
+          vp9_extend_frame_borders(ref_buf->buf);
       }
     }
   }
diff --git a/vp9/decoder/vp9_onyxd.h b/vp9/decoder/vp9_onyxd.h
index 0fc9d57..203e9fa 100644
--- a/vp9/decoder/vp9_onyxd.h
+++ b/vp9/decoder/vp9_onyxd.h
@@ -19,7 +19,7 @@
 extern "C" {
 #endif
 
-typedef void *VP9D_PTR;
+struct VP9Decompressor;
 
 typedef struct {
   int width;
@@ -39,28 +39,30 @@
 
 void vp9_initialize_dec();
 
-int vp9_receive_compressed_data(VP9D_PTR comp,
+int vp9_receive_compressed_data(struct VP9Decompressor *pbi,
                                 size_t size, const uint8_t **dest,
                                 int64_t time_stamp);
 
-int vp9_get_raw_frame(VP9D_PTR comp, YV12_BUFFER_CONFIG *sd,
+int vp9_get_raw_frame(struct VP9Decompressor *pbi,
+                      YV12_BUFFER_CONFIG *sd,
                       int64_t *time_stamp, int64_t *time_end_stamp,
                       vp9_ppflags_t *flags);
 
-vpx_codec_err_t vp9_copy_reference_dec(VP9D_PTR comp,
+vpx_codec_err_t vp9_copy_reference_dec(struct VP9Decompressor *pbi,
                                        VP9_REFFRAME ref_frame_flag,
                                        YV12_BUFFER_CONFIG *sd);
 
-vpx_codec_err_t vp9_set_reference_dec(VP9D_PTR comp,
+vpx_codec_err_t vp9_set_reference_dec(struct VP9Decompressor *pbi,
                                       VP9_REFFRAME ref_frame_flag,
                                       YV12_BUFFER_CONFIG *sd);
 
-int vp9_get_reference_dec(VP9D_PTR ptr, int index, YV12_BUFFER_CONFIG **fb);
+int vp9_get_reference_dec(struct VP9Decompressor *pbi,
+                          int index, YV12_BUFFER_CONFIG **fb);
 
 
-VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf);
+struct VP9Decompressor *vp9_create_decompressor(VP9D_CONFIG *oxcf);
 
-void vp9_remove_decompressor(VP9D_PTR comp);
+void vp9_remove_decompressor(struct VP9Decompressor *pbi);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/decoder/vp9_onyxd_if.c b/vp9/decoder/vp9_onyxd_if.c
index 1d3522e..24248a4 100644
--- a/vp9/decoder/vp9_onyxd_if.c
+++ b/vp9/decoder/vp9_onyxd_if.c
@@ -117,7 +117,7 @@
     pd[i].dqcoeff = pbi->dqcoeff[i];
 }
 
-VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf) {
+VP9D_COMP *vp9_create_decompressor(VP9D_CONFIG *oxcf) {
   VP9D_COMP *const pbi = vpx_memalign(32, sizeof(VP9D_COMP));
   VP9_COMMON *const cm = pbi ? &pbi->common : NULL;
 
@@ -138,7 +138,7 @@
   cm->error.setjmp = 1;
   vp9_initialize_dec();
 
-  vp9_create_common(cm);
+  vp9_rtcd();
 
   pbi->oxcf = *oxcf;
   pbi->ready_for_new_data = 1;
@@ -161,9 +161,8 @@
   return pbi;
 }
 
-void vp9_remove_decompressor(VP9D_PTR ptr) {
+void vp9_remove_decompressor(VP9D_COMP *pbi) {
   int i;
-  VP9D_COMP *const pbi = (VP9D_COMP *)ptr;
 
   if (!pbi)
     return;
@@ -200,10 +199,9 @@
            a->uv_height == b->uv_height && a->uv_width == b->uv_width;
 }
 
-vpx_codec_err_t vp9_copy_reference_dec(VP9D_PTR ptr,
+vpx_codec_err_t vp9_copy_reference_dec(VP9D_COMP *pbi,
                                        VP9_REFFRAME ref_frame_flag,
                                        YV12_BUFFER_CONFIG *sd) {
-  VP9D_COMP *pbi = (VP9D_COMP *) ptr;
   VP9_COMMON *cm = &pbi->common;
 
   /* TODO(jkoleszar): The decoder doesn't have any real knowledge of what the
@@ -228,9 +226,9 @@
 }
 
 
-vpx_codec_err_t vp9_set_reference_dec(VP9D_PTR ptr, VP9_REFFRAME ref_frame_flag,
+vpx_codec_err_t vp9_set_reference_dec(VP9D_COMP *pbi,
+                                      VP9_REFFRAME ref_frame_flag,
                                       YV12_BUFFER_CONFIG *sd) {
-  VP9D_COMP *pbi = (VP9D_COMP *) ptr;
   VP9_COMMON *cm = &pbi->common;
   RefBuffer *ref_buf = NULL;
 
@@ -273,8 +271,7 @@
 }
 
 
-int vp9_get_reference_dec(VP9D_PTR ptr, int index, YV12_BUFFER_CONFIG **fb) {
-  VP9D_COMP *pbi = (VP9D_COMP *) ptr;
+int vp9_get_reference_dec(VP9D_COMP *pbi, int index, YV12_BUFFER_CONFIG **fb) {
   VP9_COMMON *cm = &pbi->common;
 
   if (index < 0 || index >= REF_FRAMES)
@@ -309,20 +306,20 @@
     cm->frame_refs[ref_index].idx = INT_MAX;
 }
 
-int vp9_receive_compressed_data(VP9D_PTR ptr,
+int vp9_receive_compressed_data(VP9D_COMP *pbi,
                                 size_t size, const uint8_t **psource,
                                 int64_t time_stamp) {
-  VP9D_COMP *pbi = (VP9D_COMP *) ptr;
-  VP9_COMMON *cm = &pbi->common;
+  VP9_COMMON *cm = NULL;
   const uint8_t *source = *psource;
   int retcode = 0;
 
   /*if(pbi->ready_for_new_data == 0)
       return -1;*/
 
-  if (ptr == 0)
+  if (!pbi)
     return -1;
 
+  cm = &pbi->common;
   cm->error.error_code = VPX_CODEC_OK;
 
   pbi->source = source;
@@ -454,11 +451,10 @@
   return retcode;
 }
 
-int vp9_get_raw_frame(VP9D_PTR ptr, YV12_BUFFER_CONFIG *sd,
+int vp9_get_raw_frame(VP9D_COMP *pbi, YV12_BUFFER_CONFIG *sd,
                       int64_t *time_stamp, int64_t *time_end_stamp,
                       vp9_ppflags_t *flags) {
   int ret = -1;
-  VP9D_COMP *pbi = (VP9D_COMP *) ptr;
 
   if (pbi->ready_for_new_data == 1)
     return ret;
diff --git a/vp9/decoder/vp9_read_bit_buffer.h b/vp9/decoder/vp9_read_bit_buffer.h
index 619e39f..8cb4247 100644
--- a/vp9/decoder/vp9_read_bit_buffer.h
+++ b/vp9/decoder/vp9_read_bit_buffer.h
@@ -19,7 +19,7 @@
 extern "C" {
 #endif
 
-typedef void (*vp9_rb_error_handler)(void *data, size_t bit_offset);
+typedef void (*vp9_rb_error_handler)(void *data);
 
 struct vp9_read_bit_buffer {
   const uint8_t *bit_buffer;
@@ -39,7 +39,7 @@
   const size_t p = off / CHAR_BIT;
   const int q = CHAR_BIT - 1 - (int)off % CHAR_BIT;
   if (rb->bit_buffer + p >= rb->bit_buffer_end) {
-    rb->error_handler(rb->error_handler_data, rb->bit_offset);
+    rb->error_handler(rb->error_handler_data);
     return 0;
   } else {
     const int bit = (rb->bit_buffer[p] & (1 << q)) >> q;
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 14600e8..0f1692d 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -82,7 +82,7 @@
     vp9_cond_prob_diff_update(w, &probs[i], branch_ct[i]);
 }
 
-static void write_selected_tx_size(const VP9_COMP *cpi, MODE_INFO *m,
+static void write_selected_tx_size(const VP9_COMP *cpi,
                                    TX_SIZE tx_size, BLOCK_SIZE bsize,
                                    vp9_writer *w) {
   const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
@@ -267,7 +267,7 @@
   if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT &&
       !(ref0 != INTRA_FRAME &&
         (skip || vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)))) {
-    write_selected_tx_size(cpi, m, mi->tx_size, bsize, bc);
+    write_selected_tx_size(cpi, mi->tx_size, bsize, bc);
   }
 
   if (ref0 == INTRA_FRAME) {
@@ -369,7 +369,7 @@
   write_skip(cpi, segment_id, m, bc);
 
   if (m->mbmi.sb_type >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT)
-    write_selected_tx_size(cpi, m, m->mbmi.tx_size, m->mbmi.sb_type, bc);
+    write_selected_tx_size(cpi, m->mbmi.tx_size, m->mbmi.sb_type, bc);
 
   if (m->mbmi.sb_type >= BLOCK_8X8) {
     const MB_PREDICTION_MODE A = vp9_above_block_mode(m, above_mi, 0);
@@ -566,7 +566,7 @@
                 if (t == PIVOT_NODE)
                   s = vp9_prob_diff_update_savings_search_model(
                       frame_branch_ct[i][j][k][l][0],
-                      old_frame_coef_probs[i][j][k][l], &newp, upd, i, j);
+                      old_frame_coef_probs[i][j][k][l], &newp, upd);
                 else
                   s = vp9_prob_diff_update_savings_search(
                       frame_branch_ct[i][j][k][l][t], oldp, &newp, upd);
@@ -604,7 +604,7 @@
                 if (t == PIVOT_NODE)
                   s = vp9_prob_diff_update_savings_search_model(
                       frame_branch_ct[i][j][k][l][0],
-                      old_frame_coef_probs[i][j][k][l], &newp, upd, i, j);
+                      old_frame_coef_probs[i][j][k][l], &newp, upd);
                 else
                   s = vp9_prob_diff_update_savings_search(
                       frame_branch_ct[i][j][k][l][t],
@@ -652,7 +652,7 @@
                   if (t == PIVOT_NODE)
                     s = vp9_prob_diff_update_savings_search_model(
                         frame_branch_ct[i][j][k][l][0],
-                        old_frame_coef_probs[i][j][k][l], &newp, upd, i, j);
+                        old_frame_coef_probs[i][j][k][l], &newp, upd);
                   else
                     s = vp9_prob_diff_update_savings_search(
                         frame_branch_ct[i][j][k][l][t],
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index bba0c53..b8dc72a 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1088,8 +1088,7 @@
   return 0;
 }
 
-static void update_state_rt(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
-                         BLOCK_SIZE bsize, int output_enabled) {
+static void update_state_rt(VP9_COMP *cpi, const PICK_MODE_CONTEXT *ctx) {
   int i;
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
@@ -1128,8 +1127,8 @@
       }
 
       if (cm->interp_filter == SWITCHABLE) {
-        const int ctx = vp9_get_pred_context_switchable_interp(xd);
-        ++cm->counts.switchable_interp[ctx][mbmi->interp_filter];
+        const int pred_ctx = vp9_get_pred_context_switchable_interp(xd);
+        ++cm->counts.switchable_interp[pred_ctx][mbmi->interp_filter];
       }
     }
   }
@@ -1147,7 +1146,7 @@
       return;
   }
   set_offsets(cpi, tile, mi_row, mi_col, bsize);
-  update_state_rt(cpi, get_block_context(x, bsize), bsize, output_enabled);
+  update_state_rt(cpi, get_block_context(x, bsize));
 
   encode_superblock(cpi, tp, output_enabled, mi_row, mi_col, bsize);
   update_stats(cpi);
@@ -2169,9 +2168,10 @@
   }
 }
 
-static void reset_skip_txfm_size_b(VP9_COMMON *cm, MODE_INFO **mi_8x8,
-                                   int mis, TX_SIZE max_tx_size, int bw, int bh,
-                                   int mi_row, int mi_col, BLOCK_SIZE bsize) {
+static void reset_skip_txfm_size_b(const VP9_COMMON *cm, int mis,
+                                   TX_SIZE max_tx_size, int bw, int bh,
+                                   int mi_row, int mi_col,
+                                   MODE_INFO **mi_8x8) {
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) {
     return;
   } else {
@@ -2201,19 +2201,18 @@
   bh = num_8x8_blocks_high_lookup[mi_8x8[0]->mbmi.sb_type];
 
   if (bw == bs && bh == bs) {
-    reset_skip_txfm_size_b(cm, mi_8x8, mis, max_tx_size, bs, bs, mi_row,
-                           mi_col, bsize);
+    reset_skip_txfm_size_b(cm, mis, max_tx_size, bs, bs, mi_row, mi_col,
+                           mi_8x8);
   } else if (bw == bs && bh < bs) {
-    reset_skip_txfm_size_b(cm, mi_8x8, mis, max_tx_size, bs, hbs, mi_row,
-                           mi_col, bsize);
-    reset_skip_txfm_size_b(cm, mi_8x8 + hbs * mis, mis, max_tx_size, bs, hbs,
-                           mi_row + hbs, mi_col, bsize);
+    reset_skip_txfm_size_b(cm, mis, max_tx_size, bs, hbs, mi_row, mi_col,
+                           mi_8x8);
+    reset_skip_txfm_size_b(cm, mis, max_tx_size, bs, hbs, mi_row + hbs,
+                           mi_col, mi_8x8 + hbs * mis);
   } else if (bw < bs && bh == bs) {
-    reset_skip_txfm_size_b(cm, mi_8x8, mis, max_tx_size, hbs, bs, mi_row,
-                           mi_col, bsize);
-    reset_skip_txfm_size_b(cm, mi_8x8 + hbs, mis, max_tx_size, hbs, bs, mi_row,
-                           mi_col + hbs, bsize);
-
+    reset_skip_txfm_size_b(cm, mis, max_tx_size, hbs, bs, mi_row, mi_col,
+                           mi_8x8);
+    reset_skip_txfm_size_b(cm, mis, max_tx_size, hbs, bs, mi_row,
+                           mi_col + hbs, mi_8x8 + hbs);
   } else {
     const BLOCK_SIZE subsize = subsize_lookup[PARTITION_SPLIT][bsize];
     int n;
@@ -2296,7 +2295,7 @@
 } motion_vector_context;
 
 static void set_mode_info(MB_MODE_INFO *mbmi, BLOCK_SIZE bsize,
-                          MB_PREDICTION_MODE mode, int mi_row, int mi_col) {
+                          MB_PREDICTION_MODE mode) {
   mbmi->interp_filter = EIGHTTAP;
   mbmi->mode = mode;
   mbmi->mv[0].as_int = 0;
@@ -2356,9 +2355,10 @@
       set_offsets(cpi, tile, row, col, bs);
 
       if (cm->frame_type != KEY_FRAME)
-        vp9_pick_inter_mode(cpi, x, tile, row, col, &brate, &bdist, bs);
+        vp9_pick_inter_mode(cpi, x, tile, row, col,
+                            &brate, &bdist, bs);
       else
-        set_mode_info(&xd->mi_8x8[0]->mbmi, bs, mode, row, col);
+        set_mode_info(&xd->mi_8x8[0]->mbmi, bs, mode);
 
       *rate += brate;
       *dist += bdist;
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 513730e..19dee0e 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -526,7 +526,7 @@
         vp9_idct32x32_add(dqcoeff, dst, dst_stride, *eob);
       break;
     case TX_16X16:
-      tx_type = get_tx_type_16x16(pd->plane_type, xd);
+      tx_type = get_tx_type(pd->plane_type, xd);
       scan_order = &vp9_scan_orders[TX_16X16][tx_type];
       mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
       vp9_predict_intra_block(xd, block >> 4, bwl, TX_16X16, mode,
@@ -546,7 +546,7 @@
         vp9_iht16x16_add(tx_type, dqcoeff, dst, dst_stride, *eob);
       break;
     case TX_8X8:
-      tx_type = get_tx_type_8x8(pd->plane_type, xd);
+      tx_type = get_tx_type(pd->plane_type, xd);
       scan_order = &vp9_scan_orders[TX_8X8][tx_type];
       mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
       vp9_predict_intra_block(xd, block >> 2, bwl, TX_8X8, mode,
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 32ed969..b507c6e 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -8,31 +8,34 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include <math.h>
 #include <limits.h>
+#include <math.h>
 #include <stdio.h>
+
+#include "./vpx_scale_rtcd.h"
+
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_scale/vpx_scale.h"
+#include "vpx_scale/yv12config.h"
+
+#include "vp9/common/vp9_entropymv.h"
+#include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_reconinter.h"  // setup_dst_planes()
 #include "vp9/common/vp9_systemdependent.h"
+
 #include "vp9/encoder/vp9_block.h"
 #include "vp9/encoder/vp9_encodeframe.h"
 #include "vp9/encoder/vp9_encodemb.h"
+#include "vp9/encoder/vp9_encodemv.h"
 #include "vp9/encoder/vp9_extend.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/encoder/vp9_mcomp.h"
 #include "vp9/encoder/vp9_onyx_int.h"
-#include "vp9/encoder/vp9_variance.h"
-#include "vpx_scale/vpx_scale.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vpx_scale/yv12config.h"
 #include "vp9/encoder/vp9_quantize.h"
-#include "vp9/encoder/vp9_rdopt.h"
 #include "vp9/encoder/vp9_ratectrl.h"
-#include "vp9/common/vp9_quant_common.h"
-#include "vp9/common/vp9_entropymv.h"
-#include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_rdopt.h"
 #include "vp9/encoder/vp9_vaq.h"
-#include "./vpx_scale_rtcd.h"
-// TODO(jkoleszar): for setup_dst_planes
-#include "vp9/common/vp9_reconinter.h"
+#include "vp9/encoder/vp9_variance.h"
 
 #define OUTPUT_FPF 0
 
@@ -133,9 +136,8 @@
   return 1;
 }
 
-static void output_stats(const VP9_COMP *cpi,
-                         struct vpx_codec_pkt_list *pktlist,
-                         FIRSTPASS_STATS *stats) {
+static void output_stats(FIRSTPASS_STATS *stats,
+                         struct vpx_codec_pkt_list *pktlist) {
   struct vpx_codec_cx_pkt pkt;
   pkt.kind = VPX_CODEC_STATS_PKT;
   pkt.data.twopass_stats.buf = stats;
@@ -354,7 +356,7 @@
 }
 
 void vp9_end_first_pass(VP9_COMP *cpi) {
-  output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.total_stats);
+  output_stats(&cpi->twopass.total_stats, cpi->output_pkt_list);
 }
 
 static vp9_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) {
@@ -370,7 +372,7 @@
   }
 }
 
-static unsigned int zz_motion_search(const VP9_COMP *cpi, const MACROBLOCK *x) {
+static unsigned int zz_motion_search(const MACROBLOCK *x) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   const uint8_t *const src = x->plane[0].src.buf;
   const int src_stride = x->plane[0].src.stride;
@@ -413,6 +415,8 @@
                                     x->sadperbit16, &num00, &v_fn_ptr,
                                     x->nmvjointcost,
                                     x->mvcost, ref_mv);
+  if (tmp_err < INT_MAX)
+    tmp_err = vp9_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1);
   if (tmp_err < INT_MAX - new_mv_mode_penalty)
     tmp_err += new_mv_mode_penalty;
 
@@ -437,6 +441,8 @@
                                         &num00, &v_fn_ptr,
                                         x->nmvjointcost,
                                         x->mvcost, ref_mv);
+      if (tmp_err < INT_MAX)
+        tmp_err = vp9_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1);
       if (tmp_err < INT_MAX - new_mv_mode_penalty)
         tmp_err += new_mv_mode_penalty;
 
@@ -592,7 +598,7 @@
         int_mv mv, tmp_mv;
 
         xd->plane[0].pre[0].buf = lst_yv12->y_buffer + recon_yoffset;
-        motion_error = zz_motion_search(cpi, x);
+        motion_error = zz_motion_search(x);
         // Assume 0,0 motion with no mv overhead.
         mv.as_int = tmp_mv.as_int = 0;
 
@@ -628,7 +634,7 @@
           int gf_motion_error;
 
           xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset;
-          gf_motion_error = zz_motion_search(cpi, x);
+          gf_motion_error = zz_motion_search(x);
 
           first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv.as_mv,
                                    &gf_motion_error);
@@ -788,7 +794,7 @@
 
     // Don't want to do output stats with a stack variable!
     twopass->this_frame_stats = fps;
-    output_stats(cpi, cpi->output_pkt_list, &twopass->this_frame_stats);
+    output_stats(&twopass->this_frame_stats, cpi->output_pkt_list);
     accumulate_stats(&twopass->total_stats, &fps);
   }
 
@@ -807,7 +813,7 @@
   // Swap frame pointers so last frame refers to the frame we just compressed.
   swap_yv12(lst_yv12, new_yv12);
 
-  vp9_extend_frame_borders(lst_yv12, cm->subsampling_x, cm->subsampling_y);
+  vp9_extend_frame_borders(lst_yv12);
 
   // Special case for the first frame. Copy into the GF buffer as a second
   // reference.
@@ -839,40 +845,6 @@
   return -(log(prob) / log(2.0));
 }
 
-static int64_t estimate_modemvcost(VP9_COMP *cpi,
-                                     FIRSTPASS_STATS *fpstats) {
-#if 0
-  int mv_cost;
-  int mode_cost;
-
-  double av_pct_inter = fpstats->pcnt_inter / fpstats->count;
-  double av_pct_motion = fpstats->pcnt_motion / fpstats->count;
-  double av_intra = (1.0 - av_pct_inter);
-
-  double zz_cost;
-  double motion_cost;
-  double intra_cost;
-
-  zz_cost = bitcost(av_pct_inter - av_pct_motion);
-  motion_cost = bitcost(av_pct_motion);
-  intra_cost = bitcost(av_intra);
-
-  // Estimate the number of extra bits per mv overhead for mbs. We shift (<< 9)
-  // to match the scaling of number of bits by 512.
-  mv_cost = ((int)(fpstats->new_mv_count / fpstats->count) * 8) << 9;
-
-  // Produce a crude estimate of the overhead cost from modes. We shift (<< 9)
-  // to match the scaling of number of bits by 512.
-  mode_cost =
-    (int)((((av_pct_inter - av_pct_motion) * zz_cost) +
-           (av_pct_motion * motion_cost) +
-           (av_intra * intra_cost)) * cpi->common.MBs) << 9;
-
-  // TODO(paulwilkins): Fix overhead costs for extended Q range.
-#endif
-  return 0;
-}
-
 static double calc_correction_factor(double err_per_mb,
                                      double err_divisor,
                                      double pt_low,
@@ -1007,9 +979,6 @@
   }
 }
 
-void vp9_end_second_pass(VP9_COMP *cpi) {
-}
-
 // This function gives an estimate of how badly we believe the prediction
 // quality is decaying from frame to frame.
 static double get_prediction_decay_rate(const VP9_COMMON *cm,
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index 83e337b..03c0e20 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -89,7 +89,6 @@
 
 void vp9_init_second_pass(struct VP9_COMP *cpi);
 void vp9_rc_get_second_pass_params(struct VP9_COMP *cpi);
-void vp9_end_second_pass(struct VP9_COMP *cpi);
 int vp9_twopass_worst_quality(struct VP9_COMP *cpi, FIRSTPASS_STATS *fpstats,
                               int section_target_bandwitdh);
 
diff --git a/vp9/encoder/vp9_lookahead.c b/vp9/encoder/vp9_lookahead.c
index 4b642e2..a88d5ec 100644
--- a/vp9/encoder/vp9_lookahead.c
+++ b/vp9/encoder/vp9_lookahead.c
@@ -88,8 +88,7 @@
 #define USE_PARTIAL_COPY 0
 
 int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG   *src,
-                       int64_t ts_start, int64_t ts_end, unsigned int flags,
-                       unsigned char *active_map) {
+                       int64_t ts_start, int64_t ts_end, unsigned int flags) {
   struct lookahead_entry *buf;
 #if USE_PARTIAL_COPY
   int row, col, active_end;
diff --git a/vp9/encoder/vp9_lookahead.h b/vp9/encoder/vp9_lookahead.h
index 1c00c46..ff63c0d 100644
--- a/vp9/encoder/vp9_lookahead.h
+++ b/vp9/encoder/vp9_lookahead.h
@@ -63,8 +63,7 @@
  * \param[in] active_map  Map that specifies which macroblock is active
  */
 int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
-                       int64_t ts_start, int64_t ts_end, unsigned int flags,
-                       unsigned char *active_map);
+                       int64_t ts_start, int64_t ts_end, unsigned int flags);
 
 
 /**\brief Get the next source buffer to encode
diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index 44c1f90..d3e19b4 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -132,7 +132,6 @@
   return err;
 }
 static int find_best_16x16_intra(VP9_COMP *cpi,
-                                 int mb_y_offset,
                                  MB_PREDICTION_MODE *pbest_mode) {
   MACROBLOCK   *const x  = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -173,10 +172,7 @@
   int mb_y_offset,
   YV12_BUFFER_CONFIG *golden_ref,
   int_mv *prev_golden_ref_mv,
-  int gld_y_offset,
   YV12_BUFFER_CONFIG *alt_ref,
-  int_mv *prev_alt_ref_mv,
-  int arf_y_offset,
   int mb_row,
   int mb_col
 ) {
@@ -193,7 +189,7 @@
   xd->plane[0].dst.stride = get_frame_new_buffer(cm)->y_stride;
 
   // do intra 16x16 prediction
-  intra_error = find_best_16x16_intra(cpi, mb_y_offset,
+  intra_error = find_best_16x16_intra(cpi,
                                       &stats->ref[INTRA_FRAME].m.mode);
   if (intra_error <= 0)
     intra_error = 1;
@@ -277,8 +273,7 @@
       MBGRAPH_MB_STATS *mb_stats = &stats->mb_stats[offset + mb_col];
 
       update_mbgraph_mb_stats(cpi, mb_stats, buf, mb_y_in_offset,
-                              golden_ref, &gld_left_mv, gld_y_in_offset,
-                              alt_ref,    &arf_left_mv, arf_y_in_offset,
+                              golden_ref, &gld_left_mv, alt_ref,
                               mb_row, mb_col);
       arf_left_mv.as_int = mb_stats->ref[ALTREF_FRAME].m.mv.as_int;
       gld_left_mv.as_int = mb_stats->ref[GOLDEN_FRAME].m.mv.as_int;
@@ -374,10 +369,10 @@
       cpi->static_mb_pct = 0;
 
     cpi->seg0_cnt = ncnt[0];
-    vp9_enable_segmentation((VP9_PTR)cpi);
+    vp9_enable_segmentation(&cm->seg);
   } else {
     cpi->static_mb_pct = 0;
-    vp9_disable_segmentation((VP9_PTR)cpi);
+    vp9_disable_segmentation(&cm->seg);
   }
 
   // Free localy allocated storage
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 7d6fd3b..d3a9977 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -98,42 +98,23 @@
 }
 
 void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride) {
-  int len;
-  int search_site_count = 0;
+  int len, ss_count = 1;
 
-  // Generate offsets for 4 search sites per step.
-  x->ss[search_site_count].mv.col = 0;
-  x->ss[search_site_count].mv.row = 0;
-  x->ss[search_site_count].offset = 0;
-  search_site_count++;
+  x->ss[0].mv.col = x->ss[0].mv.row = 0;
+  x->ss[0].offset = 0;
 
   for (len = MAX_FIRST_STEP; len > 0; len /= 2) {
-    // Compute offsets for search sites.
-    x->ss[search_site_count].mv.col = 0;
-    x->ss[search_site_count].mv.row = -len;
-    x->ss[search_site_count].offset = -len * stride;
-    search_site_count++;
-
-    // Compute offsets for search sites.
-    x->ss[search_site_count].mv.col = 0;
-    x->ss[search_site_count].mv.row = len;
-    x->ss[search_site_count].offset = len * stride;
-    search_site_count++;
-
-    // Compute offsets for search sites.
-    x->ss[search_site_count].mv.col = -len;
-    x->ss[search_site_count].mv.row = 0;
-    x->ss[search_site_count].offset = -len;
-    search_site_count++;
-
-    // Compute offsets for search sites.
-    x->ss[search_site_count].mv.col = len;
-    x->ss[search_site_count].mv.row = 0;
-    x->ss[search_site_count].offset = len;
-    search_site_count++;
+    // Generate offsets for 4 search sites per step.
+    const MV ss_mvs[] = {{-len, 0}, {len, 0}, {0, -len}, {0, len}};
+    int i;
+    for (i = 0; i < 4; ++i) {
+      search_site *const ss = &x->ss[ss_count++];
+      ss->mv = ss_mvs[i];
+      ss->offset = ss->mv.row * stride + ss->mv.col;
+    }
   }
 
-  x->ss_count = search_site_count;
+  x->ss_count = ss_count;
   x->searches_per_step = 4;
 }
 
@@ -728,16 +709,52 @@
                                best_mv->col;
   this_mv.row = best_mv->row * 8;
   this_mv.col = best_mv->col * 8;
-  if (bestsad == INT_MAX)
-    return INT_MAX;
-
-  return vfp->vf(what, what_stride, this_offset, in_what_stride,
-                 (unsigned int *)&bestsad) +
-         use_mvcost ? mv_err_cost(&this_mv, center_mv,
-                                  x->nmvjointcost, x->mvcost, x->errorperbit)
-                    : 0;
+  return bestsad;
 }
 
+int vp9_get_mvpred_var(const MACROBLOCK *x,
+                       MV *best_mv,
+                       const MV *center_mv,
+                       const vp9_variance_fn_ptr_t *vfp,
+                       int use_mvcost) {
+  unsigned int bestsad;
+  MV this_mv;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const uint8_t *what = x->plane[0].src.buf;
+  const int what_stride = x->plane[0].src.stride;
+  const int in_what_stride = xd->plane[0].pre[0].stride;
+  const uint8_t *base_offset = xd->plane[0].pre[0].buf;
+  const uint8_t *this_offset = base_offset + (best_mv->row * in_what_stride) +
+      best_mv->col;
+  this_mv.row = best_mv->row * 8;
+  this_mv.col = best_mv->col * 8;
+  return vfp->vf(what, what_stride, this_offset, in_what_stride, &bestsad) +
+      (use_mvcost ?  mv_err_cost(&this_mv, center_mv, x->nmvjointcost,
+                                 x->mvcost, x->errorperbit) : 0);
+}
+
+int vp9_get_mvpred_av_var(const MACROBLOCK *x,
+                          MV *best_mv,
+                          const MV *center_mv,
+                          const uint8_t *second_pred,
+                          const vp9_variance_fn_ptr_t *vfp,
+                          int use_mvcost) {
+  unsigned int bestsad;
+  MV this_mv;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const uint8_t *what = x->plane[0].src.buf;
+  const int what_stride = x->plane[0].src.stride;
+  const int in_what_stride = xd->plane[0].pre[0].stride;
+  const uint8_t *base_offset = xd->plane[0].pre[0].buf;
+  const uint8_t *this_offset = base_offset + (best_mv->row * in_what_stride) +
+      best_mv->col;
+  this_mv.row = best_mv->row * 8;
+  this_mv.col = best_mv->col * 8;
+  return vfp->svaf(this_offset, in_what_stride, 0, 0, what, what_stride,
+                   &bestsad, second_pred) +
+      (use_mvcost ?  mv_err_cost(&this_mv, center_mv, x->nmvjointcost,
+                                 x->mvcost, x->errorperbit) : 0);
+}
 
 int vp9_hex_search(const MACROBLOCK *x,
                    MV *ref_mv,
@@ -855,182 +872,18 @@
                             square_num_candidates, square_candidates);
 };
 
-// Number of candidates in first hex search
-#define FIRST_HEX_CANDIDATES 6
-// Index of previous hex search's best match
-#define PRE_BEST_CANDIDATE 6
-// Number of candidates in following hex search
-#define NEXT_HEX_CANDIDATES 3
-// Number of candidates in refining search
-#define REFINE_CANDIDATES 4
-
 int vp9_fast_hex_search(const MACROBLOCK *x,
                         MV *ref_mv,
                         int search_param,
                         int sad_per_bit,
+                        int do_init_search,  // must be zero for fast_hex
                         const vp9_variance_fn_ptr_t *vfp,
                         int use_mvcost,
                         const MV *center_mv,
                         MV *best_mv) {
-  const MACROBLOCKD* const xd = &x->e_mbd;
-  static const MV hex[FIRST_HEX_CANDIDATES] = {
-    { -1, -2}, {1, -2}, {2, 0}, {1, 2}, { -1, 2}, { -2, 0}
-  };
-  static const MV next_chkpts[PRE_BEST_CANDIDATE][NEXT_HEX_CANDIDATES] = {
-    {{ -2, 0}, { -1, -2}, {1, -2}},
-    {{ -1, -2}, {1, -2}, {2, 0}},
-    {{1, -2}, {2, 0}, {1, 2}},
-    {{2, 0}, {1, 2}, { -1, 2}},
-    {{1, 2}, { -1, 2}, { -2, 0}},
-    {{ -1, 2}, { -2, 0}, { -1, -2}}
-  };
-  static const MV neighbors[REFINE_CANDIDATES] = {
-      {0, -1}, { -1, 0}, {1, 0}, {0, 1}
-  };
-  int i, j;
-
-  const uint8_t *what = x->plane[0].src.buf;
-  const int what_stride = x->plane[0].src.stride;
-  const int in_what_stride = xd->plane[0].pre[0].stride;
-  int br, bc;
-  MV this_mv;
-  unsigned int bestsad = 0x7fffffff;
-  unsigned int thissad;
-  const uint8_t *base_offset;
-  const uint8_t *this_offset;
-  int k = -1;
-  int best_site = -1;
-  const int max_hex_search = 512;
-  const int max_dia_search = 32;
-
-  const int *mvjsadcost = x->nmvjointsadcost;
-  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
-
-  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
-
-  // Adjust ref_mv to make sure it is within MV range
-  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
-  br = ref_mv->row;
-  bc = ref_mv->col;
-
-  // Check the start point
-  base_offset = xd->plane[0].pre[0].buf;
-  this_offset = base_offset + (br * in_what_stride) + bc;
-  this_mv.row = br;
-  this_mv.col = bc;
-  bestsad = vfp->sdf(what, what_stride, this_offset, in_what_stride, 0x7fffffff)
-            + mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost, mvsadcost,
-                             sad_per_bit);
-
-  // Initial 6-point hex search
-  if (check_bounds(x, br, bc, 2)) {
-    for (i = 0; i < FIRST_HEX_CANDIDATES; i++) {
-      this_mv.row = br + hex[i].row;
-      this_mv.col = bc + hex[i].col;
-      this_offset = base_offset + (this_mv.row * in_what_stride) + this_mv.col;
-      thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
-                         bestsad);
-      CHECK_BETTER
-    }
-  } else {
-    for (i = 0; i < FIRST_HEX_CANDIDATES; i++) {
-      this_mv.row = br + hex[i].row;
-      this_mv.col = bc + hex[i].col;
-      if (!is_mv_in(x, &this_mv))
-        continue;
-      this_offset = base_offset + (this_mv.row * in_what_stride) + this_mv.col;
-      thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
-                         bestsad);
-      CHECK_BETTER
-    }
-  }
-
-  // Continue hex search if we find a better match in first round
-  if (best_site != -1) {
-    br += hex[best_site].row;
-    bc += hex[best_site].col;
-    k = best_site;
-
-    // Allow search covering maximum MV range
-    for (j = 1; j < max_hex_search; j++) {
-      best_site = -1;
-
-      if (check_bounds(x, br, bc, 2)) {
-        for (i = 0; i < 3; i++) {
-          this_mv.row = br + next_chkpts[k][i].row;
-          this_mv.col = bc + next_chkpts[k][i].col;
-          this_offset = base_offset + (this_mv.row * in_what_stride) +
-              this_mv.col;
-          thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
-                             bestsad);
-          CHECK_BETTER
-        }
-      } else {
-        for (i = 0; i < 3; i++) {
-          this_mv.row = br + next_chkpts[k][i].row;
-          this_mv.col = bc + next_chkpts[k][i].col;
-          if (!is_mv_in(x, &this_mv))
-            continue;
-          this_offset = base_offset + (this_mv.row * in_what_stride) +
-              this_mv.col;
-          thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
-                             bestsad);
-          CHECK_BETTER
-        }
-      }
-
-      if (best_site == -1) {
-        break;
-      } else {
-        br += next_chkpts[k][best_site].row;
-        bc += next_chkpts[k][best_site].col;
-        k += 5 + best_site;
-        if (k >= 12) k -= 12;
-        else if (k >= 6) k -= 6;
-      }
-    }
-  }
-
-  // Check 4 1-away neighbors
-  for (j = 0; j < max_dia_search; j++) {
-    best_site = -1;
-
-    if (check_bounds(x, br, bc, 1)) {
-      for (i = 0; i < REFINE_CANDIDATES; i++) {
-        this_mv.row = br + neighbors[i].row;
-        this_mv.col = bc + neighbors[i].col;
-        this_offset = base_offset + (this_mv.row * in_what_stride) +
-            this_mv.col;
-        thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
-                           bestsad);
-        CHECK_BETTER
-      }
-    } else {
-      for (i = 0; i < REFINE_CANDIDATES; i++) {
-        this_mv.row = br + neighbors[i].row;
-        this_mv.col = bc + neighbors[i].col;
-        if (!is_mv_in(x, &this_mv))
-          continue;
-        this_offset = base_offset + (this_mv.row * in_what_stride) +
-            this_mv.col;
-        thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride,
-                           bestsad);
-        CHECK_BETTER
-      }
-    }
-
-    if (best_site == -1) {
-      break;
-    } else {
-      br += neighbors[best_site].row;
-      bc += neighbors[best_site].col;
-    }
-  }
-
-  best_mv->row = br;
-  best_mv->col = bc;
-
-  return bestsad;
+  return vp9_hex_search(x, ref_mv, MAX(MAX_MVSEARCH_STEPS - 2, search_param),
+                        sad_per_bit, do_init_search, vfp, use_mvcost,
+                        center_mv, best_mv);
 }
 
 #undef CHECK_BETTER
@@ -1045,8 +898,6 @@
   const int what_stride = x->plane[0].src.stride;
   const uint8_t *in_what;
   const int in_what_stride = xd->plane[0].pre[0].stride;
-  const uint8_t *best_address;
-
   MV this_mv;
 
   unsigned int bestsad = INT_MAX;
@@ -1076,7 +927,6 @@
 
   // Work out the start point for the search
   in_what = xd->plane[0].pre[0].buf + ref_row * in_what_stride + ref_col;
-  best_address = in_what;
 
   // Check the starting position
   bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff)
@@ -1134,20 +984,9 @@
       }
     }
   }
-
   best_mv->row += best_tr;
   best_mv->col += best_tc;
-
-  this_mv.row = best_mv->row * 8;
-  this_mv.col = best_mv->col * 8;
-
-  if (bestsad == INT_MAX)
-    return INT_MAX;
-
-  return fn_ptr->vf(what, what_stride, best_address, in_what_stride,
-                    (unsigned int *)(&thissad)) +
-                       mv_err_cost(&this_mv, center_mv,
-                                   mvjcost, mvcost, x->errorperbit);
+  return bestsad;
 }
 
 int vp9_diamond_search_sad_c(const MACROBLOCK *x,
@@ -1272,17 +1111,7 @@
       (*num00)++;
     }
   }
-
-  this_mv.row = best_mv->row * 8;
-  this_mv.col = best_mv->col * 8;
-
-  if (bestsad == INT_MAX)
-    return INT_MAX;
-
-  return fn_ptr->vf(what, what_stride, best_address, in_what_stride,
-                    (unsigned int *)(&thissad)) +
-                       mv_err_cost(&this_mv, center_mv,
-                                   mvjcost, mvcost, x->errorperbit);
+  return bestsad;
 }
 
 int vp9_diamond_search_sadx4(const MACROBLOCK *x,
@@ -1448,24 +1277,14 @@
       (*num00)++;
     }
   }
-
-  this_mv.row = best_mv->row * 8;
-  this_mv.col = best_mv->col * 8;
-
-  if (bestsad == INT_MAX)
-    return INT_MAX;
-
-  return fn_ptr->vf(what, what_stride, best_address, in_what_stride,
-                    (unsigned int *)(&thissad)) +
-                    mv_err_cost(&this_mv, center_mv,
-                                mvjcost, mvcost, x->errorperbit);
+  return bestsad;
 }
 
 /* do_refine: If last step (1-away) of n-step search doesn't pick the center
               point as the best match, we will do a final 1-away diamond
               refining search  */
 
-int vp9_full_pixel_diamond(VP9_COMP *cpi, MACROBLOCK *x,
+int vp9_full_pixel_diamond(const VP9_COMP *cpi, MACROBLOCK *x,
                            MV *mvp_full, int step_param,
                            int sadpb, int further_steps, int do_refine,
                            const vp9_variance_fn_ptr_t *fn_ptr,
@@ -1476,6 +1295,8 @@
                                         step_param, sadpb, &n,
                                         fn_ptr, x->nmvjointcost,
                                         x->mvcost, ref_mv);
+  if (bestsme < INT_MAX)
+    bestsme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
   *dst_mv = temp_mv;
 
   // If there won't be more n-step search, check to see if refining search is
@@ -1493,6 +1314,8 @@
                                         step_param + n, sadpb, &num00,
                                         fn_ptr, x->nmvjointcost, x->mvcost,
                                         ref_mv);
+      if (thissme < INT_MAX)
+        thissme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
 
       // check to see if refining search is needed.
       if (num00 > further_steps - n)
@@ -1512,12 +1335,13 @@
     thissme = cpi->refining_search_sad(x, &best_mv, sadpb, search_range,
                                        fn_ptr, x->nmvjointcost, x->mvcost,
                                        ref_mv);
+    if (thissme < INT_MAX)
+      thissme = vp9_get_mvpred_var(x, &best_mv, ref_mv, fn_ptr, 1);
     if (thissme < bestsme) {
       bestsme = thissme;
       *dst_mv = best_mv;
     }
   }
-
   return bestsme;
 }
 
@@ -1562,15 +1386,7 @@
       }
     }
   }
-
-  if (best_sad < INT_MAX) {
-    unsigned int unused;
-    const MV mv = {best_mv->row * 8, best_mv->col * 8};
-    return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &unused)
-                + mv_err_cost(&mv, center_mv, mvjcost, mvcost, x->errorperbit);
-  } else {
-    return INT_MAX;
-  }
+  return best_sad;
 }
 
 int vp9_full_search_sadx3(const MACROBLOCK *x, const MV *ref_mv,
@@ -1665,17 +1481,7 @@
       c++;
     }
   }
-
-  this_mv.row = best_mv->row * 8;
-  this_mv.col = best_mv->col * 8;
-
-  if (bestsad < INT_MAX)
-    return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,
-                      (unsigned int *)(&thissad)) +
-                      mv_err_cost(&this_mv, center_mv,
-                                  mvjcost, mvcost, x->errorperbit);
-  else
-    return INT_MAX;
+  return bestsad;
 }
 
 int vp9_full_search_sadx8(const MACROBLOCK *x, const MV *ref_mv,
@@ -1798,17 +1604,7 @@
       c++;
     }
   }
-
-  this_mv.row = best_mv->row * 8;
-  this_mv.col = best_mv->col * 8;
-
-  if (bestsad < INT_MAX)
-    return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,
-                      (unsigned int *)(&thissad)) +
-                      mv_err_cost(&this_mv, center_mv,
-                                  mvjcost, mvcost, x->errorperbit);
-  else
-    return INT_MAX;
+  return bestsad;
 }
 
 int vp9_refining_search_sad_c(const MACROBLOCK *x,
@@ -1866,16 +1662,7 @@
       best_address = &in_what[ref_mv->row * in_what_stride + ref_mv->col];
     }
   }
-
-  if (bestsad < INT_MAX) {
-    unsigned int unused;
-    const MV mv = {ref_mv->row * 8, ref_mv->col * 8};
-    return fn_ptr->vf(what, what_stride, best_address, in_what_stride,
-                      &unused) +
-        mv_err_cost(&mv, center_mv, mvjcost, mvcost, x->errorperbit);
-  } else {
-    return INT_MAX;
-  }
+  return bestsad;
 }
 
 int vp9_refining_search_sadx4(const MACROBLOCK *x,
@@ -1977,17 +1764,7 @@
                       neighbors[best_site].col;
     }
   }
-
-  this_mv.row = ref_mv->row * 8;
-  this_mv.col = ref_mv->col * 8;
-
-  if (bestsad < INT_MAX)
-    return fn_ptr->vf(what, what_stride, best_address, in_what_stride,
-                      (unsigned int *)(&thissad)) +
-                      mv_err_cost(&this_mv, center_mv,
-                                  mvjcost, mvcost, x->errorperbit);
-  else
-    return INT_MAX;
+  return bestsad;
 }
 
 // This function is called when we do joint motion search in comp_inter_inter
@@ -2055,18 +1832,5 @@
       best_address = &in_what[ref_mv->row * in_what_stride + ref_mv->col];
     }
   }
-
-  this_mv.row = ref_mv->row * 8;
-  this_mv.col = ref_mv->col * 8;
-
-  if (bestsad < INT_MAX) {
-    // FIXME(rbultje, yunqing): add full-pixel averaging variance functions
-    // so we don't have to use the subpixel with xoff=0,yoff=0 here.
-    return fn_ptr->svaf(best_address, in_what_stride, 0, 0, what, what_stride,
-                        (unsigned int *)(&thissad), second_pred) +
-                        mv_err_cost(&this_mv, center_mv,
-                                    mvjcost, mvcost, x->errorperbit);
-  } else {
-    return INT_MAX;
-  }
+  return bestsad;
 }
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index 586a74c..39360f1 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -35,6 +35,19 @@
 void vp9_set_mv_search_range(MACROBLOCK *x, const MV *mv);
 int vp9_mv_bit_cost(const MV *mv, const MV *ref,
                     const int *mvjcost, int *mvcost[2], int weight);
+
+// Utility to compute variance + MV rate cost for a given MV
+int vp9_get_mvpred_var(const MACROBLOCK *x,
+                       MV *best_mv,
+                       const MV *center_mv,
+                       const vp9_variance_fn_ptr_t *vfp,
+                       int use_mvcost);
+int vp9_get_mvpred_av_var(const MACROBLOCK *x,
+                          MV *best_mv,
+                          const MV *center_mv,
+                          const uint8_t *second_pred,
+                          const vp9_variance_fn_ptr_t *vfp,
+                          int use_mvcost);
 void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride);
 void vp9_init3smotion_compensation(MACROBLOCK *x,  int stride);
 
@@ -42,47 +55,27 @@
 int vp9_init_search_range(struct VP9_COMP *cpi, int size);
 
 // Runs sequence of diamond searches in smaller steps for RD
-int vp9_full_pixel_diamond(struct VP9_COMP *cpi, MACROBLOCK *x,
+int vp9_full_pixel_diamond(const struct VP9_COMP *cpi, MACROBLOCK *x,
                            MV *mvp_full, int step_param,
                            int sadpb, int further_steps, int do_refine,
                            const vp9_variance_fn_ptr_t *fn_ptr,
                            const MV *ref_mv, MV *dst_mv);
 
-int vp9_hex_search(const MACROBLOCK *x,
-                   MV *ref_mv,
-                   int search_param,
-                   int error_per_bit,
-                   int do_init_search,
-                   const vp9_variance_fn_ptr_t *vf,
-                   int use_mvcost,
-                   const MV *center_mv,
-                   MV *best_mv);
-int vp9_bigdia_search(const MACROBLOCK *x,
-                      MV *ref_mv,
-                      int search_param,
-                      int error_per_bit,
-                      int do_init_search,
-                      const vp9_variance_fn_ptr_t *vf,
-                      int use_mvcost,
-                      const MV *center_mv,
-                      MV *best_mv);
-int vp9_square_search(const MACROBLOCK *x,
-                      MV *ref_mv,
-                      int search_param,
-                      int error_per_bit,
-                      int do_init_search,
-                      const vp9_variance_fn_ptr_t *vf,
-                      int use_mvcost,
-                      const MV *center_mv,
-                      MV *best_mv);
-int vp9_fast_hex_search(const MACROBLOCK *x,
-                        MV *ref_mv,
-                        int search_param,
-                        int sad_per_bit,
-                        const vp9_variance_fn_ptr_t *vfp,
-                        int use_mvcost,
-                        const MV *center_mv,
-                        MV *best_mv);
+typedef int (integer_mv_pattern_search_fn) (
+    const MACROBLOCK *x,
+    MV *ref_mv,
+    int search_param,
+    int error_per_bit,
+    int do_init_search,
+    const vp9_variance_fn_ptr_t *vf,
+    int use_mvcost,
+    const MV *center_mv,
+    MV *best_mv);
+
+integer_mv_pattern_search_fn vp9_hex_search;
+integer_mv_pattern_search_fn vp9_bigdia_search;
+integer_mv_pattern_search_fn vp9_square_search;
+integer_mv_pattern_search_fn vp9_fast_hex_search;
 
 typedef int (fractional_mv_step_fp) (
     const MACROBLOCK *x,
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 3921ea8..9405de0 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -263,7 +263,7 @@
     // Clear down the complexity map used for rd
     vpx_memset(cpi->complexity_map, 0, cm->mi_rows * cm->mi_cols);
 
-    vp9_enable_segmentation((VP9_PTR)cpi);
+    vp9_enable_segmentation(seg);
     vp9_clearall_segfeatures(seg);
 
     // Select delta coding method
@@ -297,7 +297,7 @@
     cpi->static_mb_pct = 0;
 
     // Disable segmentation
-    vp9_disable_segmentation((VP9_PTR)cpi);
+    vp9_disable_segmentation(seg);
 
     // Clear down the segment features.
     vp9_clearall_segfeatures(seg);
@@ -310,7 +310,7 @@
     cpi->static_mb_pct = 0;
 
     // Disable segmentation and individual segment features by default
-    vp9_disable_segmentation((VP9_PTR)cpi);
+    vp9_disable_segmentation(seg);
     vp9_clearall_segfeatures(seg);
 
     // Scan frames from current to arf frame.
@@ -363,7 +363,7 @@
         // Disable segmentation and clear down features if alt ref
         // is not active for this group
 
-        vp9_disable_segmentation((VP9_PTR)cpi);
+        vp9_disable_segmentation(seg);
 
         vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
 
@@ -662,6 +662,7 @@
     sf->use_lp32x32fdct = 1;
     sf->subpel_iters_per_step = 1;
     sf->use_fast_coef_updates = 2;
+    sf->use_fast_coef_costing = 1;
 
     sf->adaptive_rd_thresh = 4;
     sf->mode_skip_start = 6;
@@ -698,6 +699,7 @@
     sf->use_lp32x32fdct = 1;
     sf->subpel_iters_per_step = 1;
     sf->use_fast_coef_updates = 2;
+    sf->use_fast_coef_costing = 1;
 
     sf->adaptive_rd_thresh = 4;
     sf->mode_skip_start = 6;
@@ -730,10 +732,12 @@
       sf->intra_uv_mode_mask[i] = INTRA_DC_ONLY;
     }
     sf->use_fast_coef_updates = 2;
+    sf->use_fast_coef_costing = 1;
     sf->adaptive_rd_thresh = 4;
     sf->mode_skip_start = 6;
   }
 }
+
 static void set_rt_speed_feature(VP9_COMMON *cm,
                                  SPEED_FEATURES *sf,
                                  int speed) {
@@ -741,6 +745,7 @@
   sf->adaptive_rd_thresh = 1;
   sf->recode_loop = ((speed < 1) ? ALLOW_RECODE : ALLOW_RECODE_KFMAXBW);
   sf->encode_breakout_thresh = 1;
+  sf->use_fast_coef_costing = 1;
 
   if (speed == 1) {
     sf->use_square_partition_only = !frame_is_intra_only(cm);
@@ -853,10 +858,17 @@
   }
   if (speed >= 6) {
     sf->partition_search_type = VAR_BASED_FIXED_PARTITION;
+    sf->search_method = HEX;
   }
   if (speed >= 7) {
     sf->partition_search_type = VAR_BASED_FIXED_PARTITION;
     sf->use_nonrd_pick_mode = 1;
+    sf->search_method = FAST_HEX;
+  }
+  if (speed >= 8) {
+    int i;
+    for (i = 0; i < BLOCK_SIZES; ++i)
+      sf->disable_inter_mode_mask[i] = 14;   // only search NEARESTMV (0)
   }
 }
 
@@ -915,9 +927,12 @@
   sf->use_uv_intra_rd_estimate = 0;
   sf->use_fast_lpf_pick = 0;
   sf->use_fast_coef_updates = 0;
+  sf->use_fast_coef_costing = 0;
   sf->mode_skip_start = MAX_MODES;  // Mode index at which mode skip mask set
   sf->use_nonrd_pick_mode = 0;
   sf->encode_breakout_thresh = 0;
+  for (i = 0; i < BLOCK_SIZES; ++i)
+    sf->disable_inter_mode_mask[i] = 0;
 
   switch (cpi->oxcf.mode) {
     case MODE_BESTQUALITY:
@@ -1294,8 +1309,7 @@
   cm->log2_tile_rows = cpi->oxcf.tile_rows;
 }
 
-static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
-  VP9_COMP *cpi = (VP9_COMP *)(ptr);
+static void init_config(struct VP9_COMP *cpi, VP9_CONFIG *oxcf) {
   VP9_COMMON *const cm = &cpi->common;
   int i;
 
@@ -1320,7 +1334,7 @@
   }
 
   // change includes all joint functionality
-  vp9_change_config(ptr, oxcf);
+  vp9_change_config(cpi, oxcf);
 
   // Initialize active best and worst q and average q values.
   if (cpi->pass == 0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
@@ -1364,8 +1378,7 @@
     cpi->fixed_divide[i] = 0x80000 / i;
 }
 
-void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
-  VP9_COMP *cpi = (VP9_COMP *)(ptr);
+void vp9_change_config(struct VP9_COMP *cpi, VP9_CONFIG *oxcf) {
   VP9_COMMON *const cm = &cpi->common;
 
   if (!cpi || !oxcf)
@@ -1683,30 +1696,19 @@
   }
 }
 
-VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
+VP9_COMP *vp9_create_compressor(VP9_CONFIG *oxcf) {
   int i, j;
-  volatile union {
-    VP9_COMP *cpi;
-    VP9_PTR   ptr;
-  } ctx;
+  VP9_COMP *cpi = vpx_memalign(32, sizeof(VP9_COMP));
+  VP9_COMMON *cm = cpi != NULL ? &cpi->common : NULL;
 
-  VP9_COMP *cpi;
-  VP9_COMMON *cm;
-
-  cpi = ctx.cpi = vpx_memalign(32, sizeof(VP9_COMP));
-  // Check that the CPI instance is valid
-  if (!cpi)
-    return 0;
-
-  cm = &cpi->common;
+  if (!cm)
+    return NULL;
 
   vp9_zero(*cpi);
 
   if (setjmp(cm->error.jmp)) {
-    VP9_PTR ptr = ctx.ptr;
-
-    ctx.cpi->common.error.setjmp = 0;
-    vp9_remove_compressor(&ptr);
+    cm->error.setjmp = 0;
+    vp9_remove_compressor(cpi);
     return 0;
   }
 
@@ -1715,15 +1717,14 @@
   CHECK_MEM_ERROR(cm, cpi->mb.ss, vpx_calloc(sizeof(search_site),
                                              (MAX_MVSEARCH_STEPS * 8) + 1));
 
-  vp9_create_common(cm);
+  vp9_rtcd();
 
   cpi->use_svc = 0;
 
-  init_config((VP9_PTR)cpi, oxcf);
-
+  init_config(cpi, oxcf);
   init_pick_mode_context(cpi);
 
-  cm->current_video_frame   = 0;
+  cm->current_video_frame = 0;
 
   // Set reference frame sign bias for ALTREF frame to 1 (for now)
   cm->ref_frame_sign_bias[ALTREF_FRAME] = 1;
@@ -1731,8 +1732,8 @@
   cpi->rc.baseline_gf_interval = DEFAULT_GF_INTERVAL;
 
   cpi->gold_is_last = 0;
-  cpi->alt_is_last  = 0;
-  cpi->gold_is_alt  = 0;
+  cpi->alt_is_last = 0;
+  cpi->gold_is_alt = 0;
 
   // Create the encoder segmentation map and set all entries to 0
   CHECK_MEM_ERROR(cm, cpi->segmentation_map,
@@ -2000,21 +2001,16 @@
   vp9_zero(cpi->mode_test_hits);
 #endif
 
-  return (VP9_PTR) cpi;
+  return cpi;
 }
 
-void vp9_remove_compressor(VP9_PTR *ptr) {
-  VP9_COMP *cpi = (VP9_COMP *)(*ptr);
+void vp9_remove_compressor(VP9_COMP *cpi) {
   int i;
 
   if (!cpi)
     return;
 
   if (cpi && (cpi->common.current_video_frame > 0)) {
-    if (cpi->pass == 2) {
-      vp9_end_second_pass(cpi);
-    }
-
 #if CONFIG_INTERNAL_STATS
 
     vp9_clear_system_state();
@@ -2115,7 +2111,6 @@
 
   vp9_remove_common(&cpi->common);
   vpx_free(cpi);
-  *ptr = 0;
 
 #ifdef OUTPUT_YUV_SRC
   fclose(yuv_file);
@@ -2242,9 +2237,7 @@
   vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt);
 }
 
-int vp9_use_as_reference(VP9_PTR ptr, int ref_frame_flags) {
-  VP9_COMP *cpi = (VP9_COMP *)(ptr);
-
+int vp9_use_as_reference(VP9_COMP *cpi, int ref_frame_flags) {
   if (ref_frame_flags > 7)
     return -1;
 
@@ -2252,9 +2245,7 @@
   return 0;
 }
 
-int vp9_update_reference(VP9_PTR ptr, int ref_frame_flags) {
-  VP9_COMP *cpi = (VP9_COMP *)(ptr);
-
+int vp9_update_reference(VP9_COMP *cpi, int ref_frame_flags) {
   if (ref_frame_flags > 7)
     return -1;
 
@@ -2288,9 +2279,8 @@
   return ref_frame == NONE ? NULL : get_ref_frame_buffer(cpi, ref_frame);
 }
 
-int vp9_copy_reference_enc(VP9_PTR ptr, VP9_REFFRAME ref_frame_flag,
+int vp9_copy_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag,
                            YV12_BUFFER_CONFIG *sd) {
-  VP9_COMP *const cpi = (VP9_COMP *)ptr;
   YV12_BUFFER_CONFIG *cfg = get_vp9_ref_frame_buffer(cpi, ref_frame_flag);
   if (cfg) {
     vp8_yv12_copy_frame(cfg, sd);
@@ -2300,8 +2290,7 @@
   }
 }
 
-int vp9_get_reference_enc(VP9_PTR ptr, int index, YV12_BUFFER_CONFIG **fb) {
-  VP9_COMP *cpi = (VP9_COMP *)ptr;
+int vp9_get_reference_enc(VP9_COMP *cpi, int index, YV12_BUFFER_CONFIG **fb) {
   VP9_COMMON *cm = &cpi->common;
 
   if (index < 0 || index >= REF_FRAMES)
@@ -2311,9 +2300,8 @@
   return 0;
 }
 
-int vp9_set_reference_enc(VP9_PTR ptr, VP9_REFFRAME ref_frame_flag,
+int vp9_set_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag,
                           YV12_BUFFER_CONFIG *sd) {
-  VP9_COMP *cpi = (VP9_COMP *)ptr;
   YV12_BUFFER_CONFIG *cfg = get_vp9_ref_frame_buffer(cpi, ref_frame_flag);
   if (cfg) {
     vp8_yv12_copy_frame(sd, cfg);
@@ -2323,9 +2311,9 @@
   }
 }
 
-int vp9_update_entropy(VP9_PTR comp, int update) {
-  ((VP9_COMP *)comp)->ext_refresh_frame_context = update;
-  ((VP9_COMP *)comp)->ext_refresh_frame_context_pending = 1;
+int vp9_update_entropy(VP9_COMP * cpi, int update) {
+  cpi->ext_refresh_frame_context = update;
+  cpi->ext_refresh_frame_context_pending = 1;
   return 0;
 }
 
@@ -2674,8 +2662,7 @@
     vp9_loop_filter_frame(cm, xd, lf->filter_level, 0, 0);
   }
 
-  vp9_extend_frame_inner_borders(cm->frame_to_show,
-                                 cm->subsampling_x, cm->subsampling_y);
+  vp9_extend_frame_inner_borders(cm->frame_to_show);
 }
 
 static void scale_references(VP9_COMP *cpi) {
@@ -3442,10 +3429,9 @@
 }
 
 
-int vp9_receive_raw_frame(VP9_PTR ptr, unsigned int frame_flags,
+int vp9_receive_raw_frame(VP9_COMP *cpi, unsigned int frame_flags,
                           YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
                           int64_t end_time) {
-  VP9_COMP *cpi = (VP9_COMP *)ptr;
   VP9_COMMON *cm = &cpi->common;
   struct vpx_usec_timer timer;
   int res = 0;
@@ -3454,8 +3440,8 @@
 
   check_initial_width(cpi, subsampling_x, subsampling_y);
   vpx_usec_timer_start(&timer);
-  if (vp9_lookahead_push(cpi->lookahead, sd, time_stamp, end_time, frame_flags,
-                         cpi->active_map_enabled ? cpi->active_map : NULL))
+  if (vp9_lookahead_push(cpi->lookahead,
+                         sd, time_stamp, end_time, frame_flags))
     res = -1;
   vpx_usec_timer_mark(&timer);
   cpi->time_receive_data += vpx_usec_timer_elapsed(&timer);
@@ -3528,10 +3514,9 @@
   cpi->last_end_time_stamp_seen = cpi->source->ts_end;
 }
 
-int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
+int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
                             size_t *size, uint8_t *dest,
                             int64_t *time_stamp, int64_t *time_end, int flush) {
-  VP9_COMP *cpi = (VP9_COMP *) ptr;
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &cpi->mb.e_mbd;
   struct vpx_usec_timer  cmptimer;
@@ -3583,8 +3568,7 @@
         // TODO(agrange) merge these two functions.
         vp9_configure_arnr_filter(cpi, frames_to_arf, cpi->rc.gfu_boost);
         vp9_temporal_filter_prepare(cpi, frames_to_arf);
-        vp9_extend_frame_borders(&cpi->alt_ref_buffer,
-                                 cm->subsampling_x, cm->subsampling_y);
+        vp9_extend_frame_borders(&cpi->alt_ref_buffer);
         force_src_buffer = &cpi->alt_ref_buffer;
       }
 
@@ -3720,7 +3704,7 @@
                                       cm->width, cm->height);
 
     if (vp9_is_scaled(&ref_buf->sf))
-      vp9_extend_frame_borders(buf, cm->subsampling_x, cm->subsampling_y);
+      vp9_extend_frame_borders(buf);
   }
 
   set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
@@ -3842,9 +3826,8 @@
   return 0;
 }
 
-int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest,
+int vp9_get_preview_raw_frame(VP9_COMP *cpi, YV12_BUFFER_CONFIG *dest,
                               vp9_ppflags_t *flags) {
-  VP9_COMP *cpi = (VP9_COMP *)comp;
   VP9_COMMON *cm = &cpi->common;
 
   if (!cm->show_frame) {
@@ -3872,11 +3855,10 @@
   }
 }
 
-int vp9_set_roimap(VP9_PTR comp, unsigned char *map, unsigned int rows,
+int vp9_set_roimap(VP9_COMP *cpi, unsigned char *map, unsigned int rows,
                    unsigned int cols, int delta_q[MAX_SEGMENTS],
                    int delta_lf[MAX_SEGMENTS],
                    unsigned int threshold[MAX_SEGMENTS]) {
-  VP9_COMP *cpi = (VP9_COMP *) comp;
   signed char feature_data[SEG_LVL_MAX][MAX_SEGMENTS];
   struct segmentation *seg = &cpi->common.seg;
   int i;
@@ -3885,15 +3867,15 @@
     return -1;
 
   if (!map) {
-    vp9_disable_segmentation((VP9_PTR)cpi);
+    vp9_disable_segmentation(seg);
     return 0;
   }
 
   // Set the segmentation Map
-  vp9_set_segmentation_map((VP9_PTR)cpi, map);
+  vp9_set_segmentation_map(cpi, map);
 
   // Activate segmentation.
-  vp9_enable_segmentation((VP9_PTR)cpi);
+  vp9_enable_segmentation(seg);
 
   // Set up the quant, LF and breakout threshold segment data
   for (i = 0; i < MAX_SEGMENTS; i++) {
@@ -3917,15 +3899,13 @@
 
   // Initialize the feature data structure
   // SEGMENT_DELTADATA    0, SEGMENT_ABSDATA      1
-  vp9_set_segment_data((VP9_PTR)cpi, &feature_data[0][0], SEGMENT_DELTADATA);
+  vp9_set_segment_data(seg, &feature_data[0][0], SEGMENT_DELTADATA);
 
   return 0;
 }
 
-int vp9_set_active_map(VP9_PTR comp, unsigned char *map,
+int vp9_set_active_map(VP9_COMP *cpi, unsigned char *map,
                        unsigned int rows, unsigned int cols) {
-  VP9_COMP *cpi = (VP9_COMP *) comp;
-
   if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols) {
     if (map) {
       vpx_memcpy(cpi->active_map, map, rows * cols);
@@ -3941,9 +3921,8 @@
   }
 }
 
-int vp9_set_internal_size(VP9_PTR comp,
+int vp9_set_internal_size(VP9_COMP *cpi,
                           VPX_SCALING horiz_mode, VPX_SCALING vert_mode) {
-  VP9_COMP *cpi = (VP9_COMP *) comp;
   VP9_COMMON *cm = &cpi->common;
   int hr = 0, hs = 0, vr = 0, vs = 0;
 
@@ -3963,9 +3942,8 @@
   return 0;
 }
 
-int vp9_set_size_literal(VP9_PTR comp, unsigned int width,
+int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width,
                          unsigned int height) {
-  VP9_COMP *cpi = (VP9_COMP *)comp;
   VP9_COMMON *cm = &cpi->common;
 
   check_initial_width(cpi, 1, 1);
@@ -4000,8 +3978,7 @@
   return 0;
 }
 
-void vp9_set_svc(VP9_PTR comp, int use_svc) {
-  VP9_COMP *cpi = (VP9_COMP *)comp;
+void vp9_set_svc(VP9_COMP *cpi, int use_svc) {
   cpi->use_svc = use_svc;
   return;
 }
@@ -4031,6 +4008,6 @@
 }
 
 
-int vp9_get_quantizer(VP9_PTR c) {
-  return ((VP9_COMP *)c)->common.base_qindex;
+int vp9_get_quantizer(VP9_COMP *cpi) {
+  return cpi->common.base_qindex;
 }
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index c4b018a..f1e5e3a 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -416,6 +416,14 @@
   // This variable sets the encode_breakout threshold. Currently, it is only
   // enabled in real time mode.
   int encode_breakout_thresh;
+
+  // A binary mask indicating if NEARESTMV, NEARMV, ZEROMV, NEWMV
+  // modes are disabled in order from LSB to MSB for each BLOCK_SIZE.
+  int disable_inter_mode_mask[BLOCK_SIZES];
+
+  // This feature controls whether we do the expensive context update and
+  // calculation in the rd coefficient costing loop.
+  int use_fast_coef_costing;
 } SPEED_FEATURES;
 
 typedef struct {
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 0f2e303..75122bc 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -8,44 +8,33 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include <stdio.h>
-#include <math.h>
-#include <limits.h>
 #include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
 
-#include "vp9/common/vp9_pragmas.h"
-#include "vp9/encoder/vp9_tokenize.h"
-#include "vp9/encoder/vp9_treewriter.h"
-#include "vp9/encoder/vp9_onyx_int.h"
-#include "vp9/common/vp9_entropymode.h"
+#include "./vp9_rtcd.h"
+
+#include "vpx_mem/vpx_mem.h"
+
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_mvref_common.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
-#include "vp9/common/vp9_quant_common.h"
-#include "vp9/encoder/vp9_encodemb.h"
-#include "vp9/encoder/vp9_quantize.h"
-#include "vp9/encoder/vp9_variance.h"
-#include "vp9/encoder/vp9_mcomp.h"
-#include "vp9/encoder/vp9_rdopt.h"
+
+#include "vp9/encoder/vp9_onyx_int.h"
 #include "vp9/encoder/vp9_ratectrl.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vp9/common/vp9_systemdependent.h"
-#include "vp9/encoder/vp9_encodemv.h"
-#include "vp9/common/vp9_seg_common.h"
-#include "vp9/common/vp9_pred_common.h"
-#include "vp9/common/vp9_entropy.h"
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_mvref_common.h"
-#include "vp9/common/vp9_common.h"
+#include "vp9/encoder/vp9_rdopt.h"
 
 static int full_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
-                                     const TileInfo *const tile,
-                                     BLOCK_SIZE bsize, int mi_row, int mi_col,
-                                     int_mv *tmp_mv, int *rate_mv) {
+                                    const TileInfo *const tile,
+                                    BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                    int_mv *tmp_mv, int *rate_mv) {
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
   struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
   int bestsme = INT_MAX;
-  int further_steps, step_param;
+  int step_param;
   int sadpb = x->sadperbit16;
   MV mvp_full;
   int ref = mbmi->ref_frame[0];
@@ -78,7 +67,6 @@
   // TODO(jingning) exploiting adaptive motion search control in non-RD
   // mode decision too.
   step_param = 6;
-  further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
 
   for (i = LAST_FRAME; i <= LAST_FRAME && cpi->common.show_frame; ++i) {
     if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
@@ -99,14 +87,33 @@
   mvp_full.row >>= 3;
 
   if (cpi->sf.search_method == FAST_HEX) {
-    vp9_fast_hex_search(x, &mvp_full, step_param, sadpb, &cpi->fn_ptr[bsize],
-                        1, &ref_mv.as_mv, &tmp_mv->as_mv);
+    // NOTE: this returns SAD
+    bestsme = vp9_fast_hex_search(x, &mvp_full, step_param, sadpb, 0,
+                                  &cpi->fn_ptr[bsize], 1,
+                                  &ref_mv.as_mv, &tmp_mv->as_mv);
+  } else if (cpi->sf.search_method == HEX) {
+    // NOTE: this returns SAD
+    bestsme = vp9_hex_search(x, &mvp_full, step_param, sadpb, 1,
+                             &cpi->fn_ptr[bsize], 1,
+                             &ref_mv.as_mv, &tmp_mv->as_mv);
+  } else if (cpi->sf.search_method == SQUARE) {
+    // NOTE: this returns SAD
+    bestsme = vp9_square_search(x, &mvp_full, step_param, sadpb, 1,
+                                &cpi->fn_ptr[bsize], 1,
+                                &ref_mv.as_mv, &tmp_mv->as_mv);
+  } else if (cpi->sf.search_method == BIGDIA) {
+    // NOTE: this returns SAD
+    bestsme = vp9_bigdia_search(x, &mvp_full, step_param, sadpb, 1,
+                                &cpi->fn_ptr[bsize], 1,
+                                &ref_mv.as_mv, &tmp_mv->as_mv);
   } else {
-    vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param, sadpb, further_steps,
-                           1, &cpi->fn_ptr[bsize], &ref_mv.as_mv,
-                           &tmp_mv->as_mv);
+    int further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
+    // NOTE: this returns variance
+    bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
+                                     sadpb, further_steps, 1,
+                                     &cpi->fn_ptr[bsize],
+                                     &ref_mv.as_mv, &tmp_mv->as_mv);
   }
-
   x->mv_col_min = tmp_col_min;
   x->mv_col_max = tmp_col_max;
   x->mv_row_min = tmp_row_min;
@@ -140,12 +147,12 @@
 static void sub_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
                                     const TileInfo *const tile,
                                     BLOCK_SIZE bsize, int mi_row, int mi_col,
-                                    int_mv *tmp_mv) {
+                                    MV *tmp_mv) {
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
   struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
   int ref = mbmi->ref_frame[0];
-  int_mv ref_mv = mbmi->ref_mvs[ref][0];
+  MV ref_mv = mbmi->ref_mvs[ref][0].as_mv;
   int dis;
 
   const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
@@ -161,10 +168,10 @@
     setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
   }
 
-  tmp_mv->as_mv.col >>= 3;
-  tmp_mv->as_mv.row >>= 3;
+  tmp_mv->col >>= 3;
+  tmp_mv->row >>= 3;
 
-  cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv.as_mv,
+  cpi->find_fractional_mv_step(x, tmp_mv, &ref_mv,
                                cpi->common.allow_high_precision_mv,
                                x->errorperbit,
                                &cpi->fn_ptr[bsize],
@@ -200,8 +207,8 @@
   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
                                     VP9_ALT_FLAG };
   int64_t best_rd = INT64_MAX;
-  int64_t this_rd;
-  static const int cost[4]= { 0, 50, 75, 100 };
+  int64_t this_rd = INT64_MAX;
+  static const int cost[4]= { 0, 2, 4, 6 };
 
   const int64_t inter_mode_thresh = 300;
   const int64_t intra_mode_cost = 50;
@@ -239,7 +246,6 @@
 
   for (ref_frame = LAST_FRAME; ref_frame <= LAST_FRAME ; ++ref_frame) {
     int rate_mv = 0;
-
     if (!(cpi->ref_frame_flags & flag_list[ref_frame]))
       continue;
 
@@ -252,11 +258,15 @@
     mbmi->ref_frame[0] = ref_frame;
 
     for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
-      int rate = cost[INTER_OFFSET(this_mode)];
+      int rate = cost[INTER_OFFSET(this_mode)]
+          << (num_pels_log2_lookup[bsize] - 4);
       int64_t dist;
+      if (cpi->sf.disable_inter_mode_mask[bsize] &
+          (1 << INTER_OFFSET(this_mode)))
+        continue;
 
       if (this_mode == NEWMV) {
-        if (this_rd < 500)
+        if (this_rd < (int64_t)(1 << num_pels_log2_lookup[bsize]))
           continue;
 
         x->mode_sad[ref_frame][INTER_OFFSET(NEWMV)] =
@@ -267,7 +277,7 @@
           continue;
 
         sub_pixel_motion_search(cpi, x, tile, bsize, mi_row, mi_col,
-                                &frame_mv[NEWMV][ref_frame]);
+                                &frame_mv[NEWMV][ref_frame].as_mv);
       }
 
       if (frame_mv[this_mode][ref_frame].as_int == 0) {
@@ -323,6 +333,5 @@
       }
     }
   }
-
   return INT64_MAX;
 }
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 89aa821..8430e4b 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -587,6 +587,7 @@
                                                    q_adj_factor);
     }
   } else if (!rc->is_src_frame_alt_ref &&
+             !cpi->use_svc &&
              (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
     // Use the lower of active_worst_quality and recent
     // average Q as basis for GF/ARF best Q limit unless last frame was
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index b57b948..a6f54fd 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -8,35 +8,38 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include <stdio.h>
-#include <math.h>
-#include <limits.h>
 #include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
 
-#include "vp9/common/vp9_pragmas.h"
-#include "vp9/encoder/vp9_tokenize.h"
-#include "vp9/encoder/vp9_treewriter.h"
-#include "vp9/encoder/vp9_onyx_int.h"
+#include "./vp9_rtcd.h"
+
+#include "vpx_mem/vpx_mem.h"
+
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/vp9_mvref_common.h"
+#include "vp9/common/vp9_pragmas.h"
+#include "vp9/common/vp9_pred_common.h"
+#include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
-#include "vp9/common/vp9_quant_common.h"
-#include "vp9/encoder/vp9_encodemb.h"
-#include "vp9/encoder/vp9_quantize.h"
-#include "vp9/encoder/vp9_variance.h"
-#include "vp9/encoder/vp9_mcomp.h"
-#include "vp9/encoder/vp9_rdopt.h"
-#include "vp9/encoder/vp9_ratectrl.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vp9/common/vp9_systemdependent.h"
-#include "vp9/encoder/vp9_encodemv.h"
 #include "vp9/common/vp9_seg_common.h"
-#include "vp9/common/vp9_pred_common.h"
-#include "vp9/common/vp9_entropy.h"
-#include "./vp9_rtcd.h"
-#include "vp9/common/vp9_mvref_common.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_idct.h"
+#include "vp9/common/vp9_systemdependent.h"
+
+#include "vp9/encoder/vp9_encodemb.h"
+#include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_mcomp.h"
+#include "vp9/encoder/vp9_onyx_int.h"
+#include "vp9/encoder/vp9_quantize.h"
+#include "vp9/encoder/vp9_ratectrl.h"
+#include "vp9/encoder/vp9_rdopt.h"
+#include "vp9/encoder/vp9_tokenize.h"
+#include "vp9/encoder/vp9_treewriter.h"
+#include "vp9/encoder/vp9_variance.h"
 
 /* Factor to weigh the rate for switchable interp filters */
 #define SWITCHABLE_INTERP_RATE_FACTOR 1
@@ -69,6 +72,7 @@
   int64_t this_rd;
   int64_t best_rd;
   int skip;
+  int use_fast_coef_costing;
   const scan_order *so;
 };
 
@@ -142,9 +146,8 @@
 }
 
 static void fill_mode_costs(VP9_COMP *cpi) {
-  VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
-  FRAME_CONTEXT *const fc = &cm->fc;
+  const FRAME_CONTEXT *const fc = &cpi->common.fc;
   int i, j;
 
   for (i = 0; i < INTRA_MODES; i++)
@@ -161,8 +164,7 @@
 
   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
     vp9_cost_tokens((int *)x->switchable_interp_costs[i],
-                    fc->switchable_interp_prob[i],
-                    vp9_switchable_interp_tree);
+                    fc->switchable_interp_prob[i], vp9_switchable_interp_tree);
 }
 
 static void fill_token_costs(vp9_coeff_cost *c,
@@ -211,7 +213,7 @@
   }
 }
 
-int vp9_compute_rd_mult(VP9_COMP *cpi, int qindex) {
+int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) {
   const int q = vp9_dc_quant(qindex, 0);
   // TODO(debargha): Adjust the function below
   int rdmult = 88 * q * q / 25;
@@ -225,12 +227,9 @@
 }
 
 static int compute_rd_thresh_factor(int qindex) {
-  int q;
   // TODO(debargha): Adjust the function below
-  q = (int)(pow(vp9_dc_quant(qindex, 0) / 4.0, RD_THRESH_POW) * 5.12);
-  if (q < 8)
-    q = 8;
-  return q;
+  const int q = (int)(pow(vp9_dc_quant(qindex, 0) / 4.0, RD_THRESH_POW) * 5.12);
+  return MAX(q, 8);
 }
 
 void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex) {
@@ -239,9 +238,9 @@
 }
 
 static void set_block_thresholds(VP9_COMP *cpi) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const SPEED_FEATURES *const sf = &cpi->sf;
   int i, bsize, segment_id;
-  VP9_COMMON *cm = &cpi->common;
-  SPEED_FEATURES *sf = &cpi->sf;
 
   for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) {
     const int qindex = clamp(vp9_get_qindex(&cm->seg, segment_id,
@@ -271,8 +270,8 @@
 }
 
 void vp9_initialize_rd_consts(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-  MACROBLOCK *x = &cpi->mb;
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
   int i;
 
   vp9_clear_system_state();
@@ -430,7 +429,7 @@
   int i;
   int64_t rate_sum = 0;
   int64_t dist_sum = 0;
-  int ref = xd->mi_8x8[0]->mbmi.ref_frame[0];
+  const int ref = xd->mi_8x8[0]->mbmi.ref_frame[0];
   unsigned int sse;
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
@@ -479,8 +478,8 @@
                                  int *out_skip) {
   int j, k;
   BLOCK_SIZE bs;
-  struct macroblock_plane *const p = &x->plane[0];
-  struct macroblockd_plane *const pd = &xd->plane[0];
+  const struct macroblock_plane *const p = &x->plane[0];
+  const struct macroblockd_plane *const pd = &xd->plane[0];
   const int width = 4 * num_4x4_blocks_wide_lookup[bsize];
   const int height = 4 * num_4x4_blocks_high_lookup[bsize];
   int rate_sum = 0;
@@ -546,16 +545,16 @@
   { 1, 2, 3, 4, 11,  256 - 21, 0 },
   { 1, 2, 3, 4, 11, 1024 - 21, 0 },
 };
-
 static INLINE int cost_coeffs(MACROBLOCK *x,
                               int plane, int block,
                               ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L,
                               TX_SIZE tx_size,
-                              const int16_t *scan, const int16_t *nb) {
+                              const int16_t *scan, const int16_t *nb,
+                              int use_fast_coef_costing) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
-  struct macroblock_plane *p = &x->plane[plane];
-  struct macroblockd_plane *pd = &xd->plane[plane];
+  const struct macroblock_plane *p = &x->plane[plane];
+  const struct macroblockd_plane *pd = &xd->plane[plane];
   const PLANE_TYPE type = pd->plane_type;
   const int16_t *band_count = &band_counts[tx_size][1];
   const int eob = p->eobs[block];
@@ -565,7 +564,6 @@
   uint8_t *p_tok = x->token_cache;
   int pt = combine_entropy_contexts(*A, *L);
   int c, cost;
-
   // Check for consistency of tx_size with mode info
   assert(type == PLANE_TYPE_Y ? mbmi->tx_size == tx_size
                               : get_uv_tx_size(mbmi) == tx_size);
@@ -591,9 +589,13 @@
 
       v = qcoeff[rc];
       t = vp9_dct_value_tokens_ptr[v].token;
-      pt = get_coef_context(nb, p_tok, c);
-      cost += (*token_costs)[!prev_t][pt][t] + vp9_dct_value_cost_ptr[v];
-      p_tok[rc] = vp9_pt_energy_class[t];
+      if (use_fast_coef_costing) {
+        cost += (*token_costs)[!prev_t][!prev_t][t] + vp9_dct_value_cost_ptr[v];
+      } else {
+        pt = get_coef_context(nb, p_tok, c);
+        cost += (*token_costs)[!prev_t][pt][t] + vp9_dct_value_cost_ptr[v];
+        p_tok[rc] = vp9_pt_energy_class[t];
+      }
       prev_t = t;
       if (!--band_left) {
         band_left = *band_count++;
@@ -603,8 +605,12 @@
 
     // eob token
     if (band_left) {
-      pt = get_coef_context(nb, p_tok, c);
-      cost += (*token_costs)[0][pt][EOB_TOKEN];
+      if (use_fast_coef_costing) {
+        cost += (*token_costs)[0][!prev_t][EOB_TOKEN];
+      } else {
+        pt = get_coef_context(nb, p_tok, c);
+        cost += (*token_costs)[0][pt][EOB_TOKEN];
+      }
     }
   }
 
@@ -613,14 +619,13 @@
 
   return cost;
 }
-
 static void dist_block(int plane, int block, TX_SIZE tx_size,
                        struct rdcost_block_args* args) {
   const int ss_txfrm_size = tx_size << 1;
   MACROBLOCK* const x = args->x;
   MACROBLOCKD* const xd = &x->e_mbd;
-  struct macroblock_plane *const p = &x->plane[plane];
-  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
   int64_t this_sse;
   int shift = tx_size == TX_32X32 ? 0 : 2;
   int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
@@ -645,7 +650,8 @@
 
   args->rate = cost_coeffs(args->x, plane, block, args->t_above + x_idx,
                            args->t_left + y_idx, tx_size,
-                           args->so->scan, args->so->neighbors);
+                           args->so->scan, args->so->neighbors,
+                           args->use_fast_coef_costing);
 }
 
 static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
@@ -653,7 +659,7 @@
   struct rdcost_block_args *args = arg;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+  MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
   int64_t rd1, rd2, rd;
 
   if (args->skip)
@@ -729,12 +735,14 @@
                              int *rate, int64_t *distortion,
                              int *skippable, int64_t *sse,
                              int64_t ref_best_rd, int plane,
-                             BLOCK_SIZE bsize, TX_SIZE tx_size) {
+                             BLOCK_SIZE bsize, TX_SIZE tx_size,
+                             int use_fast_coef_casting) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
   struct rdcost_block_args args = { 0 };
   args.x = x;
   args.best_rd = ref_best_rd;
+  args.use_fast_coef_costing = use_fast_coef_casting;
 
   if (plane == 0)
     xd->mi_8x8[0]->mbmi.tx_size = tx_size;
@@ -773,7 +781,7 @@
 
   txfm_rd_in_plane(x, rate, distortion, skip,
                    &sse[mbmi->tx_size], ref_best_rd, 0, bs,
-                   mbmi->tx_size);
+                   mbmi->tx_size, cpi->sf.use_fast_coef_costing);
   cpi->tx_stepdown_count[0]++;
 }
 
@@ -917,7 +925,8 @@
   // Actually encode using the chosen mode if a model was used, but do not
   // update the r, d costs
   txfm_rd_in_plane(x, rate, distortion, skip,
-                   &sse[mbmi->tx_size], ref_best_rd, 0, bs, mbmi->tx_size);
+                   &sse[mbmi->tx_size], ref_best_rd, 0, bs, mbmi->tx_size,
+                   cpi->sf.use_fast_coef_costing);
 
   if (max_tx_size == TX_32X32 && best_tx == TX_32X32) {
     cpi->tx_stepdown_count[0]++;
@@ -965,7 +974,8 @@
     for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
       txfm_rd_in_plane(x, &r[tx_size][0], &d[tx_size],
                        &s[tx_size], &sse[tx_size],
-                       ref_best_rd, 0, bs, tx_size);
+                       ref_best_rd, 0, bs, tx_size,
+                       cpi->sf.use_fast_coef_costing);
     choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
                              skip, txfm_cache, bs);
   }
@@ -994,7 +1004,8 @@
     for (tx_size = TX_4X4; tx_size <= max_txsize_lookup[bs]; ++tx_size)
       txfm_rd_in_plane(x, &r[tx_size][0], &d[tx_size],
                        &s[tx_size], &sse[tx_size],
-                       ref_best_rd, 0, bs, tx_size);
+                       ref_best_rd, 0, bs, tx_size,
+                       cpi->sf.use_fast_coef_costing);
     choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
                              skip, txfm_cache, bs);
   }
@@ -1097,7 +1108,8 @@
           vp9_fwht4x4(src_diff, coeff, 8);
           vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
           ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
-                               so->scan, so->neighbors);
+                               so->scan, so->neighbors,
+                               cpi->sf.use_fast_coef_costing);
           if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
             goto next;
           vp9_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block), dst, dst_stride,
@@ -1109,7 +1121,8 @@
           vp9_fht4x4(src_diff, coeff, 8, tx_type);
           vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
           ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
-                               so->scan, so->neighbors);
+                             so->scan, so->neighbors,
+                             cpi->sf.use_fast_coef_costing);
           distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
                                         16, &unused) >> 2;
           if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
@@ -1293,7 +1306,7 @@
   return best_rd;
 }
 
-static void super_block_uvrd(MACROBLOCK *x,
+static void super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x,
                              int *rate, int64_t *distortion, int *skippable,
                              int64_t *sse, BLOCK_SIZE bsize,
                              int64_t ref_best_rd) {
@@ -1320,7 +1333,8 @@
 
   for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
     txfm_rd_in_plane(x, &pnrate, &pndist, &pnskip, &pnsse,
-                     ref_best_rd, plane, bsize, uv_txfm_size);
+                     ref_best_rd, plane, bsize, uv_txfm_size,
+                     cpi->sf.use_fast_coef_costing);
     if (pnrate == INT_MAX)
       goto term;
     *rate += pnrate;
@@ -1356,7 +1370,7 @@
 
     xd->mi_8x8[0]->mbmi.uv_mode = mode;
 
-    super_block_uvrd(x, &this_rate_tokenonly,
+    super_block_uvrd(cpi, x, &this_rate_tokenonly,
                      &this_distortion, &s, &this_sse, bsize, best_rd);
     if (this_rate_tokenonly == INT_MAX)
       continue;
@@ -1399,14 +1413,15 @@
   return best_rd;
 }
 
-static int64_t rd_sbuv_dcpred(const VP9_COMMON *cm, MACROBLOCK *x,
+static int64_t rd_sbuv_dcpred(const VP9_COMP *cpi, MACROBLOCK *x,
                               int *rate, int *rate_tokenonly,
                               int64_t *distortion, int *skippable,
                               BLOCK_SIZE bsize) {
+  const VP9_COMMON *cm = &cpi->common;
   int64_t unused;
 
   x->e_mbd.mi_8x8[0]->mbmi.uv_mode = DC_PRED;
-  super_block_uvrd(x, rate_tokenonly, distortion,
+  super_block_uvrd(cpi, x, rate_tokenonly, distortion,
                    skippable, &unused, bsize, INT64_MAX);
   *rate = *rate_tokenonly + x->intra_uv_mode_cost[cm->frame_type][DC_PRED];
   return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
@@ -1422,7 +1437,7 @@
   // Use an estimated rd for uv_intra based on DC_PRED if the
   // appropriate speed flag is set.
   if (cpi->sf.use_uv_intra_rd_estimate) {
-    rd_sbuv_dcpred(&cpi->common, x, rate_uv, rate_uv_tokenonly, dist_uv,
+    rd_sbuv_dcpred(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv,
                    skip_uv, bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize);
   // Else do a proper rd search for each possible transform size that may
   // be considered in the main rd loop.
@@ -1585,7 +1600,8 @@
                                         16, &ssz);
       thissse += ssz;
       thisrate += cost_coeffs(x, 0, k, ta + (k & 1), tl + (k >> 1), TX_4X4,
-                              so->scan, so->neighbors);
+                              so->scan, so->neighbors,
+                              cpi->sf.use_fast_coef_costing);
       rd1 = RDCOST(x->rdmult, x->rddiv, thisrate, thisdistortion >> 2);
       rd2 = RDCOST(x->rdmult, x->rddiv, 0, thissse >> 2);
       rd = MIN(rd1, rd2);
@@ -1614,7 +1630,7 @@
 } SEG_RDSTAT;
 
 typedef struct {
-  int_mv *ref_mv, *second_ref_mv;
+  int_mv *ref_mv[2];
   int_mv mvp;
 
   int64_t segment_rd;
@@ -1725,6 +1741,8 @@
 
         mode_idx = INTER_OFFSET(this_mode);
         bsi->rdstat[i][mode_idx].brdcost = INT64_MAX;
+        if (cpi->sf.disable_inter_mode_mask[bsize] & (1 << mode_idx))
+          continue;
 
         // if we're near/nearest and mv == 0,0, compare to zeromv
         if ((this_mode == NEARMV || this_mode == NEARESTMV ||
@@ -1822,30 +1840,42 @@
           // adjust src pointer for this block
           mi_buf_shift(x, i);
 
-          vp9_set_mv_search_range(x, &bsi->ref_mv->as_mv);
+          vp9_set_mv_search_range(x, &bsi->ref_mv[0]->as_mv);
 
           if (cpi->sf.search_method == HEX) {
             bestsme = vp9_hex_search(x, &mvp_full,
                                      step_param,
                                      sadpb, 1, v_fn_ptr, 1,
-                                     &bsi->ref_mv->as_mv,
+                                     &bsi->ref_mv[0]->as_mv,
                                      &new_mv->as_mv);
+            if (bestsme < INT_MAX)
+              bestsme = vp9_get_mvpred_var(x, &new_mv->as_mv,
+                                           &bsi->ref_mv[0]->as_mv,
+                                           v_fn_ptr, 1);
           } else if (cpi->sf.search_method == SQUARE) {
             bestsme = vp9_square_search(x, &mvp_full,
                                         step_param,
                                         sadpb, 1, v_fn_ptr, 1,
-                                        &bsi->ref_mv->as_mv,
+                                        &bsi->ref_mv[0]->as_mv,
                                         &new_mv->as_mv);
+            if (bestsme < INT_MAX)
+              bestsme = vp9_get_mvpred_var(x, &new_mv->as_mv,
+                                           &bsi->ref_mv[0]->as_mv,
+                                           v_fn_ptr, 1);
           } else if (cpi->sf.search_method == BIGDIA) {
             bestsme = vp9_bigdia_search(x, &mvp_full,
                                         step_param,
                                         sadpb, 1, v_fn_ptr, 1,
-                                        &bsi->ref_mv->as_mv,
+                                        &bsi->ref_mv[0]->as_mv,
                                         &new_mv->as_mv);
+            if (bestsme < INT_MAX)
+              bestsme = vp9_get_mvpred_var(x, &new_mv->as_mv,
+                                           &bsi->ref_mv[0]->as_mv,
+                                           v_fn_ptr, 1);
           } else {
             bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
                                              sadpb, further_steps, 0, v_fn_ptr,
-                                             &bsi->ref_mv->as_mv,
+                                             &bsi->ref_mv[0]->as_mv,
                                              &new_mv->as_mv);
           }
 
@@ -1859,7 +1889,7 @@
             thissme = cpi->full_search_sad(x, &mvp_full,
                                            sadpb, 16, v_fn_ptr,
                                            x->nmvjointcost, x->mvcost,
-                                           &bsi->ref_mv->as_mv,
+                                           &bsi->ref_mv[0]->as_mv,
                                            &best_mv->as_mv);
             if (thissme < bestsme) {
               bestsme = thissme;
@@ -1875,7 +1905,7 @@
             int distortion;
             cpi->find_fractional_mv_step(x,
                                          &new_mv->as_mv,
-                                         &bsi->ref_mv->as_mv,
+                                         &bsi->ref_mv[0]->as_mv,
                                          cm->allow_high_precision_mv,
                                          x->errorperbit, v_fn_ptr,
                                          cpi->sf.subpel_force_stop,
@@ -1922,7 +1952,7 @@
         bsi->rdstat[i][mode_idx].brate =
             labels2mode(x, i, this_mode, &mode_mv[this_mode],
                         &second_mode_mv[this_mode], frame_mv, seg_mvs[i],
-                        bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,
+                        bsi->ref_mv[0], bsi->ref_mv[1], x->nmvjointcost,
                         x->mvcost, cpi);
 
 
@@ -2031,7 +2061,7 @@
 
       labels2mode(x, i, mode_selected, &mode_mv[mode_selected],
                   &second_mode_mv[mode_selected], frame_mv, seg_mvs[i],
-                  bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,
+                  bsi->ref_mv[0], bsi->ref_mv[1], x->nmvjointcost,
                   x->mvcost, cpi);
 
       br += bsi->rdstat[i][mode_idx].brate;
@@ -2086,8 +2116,8 @@
   vp9_zero(*bsi);
 
   bsi->segment_rd = best_rd;
-  bsi->ref_mv = best_ref_mv;
-  bsi->second_ref_mv = second_best_ref_mv;
+  bsi->ref_mv[0] = best_ref_mv;
+  bsi->ref_mv[1] = second_best_ref_mv;
   bsi->mvp.as_int = best_ref_mv->as_int;
   bsi->mvthresh = mvthresh;
 
@@ -2452,21 +2482,33 @@
   further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
 
   if (cpi->sf.search_method == FAST_HEX) {
-    bestsme = vp9_fast_hex_search(x, &mvp_full, step_param, sadpb,
+    bestsme = vp9_fast_hex_search(x, &mvp_full, step_param, sadpb, 0,
                                   &cpi->fn_ptr[bsize], 1,
                                   &ref_mv, &tmp_mv->as_mv);
+    if (bestsme < INT_MAX)
+      bestsme = vp9_get_mvpred_var(x, &tmp_mv->as_mv, &ref_mv,
+                                   &cpi->fn_ptr[bsize], 1);
   } else if (cpi->sf.search_method == HEX) {
     bestsme = vp9_hex_search(x, &mvp_full, step_param, sadpb, 1,
                              &cpi->fn_ptr[bsize], 1,
                              &ref_mv, &tmp_mv->as_mv);
+    if (bestsme < INT_MAX)
+      bestsme = vp9_get_mvpred_var(x, &tmp_mv->as_mv, &ref_mv,
+                                   &cpi->fn_ptr[bsize], 1);
   } else if (cpi->sf.search_method == SQUARE) {
     bestsme = vp9_square_search(x, &mvp_full, step_param, sadpb, 1,
                                 &cpi->fn_ptr[bsize], 1,
                                 &ref_mv, &tmp_mv->as_mv);
+    if (bestsme < INT_MAX)
+      bestsme = vp9_get_mvpred_var(x, &tmp_mv->as_mv, &ref_mv,
+                                   &cpi->fn_ptr[bsize], 1);
   } else if (cpi->sf.search_method == BIGDIA) {
     bestsme = vp9_bigdia_search(x, &mvp_full, step_param, sadpb, 1,
                                 &cpi->fn_ptr[bsize], 1,
                                 &ref_mv, &tmp_mv->as_mv);
+    if (bestsme < INT_MAX)
+      bestsme = vp9_get_mvpred_var(x, &tmp_mv->as_mv, &ref_mv,
+                                   &cpi->fn_ptr[bsize], 1);
   } else {
     bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
                                      sadpb, further_steps, 1,
@@ -2592,6 +2634,9 @@
                                        x->nmvjointcost, x->mvcost,
                                        &ref_mv[id].as_mv, second_pred,
                                        pw, ph);
+    if (bestsme < INT_MAX)
+      bestsme = vp9_get_mvpred_av_var(x, &tmp_mv.as_mv, &ref_mv[id].as_mv,
+                                      second_pred, &cpi->fn_ptr[bsize], 1);
 
     x->mv_col_min = tmp_col_min;
     x->mv_col_max = tmp_col_max;
@@ -2726,42 +2771,6 @@
     }
   }
 
-  // if we're near/nearest and mv == 0,0, compare to zeromv
-  if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
-      frame_mv[refs[0]].as_int == 0 &&
-      !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) &&
-      (num_refs == 1 || frame_mv[refs[1]].as_int == 0)) {
-    int rfc = mbmi->mode_context[refs[0]];
-    int c1 = cost_mv_ref(cpi, NEARMV, rfc);
-    int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
-    int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
-
-    if (this_mode == NEARMV) {
-      if (c1 > c3)
-        return INT64_MAX;
-    } else if (this_mode == NEARESTMV) {
-      if (c2 > c3)
-        return INT64_MAX;
-    } else {
-      assert(this_mode == ZEROMV);
-      if (num_refs == 1) {
-        if ((c3 >= c2 &&
-             mode_mv[NEARESTMV][refs[0]].as_int == 0) ||
-            (c3 >= c1 &&
-             mode_mv[NEARMV][refs[0]].as_int == 0))
-          return INT64_MAX;
-      } else {
-        if ((c3 >= c2 &&
-             mode_mv[NEARESTMV][refs[0]].as_int == 0 &&
-             mode_mv[NEARESTMV][refs[1]].as_int == 0) ||
-            (c3 >= c1 &&
-             mode_mv[NEARMV][refs[0]].as_int == 0 &&
-             mode_mv[NEARMV][refs[1]].as_int == 0))
-          return INT64_MAX;
-      }
-    }
-  }
-
   for (i = 0; i < num_refs; ++i) {
     cur_mv[i] = frame_mv[refs[i]];
     // Clip "next_nearest" so that it does not extend to far out of image
@@ -3021,7 +3030,7 @@
     rdcosty = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
     rdcosty = MIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, *psse));
 
-    super_block_uvrd(x, rate_uv, distortion_uv, &skippable_uv, &sseuv,
+    super_block_uvrd(cpi, x, rate_uv, distortion_uv, &skippable_uv, &sseuv,
                      bsize, ref_best_rd - rdcosty);
     if (*rate_uv == INT_MAX) {
       *rate2 = INT_MAX;
@@ -3173,6 +3182,8 @@
   const int *const rd_threshes = cpi->rd_threshes[segment_id][bsize];
   const int *const rd_thresh_freq_fact = cpi->rd_thresh_freq_fact[bsize];
   const int mode_search_skip_flags = cpi->sf.mode_search_skip_flags;
+  const int intra_y_mode_mask =
+      cpi->sf.intra_y_mode_mask[max_txsize_lookup[bsize]];
 
   x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
 
@@ -3316,6 +3327,9 @@
 
     this_mode = vp9_mode_order[mode_index].mode;
     ref_frame = vp9_mode_order[mode_index].ref_frame[0];
+    if (ref_frame != INTRA_FRAME &&
+        cpi->sf.disable_inter_mode_mask[bsize] & (1 << INTER_OFFSET(this_mode)))
+      continue;
     second_ref_frame = vp9_mode_order[mode_index].ref_frame[1];
 
     comp_pred = second_ref_frame > INTRA_FRAME;
@@ -3327,35 +3341,71 @@
           ref_frame != best_inter_ref_frame &&
           second_ref_frame != best_inter_ref_frame)
         continue;
-      mode_excluded = mode_excluded ?
-            mode_excluded : cm->reference_mode == SINGLE_REFERENCE;
+      mode_excluded = cm->reference_mode == SINGLE_REFERENCE;
     } else {
-      if (ref_frame != INTRA_FRAME && second_ref_frame != INTRA_FRAME)
-        mode_excluded = mode_excluded ?
-            mode_excluded : cm->reference_mode == COMPOUND_REFERENCE;
+      if (ref_frame != INTRA_FRAME)
+        mode_excluded = cm->reference_mode == COMPOUND_REFERENCE;
     }
 
     if (ref_frame == INTRA_FRAME) {
-      // Disable intra modes other than DC_PRED for blocks with low variance
-      // Threshold for intra skipping based on source variance
-      // TODO(debargha): Specialize the threshold for super block sizes
-      static const unsigned int skip_intra_var_thresh[BLOCK_SIZES] = {
-        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
-      };
-      if ((mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
-          this_mode != DC_PRED &&
-          x->source_variance < skip_intra_var_thresh[bsize])
+      if (!(intra_y_mode_mask & (1 << this_mode)))
         continue;
-      // Only search the oblique modes if the best so far is
-      // one of the neighboring directional modes
-      if ((mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
-          (this_mode >= D45_PRED && this_mode <= TM_PRED)) {
-        if (vp9_mode_order[best_mode_index].ref_frame[0] > INTRA_FRAME)
+      if (this_mode != DC_PRED) {
+        // Disable intra modes other than DC_PRED for blocks with low variance
+        // Threshold for intra skipping based on source variance
+        // TODO(debargha): Specialize the threshold for super block sizes
+        const unsigned int skip_intra_var_thresh = 64;
+        if ((mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
+            x->source_variance < skip_intra_var_thresh)
           continue;
-      }
-      if (mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
-        if (conditional_skipintra(this_mode, best_intra_mode))
+        // Only search the oblique modes if the best so far is
+        // one of the neighboring directional modes
+        if ((mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
+            (this_mode >= D45_PRED && this_mode <= TM_PRED)) {
+          if (vp9_mode_order[best_mode_index].ref_frame[0] > INTRA_FRAME)
             continue;
+        }
+        if (mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
+          if (conditional_skipintra(this_mode, best_intra_mode))
+              continue;
+        }
+      }
+    } else {
+      // if we're near/nearest and mv == 0,0, compare to zeromv
+      if ((this_mode == NEARMV || this_mode == NEARESTMV ||
+          this_mode == ZEROMV) &&
+          frame_mv[this_mode][ref_frame].as_int == 0 &&
+          !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) &&
+          (!comp_pred || frame_mv[this_mode][second_ref_frame].as_int == 0)) {
+        int rfc = mbmi->mode_context[ref_frame];
+        int c1 = cost_mv_ref(cpi, NEARMV, rfc);
+        int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
+        int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
+
+        if (this_mode == NEARMV) {
+          if (c1 > c3)
+            continue;
+        } else if (this_mode == NEARESTMV) {
+          if (c2 > c3)
+            continue;
+        } else {
+          assert(this_mode == ZEROMV);
+          if (!comp_pred) {
+            if ((c3 >= c2 &&
+                 frame_mv[NEARESTMV][ref_frame].as_int == 0) ||
+                (c3 >= c1 &&
+                 frame_mv[NEARMV][ref_frame].as_int == 0))
+              continue;
+          } else {
+            if ((c3 >= c2 &&
+                 frame_mv[NEARESTMV][ref_frame].as_int == 0 &&
+                 frame_mv[NEARESTMV][second_ref_frame].as_int == 0) ||
+                (c3 >= c1 &&
+                 frame_mv[NEARMV][ref_frame].as_int == 0 &&
+                 frame_mv[NEARMV][second_ref_frame].as_int == 0))
+              continue;
+          }
+        }
       }
     }
 
@@ -3487,19 +3537,18 @@
       this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
     }
 
+    if (ref_frame == INTRA_FRAME) {
     // Keep record of best intra rd
-    if (!is_inter_block(&xd->mi_8x8[0]->mbmi) &&
-        this_rd < best_intra_rd) {
-      best_intra_rd = this_rd;
-      best_intra_mode = xd->mi_8x8[0]->mbmi.mode;
-    }
-
-    // Keep record of best inter rd with single reference
-    if (is_inter_block(&xd->mi_8x8[0]->mbmi) &&
-        !has_second_ref(&xd->mi_8x8[0]->mbmi) &&
-        !mode_excluded && this_rd < best_inter_rd) {
-      best_inter_rd = this_rd;
-      best_inter_ref_frame = ref_frame;
+      if (this_rd < best_intra_rd) {
+        best_intra_rd = this_rd;
+        best_intra_mode = mbmi->mode;
+      }
+    } else {
+      // Keep record of best inter rd with single reference
+      if (!comp_pred && !mode_excluded && this_rd < best_inter_rd) {
+        best_inter_rd = this_rd;
+        best_inter_ref_frame = ref_frame;
+      }
     }
 
     if (!disable_skip && ref_frame == INTRA_FRAME) {
@@ -3572,38 +3621,39 @@
       single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
       hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
 
-      if (second_ref_frame <= INTRA_FRAME &&
-          single_rd < best_pred_rd[SINGLE_REFERENCE]) {
-        best_pred_rd[SINGLE_REFERENCE] = single_rd;
-      } else if (second_ref_frame > INTRA_FRAME &&
-                 single_rd < best_pred_rd[COMPOUND_REFERENCE]) {
-        best_pred_rd[COMPOUND_REFERENCE] = single_rd;
+      if (!comp_pred) {
+        if (single_rd < best_pred_rd[SINGLE_REFERENCE]) {
+          best_pred_rd[SINGLE_REFERENCE] = single_rd;
+        }
+      } else {
+        if (single_rd < best_pred_rd[COMPOUND_REFERENCE]) {
+          best_pred_rd[COMPOUND_REFERENCE] = single_rd;
+        }
       }
       if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT])
         best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
-    }
 
-    /* keep record of best filter type */
-    if (!mode_excluded && !disable_skip && ref_frame != INTRA_FRAME &&
-        cm->interp_filter != BILINEAR) {
-      int64_t ref = cpi->rd_filter_cache[cm->interp_filter == SWITCHABLE ?
+      /* keep record of best filter type */
+      if (!mode_excluded && cm->interp_filter != BILINEAR) {
+        int64_t ref = cpi->rd_filter_cache[cm->interp_filter == SWITCHABLE ?
                               SWITCHABLE_FILTERS : cm->interp_filter];
 
-      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
-        int64_t adj_rd;
-        if (ref == INT64_MAX)
-          adj_rd = 0;
-        else if (cpi->rd_filter_cache[i] == INT64_MAX)
-          // when early termination is triggered, the encoder does not have
-          // access to the rate-distortion cost. it only knows that the cost
-          // should be above the maximum valid value. hence it takes the known
-          // maximum plus an arbitrary constant as the rate-distortion cost.
-          adj_rd = cpi->mask_filter_rd - ref + 10;
-        else
-          adj_rd = cpi->rd_filter_cache[i] - ref;
+        for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
+          int64_t adj_rd;
+          if (ref == INT64_MAX)
+            adj_rd = 0;
+          else if (cpi->rd_filter_cache[i] == INT64_MAX)
+            // when early termination is triggered, the encoder does not have
+            // access to the rate-distortion cost. it only knows that the cost
+            // should be above the maximum valid value. hence it takes the known
+            // maximum plus an arbitrary constant as the rate-distortion cost.
+            adj_rd = cpi->mask_filter_rd - ref + 10;
+          else
+            adj_rd = cpi->rd_filter_cache[i] - ref;
 
-        adj_rd += this_rd;
-        best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd);
+          adj_rd += this_rd;
+          best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd);
+        }
       }
     }
 
@@ -3695,11 +3745,6 @@
     }
     if (cm->interp_filter == SWITCHABLE)
       assert(best_filter_diff[SWITCHABLE_FILTERS] == 0);
-  } else {
-    vp9_zero(best_filter_diff);
-  }
-
-  if (!x->skip) {
     for (i = 0; i < TX_MODES; i++) {
       if (best_tx_rd[i] == INT64_MAX)
         best_tx_diff[i] = 0;
@@ -3707,6 +3752,7 @@
         best_tx_diff[i] = best_rd - best_tx_rd[i];
     }
   } else {
+    vp9_zero(best_filter_diff);
     vp9_zero(best_tx_diff);
   }
 
@@ -4146,7 +4192,7 @@
         // then dont bother looking at UV
         vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
                                         BLOCK_8X8);
-        super_block_uvrd(x, &rate_uv, &distortion_uv, &uv_skippable,
+        super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
                          &uv_sse, BLOCK_8X8, tmp_best_rdu);
         if (rate_uv == INT_MAX)
           continue;
diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h
index 6b85d67..b5baa33 100644
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -35,7 +35,7 @@
 
 struct TileInfo;
 
-int vp9_compute_rd_mult(VP9_COMP *cpi, int qindex);
+int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex);
 
 void vp9_initialize_rd_consts(VP9_COMP *cpi);
 
diff --git a/vp9/encoder/vp9_segmentation.c b/vp9/encoder/vp9_segmentation.c
index 509717e..49fd7bb 100644
--- a/vp9/encoder/vp9_segmentation.c
+++ b/vp9/encoder/vp9_segmentation.c
@@ -10,29 +10,25 @@
 
 
 #include <limits.h>
+
 #include "vpx_mem/vpx_mem.h"
-#include "vp9/encoder/vp9_segmentation.h"
+
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_tile_common.h"
 
-void vp9_enable_segmentation(VP9_PTR ptr) {
-  VP9_COMP *cpi = (VP9_COMP *)ptr;
-  struct segmentation *const seg =  &cpi->common.seg;
+#include "vp9/encoder/vp9_segmentation.h"
 
+void vp9_enable_segmentation(struct segmentation *seg) {
   seg->enabled = 1;
   seg->update_map = 1;
   seg->update_data = 1;
 }
 
-void vp9_disable_segmentation(VP9_PTR ptr) {
-  VP9_COMP *cpi = (VP9_COMP *)ptr;
-  struct segmentation *const seg =  &cpi->common.seg;
+void vp9_disable_segmentation(struct segmentation *seg) {
   seg->enabled = 0;
 }
 
-void vp9_set_segmentation_map(VP9_PTR ptr,
-                              unsigned char *segmentation_map) {
-  VP9_COMP *cpi = (VP9_COMP *)ptr;
+void vp9_set_segmentation_map(VP9_COMP *cpi, unsigned char *segmentation_map) {
   struct segmentation *const seg = &cpi->common.seg;
 
   // Copy in the new segmentation map
@@ -44,12 +40,9 @@
   seg->update_data = 1;
 }
 
-void vp9_set_segment_data(VP9_PTR ptr,
+void vp9_set_segment_data(struct segmentation *seg,
                           signed char *feature_data,
                           unsigned char abs_delta) {
-  VP9_COMP *cpi = (VP9_COMP *)ptr;
-  struct segmentation *const seg = &cpi->common.seg;
-
   seg->abs_delta = abs_delta;
 
   vpx_memcpy(seg->feature_data, feature_data, sizeof(seg->feature_data));
diff --git a/vp9/encoder/vp9_segmentation.h b/vp9/encoder/vp9_segmentation.h
index 8238892..66c51a2 100644
--- a/vp9/encoder/vp9_segmentation.h
+++ b/vp9/encoder/vp9_segmentation.h
@@ -19,8 +19,8 @@
 extern "C" {
 #endif
 
-void vp9_enable_segmentation(VP9_PTR ptr);
-void vp9_disable_segmentation(VP9_PTR ptr);
+void vp9_enable_segmentation(struct segmentation *seg);
+void vp9_disable_segmentation(struct segmentation *seg);
 
 void vp9_disable_segfeature(struct segmentation *seg,
                             int segment_id,
@@ -30,7 +30,7 @@
                        SEG_LVL_FEATURES feature_id);
 // Valid values for a segment are 0 to 3
 // Segmentation map is arrange as [Rows][Columns]
-void vp9_set_segmentation_map(VP9_PTR ptr, unsigned char *segmentation_map);
+void vp9_set_segmentation_map(VP9_COMP *cpi, unsigned char *segmentation_map);
 
 // The values given for each segment can be either deltas (from the default
 // value chosen for the frame) or absolute values.
@@ -42,7 +42,7 @@
 //
 // abs_delta = SEGMENT_DELTADATA (deltas) abs_delta = SEGMENT_ABSDATA (use
 // the absolute values given).
-void vp9_set_segment_data(VP9_PTR ptr, signed char *feature_data,
+void vp9_set_segment_data(struct segmentation *seg, signed char *feature_data,
                           unsigned char abs_delta);
 
 void vp9_choose_segmap_coding_method(VP9_COMP *cpi);
diff --git a/vp9/encoder/vp9_subexp.c b/vp9/encoder/vp9_subexp.c
index 84fb35e..fdc2106 100644
--- a/vp9/encoder/vp9_subexp.c
+++ b/vp9/encoder/vp9_subexp.c
@@ -150,8 +150,7 @@
 int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
                                               const vp9_prob *oldp,
                                               vp9_prob *bestp,
-                                              vp9_prob upd,
-                                              int b, int r) {
+                                              vp9_prob upd) {
   int i, old_b, new_b, update_b, savings, bestsavings, step;
   int newp;
   vp9_prob bestnewp, newplist[ENTROPY_NODES], oldplist[ENTROPY_NODES];
diff --git a/vp9/encoder/vp9_subexp.h b/vp9/encoder/vp9_subexp.h
index ab5659b..8e9c0c6 100644
--- a/vp9/encoder/vp9_subexp.h
+++ b/vp9/encoder/vp9_subexp.h
@@ -33,8 +33,7 @@
 int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
                                               const vp9_prob *oldp,
                                               vp9_prob *bestp,
-                                              vp9_prob upd,
-                                              int b, int r);
+                                              vp9_prob upd);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 502e4b6..6233116 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -124,8 +124,7 @@
 static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
                                               uint8_t *arf_frame_buf,
                                               uint8_t *frame_ptr_buf,
-                                              int stride,
-                                              int error_thresh) {
+                                              int stride) {
   MACROBLOCK *x = &cpi->mb;
   MACROBLOCKD* const xd = &x->e_mbd;
   int step_param;
@@ -267,8 +266,7 @@
                 (cpi,
                  cpi->frames[alt_ref_index]->y_buffer + mb_y_offset,
                  cpi->frames[frame]->y_buffer + mb_y_offset,
-                 cpi->frames[frame]->y_stride,
-                 THRESH_LOW);
+                 cpi->frames[frame]->y_stride);
 #endif
           // Assign higher weight to matching MB if it's error
           // score is lower. If not applying MC default behavior
diff --git a/vp9/encoder/vp9_vaq.c b/vp9/encoder/vp9_vaq.c
index 600029b..c71c171 100644
--- a/vp9/encoder/vp9_vaq.c
+++ b/vp9/encoder/vp9_vaq.c
@@ -83,7 +83,7 @@
   if (cm->frame_type == KEY_FRAME ||
       cpi->refresh_alt_ref_frame ||
       (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
-    vp9_enable_segmentation((VP9_PTR)cpi);
+    vp9_enable_segmentation(seg);
     vp9_clearall_segfeatures(seg);
 
     seg->abs_delta = SEGMENT_DELTADATA;
diff --git a/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c b/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
index b8bfa89..34ed186 100644
--- a/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
+++ b/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
@@ -47,6 +47,77 @@
   1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15
 };
 
+#define FILTER_SRC(filter) \
+  /* filter the source */ \
+  exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
+  exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
+  \
+  /* add 8 to source */ \
+  exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \
+  exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \
+  \
+  /* divide source by 16 */ \
+  exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \
+  exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+#define MERGE_WITH_SRC(src_reg, reg) \
+  exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
+  exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
+
+#define LOAD_SRC_DST \
+  /* load source and destination */ \
+  src_reg = _mm256_loadu_si256((__m256i const *) (src)); \
+  dst_reg = _mm256_load_si256((__m256i const *) (dst));
+
+#define AVG_NEXT_SRC(src_reg, size_stride) \
+  src_next_reg = _mm256_loadu_si256((__m256i const *) \
+                                   (src + size_stride)); \
+  /* average between current and next stride source */ \
+  src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+
+#define MERGE_NEXT_SRC(src_reg, size_stride) \
+  src_next_reg = _mm256_loadu_si256((__m256i const *) \
+                                   (src + size_stride)); \
+  MERGE_WITH_SRC(src_reg, src_next_reg)
+
+#define CALC_SUM_SSE_INSIDE_LOOP \
+  /* expand each byte to 2 bytes */ \
+  exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \
+  exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \
+  /* source - dest */ \
+  exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \
+  exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \
+  /* caculate sum */ \
+  sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \
+  exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
+  sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \
+  exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
+  /* calculate sse */ \
+  sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \
+  sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+// final calculation to sum and sse
+#define CALC_SUM_AND_SSE \
+  res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \
+  sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \
+  sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \
+  sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \
+  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
+  sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \
+  \
+  sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \
+  sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \
+  \
+  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
+  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
+  *((int*)sse)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \
+                _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
+  sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \
+  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
+  sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \
+        _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
+
+
 unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src,
                                              int src_stride,
                                              int x_offset,
@@ -55,587 +126,414 @@
                                              int dst_stride,
                                              int height,
                                              unsigned int *sse) {
-  __m256i src_reg,  dst_reg,  exp_src_lo,  exp_src_hi,  exp_dst_lo,  exp_dst_hi;
-  __m256i sse_reg,  sum_reg,  sse_reg_hi,  res_cmp,  sum_reg_lo,  sum_reg_hi;
+  __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+  __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
   __m256i zero_reg;
   int i, sum;
   sum_reg = _mm256_set1_epi16(0);
   sse_reg = _mm256_set1_epi16(0);
   zero_reg = _mm256_set1_epi16(0);
 
+  // x_offset = 0 and y_offset = 0
   if (x_offset == 0) {
-    // x_offset = 0 and y_offset = 0
     if (y_offset == 0) {
       for (i = 0; i < height ; i++) {
-        // load source and destination
-        src_reg = _mm256_loadu_si256((__m256i const *) (src));
-        dst_reg = _mm256_load_si256((__m256i const *) (dst));
-
+        LOAD_SRC_DST
         // expend each byte to 2 bytes
-        exp_src_lo = _mm256_unpacklo_epi8(src_reg, zero_reg);
-        exp_src_hi = _mm256_unpackhi_epi8(src_reg, zero_reg);
-
-        exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
-        exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
-
-        // source - dest
-        exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
-        exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
-
-        // calculate sum
-        sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
-        exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
-        sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
-        exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
-
-        // calculate sse
-        sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
-        sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
-
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
         src+= src_stride;
         dst+= dst_stride;
       }
     // x_offset = 0 and y_offset = 8
     } else if (y_offset == 8) {
-        __m256i src_next_reg;
-        for (i = 0; i < height ; i++) {
-          // load source + next source + destination
-          src_reg = _mm256_loadu_si256((__m256i const *) (src));
-          src_next_reg = _mm256_loadu_si256((__m256i const *)
-                                         (src + src_stride));
-          dst_reg = _mm256_load_si256((__m256i const *) (dst));
-          // average between current and next stride source
-          src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
-
-          // expend each byte to 2 bytes
-          exp_src_lo = _mm256_unpacklo_epi8(src_reg, zero_reg);
-          exp_src_hi = _mm256_unpackhi_epi8(src_reg, zero_reg);
-
-          exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
-          exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
-
-          // source - dest
-          exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
-          exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
-
-          // calculate sum
-          sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
-          exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
-          sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
-          exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
-          sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
-
-          // calculate sse
-          sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
-
-          src+= src_stride;
-          dst+= dst_stride;
-        }
+      __m256i src_next_reg;
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, src_stride)
+        // expend each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
     // x_offset = 0 and y_offset = bilin interpolation
     } else {
-        __m256i filter, pw8, src_next_reg;
-#if (ARCH_X86_64)
-        int64_t y_offset64;
-        y_offset64 = y_offset;
-        y_offset64 <<= 5;
-        filter = _mm256_load_si256(
-            (__m256i const *)(bilinear_filters_avx2 + y_offset64));
-#else
-        y_offset <<= 5;
-        filter = _mm256_load_si256(
-            (__m256i const *)(bilinear_filters_avx2 + y_offset));
-#endif
-        pw8 = _mm256_set1_epi16(8);
-        for (i = 0; i < height ; i++) {
-          // load current and next source + destination
-          src_reg = _mm256_loadu_si256((__m256i const *) (src));
-          src_next_reg = _mm256_loadu_si256((__m256i const *)
-                          (src + src_stride));
-          dst_reg = _mm256_load_si256((__m256i const *) (dst));
+      __m256i filter, pw8, src_next_reg;
 
-          // merge current and next source
-          exp_src_lo = _mm256_unpacklo_epi8(src_reg, src_next_reg);
-          exp_src_hi = _mm256_unpackhi_epi8(src_reg, src_next_reg);
-
-          // filter the source
-          exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter);
-          exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter);
-
-          // add 8 to the source
-          exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
-          exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
-
-          // divide by 16
-          exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
-          exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
-
-          // expand each byte to 2 byte in the destination
-          exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
-          exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
-
-          // source - dest
-          exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
-          exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
-
-          // calculate sum
-          sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
-          exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
-          sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
-          exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
-
-          // calculate sse
-          sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
-          sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
-
-          src+= src_stride;
-          dst+= dst_stride;
-        }
+      y_offset <<= 5;
+      filter = _mm256_load_si256((__m256i const *)
+               (bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, src_stride)
+        FILTER_SRC(filter)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
     }
   // x_offset = 8  and y_offset = 0
   } else if (x_offset == 8) {
-      if (y_offset == 0) {
-        __m256i src_next_reg;
-        for (i = 0; i < height ; i++) {
-          // load source and another source starting from the next
-          // following byte + destination
-          src_reg = _mm256_loadu_si256((__m256i const *) (src));
-          src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
-          dst_reg = _mm256_load_si256((__m256i const *) (dst));
-
-          // average between source and the next byte following source
-          src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
-
-          // expand each byte to 2 bytes
-          exp_src_lo = _mm256_unpacklo_epi8(src_reg, zero_reg);
-          exp_src_hi = _mm256_unpackhi_epi8(src_reg, zero_reg);
-
-          exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
-          exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
-
-          // source - dest
-          exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
-          exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
-
-          // calculate sum
-          sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
-          exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
-          sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
-          exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
-
-          // calculate sse
-          sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
-          sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
-
-          src+= src_stride;
-          dst+= dst_stride;
-        }
-      // x_offset = 8  and y_offset = 8
-      } else if (y_offset == 8) {
-          __m256i src_next_reg, src_avg;
-          // load source and another source starting from the next
-          // following byte
-          src_reg = _mm256_loadu_si256((__m256i const *) (src));
-          src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
-
-          // average between source and the next byte following source
-          src_avg = _mm256_avg_epu8(src_reg, src_next_reg);
-          for (i = 0; i < height ; i++) {
-            src+= src_stride;
-            // load source and another source starting from the next
-            // following byte + destination
-            src_reg = _mm256_loadu_si256((__m256i const *) (src));
-            src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
-            dst_reg = _mm256_load_si256((__m256i const *) (dst));
-            // average between source and the next byte following source
-            src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
-
-            // expand each byte to 2 bytes
-            exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
-            exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
-
-            // average between previous average to current average
-            src_avg = _mm256_avg_epu8(src_avg, src_reg);
-            // expand each byte to 2 bytes
-            exp_src_lo = _mm256_unpacklo_epi8(src_avg, zero_reg);
-            exp_src_hi = _mm256_unpackhi_epi8(src_avg, zero_reg);
-
-            // save current source average
-            src_avg = src_reg;
-            // source - dest
-            exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
-            exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
-
-            // calculate sum
-            sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
-            exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
-            sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
-            exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
-
-            // calculate sse
-            sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
-            sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
-
-            dst+= dst_stride;
-          }
-      // x_offset = 8  and y_offset = bilin interpolation
-      } else {
-          __m256i filter, pw8, src_next_reg, src_avg;
-#if (ARCH_X86_64)
-          int64_t y_offset64;
-          y_offset64 = y_offset;
-          y_offset64 <<= 5;
-          filter = _mm256_load_si256(
-              (__m256i const *)(bilinear_filters_avx2 + y_offset64));
-#else
-          y_offset <<= 5;
-          filter = _mm256_load_si256(
-              (__m256i const *)(bilinear_filters_avx2 + y_offset));
-#endif
-          pw8 = _mm256_set1_epi16(8);
-          // load source and another source starting from the next
-          // following byte
-          src_reg = _mm256_loadu_si256((__m256i const *) (src));
-          src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
-          // average between source and the next byte following source
-          src_avg = _mm256_avg_epu8(src_reg, src_next_reg);
-          for (i = 0; i < height ; i++) {
-            src+= src_stride;
-            // load source and another source starting from the next
-            // following byte + destination
-            src_reg = _mm256_loadu_si256((__m256i const *) (src));
-            src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
-            dst_reg = _mm256_load_si256((__m256i const *) (dst));
-            // average between source and the next byte following source
-            src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
-
-            // merge previous average and current average
-            exp_src_lo = _mm256_unpacklo_epi8(src_avg, src_reg);
-            exp_src_hi = _mm256_unpackhi_epi8(src_avg, src_reg);
-
-            // filter the source
-            exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter);
-            exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter);
-
-            // add 8 to the source
-            exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
-            exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
-
-            // divide the source by 16
-            exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
-            exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
-
-            // expand each byte to 2 bytes
-            exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
-            exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
-
-            // save current source average
-            src_avg = src_reg;
-            // source - dest
-            exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
-            exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
-
-            // calculate sum
-            sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
-            exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
-            sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
-            exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
-
-            // calculate sse
-            sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
-            sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
-
-            dst+= dst_stride;
-          }
+    if (y_offset == 0) {
+      __m256i src_next_reg;
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, 1)
+        // expand each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
       }
+    // x_offset = 8  and y_offset = 8
+    } else if (y_offset == 8) {
+      __m256i src_next_reg, src_avg;
+      // load source and another source starting from the next
+      // following byte
+      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      AVG_NEXT_SRC(src_reg, 1)
+      for (i = 0; i < height ; i++) {
+        src_avg = src_reg;
+        src+= src_stride;
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, 1)
+        // average between previous average to current average
+        src_avg = _mm256_avg_epu8(src_avg, src_reg);
+        // expand each byte to 2 bytes
+        MERGE_WITH_SRC(src_avg, zero_reg)
+        // save current source average
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst+= dst_stride;
+      }
+    // x_offset = 8  and y_offset = bilin interpolation
+    } else {
+      __m256i filter, pw8, src_next_reg, src_avg;
+      y_offset <<= 5;
+      filter = _mm256_load_si256((__m256i const *)
+               (bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      // load source and another source starting from the next
+      // following byte
+      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      AVG_NEXT_SRC(src_reg, 1)
+      for (i = 0; i < height ; i++) {
+        // save current source average
+        src_avg = src_reg;
+        src+= src_stride;
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, 1)
+        MERGE_WITH_SRC(src_avg, src_reg)
+        FILTER_SRC(filter)
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst+= dst_stride;
+      }
+    }
   // x_offset = bilin interpolation and y_offset = 0
   } else {
-      if (y_offset == 0) {
-        __m256i filter, pw8, src_next_reg;
-#if (ARCH_X86_64)
-        int64_t x_offset64;
-        x_offset64 = x_offset;
-        x_offset64 <<= 5;
-        filter = _mm256_load_si256(
-            (__m256i const *)(bilinear_filters_avx2 + x_offset64));
-#else
-        x_offset <<= 5;
-        filter = _mm256_load_si256(
-            (__m256i const *)(bilinear_filters_avx2 + x_offset));
-#endif
-        pw8 = _mm256_set1_epi16(8);
-        for (i = 0; i < height ; i++) {
-          // load source and another source starting from the next
-          // following byte + destination
-          src_reg = _mm256_loadu_si256((__m256i const *) (src));
-          src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
-          dst_reg = _mm256_load_si256((__m256i const *) (dst));
-
-          // merge current and next source
-          exp_src_lo = _mm256_unpacklo_epi8(src_reg, src_next_reg);
-          exp_src_hi = _mm256_unpackhi_epi8(src_reg, src_next_reg);
-
-          // filter the source
-          exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter);
-          exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter);
-
-          // add 8 to source
-          exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
-          exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
-
-          // divide the source by 16
-          exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
-          exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
-
-          // expand each byte to 2 bytes
-          exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
-          exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
-
-          // source - dest
-          exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
-          exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
-
-          // calculate sum
-          sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
-          exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
-          sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
-          exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
-
-          // calculate sse
-          sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
-          sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
-
-          src+= src_stride;
-          dst+= dst_stride;
-        }
-      // x_offset = bilin interpolation and y_offset = 8
-      } else if (y_offset == 8) {
-          __m256i filter, pw8, src_next_reg, src_pack;
-#if (ARCH_X86_64)
-          int64_t x_offset64;
-          x_offset64 = x_offset;
-          x_offset64 <<= 5;
-          filter = _mm256_load_si256(
-              (__m256i const *)(bilinear_filters_avx2 + x_offset64));
-#else
-          x_offset <<= 5;
-          filter = _mm256_load_si256(
-              (__m256i const *)(bilinear_filters_avx2 + x_offset));
-#endif
-          pw8 = _mm256_set1_epi16(8);
-          // load source and another source starting from the next
-          // following byte
-          src_reg = _mm256_loadu_si256((__m256i const *) (src));
-          src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
-
-          // merge current and next stride source
-          exp_src_lo = _mm256_unpacklo_epi8(src_reg, src_next_reg);
-          exp_src_hi = _mm256_unpackhi_epi8(src_reg, src_next_reg);
-
-          // filter the source
-          exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter);
-          exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter);
-
-          // add 8 to source
-          exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
-          exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
-
-          // divide source by 16
-          exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
-          exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
-
-          // convert each 16 bit to 8 bit to each low and high lane source
-          src_pack =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-          for (i = 0; i < height ; i++) {
-            src+= src_stride;
-
-            // load source and another source starting from the next
-            // following byte + destination
-            src_reg = _mm256_loadu_si256((__m256i const *) (src));
-            src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
-            dst_reg = _mm256_load_si256((__m256i const *) (dst));
-
-            // merge current and next stride source
-            exp_src_lo = _mm256_unpacklo_epi8(src_reg, src_next_reg);
-            exp_src_hi = _mm256_unpackhi_epi8(src_reg, src_next_reg);
-
-            // filter the source
-            exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter);
-            exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter);
-
-            // add 8 to source
-            exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
-            exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
-
-            // divide source by 16
-            exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
-            exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
-
-            // convert each 16 bit to 8 bit to each low and high lane source
-            src_reg =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-            // average between previous pack to the current
-            src_pack = _mm256_avg_epu8(src_pack, src_reg);
-
-            // expand each byte to 2 bytes
-            exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
-            exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
-
-            exp_src_lo = _mm256_unpacklo_epi8(src_pack, zero_reg);
-            exp_src_hi = _mm256_unpackhi_epi8(src_pack, zero_reg);
-
-            // source - dest
-            exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
-            exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
-
-            // calculate sum
-            sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
-            exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
-            sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
-            exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
-
-            // calculate sse
-            sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
-            sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
-
-            // save previous pack
-            src_pack = src_reg;
-            dst+= dst_stride;
-          }
-      // x_offset = bilin interpolation and y_offset = bilin interpolation
-      } else {
-          __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
-#if (ARCH_X86_64)
-          int64_t x_offset64, y_offset64;
-          x_offset64 = x_offset;
-          x_offset64 <<= 5;
-          y_offset64 = y_offset;
-          y_offset64 <<= 5;
-          xfilter = _mm256_load_si256(
-              (__m256i const *)(bilinear_filters_avx2 + x_offset64));
-          yfilter = _mm256_load_si256(
-              (__m256i const *)(bilinear_filters_avx2 + y_offset64));
-#else
-          x_offset <<= 5;
-          xfilter = _mm256_load_si256(
-              (__m256i const *)(bilinear_filters_avx2 + x_offset));
-          y_offset <<= 5;
-          yfilter = _mm256_load_si256(
-              (__m256i const *)(bilinear_filters_avx2 + y_offset));
-#endif
-          pw8 = _mm256_set1_epi16(8);
-          // load source and another source starting from the next
-          // following byte
-          src_reg = _mm256_loadu_si256((__m256i const *) (src));
-          src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
-          // merge current and next stride source
-          exp_src_lo = _mm256_unpacklo_epi8(src_reg, src_next_reg);
-          exp_src_hi = _mm256_unpackhi_epi8(src_reg, src_next_reg);
-
-          // filter the source
-          exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, xfilter);
-          exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, xfilter);
-
-          // add 8 to the source
-          exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
-          exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
-
-          // divide the source by 16
-          exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
-          exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
-
-          // convert each 16 bit to 8 bit to each low and high lane source
-          src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-          for (i = 0; i < height ; i++) {
-            src+= src_stride;
-            // load source and another source starting from the next
-            // following byte + destination
-            src_reg = _mm256_loadu_si256((__m256i const *) (src));
-            src_next_reg = _mm256_loadu_si256((__m256i const *) (src + 1));
-            dst_reg = _mm256_load_si256((__m256i const *) (dst));
-
-            // merge current and next stride source
-            exp_src_lo = _mm256_unpacklo_epi8(src_reg, src_next_reg);
-            exp_src_hi = _mm256_unpackhi_epi8(src_reg, src_next_reg);
-
-            // filter the source
-            exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, xfilter);
-            exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, xfilter);
-
-            // add 8 to source
-            exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
-            exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
-
-            // divide source by 16
-            exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
-            exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
-
-            // convert each 16 bit to 8 bit to each low and high lane source
-            src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-
-            // merge previous pack to current pack source
-            exp_src_lo = _mm256_unpacklo_epi8(src_pack, src_reg);
-            exp_src_hi = _mm256_unpackhi_epi8(src_pack, src_reg);
-
-            // filter the source
-            exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, yfilter);
-            exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, yfilter);
-
-            // expand each byte to 2 bytes
-            exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);
-            exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);
-
-            // add 8 to source
-            exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);
-            exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);
-
-            // divide source by 16
-            exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);
-            exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
-
-            // source - dest
-            exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);
-            exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);
-
-            // caculate sum
-            sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);
-            exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo);
-            sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);
-            exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi);
-
-            // calculate sse
-            sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);
-            sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
-
-            src_pack = src_reg;
-            dst+= dst_stride;
-          }
+    if (y_offset == 0) {
+      __m256i filter, pw8, src_next_reg;
+      x_offset <<= 5;
+      filter = _mm256_load_si256((__m256i const *)
+               (bilinear_filters_avx2 + x_offset));
+      pw8 = _mm256_set1_epi16(8);
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, 1)
+        FILTER_SRC(filter)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
       }
+    // x_offset = bilin interpolation and y_offset = 8
+    } else if (y_offset == 8) {
+      __m256i filter, pw8, src_next_reg, src_pack;
+      x_offset <<= 5;
+      filter = _mm256_load_si256((__m256i const *)
+               (bilinear_filters_avx2 + x_offset));
+      pw8 = _mm256_set1_epi16(8);
+      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      MERGE_NEXT_SRC(src_reg, 1)
+      FILTER_SRC(filter)
+      // convert each 16 bit to 8 bit to each low and high lane source
+      src_pack =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      for (i = 0; i < height ; i++) {
+        src+= src_stride;
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, 1)
+        FILTER_SRC(filter)
+        src_reg =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        // average between previous pack to the current
+        src_pack = _mm256_avg_epu8(src_pack, src_reg);
+        MERGE_WITH_SRC(src_pack, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src_pack = src_reg;
+        dst+= dst_stride;
+      }
+    // x_offset = bilin interpolation and y_offset = bilin interpolation
+    } else {
+      __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
+      x_offset <<= 5;
+      xfilter = _mm256_load_si256((__m256i const *)
+                (bilinear_filters_avx2 + x_offset));
+      y_offset <<= 5;
+      yfilter = _mm256_load_si256((__m256i const *)
+                (bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      // load source and another source starting from the next
+      // following byte
+      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      MERGE_NEXT_SRC(src_reg, 1)
+
+      FILTER_SRC(xfilter)
+      // convert each 16 bit to 8 bit to each low and high lane source
+      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      for (i = 0; i < height ; i++) {
+        src+= src_stride;
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, 1)
+        FILTER_SRC(xfilter)
+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        // merge previous pack to current pack source
+        MERGE_WITH_SRC(src_pack, src_reg)
+        // filter the source
+        FILTER_SRC(yfilter)
+        src_pack = src_reg;
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst+= dst_stride;
+      }
+    }
   }
-  // sum < 0
-  res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg);
-  // save the next 8 bytes of each lane of sse
-  sse_reg_hi = _mm256_srli_si256(sse_reg, 8);
-  // merge the result of sum < 0  with sum to add sign to the next 16 bits
-  sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp);
-  sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp);
-  // add each 8 bytes from every lane of sse and sum
-  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);
-  sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi);
+  CALC_SUM_AND_SSE
+  return sum;
+}
 
-  // save the next 4 bytes of each lane sse
-  sse_reg_hi = _mm256_srli_si256(sse_reg, 4);
-  // save the next 8 bytes of each lane of sum
-  sum_reg_hi = _mm256_srli_si256(sum_reg, 8);
+unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
+                                             int src_stride,
+                                             int x_offset,
+                                             int y_offset,
+                                             const uint8_t *dst,
+                                             int dst_stride,
+                                             const uint8_t *sec,
+                                             int sec_stride,
+                                             int height,
+                                             unsigned int *sse) {
+  __m256i sec_reg;
+  __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+  __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
+  __m256i zero_reg;
+  int i, sum;
+  sum_reg = _mm256_set1_epi16(0);
+  sse_reg = _mm256_set1_epi16(0);
+  zero_reg = _mm256_set1_epi16(0);
 
-  // add the first 4 bytes to the next 4 bytes sse
-  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);
-  // add the first 8 bytes to the next 8 bytes
-  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);
-  // extract the low lane and the high lane and add the results
-  *((int*)sse)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) +
-                _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1));
-  sum_reg_hi = _mm256_srli_si256(sum_reg, 4);
-  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);
-  sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) +
-        _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
+  // x_offset = 0 and y_offset = 0
+  if (x_offset == 0) {
+    if (y_offset == 0) {
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+        sec+= sec_stride;
+        // expend each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    } else if (y_offset == 8) {
+      __m256i src_next_reg;
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, src_stride)
+        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+        sec+= sec_stride;
+        // expend each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    // x_offset = 0 and y_offset = bilin interpolation
+    } else {
+      __m256i filter, pw8, src_next_reg;
+
+      y_offset <<= 5;
+      filter = _mm256_load_si256((__m256i const *)
+                 (bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, src_stride)
+        FILTER_SRC(filter)
+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+        sec+= sec_stride;
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    }
+  // x_offset = 8  and y_offset = 0
+  } else if (x_offset == 8) {
+    if (y_offset == 0) {
+      __m256i src_next_reg;
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, 1)
+        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+        sec+= sec_stride;
+        // expand each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    // x_offset = 8  and y_offset = 8
+    } else if (y_offset == 8) {
+      __m256i src_next_reg, src_avg;
+      // load source and another source starting from the next
+      // following byte
+      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      AVG_NEXT_SRC(src_reg, 1)
+      for (i = 0; i < height ; i++) {
+        // save current source average
+        src_avg = src_reg;
+        src+= src_stride;
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, 1)
+        // average between previous average to current average
+        src_avg = _mm256_avg_epu8(src_avg, src_reg);
+        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        src_avg = _mm256_avg_epu8(src_avg, sec_reg);
+        sec+= sec_stride;
+        // expand each byte to 2 bytes
+        MERGE_WITH_SRC(src_avg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst+= dst_stride;
+      }
+    // x_offset = 8  and y_offset = bilin interpolation
+    } else {
+      __m256i filter, pw8, src_next_reg, src_avg;
+      y_offset <<= 5;
+      filter = _mm256_load_si256((__m256i const *)
+               (bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      // load source and another source starting from the next
+      // following byte
+      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      AVG_NEXT_SRC(src_reg, 1)
+      for (i = 0; i < height ; i++) {
+        // save current source average
+        src_avg = src_reg;
+        src+= src_stride;
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, 1)
+        MERGE_WITH_SRC(src_avg, src_reg)
+        FILTER_SRC(filter)
+        src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        src_avg = _mm256_avg_epu8(src_avg, sec_reg);
+        // expand each byte to 2 bytes
+        MERGE_WITH_SRC(src_avg, zero_reg)
+        sec+= sec_stride;
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst+= dst_stride;
+      }
+    }
+  // x_offset = bilin interpolation and y_offset = 0
+  } else {
+    if (y_offset == 0) {
+      __m256i filter, pw8, src_next_reg;
+      x_offset <<= 5;
+      filter = _mm256_load_si256((__m256i const *)
+               (bilinear_filters_avx2 + x_offset));
+      pw8 = _mm256_set1_epi16(8);
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, 1)
+        FILTER_SRC(filter)
+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        sec+= sec_stride;
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    // x_offset = bilin interpolation and y_offset = 8
+    } else if (y_offset == 8) {
+      __m256i filter, pw8, src_next_reg, src_pack;
+      x_offset <<= 5;
+      filter = _mm256_load_si256((__m256i const *)
+               (bilinear_filters_avx2 + x_offset));
+      pw8 = _mm256_set1_epi16(8);
+      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      MERGE_NEXT_SRC(src_reg, 1)
+      FILTER_SRC(filter)
+      // convert each 16 bit to 8 bit to each low and high lane source
+      src_pack =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      for (i = 0; i < height ; i++) {
+        src+= src_stride;
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, 1)
+        FILTER_SRC(filter)
+        src_reg =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        // average between previous pack to the current
+        src_pack = _mm256_avg_epu8(src_pack, src_reg);
+        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        src_pack = _mm256_avg_epu8(src_pack, sec_reg);
+        sec+= sec_stride;
+        MERGE_WITH_SRC(src_pack, zero_reg)
+        src_pack = src_reg;
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst+= dst_stride;
+      }
+    // x_offset = bilin interpolation and y_offset = bilin interpolation
+    } else {
+      __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
+      x_offset <<= 5;
+      xfilter = _mm256_load_si256((__m256i const *)
+                (bilinear_filters_avx2 + x_offset));
+      y_offset <<= 5;
+      yfilter = _mm256_load_si256((__m256i const *)
+                (bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      // load source and another source starting from the next
+      // following byte
+      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      MERGE_NEXT_SRC(src_reg, 1)
+
+      FILTER_SRC(xfilter)
+      // convert each 16 bit to 8 bit to each low and high lane source
+      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      for (i = 0; i < height ; i++) {
+        src+= src_stride;
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, 1)
+        FILTER_SRC(xfilter)
+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        // merge previous pack to current pack source
+        MERGE_WITH_SRC(src_pack, src_reg)
+        // filter the source
+        FILTER_SRC(yfilter)
+        src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        sec_reg = _mm256_load_si256((__m256i const *) (sec));
+        src_pack = _mm256_avg_epu8(src_pack, sec_reg);
+        MERGE_WITH_SRC(src_pack, zero_reg)
+        src_pack = src_reg;
+        sec+= sec_stride;
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst+= dst_stride;
+      }
+    }
+  }
+  CALC_SUM_AND_SSE
   return sum;
 }
diff --git a/vp9/encoder/x86/vp9_variance_avx2.c b/vp9/encoder/x86/vp9_variance_avx2.c
index 02007a3..835c519 100644
--- a/vp9/encoder/x86/vp9_variance_avx2.c
+++ b/vp9/encoder/x86/vp9_variance_avx2.c
@@ -54,6 +54,20 @@
   unsigned int *sse
 );
 
+unsigned int vp9_sub_pixel_avg_variance32xh_avx2
+(
+  const uint8_t *src,
+  int src_stride,
+  int x_offset,
+  int y_offset,
+  const uint8_t *dst,
+  int dst_stride,
+  const uint8_t *sec,
+  int sec_stride,
+  int height,
+  unsigned int *sseptr
+);
+
 static void variance_avx2(const unsigned char *src_ptr, int  source_stride,
                         const unsigned char *ref_ptr, int  recon_stride,
                         int  w, int  h, unsigned int *sse, int *sum,
@@ -207,3 +221,48 @@
   *sse_ptr = sse;
   return sse - (((int64_t)se * se) >> 10);
 }
+
+unsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src,
+                                                  int src_stride,
+                                                  int x_offset,
+                                                  int y_offset,
+                                                  const uint8_t *dst,
+                                                  int dst_stride,
+                                                  unsigned int *sseptr,
+                                                  const uint8_t *sec) {
+  // processing 32 elements in parallel
+  unsigned int sse;
+
+  int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
+                                               y_offset, dst, dst_stride,
+                                               sec, 64, 64, &sse);
+  unsigned int sse2;
+  // processing the next 32 elements in parallel
+  int se2 = vp9_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset,
+                                                y_offset, dst + 32, dst_stride,
+                                                sec + 32, 64, 64, &sse2);
+  se += se2;
+  sse += sse2;
+  *sseptr = sse;
+
+  return sse - (((int64_t)se * se) >> 12);
+}
+
+unsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src,
+                                                  int src_stride,
+                                                  int x_offset,
+                                                  int y_offset,
+                                                  const uint8_t *dst,
+                                                  int dst_stride,
+                                                  unsigned int *sseptr,
+                                                  const uint8_t *sec) {
+  // processing 32 element in parallel
+  unsigned int sse;
+  int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
+                                                 y_offset, dst, dst_stride,
+                                                 sec, 32, 32, &sse);
+  *sseptr = sse;
+  return sse - (((int64_t)se * se) >> 10);
+}
+
+
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 9fb6115..b4ab99b 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -45,7 +45,7 @@
 VP9_COMMON_SRCS-yes += common/vp9_reconinter.h
 VP9_COMMON_SRCS-yes += common/vp9_reconintra.h
 VP9_COMMON_SRCS-yes += common/vp9_rtcd.c
-VP9_COMMON_SRCS-yes += common/vp9_rtcd_defs.sh
+VP9_COMMON_SRCS-yes += common/vp9_rtcd_defs.pl
 VP9_COMMON_SRCS-yes += common/vp9_scale.h
 VP9_COMMON_SRCS-yes += common/vp9_scale.c
 VP9_COMMON_SRCS-yes += common/vp9_seg_common.h
@@ -145,4 +145,4 @@
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_save_reg_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_reconintra_neon$(ASM)
 
-$(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.sh))
+$(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.pl))
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index d7713fd..32c528a 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -75,7 +75,7 @@
   vpx_codec_enc_cfg_t     cfg;
   struct vp9_extracfg     vp8_cfg;
   VP9_CONFIG              oxcf;
-  VP9_PTR                 cpi;
+  VP9_COMP               *cpi;
   unsigned char          *cx_data;
   size_t                  cx_data_sz;
   unsigned char          *pending_cx_data;
@@ -359,7 +359,7 @@
     memcpy(oxcf->ss_target_bitrate, cfg.ss_target_bitrate,
            sizeof(cfg.ss_target_bitrate));
   } else if (oxcf->ss_number_layers == 1) {
-    oxcf->ss_target_bitrate[0] = oxcf->target_bandwidth;
+    oxcf->ss_target_bitrate[0] = (int)oxcf->target_bandwidth;
   }
 
   oxcf->ts_number_layers = cfg.ts_number_layers;
@@ -502,8 +502,6 @@
   vpx_codec_enc_cfg_t       *cfg;
   unsigned int               i;
 
-  VP9_PTR optr;
-
   if (ctx->priv == NULL) {
     priv = calloc(1, sizeof(struct vpx_codec_alg_priv));
 
@@ -551,15 +549,15 @@
     res = validate_config(priv, &priv->cfg, &priv->vp8_cfg);
 
     if (res == VPX_CODEC_OK) {
+      VP9_COMP *cpi;
       set_vp9e_config(&ctx->priv->alg_priv->oxcf,
                       ctx->priv->alg_priv->cfg,
                       ctx->priv->alg_priv->vp8_cfg);
-      optr = vp9_create_compressor(&ctx->priv->alg_priv->oxcf);
-
-      if (optr == NULL)
+      cpi = vp9_create_compressor(&ctx->priv->alg_priv->oxcf);
+      if (cpi == NULL)
         res = VPX_CODEC_MEM_ERROR;
       else
-        ctx->priv->alg_priv->cpi = optr;
+        ctx->priv->alg_priv->cpi = cpi;
     }
   }
 
@@ -574,7 +572,7 @@
 
 static vpx_codec_err_t vp9e_destroy(vpx_codec_alg_priv_t *ctx) {
   free(ctx->cx_data);
-  vp9_remove_compressor(&ctx->cpi);
+  vp9_remove_compressor(ctx->cpi);
   free(ctx);
   return VPX_CODEC_OK;
 }
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index b85e172..83d64b8 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -46,7 +46,7 @@
   vp9_stream_info_t       si;
   int                     defer_alloc;
   int                     decoder_init;
-  VP9D_PTR                pbi;
+  struct VP9Decompressor *pbi;
   int                     postproc_cfg_set;
   vp8_postproc_cfg_t      postproc_cfg;
 #if CONFIG_POSTPROC_VISUALIZER
@@ -274,7 +274,7 @@
 
     if (!res) {
       VP9D_CONFIG oxcf;
-      VP9D_PTR optr;
+      struct VP9Decompressor *optr;
 
       vp9_initialize_dec();
 
diff --git a/vpx_scale/generic/yv12extend.c b/vpx_scale/generic/yv12extend.c
index 7896dfe..7b43eec 100644
--- a/vpx_scale/generic/yv12extend.c
+++ b/vpx_scale/generic/yv12extend.c
@@ -81,9 +81,7 @@
 }
 
 #if CONFIG_VP9
-static void extend_frame(YV12_BUFFER_CONFIG *const ybf,
-                         int subsampling_x, int subsampling_y,
-                         int ext_size) {
+static void extend_frame(YV12_BUFFER_CONFIG *const ybf, int ext_size) {
   const int c_w = ybf->uv_crop_width;
   const int c_h = ybf->uv_crop_height;
   const int c_ext_size = ext_size >> 1;
@@ -110,16 +108,14 @@
                c_w, c_h, c_et, c_el, c_eb, c_er);
 }
 
-void vp9_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf,
-                                int subsampling_x, int subsampling_y) {
-  extend_frame(ybf, subsampling_x, subsampling_y, ybf->border);
+void vp9_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) {
+  extend_frame(ybf, ybf->border);
 }
 
-void vp9_extend_frame_inner_borders_c(YV12_BUFFER_CONFIG *ybf,
-                                      int subsampling_x, int subsampling_y) {
+void vp9_extend_frame_inner_borders_c(YV12_BUFFER_CONFIG *ybf) {
   const int inner_bw = (ybf->border > VP9INNERBORDERINPIXELS) ?
                        VP9INNERBORDERINPIXELS : ybf->border;
-  extend_frame(ybf, subsampling_x, subsampling_y, inner_bw);
+  extend_frame(ybf, inner_bw);
 }
 #endif  // CONFIG_VP9
 
diff --git a/vpx_scale/vpx_scale.mk b/vpx_scale/vpx_scale.mk
index 50d3e9d..51a0ec9 100644
--- a/vpx_scale/vpx_scale.mk
+++ b/vpx_scale/vpx_scale.mk
@@ -7,7 +7,7 @@
 SCALE_SRCS-$(CONFIG_SPATIAL_RESAMPLING) += generic/gen_scalers.c
 SCALE_SRCS-yes += vpx_scale_asm_offsets.c
 SCALE_SRCS-yes += vpx_scale_rtcd.c
-SCALE_SRCS-yes += vpx_scale_rtcd.sh
+SCALE_SRCS-yes += vpx_scale_rtcd.pl
 
 #neon
 SCALE_SRCS-$(HAVE_NEON)  += arm/neon/vp8_vpxyv12_copyframe_func_neon$(ASM)
@@ -24,4 +24,4 @@
 $(eval $(call asm_offsets_template,\
 	         vpx_scale_asm_offsets.asm, vpx_scale/vpx_scale_asm_offsets.c))
 
-$(eval $(call rtcd_h_template,vpx_scale_rtcd,vpx_scale/vpx_scale_rtcd.sh))
+$(eval $(call rtcd_h_template,vpx_scale_rtcd,vpx_scale/vpx_scale_rtcd.pl))
diff --git a/vpx_scale/vpx_scale_rtcd.pl b/vpx_scale/vpx_scale_rtcd.pl
new file mode 100644
index 0000000..28e168e
--- /dev/null
+++ b/vpx_scale/vpx_scale_rtcd.pl
@@ -0,0 +1,35 @@
+sub vpx_scale_forward_decls() {
+print <<EOF
+struct yv12_buffer_config;
+EOF
+}
+forward_decls qw/vpx_scale_forward_decls/;
+
+# Scaler functions
+if (vpx_config("CONFIG_SPATIAL_RESAMPLING") eq "yes") {
+    add_proto qw/void vp8_horizontal_line_5_4_scale/, "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width";
+    add_proto qw/void vp8_vertical_band_5_4_scale/, "unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width";
+    add_proto qw/void vp8_horizontal_line_5_3_scale/, "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width";
+    add_proto qw/void vp8_vertical_band_5_3_scale/, "unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width";
+    add_proto qw/void vp8_horizontal_line_2_1_scale/, "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width";
+    add_proto qw/void vp8_vertical_band_2_1_scale/, "unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width";
+    add_proto qw/void vp8_vertical_band_2_1_scale_i/, "unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width";
+}
+
+add_proto qw/void vp8_yv12_extend_frame_borders/, "struct yv12_buffer_config *ybf";
+specialize qw/vp8_yv12_extend_frame_borders neon/;
+
+add_proto qw/void vp8_yv12_copy_frame/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc";
+specialize qw/vp8_yv12_copy_frame neon/;
+
+add_proto qw/void vpx_yv12_copy_y/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc";
+specialize qw/vpx_yv12_copy_y neon/;
+
+if (vpx_config("CONFIG_VP9") eq "yes") {
+    add_proto qw/void vp9_extend_frame_borders/, "struct yv12_buffer_config *ybf";
+    specialize qw/vp9_extend_frame_borders dspr2/;
+
+    add_proto qw/void vp9_extend_frame_inner_borders/, "struct yv12_buffer_config *ybf";
+    specialize qw/vp9_extend_frame_inner_borders dspr2/;
+}
+1;
diff --git a/vpx_scale/vpx_scale_rtcd.sh b/vpx_scale/vpx_scale_rtcd.sh
deleted file mode 100644
index 1d02b69..0000000
--- a/vpx_scale/vpx_scale_rtcd.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-vpx_scale_forward_decls() {
-cat <<EOF
-struct yv12_buffer_config;
-EOF
-}
-forward_decls vpx_scale_forward_decls
-
-# Scaler functions
-if [ "$CONFIG_SPATIAL_RESAMPLING" = "yes" ]; then
-    prototype void vp8_horizontal_line_5_4_scale "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width"
-    prototype void vp8_vertical_band_5_4_scale "unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
-    prototype void vp8_horizontal_line_5_3_scale "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width"
-    prototype void vp8_vertical_band_5_3_scale "unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
-    prototype void vp8_horizontal_line_2_1_scale "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width"
-    prototype void vp8_vertical_band_2_1_scale "unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
-    prototype void vp8_vertical_band_2_1_scale_i "unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width"
-fi
-
-prototype void vp8_yv12_extend_frame_borders "struct yv12_buffer_config *ybf"
-specialize vp8_yv12_extend_frame_borders neon
-
-prototype void vp8_yv12_copy_frame "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc"
-specialize vp8_yv12_copy_frame neon
-
-prototype void vpx_yv12_copy_y "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc"
-specialize vpx_yv12_copy_y neon
-
-if [ "$CONFIG_VP9" = "yes" ]; then
-    prototype void vp9_extend_frame_borders "struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y"
-    specialize vp9_extend_frame_borders dspr2
-
-    prototype void vp9_extend_frame_inner_borders "struct yv12_buffer_config *ybf, int subsampling_x, int subsampling_y"
-    specialize vp9_extend_frame_inner_borders dspr2
-fi