Merge "vp8 - compatibility warning added to changelog"
diff --git a/tools/ftfy.sh b/tools/ftfy.sh
new file mode 100755
index 0000000..de0f0ed
--- /dev/null
+++ b/tools/ftfy.sh
@@ -0,0 +1,153 @@
+#!/bin/sh
+self="$0"
+dirname_self=$(dirname "$self")
+
+usage() {
+  cat <<EOF >&2
+Usage: $self [option]
+
+This script applies a whitespace transformation to the commit at HEAD. If no
+options are given, then the modified files are left in the working tree.
+
+Options:
+  -h, --help     Shows this message
+  -n, --dry-run  Shows a diff of the changes to be made.
+  --amend        Squashes the changes into the commit at HEAD
+                     This option will also reformat the commit message.
+  --commit       Creates a new commit containing only the whitespace changes
+  --msg-only     Reformat the commit message only, ignore the patch itself.
+
+EOF
+  rm -f ${CLEAN_FILES}
+  exit 1
+}
+
+
+log() {
+  echo "${self##*/}: $@" >&2
+}
+
+
+vpx_style() {
+  astyle --style=bsd --min-conditional-indent=0 --break-blocks \
+         --pad-oper --pad-header --unpad-paren \
+         --align-pointer=name \
+         --indent-preprocessor --convert-tabs --indent-labels \
+         --suffix=none --quiet "$@"
+  sed -i 's/[[:space:]]\{1,\},/,/g' "$@"
+}
+
+
+apply() {
+  [ $INTERSECT_RESULT -ne 0 ] && patch -p1 < "$1"
+}
+
+
+commit() {
+  LAST_CHANGEID=$(git show | awk '/Change-Id:/{print $2}')
+  if [ -z "$LAST_CHANGEID" ]; then
+    log "HEAD doesn't have a Change-Id, unable to generate a new commit"
+    exit 1
+  fi
+
+  # Build a deterministic Change-Id from the parent's
+  NEW_CHANGEID=${LAST_CHANGEID}-styled
+  NEW_CHANGEID=I$(echo $NEW_CHANGEID | git hash-object --stdin)
+
+  # Commit, preserving authorship from the parent commit.
+  git commit -a -C HEAD > /dev/null
+  git commit --amend -F- << EOF
+Cosmetic: Fix whitespace in change ${LAST_CHANGEID:0:9}
+
+Change-Id: ${NEW_CHANGEID}
+EOF
+}
+
+
+show_commit_msg_diff() {
+  if [ $DIFF_MSG_RESULT -ne 0 ]; then
+    log "Modified commit message:"
+    diff -u "$ORIG_COMMIT_MSG" "$NEW_COMMIT_MSG" | tail -n +3
+  fi
+}
+
+
+amend() {
+  show_commit_msg_diff
+  if [ $DIFF_MSG_RESULT -ne 0 ] || [ $INTERSECT_RESULT -ne 0 ]; then
+    git commit -a --amend -F "$NEW_COMMIT_MSG"
+  fi
+}
+
+
+diff_msg() {
+  git log -1 --format=%B > "$ORIG_COMMIT_MSG"
+  "${dirname_self}"/wrap-commit-msg.py \
+      < "$ORIG_COMMIT_MSG" > "$NEW_COMMIT_MSG"
+  cmp -s "$ORIG_COMMIT_MSG" "$NEW_COMMIT_MSG"
+  DIFF_MSG_RESULT=$?
+}
+
+
+# Temporary files
+ORIG_DIFF=orig.diff.$$
+MODIFIED_DIFF=modified.diff.$$
+FINAL_DIFF=final.diff.$$
+ORIG_COMMIT_MSG=orig.commit-msg.$$
+NEW_COMMIT_MSG=new.commit-msg.$$
+CLEAN_FILES="${ORIG_DIFF} ${MODIFIED_DIFF} ${FINAL_DIFF}"
+CLEAN_FILES="${CLEAN_FILES} ${ORIG_COMMIT_MSG} ${NEW_COMMIT_MSG}"
+
+# Preconditions
+[ $# -lt 2 ] || usage
+
+if ! git diff --quiet HEAD; then
+  log "Working tree is dirty, commit your changes first"
+  exit 1
+fi
+
+# Need to be in the root
+cd "$(git rev-parse --show-toplevel)"
+
+# Collect the original diff
+git show > "${ORIG_DIFF}"
+
+# Apply the style guide on the modified files and collect its diff
+for f in $(git diff HEAD^ --name-only | grep '\.[ch]$'); do
+  case "$f" in
+    third_party/*) continue;;
+    nestegg/*) continue;;
+  esac
+  vpx_style "$f"
+done
+git diff --no-color --no-ext-diff > "${MODIFIED_DIFF}"
+
+# Intersect the two diffs
+"${dirname_self}"/intersect-diffs.py \
+    "${ORIG_DIFF}" "${MODIFIED_DIFF}" > "${FINAL_DIFF}"
+INTERSECT_RESULT=$?
+git reset --hard >/dev/null
+
+# Fixup the commit message
+diff_msg
+
+# Handle options
+if [ -n "$1" ]; then
+  case "$1" in
+    -h|--help) usage;;
+    -n|--dry-run) cat "${FINAL_DIFF}"; show_commit_msg_diff;;
+    --commit) apply "${FINAL_DIFF}"; commit;;
+    --amend) apply "${FINAL_DIFF}"; amend;;
+    --msg-only) amend;;
+    *) usage;;
+  esac
+else
+  apply "${FINAL_DIFF}"
+  if ! git diff --quiet; then
+    log "Formatting changes applied, verify and commit."
+    log "See also: http://www.webmproject.org/code/contribute/conventions/"
+    git diff --stat
+  fi
+fi
+
+rm -f ${CLEAN_FILES}
diff --git a/tools/intersect-diffs.py b/tools/intersect-diffs.py
new file mode 100755
index 0000000..be9dea5
--- /dev/null
+++ b/tools/intersect-diffs.py
@@ -0,0 +1,188 @@
+#!/usr/bin/env python
+##  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+"""Calculates the "intersection" of two unified diffs.
+
+Given two diffs, A and B, it finds all hunks in B that had non-context lines
+in A and prints them to stdout. This is useful to determine the hunks in B that
+are relevant to A. The resulting file can be applied with patch(1) on top of A.
+"""
+
+__author__ = "jkoleszar@google.com"
+
+import re
+import sys
+
+
+class DiffLines(object):
+    """A container for one half of a diff."""
+
+    def __init__(self, filename, offset, length):
+        self.filename = filename
+        self.offset = offset
+        self.length = length
+        self.lines = []
+        self.delta_line_nums = []
+
+    def Append(self, line):
+        l = len(self.lines)
+        if line[0] != " ":
+            self.delta_line_nums.append(self.offset + l)
+        self.lines.append(line[1:])
+        assert l+1 <= self.length
+
+    def Complete(self):
+        return len(self.lines) == self.length
+
+    def __contains__(self, item):
+        return item >= self.offset and item <= self.offset + self.length - 1
+
+
+class DiffHunk(object):
+    """A container for one diff hunk, consisting of two DiffLines."""
+
+    def __init__(self, header, file_a, file_b, start_a, len_a, start_b, len_b):
+        self.header = header
+        self.left = DiffLines(file_a, start_a, len_a)
+        self.right = DiffLines(file_b, start_b, len_b)
+        self.lines = []
+
+    def Append(self, line):
+        """Adds a line to the DiffHunk and its DiffLines children."""
+        if line[0] == "-":
+            self.left.Append(line)
+        elif line[0] == "+":
+            self.right.Append(line)
+        elif line[0] == " ":
+            self.left.Append(line)
+            self.right.Append(line)
+        else:
+            assert False, ("Unrecognized character at start of diff line "
+                           "%r" % line[0])
+        self.lines.append(line)
+
+    def Complete(self):
+        return self.left.Complete() and self.right.Complete()
+
+    def __repr__(self):
+        return "DiffHunk(%s, %s, len %d)" % (
+            self.left.filename, self.right.filename,
+            max(self.left.length, self.right.length))
+
+
+def ParseDiffHunks(stream):
+    """Walk a file-like object, yielding DiffHunks as they're parsed."""
+
+    file_regex = re.compile(r"(\+\+\+|---) (\S+)")
+    range_regex = re.compile(r"@@ -(\d+)(,(\d+))? \+(\d+)(,(\d+))?")
+    hunk = None
+    while True:
+        line = stream.readline()
+        if not line:
+            break
+
+        if hunk is None:
+            # Parse file names
+            diff_file = file_regex.match(line)
+            if diff_file:
+              if line.startswith("---"):
+                  a_line = line
+                  a = diff_file.group(2)
+                  continue
+              if line.startswith("+++"):
+                  b_line = line
+                  b = diff_file.group(2)
+                  continue
+
+            # Parse offset/lengths
+            diffrange = range_regex.match(line)
+            if diffrange:
+                if diffrange.group(2):
+                    start_a = int(diffrange.group(1))
+                    len_a = int(diffrange.group(3))
+                else:
+                    start_a = 1
+                    len_a = int(diffrange.group(1))
+
+                if diffrange.group(5):
+                    start_b = int(diffrange.group(4))
+                    len_b = int(diffrange.group(6))
+                else:
+                    start_b = 1
+                    len_b = int(diffrange.group(4))
+
+                header = [a_line, b_line, line]
+                hunk = DiffHunk(header, a, b, start_a, len_a, start_b, len_b)
+        else:
+            # Add the current line to the hunk
+            hunk.Append(line)
+
+            # See if the whole hunk has been parsed. If so, yield it and prepare
+            # for the next hunk.
+            if hunk.Complete():
+                yield hunk
+                hunk = None
+
+    # Partial hunks are a parse error
+    assert hunk is None
+
+
+def FormatDiffHunks(hunks):
+    """Re-serialize a list of DiffHunks."""
+    r = []
+    last_header = None
+    for hunk in hunks:
+        this_header = hunk.header[0:2]
+        if last_header != this_header:
+            r.extend(hunk.header)
+            last_header = this_header
+        else:
+            r.extend(hunk.header[2])
+        r.extend(hunk.lines)
+        r.append("\n")
+    return "".join(r)
+
+
+def ZipHunks(rhs_hunks, lhs_hunks):
+    """Join two hunk lists on filename."""
+    for rhs_hunk in rhs_hunks:
+        rhs_file = rhs_hunk.right.filename.split("/")[1:]
+
+        for lhs_hunk in lhs_hunks:
+            lhs_file = lhs_hunk.left.filename.split("/")[1:]
+            if lhs_file != rhs_file:
+                continue
+            yield (rhs_hunk, lhs_hunk)
+
+
+def main():
+    old_hunks = [x for x in ParseDiffHunks(open(sys.argv[1], "r"))]
+    new_hunks = [x for x in ParseDiffHunks(open(sys.argv[2], "r"))]
+    out_hunks = []
+
+    # Join the right hand side of the older diff with the left hand side of the
+    # newer diff.
+    for old_hunk, new_hunk in ZipHunks(old_hunks, new_hunks):
+        if new_hunk in out_hunks:
+            continue
+        old_lines = old_hunk.right
+        new_lines = new_hunk.left
+
+        # Determine if this hunk overlaps any non-context line from the other
+        for i in old_lines.delta_line_nums:
+            if i in new_lines:
+                out_hunks.append(new_hunk)
+                break
+
+    if out_hunks:
+        print FormatDiffHunks(out_hunks)
+        sys.exit(1)
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/wrap-commit-msg.py b/tools/wrap-commit-msg.py
new file mode 100755
index 0000000..d5b4b04
--- /dev/null
+++ b/tools/wrap-commit-msg.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+##  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+"""Wraps paragraphs of text, preserving manual formatting
+
+This is like fold(1), but has the special convention of not modifying lines
+that start with whitespace. This allows you to intersperse blocks with
+special formatting, like code blocks, with written prose. The prose will
+be wordwrapped, and the manual formatting will be preserved.
+
+ * This won't handle the case of a bulleted (or ordered) list specially, so
+   manual wrapping must be done.
+
+Occasionally it's useful to put something with explicit formatting that
+doesn't look at all like a block of text inline.
+
+  indicator = has_leading_whitespace(line);
+  if (indicator)
+    preserve_formatting(line);
+
+The intent is that this docstring would make it through the transform
+and still be legible and presented as it is in the source. If additional
+cases are handled, update this doc to describe the effect.
+"""
+
+__author__ = "jkoleszar@google.com"
+import textwrap
+import sys
+
+def wrap(text):
+    if text:
+        return textwrap.fill(text, break_long_words=False) + '\n'
+    return ""
+
+
+def main(fileobj):
+    text = ""
+    output = ""
+    while True:
+        line = fileobj.readline()
+        if not line:
+            break
+
+        if line.lstrip() == line:
+            text += line
+        else:
+            output += wrap(text)
+            text=""
+            output += line
+    output += wrap(text)
+
+    # Replace the file or write to stdout.
+    if fileobj == sys.stdin:
+        fileobj = sys.stdout
+    else:
+        fileobj.seek(0)
+        fileobj.truncate(0)
+    fileobj.write(output)
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        main(open(sys.argv[1], "r+"))
+    else:
+        main(sys.stdin)
diff --git a/vp8/common/reconintra.c b/vp8/common/reconintra.c
index 4b13777..4067a68 100644
--- a/vp8/common/reconintra.c
+++ b/vp8/common/reconintra.c
@@ -14,140 +14,17 @@
 #include "vpx_mem/vpx_mem.h"
 #include "blockd.h"
 
-/* For skip_recon_mb(), add vp8_build_intra_predictors_mby_s(MACROBLOCKD *x) and
- * vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x).
- */
-
-void vp8_build_intra_predictors_mby_c(MACROBLOCKD *x)
-{
-
-    unsigned char *yabove_row = x->dst.y_buffer - x->dst.y_stride;
-    unsigned char yleft_col[16];
-    unsigned char ytop_left = yabove_row[-1];
-    unsigned char *ypred_ptr = x->predictor;
-    int r, c, i;
-
-    for (i = 0; i < 16; i++)
-    {
-        yleft_col[i] = x->dst.y_buffer [i* x->dst.y_stride -1];
-    }
-
-    /* for Y */
-    switch (x->mode_info_context->mbmi.mode)
-    {
-    case DC_PRED:
-    {
-        int expected_dc;
-        int i;
-        int shift;
-        int average = 0;
-
-
-        if (x->up_available || x->left_available)
-        {
-            if (x->up_available)
-            {
-                for (i = 0; i < 16; i++)
-                {
-                    average += yabove_row[i];
-                }
-            }
-
-            if (x->left_available)
-            {
-
-                for (i = 0; i < 16; i++)
-                {
-                    average += yleft_col[i];
-                }
-
-            }
-
-
-
-            shift = 3 + x->up_available + x->left_available;
-            expected_dc = (average + (1 << (shift - 1))) >> shift;
-        }
-        else
-        {
-            expected_dc = 128;
-        }
-
-        vpx_memset(ypred_ptr, expected_dc, 256);
-    }
-    break;
-    case V_PRED:
-    {
-
-        for (r = 0; r < 16; r++)
-        {
-
-            ((int *)ypred_ptr)[0] = ((int *)yabove_row)[0];
-            ((int *)ypred_ptr)[1] = ((int *)yabove_row)[1];
-            ((int *)ypred_ptr)[2] = ((int *)yabove_row)[2];
-            ((int *)ypred_ptr)[3] = ((int *)yabove_row)[3];
-            ypred_ptr += 16;
-        }
-    }
-    break;
-    case H_PRED:
-    {
-
-        for (r = 0; r < 16; r++)
-        {
-
-            vpx_memset(ypred_ptr, yleft_col[r], 16);
-            ypred_ptr += 16;
-        }
-
-    }
-    break;
-    case TM_PRED:
-    {
-
-        for (r = 0; r < 16; r++)
-        {
-            for (c = 0; c < 16; c++)
-            {
-                int pred =  yleft_col[r] + yabove_row[ c] - ytop_left;
-
-                if (pred < 0)
-                    pred = 0;
-
-                if (pred > 255)
-                    pred = 255;
-
-                ypred_ptr[c] = pred;
-            }
-
-            ypred_ptr += 16;
-        }
-
-    }
-    break;
-    case B_PRED:
-    case NEARESTMV:
-    case NEARMV:
-    case ZEROMV:
-    case NEWMV:
-    case SPLITMV:
-    case MB_MODE_COUNT:
-        break;
-    }
-}
-
-void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x,
+void vp8_build_intra_predictors_mby_s_c(MACROBLOCKD *x,
                                           unsigned char * yabove_row,
                                           unsigned char * yleft,
                                           int left_stride,
-                                          unsigned char * ypred_ptr)
+                                          unsigned char * ypred_ptr,
+                                          int y_stride)
 {
     unsigned char yleft_col[16];
     unsigned char ytop_left = yabove_row[-1];
     int r, c, i;
 
-    int y_stride = x->dst.y_stride;
-
     for (i = 0; i < 16; i++)
     {
         yleft_col[i] = yleft[i* left_stride];
@@ -198,7 +75,7 @@
         for (r = 0; r < 16; r++)
         {
             vpx_memset(ypred_ptr, expected_dc, 16);
-            ypred_ptr += y_stride; /*16;*/
+            ypred_ptr += y_stride;
         }
     }
     break;
@@ -212,7 +89,7 @@
             ((int *)ypred_ptr)[1] = ((int *)yabove_row)[1];
             ((int *)ypred_ptr)[2] = ((int *)yabove_row)[2];
             ((int *)ypred_ptr)[3] = ((int *)yabove_row)[3];
-            ypred_ptr += y_stride; /*16;*/
+            ypred_ptr += y_stride;
         }
     }
     break;
@@ -223,7 +100,7 @@
         {
 
             vpx_memset(ypred_ptr, yleft_col[r], 16);
-            ypred_ptr += y_stride;  /*16;*/
+            ypred_ptr += y_stride;
         }
 
     }
@@ -246,145 +123,7 @@
                 ypred_ptr[c] = pred;
             }
 
-            ypred_ptr += y_stride;  /*16;*/
-        }
-
-    }
-    break;
-    case B_PRED:
-    case NEARESTMV:
-    case NEARMV:
-    case ZEROMV:
-    case NEWMV:
-    case SPLITMV:
-    case MB_MODE_COUNT:
-        break;
-    }
-}
-
-void vp8_build_intra_predictors_mbuv_c(MACROBLOCKD *x)
-{
-    unsigned char *uabove_row = x->dst.u_buffer - x->dst.uv_stride;
-    unsigned char uleft_col[16];
-    unsigned char utop_left = uabove_row[-1];
-    unsigned char *vabove_row = x->dst.v_buffer - x->dst.uv_stride;
-    unsigned char vleft_col[20];
-    unsigned char vtop_left = vabove_row[-1];
-    unsigned char *upred_ptr = &x->predictor[256];
-    unsigned char *vpred_ptr = &x->predictor[320];
-    int i, j;
-
-    for (i = 0; i < 8; i++)
-    {
-        uleft_col[i] = x->dst.u_buffer [i* x->dst.uv_stride -1];
-        vleft_col[i] = x->dst.v_buffer [i* x->dst.uv_stride -1];
-    }
-
-    switch (x->mode_info_context->mbmi.uv_mode)
-    {
-    case DC_PRED:
-    {
-        int expected_udc;
-        int expected_vdc;
-        int i;
-        int shift;
-        int Uaverage = 0;
-        int Vaverage = 0;
-
-        if (x->up_available)
-        {
-            for (i = 0; i < 8; i++)
-            {
-                Uaverage += uabove_row[i];
-                Vaverage += vabove_row[i];
-            }
-        }
-
-        if (x->left_available)
-        {
-            for (i = 0; i < 8; i++)
-            {
-                Uaverage += uleft_col[i];
-                Vaverage += vleft_col[i];
-            }
-        }
-
-        if (!x->up_available && !x->left_available)
-        {
-            expected_udc = 128;
-            expected_vdc = 128;
-        }
-        else
-        {
-            shift = 2 + x->up_available + x->left_available;
-            expected_udc = (Uaverage + (1 << (shift - 1))) >> shift;
-            expected_vdc = (Vaverage + (1 << (shift - 1))) >> shift;
-        }
-
-
-        vpx_memset(upred_ptr, expected_udc, 64);
-        vpx_memset(vpred_ptr, expected_vdc, 64);
-
-
-    }
-    break;
-    case V_PRED:
-    {
-        int i;
-
-        for (i = 0; i < 8; i++)
-        {
-            vpx_memcpy(upred_ptr, uabove_row, 8);
-            vpx_memcpy(vpred_ptr, vabove_row, 8);
-            upred_ptr += 8;
-            vpred_ptr += 8;
-        }
-
-    }
-    break;
-    case H_PRED:
-    {
-        int i;
-
-        for (i = 0; i < 8; i++)
-        {
-            vpx_memset(upred_ptr, uleft_col[i], 8);
-            vpx_memset(vpred_ptr, vleft_col[i], 8);
-            upred_ptr += 8;
-            vpred_ptr += 8;
-        }
-    }
-
-    break;
-    case TM_PRED:
-    {
-        int i;
-
-        for (i = 0; i < 8; i++)
-        {
-            for (j = 0; j < 8; j++)
-            {
-                int predu = uleft_col[i] + uabove_row[j] - utop_left;
-                int predv = vleft_col[i] + vabove_row[j] - vtop_left;
-
-                if (predu < 0)
-                    predu = 0;
-
-                if (predu > 255)
-                    predu = 255;
-
-                if (predv < 0)
-                    predv = 0;
-
-                if (predv > 255)
-                    predv = 255;
-
-                upred_ptr[j] = predu;
-                vpred_ptr[j] = predv;
-            }
-
-            upred_ptr += 8;
-            vpred_ptr += 8;
+            ypred_ptr += y_stride;
         }
 
     }
@@ -407,13 +146,13 @@
                                          unsigned char * vleft,
                                          int left_stride,
                                          unsigned char * upred_ptr,
-                                         unsigned char * vpred_ptr)
+                                         unsigned char * vpred_ptr,
+                                         int pred_stride)
 {
     unsigned char uleft_col[8];
     unsigned char utop_left = uabove_row[-1];
     unsigned char vleft_col[8];
     unsigned char vtop_left = vabove_row[-1];
-    int uv_stride = x->dst.uv_stride;
 
     int i, j;
 
@@ -471,8 +210,8 @@
         {
             vpx_memset(upred_ptr, expected_udc, 8);
             vpx_memset(vpred_ptr, expected_vdc, 8);
-            upred_ptr += uv_stride; /*8;*/
-            vpred_ptr += uv_stride; /*8;*/
+            upred_ptr += pred_stride;
+            vpred_ptr += pred_stride;
         }
     }
     break;
@@ -484,8 +223,8 @@
         {
             vpx_memcpy(upred_ptr, uabove_row, 8);
             vpx_memcpy(vpred_ptr, vabove_row, 8);
-            upred_ptr += uv_stride; /*8;*/
-            vpred_ptr += uv_stride; /*8;*/
+            upred_ptr += pred_stride;
+            vpred_ptr += pred_stride;
         }
 
     }
@@ -498,8 +237,8 @@
         {
             vpx_memset(upred_ptr, uleft_col[i], 8);
             vpx_memset(vpred_ptr, vleft_col[i], 8);
-            upred_ptr += uv_stride; /*8;*/
-            vpred_ptr += uv_stride; /*8;*/
+            upred_ptr += pred_stride;
+            vpred_ptr += pred_stride;
         }
     }
 
@@ -531,8 +270,8 @@
                 vpred_ptr[j] = predv;
             }
 
-            upred_ptr += uv_stride; /*8;*/
-            vpred_ptr += uv_stride; /*8;*/
+            upred_ptr += pred_stride;
+            vpred_ptr += pred_stride;
         }
 
     }
diff --git a/vp8/common/rtcd_defs.sh b/vp8/common/rtcd_defs.sh
index 0fdb4fa..ab99515 100644
--- a/vp8/common/rtcd_defs.sh
+++ b/vp8/common/rtcd_defs.sh
@@ -122,17 +122,12 @@
 specialize vp8_copy_mem8x4 mmx media neon
 vp8_copy_mem8x4_media=vp8_copy_mem8x4_v6
 
-prototype void vp8_build_intra_predictors_mby "struct macroblockd *x"
-specialize vp8_build_intra_predictors_mby sse2 ssse3 neon
+prototype void vp8_build_intra_predictors_mby_s "struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride"
+specialize vp8_build_intra_predictors_mby_s sse2 ssse3
+#TODO: fix assembly for neon
 
-prototype void vp8_build_intra_predictors_mby_s "struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr"
-#TODO: fix assembly --- specialize vp8_build_intra_predictors_mby_s sse2 ssse3 neon
-
-prototype void vp8_build_intra_predictors_mbuv "struct macroblockd *x"
-specialize vp8_build_intra_predictors_mbuv sse2 ssse3
-
-prototype void vp8_build_intra_predictors_mbuv_s "struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr"
-#TODO: fix assembly --- specialize vp8_build_intra_predictors_mbuv_s sse2 ssse3
+prototype void vp8_build_intra_predictors_mbuv_s "struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row,  unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride"
+specialize vp8_build_intra_predictors_mbuv_s sse2 ssse3
 
 prototype void vp8_intra4x4_predict "unsigned char *src, int src_stride, int b_mode, unsigned char *dst, int dst_stride"
 specialize vp8_intra4x4_predict media
diff --git a/vp8/common/x86/recon_sse2.asm b/vp8/common/x86/recon_sse2.asm
index 4b68ef5..7b6e3cf 100644
--- a/vp8/common/x86/recon_sse2.asm
+++ b/vp8/common/x86/recon_sse2.asm
@@ -119,35 +119,37 @@
 ;void vp8_intra_pred_uv_dc_mmx2(
 ;    unsigned char *dst,
 ;    int dst_stride
-;    unsigned char *src,
-;    int src_stride,
+;    unsigned char *above,
+;    unsigned char *left,
+;    int left_stride,
 ;    )
 global sym(vp8_intra_pred_uv_dc_mmx2)
 sym(vp8_intra_pred_uv_dc_mmx2):
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
+    SHADOW_ARGS_TO_STACK 5
     push        rsi
     push        rdi
     ; end prolog
 
     ; from top
-    mov         rsi,        arg(2) ;src;
-    movsxd      rax,        dword ptr arg(3) ;src_stride;
-    sub         rsi,        rax
+    mov         rdi,        arg(2) ;above;
+    mov         rsi,        arg(3) ;left;
+    movsxd      rax,        dword ptr arg(4) ;left_stride;
     pxor        mm0,        mm0
-    movq        mm1,        [rsi]
-    psadbw      mm1,        mm0
-
-    ; from left
-    dec         rsi
+    movq        mm1,        [rdi]
     lea         rdi,        [rax*3]
-    movzx       ecx,        byte [rsi+rax]
+    psadbw      mm1,        mm0
+    ; from left
+    movzx       ecx,        byte [rsi]
+    movzx       edx,        byte [rsi+rax*1]
+    add         ecx,        edx
     movzx       edx,        byte [rsi+rax*2]
     add         ecx,        edx
+
     movzx       edx,        byte [rsi+rdi]
-    add         ecx,        edx
     lea         rsi,        [rsi+rax*4]
+    add         ecx,        edx
     movzx       edx,        byte [rsi]
     add         ecx,        edx
     movzx       edx,        byte [rsi+rax]
@@ -156,31 +158,29 @@
     add         ecx,        edx
     movzx       edx,        byte [rsi+rdi]
     add         ecx,        edx
-    movzx       edx,        byte [rsi+rax*4]
-    add         ecx,        edx
 
     ; add up
     pextrw      edx,        mm1, 0x0
     lea         edx,        [edx+ecx+8]
     sar         edx,        4
     movd        mm1,        edx
+    movsxd      rcx,        dword ptr arg(1) ;dst_stride
     pshufw      mm1,        mm1, 0x0
+    mov         rdi,        arg(0) ;dst;
     packuswb    mm1,        mm1
 
     ; write out
-    mov         rdi,        arg(0) ;dst;
-    movsxd      rcx,        dword ptr arg(1) ;dst_stride
     lea         rax,        [rcx*3]
+    lea         rdx,        [rdi+rcx*4]
 
     movq [rdi      ],       mm1
     movq [rdi+rcx  ],       mm1
     movq [rdi+rcx*2],       mm1
     movq [rdi+rax  ],       mm1
-    lea         rdi,        [rdi+rcx*4]
-    movq [rdi      ],       mm1
-    movq [rdi+rcx  ],       mm1
-    movq [rdi+rcx*2],       mm1
-    movq [rdi+rax  ],       mm1
+    movq [rdx      ],       mm1
+    movq [rdx+rcx  ],       mm1
+    movq [rdx+rcx*2],       mm1
+    movq [rdx+rax  ],       mm1
 
     ; begin epilog
     pop         rdi
@@ -192,23 +192,24 @@
 ;void vp8_intra_pred_uv_dctop_mmx2(
 ;    unsigned char *dst,
 ;    int dst_stride
-;    unsigned char *src,
-;    int src_stride,
+;    unsigned char *above,
+;    unsigned char *left,
+;    int left_stride,
 ;    )
 global sym(vp8_intra_pred_uv_dctop_mmx2)
 sym(vp8_intra_pred_uv_dctop_mmx2):
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
+    SHADOW_ARGS_TO_STACK 5
     GET_GOT     rbx
     push        rsi
     push        rdi
     ; end prolog
 
+    ;arg(3), arg(4) not used
+
     ; from top
-    mov         rsi,        arg(2) ;src;
-    movsxd      rax,        dword ptr arg(3) ;src_stride;
-    sub         rsi,        rax
+    mov         rsi,        arg(2) ;above;
     pxor        mm0,        mm0
     movq        mm1,        [rsi]
     psadbw      mm1,        mm0
@@ -245,22 +246,24 @@
 ;void vp8_intra_pred_uv_dcleft_mmx2(
 ;    unsigned char *dst,
 ;    int dst_stride
-;    unsigned char *src,
-;    int src_stride,
+;    unsigned char *above,
+;    unsigned char *left,
+;    int left_stride,
 ;    )
 global sym(vp8_intra_pred_uv_dcleft_mmx2)
 sym(vp8_intra_pred_uv_dcleft_mmx2):
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
+    SHADOW_ARGS_TO_STACK 5
     push        rsi
     push        rdi
     ; end prolog
 
+    ;arg(2) not used
+
     ; from left
-    mov         rsi,        arg(2) ;src;
-    movsxd      rax,        dword ptr arg(3) ;src_stride;
-    dec         rsi
+    mov         rsi,        arg(3) ;left;
+    movsxd      rax,        dword ptr arg(4) ;left_stride;
     lea         rdi,        [rax*3]
     movzx       ecx,        byte [rsi]
     movzx       edx,        byte [rsi+rax]
@@ -310,17 +313,20 @@
 ;void vp8_intra_pred_uv_dc128_mmx(
 ;    unsigned char *dst,
 ;    int dst_stride
-;    unsigned char *src,
-;    int src_stride,
+;    unsigned char *above,
+;    unsigned char *left,
+;    int left_stride,
 ;    )
 global sym(vp8_intra_pred_uv_dc128_mmx)
 sym(vp8_intra_pred_uv_dc128_mmx):
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
+    SHADOW_ARGS_TO_STACK 5
     GET_GOT     rbx
     ; end prolog
 
+    ;arg(2), arg(3), arg(4) not used
+
     ; write out
     movq        mm1,        [GLOBAL(dc_128)]
     mov         rax,        arg(0) ;dst;
@@ -346,15 +352,16 @@
 ;void vp8_intra_pred_uv_tm_sse2(
 ;    unsigned char *dst,
 ;    int dst_stride
-;    unsigned char *src,
-;    int src_stride,
+;    unsigned char *above,
+;    unsigned char *left,
+;    int left_stride,
 ;    )
 %macro vp8_intra_pred_uv_tm 1
 global sym(vp8_intra_pred_uv_tm_%1)
 sym(vp8_intra_pred_uv_tm_%1):
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
+    SHADOW_ARGS_TO_STACK 5
     GET_GOT     rbx
     push        rsi
     push        rdi
@@ -362,9 +369,8 @@
 
     ; read top row
     mov         edx,        4
-    mov         rsi,        arg(2) ;src;
-    movsxd      rax,        dword ptr arg(3) ;src_stride;
-    sub         rsi,        rax
+    mov         rsi,        arg(2) ;above
+    movsxd      rax,        dword ptr arg(4) ;left_stride;
     pxor        xmm0,       xmm0
 %ifidn %1, ssse3
     movdqa      xmm2,       [GLOBAL(dc_1024)]
@@ -374,7 +380,7 @@
 
     ; set up left ptrs ans subtract topleft
     movd        xmm3,       [rsi-1]
-    lea         rsi,        [rsi+rax-1]
+    mov         rsi,        arg(3) ;left;
 %ifidn %1, sse2
     punpcklbw   xmm3,       xmm0
     pshuflw     xmm3,       xmm3, 0x0
@@ -427,20 +433,22 @@
 ;void vp8_intra_pred_uv_ve_mmx(
 ;    unsigned char *dst,
 ;    int dst_stride
-;    unsigned char *src,
-;    int src_stride,
+;    unsigned char *above,
+;    unsigned char *left,
+;    int left_stride,
 ;    )
 global sym(vp8_intra_pred_uv_ve_mmx)
 sym(vp8_intra_pred_uv_ve_mmx):
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
+    SHADOW_ARGS_TO_STACK 5
     ; end prolog
 
+    ; arg(3), arg(4) not used
+
     ; read from top
     mov         rax,        arg(2) ;src;
-    movsxd      rdx,        dword ptr arg(3) ;src_stride;
-    sub         rax,        rdx
+
     movq        mm1,        [rax]
 
     ; write out
@@ -466,15 +474,16 @@
 ;void vp8_intra_pred_uv_ho_mmx2(
 ;    unsigned char *dst,
 ;    int dst_stride
-;    unsigned char *src,
-;    int src_stride,
+;    unsigned char *above,
+;    unsigned char *left,
+;    int left_stride
 ;    )
 %macro vp8_intra_pred_uv_ho 1
 global sym(vp8_intra_pred_uv_ho_%1)
 sym(vp8_intra_pred_uv_ho_%1):
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
+    SHADOW_ARGS_TO_STACK 5
     push        rsi
     push        rdi
 %ifidn %1, ssse3
@@ -485,12 +494,14 @@
 %endif
     ; end prolog
 
+    ;arg(2) not used
+
     ; read from left and write out
 %ifidn %1, mmx2
     mov         edx,        4
 %endif
-    mov         rsi,        arg(2) ;src;
-    movsxd      rax,        dword ptr arg(3) ;src_stride;
+    mov         rsi,        arg(3) ;left
+    movsxd      rax,        dword ptr arg(4) ;left_stride;
     mov         rdi,        arg(0) ;dst;
     movsxd      rcx,        dword ptr arg(1) ;dst_stride
 %ifidn %1, ssse3
@@ -498,7 +509,7 @@
     movdqa      xmm2,       [GLOBAL(dc_00001111)]
     lea         rbx,        [rax*3]
 %endif
-    dec         rsi
+
 %ifidn %1, mmx2
 .vp8_intra_pred_uv_ho_%1_loop:
     movd        mm0,        [rsi]
@@ -562,38 +573,43 @@
 ;void vp8_intra_pred_y_dc_sse2(
 ;    unsigned char *dst,
 ;    int dst_stride
-;    unsigned char *src,
-;    int src_stride,
+;    unsigned char *above,
+;    unsigned char *left,
+;    int left_stride
 ;    )
 global sym(vp8_intra_pred_y_dc_sse2)
 sym(vp8_intra_pred_y_dc_sse2):
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
+    SHADOW_ARGS_TO_STACK 5
     push        rsi
     push        rdi
     ; end prolog
 
     ; from top
-    mov         rsi,        arg(2) ;src;
-    movsxd      rax,        dword ptr arg(3) ;src_stride;
-    sub         rsi,        rax
+    mov         rdi,        arg(2) ;above
+    mov         rsi,        arg(3) ;left
+    movsxd      rax,        dword ptr arg(4) ;left_stride;
+
     pxor        xmm0,       xmm0
-    movdqa      xmm1,       [rsi]
+    movdqa      xmm1,       [rdi]
     psadbw      xmm1,       xmm0
     movq        xmm2,       xmm1
     punpckhqdq  xmm1,       xmm1
     paddw       xmm1,       xmm2
 
     ; from left
-    dec         rsi
     lea         rdi,        [rax*3]
-    movzx       ecx,        byte [rsi+rax]
+
+    movzx       ecx,        byte [rsi]
+    movzx       edx,        byte [rsi+rax]
+    add         ecx,        edx
     movzx       edx,        byte [rsi+rax*2]
     add         ecx,        edx
     movzx       edx,        byte [rsi+rdi]
     add         ecx,        edx
     lea         rsi,        [rsi+rax*4]
+
     movzx       edx,        byte [rsi]
     add         ecx,        edx
     movzx       edx,        byte [rsi+rax]
@@ -603,6 +619,7 @@
     movzx       edx,        byte [rsi+rdi]
     add         ecx,        edx
     lea         rsi,        [rsi+rax*4]
+
     movzx       edx,        byte [rsi]
     add         ecx,        edx
     movzx       edx,        byte [rsi+rax]
@@ -612,6 +629,7 @@
     movzx       edx,        byte [rsi+rdi]
     add         ecx,        edx
     lea         rsi,        [rsi+rax*4]
+
     movzx       edx,        byte [rsi]
     add         ecx,        edx
     movzx       edx,        byte [rsi+rax]
@@ -620,8 +638,6 @@
     add         ecx,        edx
     movzx       edx,        byte [rsi+rdi]
     add         ecx,        edx
-    movzx       edx,        byte [rsi+rax*4]
-    add         ecx,        edx
 
     ; add up
     pextrw      edx,        xmm1, 0x0
@@ -663,22 +679,23 @@
 ;void vp8_intra_pred_y_dctop_sse2(
 ;    unsigned char *dst,
 ;    int dst_stride
-;    unsigned char *src,
-;    int src_stride,
+;    unsigned char *above,
+;    unsigned char *left,
+;    int left_stride
 ;    )
 global sym(vp8_intra_pred_y_dctop_sse2)
 sym(vp8_intra_pred_y_dctop_sse2):
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
+    SHADOW_ARGS_TO_STACK 5
     push        rsi
     GET_GOT     rbx
     ; end prolog
 
+    ;arg(3), arg(4) not used
+
     ; from top
-    mov         rcx,        arg(2) ;src;
-    movsxd      rax,        dword ptr arg(3) ;src_stride;
-    sub         rcx,        rax
+    mov         rcx,        arg(2) ;above;
     pxor        xmm0,       xmm0
     movdqa      xmm1,       [rcx]
     psadbw      xmm1,       xmm0
@@ -724,22 +741,25 @@
 ;void vp8_intra_pred_y_dcleft_sse2(
 ;    unsigned char *dst,
 ;    int dst_stride
-;    unsigned char *src,
-;    int src_stride,
+;    unsigned char *above,
+;    unsigned char *left,
+;    int left_stride
 ;    )
 global sym(vp8_intra_pred_y_dcleft_sse2)
 sym(vp8_intra_pred_y_dcleft_sse2):
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
+    SHADOW_ARGS_TO_STACK 5
     push        rsi
     push        rdi
     ; end prolog
 
+    ;arg(2) not used
+
     ; from left
-    mov         rsi,        arg(2) ;src;
-    movsxd      rax,        dword ptr arg(3) ;src_stride;
-    dec         rsi
+    mov         rsi,        arg(3) ;left;
+    movsxd      rax,        dword ptr arg(4) ;left_stride;
+
     lea         rdi,        [rax*3]
     movzx       ecx,        byte [rsi]
     movzx       edx,        byte [rsi+rax]
@@ -814,18 +834,21 @@
 ;void vp8_intra_pred_y_dc128_sse2(
 ;    unsigned char *dst,
 ;    int dst_stride
-;    unsigned char *src,
-;    int src_stride,
+;    unsigned char *above,
+;    unsigned char *left,
+;    int left_stride
 ;    )
 global sym(vp8_intra_pred_y_dc128_sse2)
 sym(vp8_intra_pred_y_dc128_sse2):
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
+    SHADOW_ARGS_TO_STACK 5
     push        rsi
     GET_GOT     rbx
     ; end prolog
 
+    ;arg(2), arg(3), arg(4) not used
+
     ; write out
     mov         rsi,        2
     movdqa      xmm1,       [GLOBAL(dc_128)]
@@ -857,15 +880,16 @@
 ;void vp8_intra_pred_y_tm_sse2(
 ;    unsigned char *dst,
 ;    int dst_stride
-;    unsigned char *src,
-;    int src_stride,
+;    unsigned char *above,
+;    unsigned char *left,
+;    int left_stride
 ;    )
 %macro vp8_intra_pred_y_tm 1
 global sym(vp8_intra_pred_y_tm_%1)
 sym(vp8_intra_pred_y_tm_%1):
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
+    SHADOW_ARGS_TO_STACK 5
     push        rsi
     push        rdi
     GET_GOT     rbx
@@ -873,9 +897,8 @@
 
     ; read top row
     mov         edx,        8
-    mov         rsi,        arg(2) ;src;
-    movsxd      rax,        dword ptr arg(3) ;src_stride;
-    sub         rsi,        rax
+    mov         rsi,        arg(2) ;above
+    movsxd      rax,        dword ptr arg(4) ;left_stride;
     pxor        xmm0,       xmm0
 %ifidn %1, ssse3
     movdqa      xmm3,       [GLOBAL(dc_1024)]
@@ -887,7 +910,7 @@
 
     ; set up left ptrs ans subtract topleft
     movd        xmm4,       [rsi-1]
-    lea         rsi,        [rsi+rax-1]
+    mov         rsi,        arg(3) ;left
 %ifidn %1, sse2
     punpcklbw   xmm4,       xmm0
     pshuflw     xmm4,       xmm4, 0x0
@@ -945,27 +968,29 @@
 ;void vp8_intra_pred_y_ve_sse2(
 ;    unsigned char *dst,
 ;    int dst_stride
-;    unsigned char *src,
-;    int src_stride,
+;    unsigned char *above,
+;    unsigned char *left,
+;    int left_stride
 ;    )
 global sym(vp8_intra_pred_y_ve_sse2)
 sym(vp8_intra_pred_y_ve_sse2):
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
+    SHADOW_ARGS_TO_STACK 5
     push        rsi
     ; end prolog
 
+    ;arg(3), arg(4) not used
+
+    mov         rax,        arg(2) ;above;
+    mov         rsi,        2
+    movsxd      rdx,        dword ptr arg(1) ;dst_stride
+
     ; read from top
-    mov         rax,        arg(2) ;src;
-    movsxd      rdx,        dword ptr arg(3) ;src_stride;
-    sub         rax,        rdx
     movdqa      xmm1,       [rax]
 
     ; write out
-    mov         rsi,        2
     mov         rax,        arg(0) ;dst;
-    movsxd      rdx,        dword ptr arg(1) ;dst_stride
     lea         rcx,        [rdx*3]
 
 .label
@@ -991,25 +1016,27 @@
 ;void vp8_intra_pred_y_ho_sse2(
 ;    unsigned char *dst,
 ;    int dst_stride
-;    unsigned char *src,
-;    int src_stride,
+;    unsigned char *above,
+;    unsigned char *left,
+;    int left_stride,
 ;    )
 global sym(vp8_intra_pred_y_ho_sse2)
 sym(vp8_intra_pred_y_ho_sse2):
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
+    SHADOW_ARGS_TO_STACK 5
     push        rsi
     push        rdi
     ; end prolog
 
+    ;arg(2) not used
+
     ; read from left and write out
     mov         edx,        8
-    mov         rsi,        arg(2) ;src;
-    movsxd      rax,        dword ptr arg(3) ;src_stride;
+    mov         rsi,        arg(3) ;left;
+    movsxd      rax,        dword ptr arg(4) ;left_stride;
     mov         rdi,        arg(0) ;dst;
     movsxd      rcx,        dword ptr arg(1) ;dst_stride
-    dec         rsi
 
 vp8_intra_pred_y_ho_sse2_loop:
     movd        xmm0,       [rsi]
diff --git a/vp8/common/x86/recon_wrapper_sse2.c b/vp8/common/x86/recon_wrapper_sse2.c
index cb9ab80..b482faa 100644
--- a/vp8/common/x86/recon_wrapper_sse2.c
+++ b/vp8/common/x86/recon_wrapper_sse2.c
@@ -15,7 +15,8 @@
 
 #define build_intra_predictors_mbuv_prototype(sym) \
     void sym(unsigned char *dst, int dst_stride, \
-             const unsigned char *src, int src_stride)
+             const unsigned char *above, \
+             const unsigned char *left, int left_stride)
 typedef build_intra_predictors_mbuv_prototype((*build_intra_predictors_mbuv_fn_t));
 
 extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dc_mmx2);
@@ -29,15 +30,19 @@
 extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_ssse3);
 
 static void vp8_build_intra_predictors_mbuv_x86(MACROBLOCKD *x,
+                                                unsigned char * uabove_row,
+                                                unsigned char * vabove_row,
                                                 unsigned char *dst_u,
                                                 unsigned char *dst_v,
                                                 int dst_stride,
+                                                unsigned char * uleft,
+                                                unsigned char * vleft,
+                                                int left_stride,
                                                 build_intra_predictors_mbuv_fn_t tm_func,
                                                 build_intra_predictors_mbuv_fn_t ho_func)
 {
     int mode = x->mode_info_context->mbmi.uv_mode;
     build_intra_predictors_mbuv_fn_t fn;
-    int src_stride = x->dst.uv_stride;
 
     switch (mode) {
         case  V_PRED: fn = vp8_intra_pred_uv_ve_mmx; break;
@@ -59,59 +64,78 @@
         default: return;
     }
 
-    fn(dst_u, dst_stride, x->dst.u_buffer, src_stride);
-    fn(dst_v, dst_stride, x->dst.v_buffer, src_stride);
+    fn(dst_u, dst_stride, uabove_row, uleft, left_stride);
+    fn(dst_v, dst_stride, vabove_row, vleft, left_stride);
 }
 
-void vp8_build_intra_predictors_mbuv_sse2(MACROBLOCKD *x)
+void vp8_build_intra_predictors_mbuv_s_sse2(MACROBLOCKD *x,
+                                            unsigned char * uabove_row,
+                                            unsigned char * vabove_row,
+                                            unsigned char * uleft,
+                                            unsigned char * vleft,
+                                            int left_stride,
+                                            unsigned char * upred_ptr,
+                                            unsigned char * vpred_ptr,
+                                            int pred_stride)
 {
-    vp8_build_intra_predictors_mbuv_x86(x, &x->predictor[256],
-                                        &x->predictor[320], 8,
+    vp8_build_intra_predictors_mbuv_x86(x,
+                                        uabove_row, vabove_row,
+                                        upred_ptr,
+                                        vpred_ptr, pred_stride,
+                                        uleft,
+                                        vleft,
+                                        left_stride,
                                         vp8_intra_pred_uv_tm_sse2,
                                         vp8_intra_pred_uv_ho_mmx2);
 }
 
-void vp8_build_intra_predictors_mbuv_ssse3(MACROBLOCKD *x)
+void vp8_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *x,
+                                             unsigned char * uabove_row,
+                                             unsigned char * vabove_row,
+                                             unsigned char * uleft,
+                                             unsigned char * vleft,
+                                             int left_stride,
+                                             unsigned char * upred_ptr,
+                                             unsigned char * vpred_ptr,
+                                             int pred_stride)
 {
-    vp8_build_intra_predictors_mbuv_x86(x, &x->predictor[256],
-                                        &x->predictor[320], 8,
+    vp8_build_intra_predictors_mbuv_x86(x,
+                                        uabove_row, vabove_row,
+                                        upred_ptr,
+                                        vpred_ptr, pred_stride,
+                                        uleft,
+                                        vleft,
+                                        left_stride,
                                         vp8_intra_pred_uv_tm_ssse3,
                                         vp8_intra_pred_uv_ho_ssse3);
 }
 
-void vp8_build_intra_predictors_mbuv_s_sse2(MACROBLOCKD *x)
-{
-    vp8_build_intra_predictors_mbuv_x86(x, x->dst.u_buffer,
-                                        x->dst.v_buffer, x->dst.uv_stride,
-                                        vp8_intra_pred_uv_tm_sse2,
-                                        vp8_intra_pred_uv_ho_mmx2);
-}
+#define build_intra_predictors_mby_prototype(sym) \
+    void sym(unsigned char *dst, int dst_stride, \
+             const unsigned char *above, \
+             const unsigned char *left, int left_stride)
+typedef build_intra_predictors_mby_prototype((*build_intra_predictors_mby_fn_t));
 
-void vp8_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *x)
-{
-    vp8_build_intra_predictors_mbuv_x86(x, x->dst.u_buffer,
-                                        x->dst.v_buffer, x->dst.uv_stride,
-                                        vp8_intra_pred_uv_tm_ssse3,
-                                        vp8_intra_pred_uv_ho_ssse3);
-}
-
-extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_dc_sse2);
-extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_dctop_sse2);
-extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_dcleft_sse2);
-extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_dc128_sse2);
-extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_ho_sse2);
-extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_ve_sse2);
-extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_tm_sse2);
-extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_tm_ssse3);
+extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_dc_sse2);
+extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_dctop_sse2);
+extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_dcleft_sse2);
+extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_dc128_sse2);
+extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_ho_sse2);
+extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_ve_sse2);
+extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_tm_sse2);
+extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_tm_ssse3);
 
 static void vp8_build_intra_predictors_mby_x86(MACROBLOCKD *x,
+                                               unsigned char * yabove_row,
                                                unsigned char *dst_y,
                                                int dst_stride,
-                                               build_intra_predictors_mbuv_fn_t tm_func)
+                                               unsigned char * yleft,
+                                               int left_stride,
+                                               build_intra_predictors_mby_fn_t tm_func)
 {
     int mode = x->mode_info_context->mbmi.mode;
     build_intra_predictors_mbuv_fn_t fn;
-    int src_stride = x->dst.y_stride;
+
     switch (mode) {
         case  V_PRED: fn = vp8_intra_pred_y_ve_sse2; break;
         case  H_PRED: fn = vp8_intra_pred_y_ho_sse2; break;
@@ -132,31 +156,31 @@
         default: return;
     }
 
-    fn(dst_y, dst_stride, x->dst.y_buffer, src_stride);
+    fn(dst_y, dst_stride, yabove_row, yleft, left_stride);
     return;
 }
 
-void vp8_build_intra_predictors_mby_sse2(MACROBLOCKD *x)
+void vp8_build_intra_predictors_mby_s_sse2(MACROBLOCKD *x,
+                                           unsigned char * yabove_row,
+                                           unsigned char * yleft,
+                                           int left_stride,
+                                           unsigned char * ypred_ptr,
+                                           int y_stride)
 {
-    vp8_build_intra_predictors_mby_x86(x, x->predictor, 16,
+    vp8_build_intra_predictors_mby_x86(x, yabove_row, ypred_ptr,
+                                       y_stride, yleft, left_stride,
                                        vp8_intra_pred_y_tm_sse2);
 }
 
-void vp8_build_intra_predictors_mby_ssse3(MACROBLOCKD *x)
+void vp8_build_intra_predictors_mby_s_ssse3(MACROBLOCKD *x,
+                                            unsigned char * yabove_row,
+                                            unsigned char * yleft,
+                                            int left_stride,
+                                            unsigned char * ypred_ptr,
+                                            int y_stride)
 {
-    vp8_build_intra_predictors_mby_x86(x, x->predictor, 16,
-                                       vp8_intra_pred_y_tm_ssse3);
-}
-
-void vp8_build_intra_predictors_mby_s_sse2(MACROBLOCKD *x)
-{
-    vp8_build_intra_predictors_mby_x86(x, x->dst.y_buffer, x->dst.y_stride,
-                                       vp8_intra_pred_y_tm_sse2);
-}
-
-void vp8_build_intra_predictors_mby_s_ssse3(MACROBLOCKD *x)
-{
-    vp8_build_intra_predictors_mby_x86(x, x->dst.y_buffer, x->dst.y_stride,
+    vp8_build_intra_predictors_mby_x86(x, yabove_row, ypred_ptr,
+                                     y_stride, yleft, left_stride,
                                        vp8_intra_pred_y_tm_ssse3);
 
 }
diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c
index 08a0c4b..f75e8ee 100644
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -162,7 +162,8 @@
                                           xd->recon_left[1],
                                           xd->recon_left[2],
                                           xd->recon_left_stride[1],
-                                          xd->dst.u_buffer, xd->dst.v_buffer);
+                                          xd->dst.u_buffer, xd->dst.v_buffer,
+                                          xd->dst.uv_stride);
 
         if (mode != B_PRED)
         {
@@ -170,7 +171,8 @@
                                                  xd->recon_above[0],
                                                  xd->recon_left[0],
                                                  xd->recon_left_stride[0],
-                                                 xd->dst.y_buffer);
+                                                 xd->dst.y_buffer,
+                                                 xd->dst.y_stride);
         }
         else
         {
diff --git a/vp8/decoder/detokenize.c b/vp8/decoder/detokenize.c
index ba94c58..c5752ee 100644
--- a/vp8/decoder/detokenize.c
+++ b/vp8/decoder/detokenize.c
@@ -15,58 +15,6 @@
 #include "vpx_ports/mem.h"
 #include "detokenize.h"
 
-#define BOOL_DATA unsigned char
-
-#define OCB_X PREV_COEF_CONTEXTS * ENTROPY_NODES
-DECLARE_ALIGNED(16, static const unsigned char, coef_bands_x[16]) =
-{
-    0 * OCB_X, 1 * OCB_X, 2 * OCB_X, 3 * OCB_X,
-    6 * OCB_X, 4 * OCB_X, 5 * OCB_X, 6 * OCB_X,
-    6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X,
-    6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 7 * OCB_X
-};
-#define EOB_CONTEXT_NODE            0
-#define ZERO_CONTEXT_NODE           1
-#define ONE_CONTEXT_NODE            2
-#define LOW_VAL_CONTEXT_NODE        3
-#define TWO_CONTEXT_NODE            4
-#define THREE_CONTEXT_NODE          5
-#define HIGH_LOW_CONTEXT_NODE       6
-#define CAT_ONE_CONTEXT_NODE        7
-#define CAT_THREEFOUR_CONTEXT_NODE  8
-#define CAT_THREE_CONTEXT_NODE      9
-#define CAT_FIVE_CONTEXT_NODE       10
-
-#define CAT1_MIN_VAL    5
-#define CAT2_MIN_VAL    7
-#define CAT3_MIN_VAL   11
-#define CAT4_MIN_VAL   19
-#define CAT5_MIN_VAL   35
-#define CAT6_MIN_VAL   67
-
-#define CAT1_PROB0    159
-#define CAT2_PROB0    145
-#define CAT2_PROB1    165
-
-#define CAT3_PROB0 140
-#define CAT3_PROB1 148
-#define CAT3_PROB2 173
-
-#define CAT4_PROB0 135
-#define CAT4_PROB1 140
-#define CAT4_PROB2 155
-#define CAT4_PROB3 176
-
-#define CAT5_PROB0 130
-#define CAT5_PROB1 134
-#define CAT5_PROB2 141
-#define CAT5_PROB3 157
-#define CAT5_PROB4 180
-
-static const unsigned char cat6_prob[12] =
-{ 129, 130, 133, 140, 153, 177, 196, 230, 243, 254, 254, 0 };
-
-
 void vp8_reset_mb_tokens_context(MACROBLOCKD *x)
 {
     /* Clear entropy contexts for Y2 blocks */
@@ -83,302 +31,216 @@
     }
 }
 
-DECLARE_ALIGNED(16, extern const unsigned char, vp8_norm[256]);
-#define FILL \
-    if(count < 0) \
-        VP8DX_BOOL_DECODER_FILL(count, value, bufptr, bufend);
+/*
+    ------------------------------------------------------------------------------
+    Residual decoding (Paragraph 13.2 / 13.3)
+*/
+static const uint8_t kBands[16 + 1] = {
+  0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7,
+  0  /* extra entry as sentinel */
+};
 
-#define NORMALIZE \
-    /*if(range < 0x80)*/                            \
-    { \
-        shift = vp8_norm[range]; \
-        range <<= shift; \
-        value <<= shift; \
-        count -= shift; \
+static const uint8_t kCat3[] = { 173, 148, 140, 0 };
+static const uint8_t kCat4[] = { 176, 155, 140, 135, 0 };
+static const uint8_t kCat5[] = { 180, 157, 141, 134, 130, 0 };
+static const uint8_t kCat6[] =
+  { 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0 };
+static const uint8_t* const kCat3456[] = { kCat3, kCat4, kCat5, kCat6 };
+static const uint8_t kZigzag[16] = {
+  0, 1, 4, 8,  5, 2, 3, 6,  9, 12, 13, 10,  7, 11, 14, 15
+};
+
+#define VP8GetBit vp8dx_decode_bool
+#define NUM_PROBAS  11
+#define NUM_CTX  3
+
+typedef const uint8_t (*ProbaArray)[NUM_CTX][NUM_PROBAS];  // for const-casting
+
+static int GetSigned(BOOL_DECODER *br, int value_to_sign)
+{
+    int split = (br->range + 1) >> 1;
+    VP8_BD_VALUE bigsplit = (VP8_BD_VALUE)split << (VP8_BD_VALUE_SIZE - 8);
+    int v;
+
+    if(br->count < 0)
+        vp8dx_bool_decoder_fill(br);
+
+    if ( br->value < bigsplit )
+    {
+        br->range = split;
+        v= value_to_sign;
     }
-
-#define DECODE_AND_APPLYSIGN(value_to_sign) \
-    split = (range + 1) >> 1; \
-    bigsplit = (VP8_BD_VALUE)split << (VP8_BD_VALUE_SIZE - 8); \
-    FILL \
-    if ( value < bigsplit ) \
-    { \
-        range = split; \
-        v= value_to_sign; \
-    } \
-    else \
-    { \
-        range = range-split; \
-        value = value-bigsplit; \
-        v = -value_to_sign; \
-    } \
-    range +=range;                   \
-    value +=value;                   \
-    count--;
-
-#define DECODE_AND_BRANCH_IF_ZERO(probability,branch) \
-    { \
-        split = 1 +  ((( probability*(range-1) ) )>> 8); \
-        bigsplit = (VP8_BD_VALUE)split << (VP8_BD_VALUE_SIZE - 8); \
-        FILL \
-        if ( value < bigsplit ) \
-        { \
-            range = split; \
-            NORMALIZE \
-            goto branch; \
-        } \
-        value -= bigsplit; \
-        range = range - split; \
-        NORMALIZE \
+    else
+    {
+        br->range = br->range-split;
+        br->value = br->value-bigsplit;
+        v = -value_to_sign;
     }
+    br->range +=br->range;
+    br->value +=br->value;
+    br->count--;
 
-#define DECODE_AND_LOOP_IF_ZERO(probability,branch) \
-    { \
-        split = 1 + ((( probability*(range-1) ) ) >> 8); \
-        bigsplit = (VP8_BD_VALUE)split << (VP8_BD_VALUE_SIZE - 8); \
-        FILL \
-        if ( value < bigsplit ) \
-        { \
-            range = split; \
-            NORMALIZE \
-            Prob = coef_probs; \
-            if(c<15) {\
-            ++c; \
-            Prob += coef_bands_x[c]; \
-            goto branch; \
-            } goto BLOCK_FINISHED; /*for malformed input */\
-        } \
-        value -= bigsplit; \
-        range = range - split; \
-        NORMALIZE \
+    return v;
+}
+/*
+   Returns the position of the last non-zero coeff plus one
+   (and 0 if there's no coeff at all)
+*/
+static int GetCoeffs(BOOL_DECODER *br, ProbaArray prob,
+                     int ctx, int n, int16_t* out)
+{
+    const uint8_t* p = prob[n][ctx];
+    if (!VP8GetBit(br, p[0]))
+    {   /* first EOB is more a 'CBP' bit. */
+        return 0;
     }
+    while (1)
+    {
+        ++n;
+        if (!VP8GetBit(br, p[1]))
+        {
+            p = prob[kBands[n]][0];
+        }
+        else
+        {  /* non zero coeff */
+            int v, j;
+            if (!VP8GetBit(br, p[2]))
+            {
+                p = prob[kBands[n]][1];
+                v = 1;
+            }
+            else
+            {
+                if (!VP8GetBit(br, p[3]))
+                {
+                    if (!VP8GetBit(br, p[4]))
+                    {
+                        v = 2;
+                    }
+                    else
+                    {
+                        v = 3 + VP8GetBit(br, p[5]);
+                    }
+                }
+                else
+                {
+                    if (!VP8GetBit(br, p[6]))
+                    {
+                        if (!VP8GetBit(br, p[7]))
+                        {
+                            v = 5 + VP8GetBit(br, 159);
+                        } else
+                        {
+                            v = 7 + 2 * VP8GetBit(br, 165);
+                            v += VP8GetBit(br, 145);
+                        }
+                    }
+                    else
+                    {
+                        const uint8_t* tab;
+                        const int bit1 = VP8GetBit(br, p[8]);
+                        const int bit0 = VP8GetBit(br, p[9 + bit1]);
+                        const int cat = 2 * bit1 + bit0;
+                        v = 0;
+                        for (tab = kCat3456[cat]; *tab; ++tab)
+                        {
+                            v += v + VP8GetBit(br, *tab);
+                        }
+                        v += 3 + (8 << cat);
+                    }
+                }
+                p = prob[kBands[n]][2];
+            }
+            j = kZigzag[n - 1];
 
-#define DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val) \
-    DECODE_AND_APPLYSIGN(val) \
-    Prob = coef_probs + (ENTROPY_NODES*2); \
-    if(c < 15){\
-        qcoeff_ptr [ scan[c] ] = (int16_t) v; \
-        ++c; \
-        goto DO_WHILE; }\
-    qcoeff_ptr [ 15 ] = (int16_t) v; \
-    goto BLOCK_FINISHED;
+            out[j] = GetSigned(br, v);
 
-
-#define DECODE_EXTRABIT_AND_ADJUST_VAL(prob, bits_count)\
-    split = 1 +  (((range-1) * prob) >> 8); \
-    bigsplit = (VP8_BD_VALUE)split << (VP8_BD_VALUE_SIZE - 8); \
-    FILL \
-    if(value >= bigsplit)\
-    {\
-        range = range-split;\
-        value = value-bigsplit;\
-        val += ((uint16_t)1<<bits_count);\
-    }\
-    else\
-    {\
-        range = split;\
-    }\
-    NORMALIZE
+            if (n == 16 || !VP8GetBit(br, p[0]))
+            {   /* EOB */
+                return n;
+            }
+        }
+        if (n == 16)
+        {
+            return 16;
+        }
+    }
+}
 
 int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x)
 {
-    ENTROPY_CONTEXT *A = (ENTROPY_CONTEXT *)x->above_context;
-    ENTROPY_CONTEXT *L = (ENTROPY_CONTEXT *)x->left_context;
-    const FRAME_CONTEXT * const fc = &dx->common.fc;
-
     BOOL_DECODER *bc = x->current_bc;
-
+    const FRAME_CONTEXT * const fc = &dx->common.fc;
     char *eobs = x->eobs;
 
-    ENTROPY_CONTEXT *a;
-    ENTROPY_CONTEXT *l;
     int i;
-
+    int nonzeros;
     int eobtotal = 0;
 
-    register int count;
-
-    const BOOL_DATA *bufptr;
-    const BOOL_DATA *bufend;
-    register unsigned int range;
-    VP8_BD_VALUE value;
-    const int *scan;
-    register unsigned int shift;
-    unsigned int split;
-    VP8_BD_VALUE bigsplit;
     short *qcoeff_ptr;
+    ProbaArray coef_probs;
+    ENTROPY_CONTEXT *a_ctx = ((ENTROPY_CONTEXT *)x->above_context);
+    ENTROPY_CONTEXT *l_ctx = ((ENTROPY_CONTEXT *)x->left_context);
+    ENTROPY_CONTEXT *a;
+    ENTROPY_CONTEXT *l;
+    int skip_dc = 0;
 
-    const vp8_prob *coef_probs;
-    int stop;
-    int val, bits_count;
-    int c;
-    int v;
-    const vp8_prob *Prob;
-    int start_coeff;
-
-
-    i = 0;
-    stop = 16;
-
-    scan = vp8_default_zig_zag1d;
     qcoeff_ptr = &x->qcoeff[0];
-    coef_probs = fc->coef_probs [3] [ 0 ] [0];
 
     if (x->mode_info_context->mbmi.mode != B_PRED &&
         x->mode_info_context->mbmi.mode != SPLITMV)
     {
-        i = 24;
-        stop = 24;
-        qcoeff_ptr += 24*16;
-        eobtotal -= 16;
-        coef_probs = fc->coef_probs [1] [ 0 ] [0];
+        a = a_ctx + 8;
+        l = l_ctx + 8;
+
+        coef_probs = fc->coef_probs [1];
+
+        nonzeros = GetCoeffs(bc, coef_probs, (*a + *l), 0, qcoeff_ptr + 24 * 16);
+        *a = *l = (nonzeros > 0);
+
+        eobs[24] = nonzeros;
+        eobtotal += nonzeros - 16;
+
+        coef_probs = fc->coef_probs [0];
+        skip_dc = 1;
     }
-
-    bufend  = bc->user_buffer_end;
-    bufptr  = bc->user_buffer;
-    value   = bc->value;
-    count   = bc->count;
-    range   = bc->range;
-
-    start_coeff = 0;
-
-BLOCK_LOOP:
-    a = A + vp8_block2above[i];
-    l = L + vp8_block2left[i];
-
-    c = start_coeff;
-
-    VP8_COMBINEENTROPYCONTEXTS(v, *a, *l);
-
-    Prob = coef_probs;
-    Prob += v * ENTROPY_NODES;
-    *a = *l = 0;
-
-DO_WHILE:
-    Prob += coef_bands_x[c];
-    DECODE_AND_BRANCH_IF_ZERO(Prob[EOB_CONTEXT_NODE], BLOCK_FINISHED);
-    *a = *l = 1;
-
-CHECK_0_:
-    DECODE_AND_LOOP_IF_ZERO(Prob[ZERO_CONTEXT_NODE], CHECK_0_);
-    DECODE_AND_BRANCH_IF_ZERO(Prob[ONE_CONTEXT_NODE], ONE_CONTEXT_NODE_0_);
-    DECODE_AND_BRANCH_IF_ZERO(Prob[LOW_VAL_CONTEXT_NODE],
-                              LOW_VAL_CONTEXT_NODE_0_);
-    DECODE_AND_BRANCH_IF_ZERO(Prob[HIGH_LOW_CONTEXT_NODE],
-                              HIGH_LOW_CONTEXT_NODE_0_);
-    DECODE_AND_BRANCH_IF_ZERO(Prob[CAT_THREEFOUR_CONTEXT_NODE],
-                              CAT_THREEFOUR_CONTEXT_NODE_0_);
-    DECODE_AND_BRANCH_IF_ZERO(Prob[CAT_FIVE_CONTEXT_NODE],
-                              CAT_FIVE_CONTEXT_NODE_0_);
-
-    val = CAT6_MIN_VAL;
-    bits_count = 10;
-
-    do
+    else
     {
-        DECODE_EXTRABIT_AND_ADJUST_VAL(cat6_prob[bits_count], bits_count);
-        bits_count -- ;
+        coef_probs = fc->coef_probs [3];
+        skip_dc = 0;
     }
-    while (bits_count >= 0);
 
-    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
-
-CAT_FIVE_CONTEXT_NODE_0_:
-    val = CAT5_MIN_VAL;
-    DECODE_EXTRABIT_AND_ADJUST_VAL(CAT5_PROB4, 4);
-    DECODE_EXTRABIT_AND_ADJUST_VAL(CAT5_PROB3, 3);
-    DECODE_EXTRABIT_AND_ADJUST_VAL(CAT5_PROB2, 2);
-    DECODE_EXTRABIT_AND_ADJUST_VAL(CAT5_PROB1, 1);
-    DECODE_EXTRABIT_AND_ADJUST_VAL(CAT5_PROB0, 0);
-    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
-
-CAT_THREEFOUR_CONTEXT_NODE_0_:
-    DECODE_AND_BRANCH_IF_ZERO(Prob[CAT_THREE_CONTEXT_NODE],
-                              CAT_THREE_CONTEXT_NODE_0_);
-    val = CAT4_MIN_VAL;
-    DECODE_EXTRABIT_AND_ADJUST_VAL(CAT4_PROB3, 3);
-    DECODE_EXTRABIT_AND_ADJUST_VAL(CAT4_PROB2, 2);
-    DECODE_EXTRABIT_AND_ADJUST_VAL(CAT4_PROB1, 1);
-    DECODE_EXTRABIT_AND_ADJUST_VAL(CAT4_PROB0, 0);
-    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
-
-CAT_THREE_CONTEXT_NODE_0_:
-    val = CAT3_MIN_VAL;
-    DECODE_EXTRABIT_AND_ADJUST_VAL(CAT3_PROB2, 2);
-    DECODE_EXTRABIT_AND_ADJUST_VAL(CAT3_PROB1, 1);
-    DECODE_EXTRABIT_AND_ADJUST_VAL(CAT3_PROB0, 0);
-    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
-
-HIGH_LOW_CONTEXT_NODE_0_:
-    DECODE_AND_BRANCH_IF_ZERO(Prob[CAT_ONE_CONTEXT_NODE],
-                              CAT_ONE_CONTEXT_NODE_0_);
-
-    val = CAT2_MIN_VAL;
-    DECODE_EXTRABIT_AND_ADJUST_VAL(CAT2_PROB1, 1);
-    DECODE_EXTRABIT_AND_ADJUST_VAL(CAT2_PROB0, 0);
-    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
-
-CAT_ONE_CONTEXT_NODE_0_:
-    val = CAT1_MIN_VAL;
-    DECODE_EXTRABIT_AND_ADJUST_VAL(CAT1_PROB0, 0);
-    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val);
-
-LOW_VAL_CONTEXT_NODE_0_:
-    DECODE_AND_BRANCH_IF_ZERO(Prob[TWO_CONTEXT_NODE], TWO_CONTEXT_NODE_0_);
-    DECODE_AND_BRANCH_IF_ZERO(Prob[THREE_CONTEXT_NODE], THREE_CONTEXT_NODE_0_);
-    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(4);
-
-THREE_CONTEXT_NODE_0_:
-    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(3);
-
-TWO_CONTEXT_NODE_0_:
-    DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(2);
-
-ONE_CONTEXT_NODE_0_:
-    DECODE_AND_APPLYSIGN(1);
-    Prob = coef_probs + ENTROPY_NODES;
-
-    if (c < 15)
+    for (i = 0; i < 16; ++i)
     {
-        qcoeff_ptr [ scan[c] ] = (int16_t) v;
-        ++c;
-        goto DO_WHILE;
+        a = a_ctx + (i&3);
+        l = l_ctx + ((i&0xc)>>2);
+
+        nonzeros = GetCoeffs(bc, coef_probs, (*a + *l), skip_dc, qcoeff_ptr);
+        *a = *l = (nonzeros > 0);
+
+        nonzeros += skip_dc;
+        eobs[i] = nonzeros;
+        eobtotal += nonzeros;
+        qcoeff_ptr += 16;
     }
 
-    qcoeff_ptr [ 15 ] = (int16_t) v;
-BLOCK_FINISHED:
-    eobs[i] = c;
-    eobtotal += c;
-    qcoeff_ptr += 16;
+    coef_probs = fc->coef_probs [2];
 
-    i++;
-
-    if (i < stop)
-        goto BLOCK_LOOP;
-
-    if (i == 25)
+    a_ctx += 4;
+    l_ctx += 4;
+    for (i = 16; i < 24; ++i)
     {
-        start_coeff = 1;
-        i = 0;
-        stop = 16;
-        coef_probs = fc->coef_probs [0] [ 0 ] [0];
-        qcoeff_ptr -= (24*16 + 16);
-        goto BLOCK_LOOP;
+        a = a_ctx + ((i > 19)<<1) + (i&1);
+        l = l_ctx + ((i > 19)<<1) + ((i&3)>1);
+
+        nonzeros = GetCoeffs(bc, coef_probs, (*a + *l), 0, qcoeff_ptr);
+        *a = *l = (nonzeros > 0);
+
+        eobs[i] = nonzeros;
+        eobtotal += nonzeros;
+        qcoeff_ptr += 16;
     }
 
-    if (i == 16)
-    {
-        start_coeff = 0;
-        coef_probs = fc->coef_probs [2] [ 0 ] [0];
-        stop = 24;
-        goto BLOCK_LOOP;
-    }
-
-    FILL
-    bc->user_buffer = bufptr;
-    bc->value = value;
-    bc->count = count;
-    bc->range = range;
     return eobtotal;
-
 }
+
diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c
index bc4450d..845228b 100644
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -150,7 +150,8 @@
                                           xd->recon_left[1],
                                           xd->recon_left[2],
                                           xd->recon_left_stride[1],
-                                          xd->dst.u_buffer, xd->dst.v_buffer);
+                                          xd->dst.u_buffer, xd->dst.v_buffer,
+                                          xd->dst.uv_stride);
 
         if (mode != B_PRED)
         {
@@ -158,7 +159,8 @@
                                                  xd->recon_above[0],
                                                  xd->recon_left[0],
                                                  xd->recon_left_stride[0],
-                                                 xd->dst.y_buffer);
+                                                 xd->dst.y_buffer,
+                                                 xd->dst.y_stride);
         }
         else
         {
@@ -813,15 +815,15 @@
         /* Allocate memory for above_row buffers. */
         CHECK_MEM_ERROR(pbi->mt_yabove_row, vpx_malloc(sizeof(unsigned char *) * pc->mb_rows));
         for (i=0; i< pc->mb_rows; i++)
-            CHECK_MEM_ERROR(pbi->mt_yabove_row[i], vpx_calloc(sizeof(unsigned char) * (width + (VP8BORDERINPIXELS<<1)), 1));
+            CHECK_MEM_ERROR(pbi->mt_yabove_row[i], vpx_memalign(16,sizeof(unsigned char) * (width + (VP8BORDERINPIXELS<<1))));
 
         CHECK_MEM_ERROR(pbi->mt_uabove_row, vpx_malloc(sizeof(unsigned char *) * pc->mb_rows));
         for (i=0; i< pc->mb_rows; i++)
-            CHECK_MEM_ERROR(pbi->mt_uabove_row[i], vpx_calloc(sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS), 1));
+            CHECK_MEM_ERROR(pbi->mt_uabove_row[i], vpx_memalign(16,sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS)));
 
         CHECK_MEM_ERROR(pbi->mt_vabove_row, vpx_malloc(sizeof(unsigned char *) * pc->mb_rows));
         for (i=0; i< pc->mb_rows; i++)
-            CHECK_MEM_ERROR(pbi->mt_vabove_row[i], vpx_calloc(sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS), 1));
+            CHECK_MEM_ERROR(pbi->mt_vabove_row[i], vpx_memalign(16,sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS)));
 
         /* Allocate memory for left_col buffers. */
         CHECK_MEM_ERROR(pbi->mt_yleft_col, vpx_malloc(sizeof(unsigned char *) * pc->mb_rows));
diff --git a/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm b/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm
index 5b7e8f6..a644a00 100644
--- a/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm
+++ b/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm
@@ -47,7 +47,6 @@
     mvn     r2,  #23
     str     r12, [r0, #vp8_writer_lowvalue]
     str     r3,  [r0, #vp8_writer_range]
-    str     r12, [r0, #vp8_writer_value]
     str     r2,  [r0, #vp8_writer_count]
     str     r12, [r0, #vp8_writer_pos]
     str     r1,  [r0, #vp8_writer_buffer]
diff --git a/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm b/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
index 3a183aa..90a98fe 100644
--- a/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
@@ -90,7 +90,6 @@
     mov     r5, #255                    ; vp8_writer_range
     mvn     r3, #23                     ; vp8_writer_count
 
-    str     r2,  [r0, #vp8_writer_value]
     str     r2,  [r0, #vp8_writer_pos]
     str     r10, [r0, #vp8_writer_buffer]
 
diff --git a/vp8/encoder/asm_enc_offsets.c b/vp8/encoder/asm_enc_offsets.c
index 09ee6fb..a4169b3 100644
--- a/vp8/encoder/asm_enc_offsets.c
+++ b/vp8/encoder/asm_enc_offsets.c
@@ -45,7 +45,6 @@
 /* pack tokens */
 DEFINE(vp8_writer_lowvalue,                     offsetof(vp8_writer, lowvalue));
 DEFINE(vp8_writer_range,                        offsetof(vp8_writer, range));
-DEFINE(vp8_writer_value,                        offsetof(vp8_writer, value));
 DEFINE(vp8_writer_count,                        offsetof(vp8_writer, count));
 DEFINE(vp8_writer_pos,                          offsetof(vp8_writer, pos));
 DEFINE(vp8_writer_buffer,                       offsetof(vp8_writer, buffer));
diff --git a/vp8/encoder/boolhuff.c b/vp8/encoder/boolhuff.c
index d8ff5f9..74770a2 100644
--- a/vp8/encoder/boolhuff.c
+++ b/vp8/encoder/boolhuff.c
@@ -45,7 +45,6 @@
 
     br->lowvalue   = 0;
     br->range      = 255;
-    br->value      = 0;
     br->count      = -24;
     br->buffer     = source;
     br->buffer_end = source_end;
diff --git a/vp8/encoder/boolhuff.h b/vp8/encoder/boolhuff.h
index 569b779..fb6cbaf 100644
--- a/vp8/encoder/boolhuff.h
+++ b/vp8/encoder/boolhuff.h
@@ -26,7 +26,6 @@
 {
     unsigned int lowvalue;
     unsigned int range;
-    unsigned int value;
     int count;
     unsigned int pos;
     unsigned char *buffer;
diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c
index f73bcc5..1f445b7 100644
--- a/vp8/encoder/encodeintra.c
+++ b/vp8/encoder/encodeintra.c
@@ -99,7 +99,8 @@
                                          xd->dst.y_buffer - xd->dst.y_stride,
                                          xd->dst.y_buffer - 1,
                                          xd->dst.y_stride,
-                                         xd->dst.y_buffer);
+                                         xd->dst.y_buffer,
+                                         xd->dst.y_stride);
 
     vp8_subtract_mby(x->src_diff, *(b->base_src),
         b->src_stride, xd->dst.y_buffer, xd->dst.y_stride);
@@ -121,7 +122,8 @@
                                       xd->dst.u_buffer - 1,
                                       xd->dst.v_buffer - 1,
                                       xd->dst.uv_stride,
-                                      xd->dst.u_buffer, xd->dst.v_buffer);
+                                      xd->dst.u_buffer, xd->dst.v_buffer,
+                                      xd->dst.uv_stride);
 
     vp8_subtract_mbuv(x->src_diff, x->src.u_buffer,
         x->src.v_buffer, x->src.uv_stride, xd->dst.u_buffer,
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index 24e041f..dafb645 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -735,8 +735,12 @@
         case V_PRED:
         case H_PRED:
         case TM_PRED:
-            vp8_build_intra_predictors_mby
-                (&x->e_mbd);
+            vp8_build_intra_predictors_mby_s(xd,
+                                             xd->dst.y_buffer - xd->dst.y_stride,
+                                             xd->dst.y_buffer - 1,
+                                             xd->dst.y_stride,
+                                             xd->predictor,
+                                             16);
             distortion2 = vp8_variance16x16
                                           (*(b->base_src), b->src_stride,
                                           x->e_mbd.predictor, 16, &sse);
@@ -1130,19 +1134,24 @@
     int this_rd;
     unsigned int sse;
     BLOCK *b = &x->block[0];
+    MACROBLOCKD *xd = &x->e_mbd;
 
-    x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
+    xd->mode_info_context->mbmi.ref_frame = INTRA_FRAME;
 
     pick_intra_mbuv_mode(x);
 
     for (mode = DC_PRED; mode <= TM_PRED; mode ++)
     {
-        x->e_mbd.mode_info_context->mbmi.mode = mode;
-        vp8_build_intra_predictors_mby
-            (&x->e_mbd);
+        xd->mode_info_context->mbmi.mode = mode;
+        vp8_build_intra_predictors_mby_s(xd,
+                                         xd->dst.y_buffer - xd->dst.y_stride,
+                                         xd->dst.y_buffer - 1,
+                                         xd->dst.y_stride,
+                                         xd->predictor,
+                                         16);
         distortion = vp8_variance16x16
-            (*(b->base_src), b->src_stride, x->e_mbd.predictor, 16, &sse);
-        rate = x->mbmode_cost[x->e_mbd.frame_type][mode];
+            (*(b->base_src), b->src_stride, xd->predictor, 16, &sse);
+        rate = x->mbmode_cost[xd->frame_type][mode];
         this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
 
         if (error16x16 > this_rd)
@@ -1153,13 +1162,13 @@
             best_rate = rate;
         }
     }
-    x->e_mbd.mode_info_context->mbmi.mode = best_mode;
+    xd->mode_info_context->mbmi.mode = best_mode;
 
     error4x4 = pick_intra4x4mby_modes(x, &rate,
                                       &best_sse);
     if (error4x4 < error16x16)
     {
-        x->e_mbd.mode_info_context->mbmi.mode = B_PRED;
+        xd->mode_info_context->mbmi.mode = B_PRED;
         best_rate = rate;
     }
 
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 8f575e4..2b706ba 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -782,18 +782,23 @@
     int distortion;
     int best_rd = INT_MAX;
     int this_rd;
+    MACROBLOCKD *xd = &x->e_mbd;
 
     //Y Search for 16x16 intra prediction mode
     for (mode = DC_PRED; mode <= TM_PRED; mode++)
     {
-        x->e_mbd.mode_info_context->mbmi.mode = mode;
+        xd->mode_info_context->mbmi.mode = mode;
 
-        vp8_build_intra_predictors_mby
-            (&x->e_mbd);
+        vp8_build_intra_predictors_mby_s(xd,
+                                         xd->dst.y_buffer - xd->dst.y_stride,
+                                         xd->dst.y_buffer - 1,
+                                         xd->dst.y_stride,
+                                         xd->predictor,
+                                         16);
 
         macro_block_yrd(x, &ratey, &distortion);
-        rate = ratey + x->mbmode_cost[x->e_mbd.frame_type]
-                                     [x->e_mbd.mode_info_context->mbmi.mode];
+        rate = ratey + x->mbmode_cost[xd->frame_type]
+                                     [xd->mode_info_context->mbmi.mode];
 
         this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
 
@@ -807,7 +812,7 @@
         }
     }
 
-    x->e_mbd.mode_info_context->mbmi.mode = mode_selected;
+    xd->mode_info_context->mbmi.mode = mode_selected;
     return best_rd;
 }
 
@@ -875,6 +880,7 @@
     int best_rd = INT_MAX;
     int UNINITIALIZED_IS_SAFE(d), UNINITIALIZED_IS_SAFE(r);
     int rate_to;
+    MACROBLOCKD *xd = &x->e_mbd;
 
     for (mode = DC_PRED; mode <= TM_PRED; mode++)
     {
@@ -882,17 +888,26 @@
         int distortion;
         int this_rd;
 
-        x->e_mbd.mode_info_context->mbmi.uv_mode = mode;
-        vp8_build_intra_predictors_mbuv
-                     (&x->e_mbd);
+        xd->mode_info_context->mbmi.uv_mode = mode;
+
+        vp8_build_intra_predictors_mbuv_s(xd,
+                                          xd->dst.u_buffer - xd->dst.uv_stride,
+                                          xd->dst.v_buffer - xd->dst.uv_stride,
+                                          xd->dst.u_buffer - 1,
+                                          xd->dst.v_buffer - 1,
+                                          xd->dst.uv_stride,
+                                          &xd->predictor[256], &xd->predictor[320],
+                                          8);
+
+
         vp8_subtract_mbuv(x->src_diff,
                       x->src.u_buffer, x->src.v_buffer, x->src.uv_stride,
-                      &x->e_mbd.predictor[256], &x->e_mbd.predictor[320], 8);
+                      &xd->predictor[256], &xd->predictor[320], 8);
         vp8_transform_mbuv(x);
         vp8_quantize_mbuv(x);
 
         rate_to = rd_cost_mbuv(x);
-        rate = rate_to + x->intra_uv_mode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.uv_mode];
+        rate = rate_to + x->intra_uv_mode_cost[xd->frame_type][xd->mode_info_context->mbmi.uv_mode];
 
         distortion = vp8_mbuverror(x) / 4;
 
@@ -911,7 +926,7 @@
     *rate = r;
     *distortion = d;
 
-    x->e_mbd.mode_info_context->mbmi.uv_mode = mode_selected;
+    xd->mode_info_context->mbmi.uv_mode = mode_selected;
 }
 
 int vp8_cost_mv_ref(MB_PREDICTION_MODE m, const int near_mv_ref_ct[4])
@@ -2157,8 +2172,13 @@
         {
             int distortion;
             x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
-            vp8_build_intra_predictors_mby
-                (&x->e_mbd);
+
+            vp8_build_intra_predictors_mby_s(xd,
+                                             xd->dst.y_buffer - xd->dst.y_stride,
+                                             xd->dst.y_buffer - 1,
+                                             xd->dst.y_stride,
+                                             xd->predictor,
+                                             16);
             macro_block_yrd(x, &rd.rate_y, &distortion) ;
             rd.rate2 += rd.rate_y;
             rd.distortion2 += distortion;
diff --git a/vpxenc.c b/vpxenc.c
index e8b8261..d89c075 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -2496,5 +2496,6 @@
 
     vpx_img_free(&raw);
     free(argv);
+    free(streams);
     return EXIT_SUCCESS;
 }