Merge "Replacing raster_block with block in the encoder."
diff --git a/build/make/ads2gas_apple.pl b/build/make/ads2gas_apple.pl
index 51e6fbc..fdafa98 100755
--- a/build/make/ads2gas_apple.pl
+++ b/build/make/ads2gas_apple.pl
@@ -188,7 +188,7 @@
         $trimmed =~ s/,//g;
 
         # string to array
-        @incoming_array = split(/ /, $trimmed);
+        @incoming_array = split(/\s+/, $trimmed);
 
         print ".macro @incoming_array[0]\n";
 
diff --git a/examples.mk b/examples.mk
index 2cee298..16f3c8f 100644
--- a/examples.mk
+++ b/examples.mk
@@ -41,6 +41,7 @@
 vpxenc.SRCS                 += vpx_ports/mem_ops.h
 vpxenc.SRCS                 += vpx_ports/mem_ops_aligned.h
 vpxenc.SRCS                 += vpx_ports/vpx_timer.h
+vpxenc.SRCS                 += vpxstats.c vpxstats.h
 vpxenc.SRCS                 += third_party/libmkv/EbmlIDs.h
 vpxenc.SRCS                 += third_party/libmkv/EbmlWriter.c
 vpxenc.SRCS                 += third_party/libmkv/EbmlWriter.h
diff --git a/test/android/Android.mk b/test/android/Android.mk
new file mode 100644
index 0000000..8d8ce16
--- /dev/null
+++ b/test/android/Android.mk
@@ -0,0 +1,63 @@
+# Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS.  All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+#
+# This make file builds vpx_test app for android.
+# The test app itself runs on the command line through adb shell
+# The paths are really messed up as the libvpx make file
+# expects to be made from a parent directory.
+# TODO(joshualitt)
+# Fix android make files so they can be built from anywhere, will require
+# changing the libvpx make file and this one.
+CUR_WD := $(call my-dir)
+BINDINGS_DIR := $(CUR_WD)/../../..
+LOCAL_PATH := $(CUR_WD)/../../..
+
+#libvpx
+include $(CLEAR_VARS)
+include $(BINDINGS_DIR)/libvpx/build/make/Android.mk
+# Restore path
+# TODO joshualitt Fix makefiles so this is no longer needed
+LOCAL_PATH := $(CUR_WD)/../..
+
+#libgtest
+include $(CLEAR_VARS)
+LOCAL_CPP_EXTENSION := .cc
+LOCAL_MODULE := gtest
+LOCAL_C_INCLUDES := $(LOCAL_PATH)/third_party/googletest/src/
+LOCAL_C_INCLUDES += $(LOCAL_PATH)/third_party/googletest/src/include/
+LOCAL_SRC_FILES := ./third_party/googletest/src/src/gtest-all.cc
+include $(BUILD_STATIC_LIBRARY)
+
+#libnestegg
+include $(CLEAR_VARS)
+LOCAL_CPP_EXTENSION := .cc
+LOCAL_MODULE := nestegg
+NESTEGG_PATH := $(LOCAL_PATH)/nestegg
+LOCAL_C_INCLUDES := $(NESTEGG_PATH)/include
+LOCAL_C_INCLUDES += $(LOCAL_PATH)/
+LOCAL_C_INCLUDES += $(NESTEGG_PATH)/halloc/
+LOCAL_SRC_FILES := ./nestegg/halloc/src/halloc.c
+LOCAL_SRC_FILES += ./nestegg/src/nestegg.c
+include $(BUILD_STATIC_LIBRARY)
+
+#libvpx_test
+include $(CLEAR_VARS)
+LOCAL_MODULE := libvpx_test
+LOCAL_STATIC_LIBRARIES := gtest
+LOCAL_STATIC_LIBRARIES += nestegg
+LOCAL_STATIC_LIBRARIES += cpufeatures
+LOCAL_SHARED_LIBRARIES := vpx
+LOCAL_C_INCLUDES := $(LOCAL_PATH)/
+LOCAL_C_INCLUDES += $(BINDINGS_DIR)/
+LOCAL_C_INCLUDES += $(LOCAL_PATH)/third_party/googletest/src/include
+LOCAL_SRC_FILES := ./args.c
+LOCAL_SRC_FILES += ./md5_utils.c
+LOCAL_SRC_FILES += ./test/decode_test_driver.cc
+LOCAL_SRC_FILES += ./test/test_libvpx.cc
+LOCAL_SRC_FILES += ./test/test_vector_test.cc
+include $(BUILD_EXECUTABLE)
diff --git a/test/android/README b/test/android/README
new file mode 100644
index 0000000..8bc1569
--- /dev/null
+++ b/test/android/README
@@ -0,0 +1,24 @@
+Android.mk will build vpx unittests on android.
+1) configure libvpx from the parent directory:
+./libvpx/configure --target=armv7-android-gcc --enable-external-build --enable-postproc --disable-install-srcs --enable-multi-res-encoding --enable-temporal-denoising --disable-unit-tests --disable-install-docs --disable-examples --disable-runtime-cpu-detect --sdk=$NDK
+
+2) from the parent directory, invoke ndk-build:
+NDK_PROJECT_PATH=. ndk-build APP_BUILD_SCRIPT=./libvpx/test/android/Android.mk APP_ABI=armeabi-v7a APP_PLATFORM=android-18 APP_OPTIM=release APP_STL=gnustl_static APP_CPPFLAGS=-frtti
+
+3) Run get_files.py to download the test files:
+python get_files.py -i /path/to/test-data.sha1 -o /path/to/put/files -u http://libvpx-test-file-url
+
+NOTE: currently the url of the test files is http://downloads.webmproject.org/test_data/libvpx
+
+4) transfer files to device using adb.  Currently, I put these files in /data/local/tmp
+
+adb push /path/to/test_files /data/local/tmp
+adb push /path/to/built_libs /data/local/tmp
+
+NOTE: Built_libs defaults to parent_dir/libs/armeabi-v7a
+
+5) run tests:
+adb shell
+(on device)
+cd /data/local/tmp
+LD_LIBRARY_PATH=. ./vpx_test
diff --git a/test/android/get_files.py b/test/android/get_files.py
new file mode 100644
index 0000000..1c69740
--- /dev/null
+++ b/test/android/get_files.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS.  All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+#
+# This simple script pulls test files from the webm homepage
+# It is intelligent enough to only pull files if
+#   1) File / test_data folder does not exist
+#   2) SHA mismatch
+
+import pycurl
+import csv
+import hashlib
+import re
+import os.path
+import time
+import itertools
+import sys
+import getopt
+
+#globals
+url = ''
+file_list_path = ''
+local_resource_path = ''
+
+# Helper functions:
+# A simple function which returns the sha hash of a file in hex
+def get_file_sha(filename):
+  try:
+    sha_hash = hashlib.sha1()
+    with open(filename, 'rb') as file:
+      buf = file.read(HASH_CHUNK)
+      while len(buf) > 0:
+        sha_hash.update(buf)
+        buf = file.read(HASH_CHUNK)
+      return sha_hash.hexdigest()
+  except IOError:
+    print "Error reading " + filename
+
+# Downloads a file from a url, and then checks the sha against the passed
+# in sha
+def download_and_check_sha(url, filename, sha):
+  path = os.path.join(local_resource_path, filename)
+  fp = open(path, "wb")
+  curl = pycurl.Curl()
+  curl.setopt(pycurl.URL, url + "/" + filename)
+  curl.setopt(pycurl.WRITEDATA, fp)
+  curl.perform()
+  curl.close()
+  fp.close()
+  return get_file_sha(path) == sha
+
+#constants
+ftp_retries = 3
+
+SHA_COL = 0
+NAME_COL = 1
+EXPECTED_COL = 2
+HASH_CHUNK = 65536
+
+# Main script
+try:
+  opts, args = \
+      getopt.getopt(sys.argv[1:], \
+                    "u:i:o:", ["url=", "input_csv=", "output_dir="])
+except:
+  print 'get_files.py -u <url> -i <input_csv> -o <output_dir>'
+  sys.exit(2)
+
+for opt, arg in opts:
+  if opt == '-u':
+    url = arg
+  elif opt in ("-i", "--input_csv"):
+    file_list_path = os.path.join(arg)
+  elif opt in ("-o", "--output_dir"):
+    local_resource_path = os.path.join(arg)
+
+if len(sys.argv) != 7:
+  print "Expects two paths and a url!"
+  exit(1)
+
+if not os.path.isdir(local_resource_path):
+  os.makedirs(local_resource_path)
+
+file_list_csv = open(file_list_path, "rb")
+
+# Our 'csv' file uses multiple spaces as a delimiter, python's
+# csv class only uses single character delimiters, so we convert them below
+file_list_reader = csv.reader((re.sub(' +', ' ', line) \
+    for line in file_list_csv), delimiter = ' ')
+
+file_shas = []
+file_names = []
+
+for row in file_list_reader:
+  if len(row) != EXPECTED_COL:
+      continue
+  file_shas.append(row[SHA_COL])
+  file_names.append(row[NAME_COL])
+
+file_list_csv.close()
+
+# Download files, only if they don't already exist and have correct shas
+for filename, sha in itertools.izip(file_names, file_shas):
+  path = os.path.join(local_resource_path, filename)
+  if os.path.isfile(path) \
+      and get_file_sha(path) == sha:
+    print path + ' exists, skipping'
+    continue
+  for retry in range(0, ftp_retries):
+    print "Downloading " + path
+    if not download_and_check_sha(url, filename, sha):
+      print "Sha does not match, retrying..."
+    else:
+      break
diff --git a/test/svc_test.cc b/test/svc_test.cc
index 5941cae..98a5d94 100644
--- a/test/svc_test.cc
+++ b/test/svc_test.cc
@@ -31,12 +31,16 @@
   SvcTest()
       : codec_iface_(0),
         test_file_name_("hantro_collage_w352h288.yuv"),
-        decoder_(0) {}
+        codec_initialized_(false),
+        decoder_(0) {
+    memset(&svc_, 0, sizeof(svc_));
+    memset(&codec_, 0, sizeof(codec_));
+    memset(&codec_enc_, 0, sizeof(codec_enc_));
+  }
 
   virtual ~SvcTest() {}
 
   virtual void SetUp() {
-    memset(&svc_, 0, sizeof(svc_));
     svc_.first_frame_full_size = 1;
     svc_.encoding_mode = INTER_LAYER_PREDICTION_IP;
     svc_.log_level = SVC_LOG_DEBUG;
@@ -61,6 +65,8 @@
 
   virtual void TearDown() {
     vpx_svc_release(&svc_);
+    delete(decoder_);
+    if (codec_initialized_) vpx_codec_destroy(&codec_);
   }
 
   SvcContext svc_;
@@ -68,22 +74,16 @@
   struct vpx_codec_enc_cfg codec_enc_;
   vpx_codec_iface_t *codec_iface_;
   std::string test_file_name_;
-
+  bool codec_initialized_;
   Decoder *decoder_;
 };
 
 TEST_F(SvcTest, SvcInit) {
-  svc_.spatial_layers = 0;  // use default layers
-  vpx_codec_err_t res = vpx_svc_init(&svc_, &codec_, codec_iface_, &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  EXPECT_EQ(VPX_SS_DEFAULT_LAYERS, svc_.spatial_layers);
-
-  res = vpx_svc_init(NULL, &codec_, codec_iface_, &codec_enc_);
+  // test missing parameters
+  vpx_codec_err_t res = vpx_svc_init(NULL, &codec_, codec_iface_, &codec_enc_);
   EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
   res = vpx_svc_init(&svc_, NULL, codec_iface_, &codec_enc_);
   EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
   res = vpx_svc_init(&svc_, &codec_, NULL, &codec_enc_);
   EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
 
@@ -94,58 +94,88 @@
   res = vpx_svc_init(&svc_, &codec_, codec_iface_, &codec_enc_);
   EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
 
+  svc_.spatial_layers = 0;  // use default layers
+  res = vpx_svc_init(&svc_, &codec_, codec_iface_, &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  codec_initialized_ = true;
+  EXPECT_EQ(VPX_SS_DEFAULT_LAYERS, svc_.spatial_layers);
+}
+
+TEST_F(SvcTest, InitTwoLayers) {
   svc_.spatial_layers = 2;
   vpx_svc_set_scale_factors(&svc_, "4/16,16*16");  // invalid scale values
-  res = vpx_svc_init(&svc_, &codec_, codec_iface_, &codec_enc_);
+  vpx_codec_err_t res = vpx_svc_init(&svc_, &codec_, codec_iface_, &codec_enc_);
   EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
 
   vpx_svc_set_scale_factors(&svc_, "4/16,16/16");  // valid scale values
   res = vpx_svc_init(&svc_, &codec_, codec_iface_, &codec_enc_);
   EXPECT_EQ(VPX_CODEC_OK, res);
+  codec_initialized_ = true;
 }
 
-TEST_F(SvcTest, SetOptions) {
-  vpx_codec_err_t res = vpx_svc_set_options(NULL, "layers=3");
+TEST_F(SvcTest, InvalidOptions) {
+  vpx_codec_err_t res = vpx_svc_set_options(&svc_, NULL);
   EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
 
-  vpx_svc_set_options(&svc_, NULL);
+  res = vpx_svc_set_options(&svc_, "not-an-option=1");
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
   EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+}
 
-  vpx_svc_set_options(&svc_, "layers=3");
+TEST_F(SvcTest, SetLayersOption) {
+  vpx_codec_err_t res = vpx_svc_set_options(&svc_, "layers=3");
+  EXPECT_EQ(VPX_CODEC_OK, res);
   res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
   EXPECT_EQ(VPX_CODEC_OK, res);
+  codec_initialized_ = true;
   EXPECT_EQ(3, svc_.spatial_layers);
+}
 
-  vpx_svc_set_options(&svc_, "not-an-option=1");
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  vpx_svc_set_options(&svc_, "encoding-mode=alt-ip");
+TEST_F(SvcTest, SetEncodingMode) {
+  vpx_codec_err_t res = vpx_svc_set_options(&svc_, "encoding-mode=alt-ip");
+  EXPECT_EQ(VPX_CODEC_OK, res);
   res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
   EXPECT_EQ(VPX_CODEC_OK, res);
+  codec_initialized_ = true;
   EXPECT_EQ(ALT_INTER_LAYER_PREDICTION_IP, svc_.encoding_mode);
+}
 
-  vpx_svc_set_options(&svc_, "layers=2 encoding-mode=ip");
+TEST_F(SvcTest, SetMultipleOptions) {
+  vpx_codec_err_t res = vpx_svc_set_options(&svc_, "layers=2 encoding-mode=ip");
   res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
   EXPECT_EQ(VPX_CODEC_OK, res);
+  codec_initialized_ = true;
   EXPECT_EQ(2, svc_.spatial_layers);
   EXPECT_EQ(INTER_LAYER_PREDICTION_IP, svc_.encoding_mode);
+}
 
-  vpx_svc_set_options(&svc_, "scale-factors=not-scale-factors");
+TEST_F(SvcTest, SetScaleFactorsOption) {
+  svc_.spatial_layers = 2;
+  vpx_codec_err_t res =
+      vpx_svc_set_options(&svc_, "scale-factors=not-scale-factors");
+  EXPECT_EQ(VPX_CODEC_OK, res);
   res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
   EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
 
-  vpx_svc_set_options(&svc_, "scale-factors=1/3,2/3");
+  res = vpx_svc_set_options(&svc_, "scale-factors=1/3,2/3");
+  EXPECT_EQ(VPX_CODEC_OK, res);
   res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
   EXPECT_EQ(VPX_CODEC_OK, res);
+  codec_initialized_ = true;
+}
 
-  vpx_svc_set_options(&svc_, "quantizers=not-quantizers");
+TEST_F(SvcTest, SetQuantizersOption) {
+  svc_.spatial_layers = 2;
+  vpx_codec_err_t res = vpx_svc_set_options(&svc_, "quantizers=not-quantizers");
+  EXPECT_EQ(VPX_CODEC_OK, res);
   res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
   EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
 
   vpx_svc_set_options(&svc_, "quantizers=40,45");
   res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
   EXPECT_EQ(VPX_CODEC_OK, res);
+  codec_initialized_ = true;
 }
 
 TEST_F(SvcTest, SetQuantizers) {
@@ -157,15 +187,16 @@
 
   svc_.first_frame_full_size = 0;
   svc_.spatial_layers = 2;
-  res = vpx_svc_set_quantizers(&svc_, "40,30");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-
   res = vpx_svc_set_quantizers(&svc_, "40");
   EXPECT_EQ(VPX_CODEC_OK, res);
   res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
   EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  res = vpx_svc_set_quantizers(&svc_, "40,30");
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  codec_initialized_ = true;
 }
 
 TEST_F(SvcTest, SetScaleFactors) {
@@ -177,15 +208,16 @@
 
   svc_.first_frame_full_size = 0;
   svc_.spatial_layers = 2;
-  res = vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_OK, res);
-
   res = vpx_svc_set_scale_factors(&svc_, "4/16");
   EXPECT_EQ(VPX_CODEC_OK, res);
   res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
   EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  res = vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  codec_initialized_ = true;
 }
 
 // test that decoder can handle an SVC frame as the first frame in a sequence
@@ -200,6 +232,7 @@
   vpx_codec_err_t res =
       vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
   EXPECT_EQ(VPX_CODEC_OK, res);
+  codec_initialized_ = true;
 
   libvpx_test::I420VideoSource video(test_file_name_, kWidth, kHeight,
                                      codec_enc_.g_timebase.den,
@@ -227,6 +260,7 @@
   vpx_codec_err_t res =
       vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
   ASSERT_EQ(VPX_CODEC_OK, res);
+  codec_initialized_ = true;
 
   libvpx_test::I420VideoSource video(test_file_name_, kWidth, kHeight,
                                      codec_enc_.g_timebase.den,
@@ -280,6 +314,7 @@
   vpx_codec_err_t res =
       vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
   EXPECT_EQ(VPX_CODEC_OK, res);
+  codec_initialized_ = true;
 
   // ensure that requested layer is a valid layer
   uint32_t layer_width, layer_height;
diff --git a/test/test-data.sha1 b/test/test-data.sha1
index 44220d5..5229d09 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -538,3 +538,7 @@
 a481fbea465010b57af5a19ebf6d4a5cfe5b9278  vp90-2-08-tile_1x4_frame_parallel.webm.md5
 0203ec456277a01aec401e7fb6c72c9a7e5e3f9d  vp90-2-08-tile_1x4.webm
 c9b237dfcc01c1b414fbcaa481d014a906ef7998  vp90-2-08-tile_1x4.webm.md5
+20c75157e91ab41f82f70ffa73d5d01df8469287  vp90-2-08-tile-4x4.webm
+ae7451810247fd13975cc257aa0301ff17102255  vp90-2-08-tile-4x4.webm.md5
+2ec6e15422ac7a61af072dc5f27fcaf1942ce116  vp90-2-08-tile-4x1.webm
+0094f5ee5e46345017c30e0aa4835b550212d853  vp90-2-08-tile-4x1.webm.md5
diff --git a/test/test.mk b/test/test.mk
index f7a5d15..ac072d0 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -646,5 +646,9 @@
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4_frame_parallel.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x1.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm.md5
diff --git a/test/test_vector_test.cc b/test/test_vector_test.cc
index c6ad1c5..08449a5 100644
--- a/test/test_vector_test.cc
+++ b/test/test_vector_test.cc
@@ -164,6 +164,7 @@
   "vp90-2-07-frame_parallel.webm",
   "vp90-2-08-tile_1x2_frame_parallel.webm", "vp90-2-08-tile_1x2.webm",
   "vp90-2-08-tile_1x4_frame_parallel.webm", "vp90-2-08-tile_1x4.webm",
+  "vp90-2-08-tile-4x4.webm", "vp90-2-08-tile-4x1.webm",
 #if CONFIG_NON420
   "vp91-2-04-yv444.webm"
 #endif
diff --git a/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm b/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
index f00d027..388a7d7 100644
--- a/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
@@ -1145,7 +1145,7 @@
 
     ; pass loop processing
     add r5, r5, #1
-    B idct32_pass_loop
+    b idct32_pass_loop
 
 idct32_bands_end_2nd_pass
     STORE_COMBINE_CENTER_RESULTS
diff --git a/vp9/common/mips/dspr2/vp9_intrapred16_dspr2.c b/vp9/common/mips/dspr2/vp9_intrapred16_dspr2.c
new file mode 100644
index 0000000..b0dc496
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_intrapred16_dspr2.c
@@ -0,0 +1,332 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <stdlib.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+void vp9_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  int32_t  tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+  int32_t  tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+
+  __asm__ __volatile__ (
+      "lb         %[tmp1],      (%[left])                    \n\t"
+      "lb         %[tmp2],      1(%[left])                   \n\t"
+      "lb         %[tmp3],      2(%[left])                   \n\t"
+      "lb         %[tmp4],      3(%[left])                   \n\t"
+      "lb         %[tmp5],      4(%[left])                   \n\t"
+      "lb         %[tmp6],      5(%[left])                   \n\t"
+      "lb         %[tmp7],      6(%[left])                   \n\t"
+      "lb         %[tmp8],      7(%[left])                   \n\t"
+      "lb         %[tmp9],      8(%[left])                   \n\t"
+      "lb         %[tmp10],     9(%[left])                   \n\t"
+      "lb         %[tmp11],     10(%[left])                  \n\t"
+      "lb         %[tmp12],     11(%[left])                  \n\t"
+      "lb         %[tmp13],     12(%[left])                  \n\t"
+      "lb         %[tmp14],     13(%[left])                  \n\t"
+      "lb         %[tmp15],     14(%[left])                  \n\t"
+      "lb         %[tmp16],     15(%[left])                  \n\t"
+
+      "replv.qb   %[tmp1],      %[tmp1]                      \n\t"
+      "replv.qb   %[tmp2],      %[tmp2]                      \n\t"
+      "replv.qb   %[tmp3],      %[tmp3]                      \n\t"
+      "replv.qb   %[tmp4],      %[tmp4]                      \n\t"
+      "replv.qb   %[tmp5],      %[tmp5]                      \n\t"
+      "replv.qb   %[tmp6],      %[tmp6]                      \n\t"
+      "replv.qb   %[tmp7],      %[tmp7]                      \n\t"
+      "replv.qb   %[tmp8],      %[tmp8]                      \n\t"
+      "replv.qb   %[tmp9],      %[tmp9]                      \n\t"
+      "replv.qb   %[tmp10],     %[tmp10]                     \n\t"
+      "replv.qb   %[tmp11],     %[tmp11]                     \n\t"
+      "replv.qb   %[tmp12],     %[tmp12]                     \n\t"
+      "replv.qb   %[tmp13],     %[tmp13]                     \n\t"
+      "replv.qb   %[tmp14],     %[tmp14]                     \n\t"
+      "replv.qb   %[tmp15],     %[tmp15]                     \n\t"
+      "replv.qb   %[tmp16],     %[tmp16]                     \n\t"
+
+      "sw         %[tmp1],      (%[dst])                     \n\t"
+      "sw         %[tmp1],      4(%[dst])                    \n\t"
+      "sw         %[tmp1],      8(%[dst])                    \n\t"
+      "sw         %[tmp1],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp2],      (%[dst])                     \n\t"
+      "sw         %[tmp2],      4(%[dst])                    \n\t"
+      "sw         %[tmp2],      8(%[dst])                    \n\t"
+      "sw         %[tmp2],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp3],      (%[dst])                     \n\t"
+      "sw         %[tmp3],      4(%[dst])                    \n\t"
+      "sw         %[tmp3],      8(%[dst])                    \n\t"
+      "sw         %[tmp3],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp4],      (%[dst])                     \n\t"
+      "sw         %[tmp4],      4(%[dst])                    \n\t"
+      "sw         %[tmp4],      8(%[dst])                    \n\t"
+      "sw         %[tmp4],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp5],      (%[dst])                     \n\t"
+      "sw         %[tmp5],      4(%[dst])                    \n\t"
+      "sw         %[tmp5],      8(%[dst])                    \n\t"
+      "sw         %[tmp5],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp6],      (%[dst])                     \n\t"
+      "sw         %[tmp6],      4(%[dst])                    \n\t"
+      "sw         %[tmp6],      8(%[dst])                    \n\t"
+      "sw         %[tmp6],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp7],      (%[dst])                     \n\t"
+      "sw         %[tmp7],      4(%[dst])                    \n\t"
+      "sw         %[tmp7],      8(%[dst])                    \n\t"
+      "sw         %[tmp7],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp8],      (%[dst])                     \n\t"
+      "sw         %[tmp8],      4(%[dst])                    \n\t"
+      "sw         %[tmp8],      8(%[dst])                    \n\t"
+      "sw         %[tmp8],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp9],      (%[dst])                     \n\t"
+      "sw         %[tmp9],      4(%[dst])                    \n\t"
+      "sw         %[tmp9],      8(%[dst])                    \n\t"
+      "sw         %[tmp9],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp10],     (%[dst])                     \n\t"
+      "sw         %[tmp10],     4(%[dst])                    \n\t"
+      "sw         %[tmp10],     8(%[dst])                    \n\t"
+      "sw         %[tmp10],     12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp11],     (%[dst])                     \n\t"
+      "sw         %[tmp11],     4(%[dst])                    \n\t"
+      "sw         %[tmp11],     8(%[dst])                    \n\t"
+      "sw         %[tmp11],     12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp12],     (%[dst])                     \n\t"
+      "sw         %[tmp12],     4(%[dst])                    \n\t"
+      "sw         %[tmp12],     8(%[dst])                    \n\t"
+      "sw         %[tmp12],     12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp13],     (%[dst])                     \n\t"
+      "sw         %[tmp13],     4(%[dst])                    \n\t"
+      "sw         %[tmp13],     8(%[dst])                    \n\t"
+      "sw         %[tmp13],     12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp14],     (%[dst])                     \n\t"
+      "sw         %[tmp14],     4(%[dst])                    \n\t"
+      "sw         %[tmp14],     8(%[dst])                    \n\t"
+      "sw         %[tmp14],     12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp15],     (%[dst])                     \n\t"
+      "sw         %[tmp15],     4(%[dst])                    \n\t"
+      "sw         %[tmp15],     8(%[dst])                    \n\t"
+      "sw         %[tmp15],     12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp16],     (%[dst])                     \n\t"
+      "sw         %[tmp16],     4(%[dst])                    \n\t"
+      "sw         %[tmp16],     8(%[dst])                    \n\t"
+      "sw         %[tmp16],     12(%[dst])                   \n\t"
+
+      : [tmp1] "=&r" (tmp1),   [tmp2] "=&r" (tmp2),
+        [tmp3] "=&r" (tmp3),   [tmp4] "=&r" (tmp4),
+        [tmp5] "=&r" (tmp5),   [tmp7] "=&r" (tmp7),
+        [tmp6] "=&r" (tmp6),   [tmp8] "=&r" (tmp8),
+        [tmp9] "=&r" (tmp9),   [tmp10] "=&r" (tmp10),
+        [tmp11] "=&r" (tmp11), [tmp12] "=&r" (tmp12),
+        [tmp13] "=&r" (tmp13), [tmp14] "=&r" (tmp14),
+        [tmp15] "=&r" (tmp15), [tmp16] "=&r" (tmp16)
+      : [left] "r" (left), [dst] "r" (dst), [stride] "r" (stride)
+  );
+}
+
+void vp9_dc_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  int32_t  expected_dc;
+  int32_t  average;
+  int32_t  tmp, above1, above_l1, above_r1, left1, left_r1, left_l1;
+  int32_t  above2, left2;
+
+  __asm__ __volatile__ (
+      "lw              %[above1],           (%[above])                    \n\t"
+      "lw              %[above2],           4(%[above])                   \n\t"
+      "lw              %[left1],            (%[left])                     \n\t"
+      "lw              %[left2],            4(%[left])                    \n\t"
+
+      "preceu.ph.qbl   %[above_l1],         %[above1]                     \n\t"
+      "preceu.ph.qbr   %[above_r1],         %[above1]                     \n\t"
+      "preceu.ph.qbl   %[left_l1],          %[left1]                      \n\t"
+      "preceu.ph.qbr   %[left_r1],          %[left1]                      \n\t"
+
+      "addu.ph         %[average],          %[above_r1],     %[above_l1]  \n\t"
+      "addu.ph         %[average],          %[average],      %[left_l1]   \n\t"
+      "addu.ph         %[average],          %[average],      %[left_r1]   \n\t"
+
+      "preceu.ph.qbl   %[above_l1],         %[above2]                     \n\t"
+      "preceu.ph.qbr   %[above_r1],         %[above2]                     \n\t"
+      "preceu.ph.qbl   %[left_l1],          %[left2]                      \n\t"
+      "preceu.ph.qbr   %[left_r1],          %[left2]                      \n\t"
+
+      "addu.ph         %[average],          %[average],      %[above_l1]  \n\t"
+      "addu.ph         %[average],          %[average],      %[above_r1]  \n\t"
+      "addu.ph         %[average],          %[average],      %[left_l1]   \n\t"
+      "addu.ph         %[average],          %[average],      %[left_r1]   \n\t"
+
+      "lw              %[above1],           8(%[above])                   \n\t"
+      "lw              %[above2],           12(%[above])                  \n\t"
+      "lw              %[left1],            8(%[left])                    \n\t"
+      "lw              %[left2],            12(%[left])                   \n\t"
+
+      "preceu.ph.qbl   %[above_l1],         %[above1]                     \n\t"
+      "preceu.ph.qbr   %[above_r1],         %[above1]                     \n\t"
+      "preceu.ph.qbl   %[left_l1],          %[left1]                      \n\t"
+      "preceu.ph.qbr   %[left_r1],          %[left1]                      \n\t"
+
+      "addu.ph         %[average],          %[average],      %[above_l1]  \n\t"
+      "addu.ph         %[average],          %[average],      %[above_r1]  \n\t"
+      "addu.ph         %[average],          %[average],      %[left_l1]   \n\t"
+      "addu.ph         %[average],          %[average],      %[left_r1]   \n\t"
+
+      "preceu.ph.qbl   %[above_l1],         %[above2]                     \n\t"
+      "preceu.ph.qbr   %[above_r1],         %[above2]                     \n\t"
+      "preceu.ph.qbl   %[left_l1],          %[left2]                      \n\t"
+      "preceu.ph.qbr   %[left_r1],          %[left2]                      \n\t"
+
+      "addu.ph         %[average],          %[average],      %[above_l1]  \n\t"
+      "addu.ph         %[average],          %[average],      %[above_r1]  \n\t"
+      "addu.ph         %[average],          %[average],      %[left_l1]   \n\t"
+      "addu.ph         %[average],          %[average],      %[left_r1]   \n\t"
+
+      "addiu           %[average],          %[average],      16           \n\t"
+      "srl             %[tmp],              %[average],      16           \n\t"
+      "addu.ph         %[average],          %[tmp],          %[average]   \n\t"
+      "srl             %[expected_dc],      %[average],      5            \n\t"
+      "replv.qb        %[expected_dc],      %[expected_dc]                \n\t"
+
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      : [left1] "=&r" (left1), [above1] "=&r" (above1),
+        [left_l1] "=&r" (left_l1), [above_l1] "=&r" (above_l1),
+        [left_r1] "=&r" (left_r1), [above_r1] "=&r" (above_r1),
+        [above2] "=&r" (above2), [left2] "=&r" (left2),
+        [average] "=&r" (average), [tmp] "=&r" (tmp),
+        [expected_dc] "=&r" (expected_dc)
+      : [above] "r" (above), [left] "r" (left),
+        [dst] "r" (dst), [stride] "r" (stride)
+  );
+}
+#endif  // #if HAVE_DSPR2
diff --git a/vp9/common/mips/dspr2/vp9_intrapred4_dspr2.c b/vp9/common/mips/dspr2/vp9_intrapred4_dspr2.c
new file mode 100644
index 0000000..a53c623
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_intrapred4_dspr2.c
@@ -0,0 +1,232 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <stdlib.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+void vp9_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  int32_t  tmp1, tmp2, tmp3, tmp4;
+
+  __asm__ __volatile__ (
+      "lb         %[tmp1],      (%[left])                    \n\t"
+      "lb         %[tmp2],      1(%[left])                   \n\t"
+      "lb         %[tmp3],      2(%[left])                   \n\t"
+      "lb         %[tmp4],      3(%[left])                   \n\t"
+      "replv.qb   %[tmp1],      %[tmp1]                      \n\t"
+      "replv.qb   %[tmp2],      %[tmp2]                      \n\t"
+      "replv.qb   %[tmp3],      %[tmp3]                      \n\t"
+      "replv.qb   %[tmp4],      %[tmp4]                      \n\t"
+      "sw         %[tmp1],      (%[dst])                     \n\t"
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp2],      (%[dst])                     \n\t"
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp3],      (%[dst])                     \n\t"
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp4],      (%[dst])                     \n\t"
+
+      : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2),
+        [tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4)
+      : [left] "r" (left), [dst] "r" (dst), [stride] "r" (stride)
+  );
+}
+
+void vp9_dc_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int32_t  expected_dc;
+  int32_t  average;
+  int32_t  tmp, above_c, above_l, above_r, left_c, left_r, left_l;
+
+  __asm__ __volatile__ (
+      "lw              %[above_c],         (%[above])                    \n\t"
+      "lw              %[left_c],          (%[left])                     \n\t"
+
+      "preceu.ph.qbl   %[above_l],         %[above_c]                    \n\t"
+      "preceu.ph.qbr   %[above_r],         %[above_c]                    \n\t"
+      "preceu.ph.qbl   %[left_l],          %[left_c]                     \n\t"
+      "preceu.ph.qbr   %[left_r],          %[left_c]                     \n\t"
+
+      "addu.ph         %[average],         %[above_r],       %[above_l]  \n\t"
+      "addu.ph         %[average],         %[average],       %[left_l]   \n\t"
+      "addu.ph         %[average],         %[average],       %[left_r]   \n\t"
+      "addiu           %[average],         %[average],       4           \n\t"
+      "srl             %[tmp],             %[average],       16          \n\t"
+      "addu.ph         %[average],         %[tmp],           %[average]  \n\t"
+      "srl             %[expected_dc],     %[average],       3           \n\t"
+      "replv.qb        %[expected_dc],     %[expected_dc]                \n\t"
+
+      "sw              %[expected_dc],     (%[dst])                      \n\t"
+      "add             %[dst],              %[dst],          %[stride]   \n\t"
+      "sw              %[expected_dc],     (%[dst])                      \n\t"
+      "add             %[dst],              %[dst],          %[stride]   \n\t"
+      "sw              %[expected_dc],     (%[dst])                      \n\t"
+      "add             %[dst],              %[dst],          %[stride]   \n\t"
+      "sw              %[expected_dc],     (%[dst])                      \n\t"
+
+      : [above_c] "=&r" (above_c), [above_l] "=&r" (above_l),
+        [above_r] "=&r" (above_r), [left_c] "=&r" (left_c),
+        [left_l] "=&r" (left_l), [left_r] "=&r" (left_r),
+        [average] "=&r" (average), [tmp] "=&r" (tmp),
+        [expected_dc] "=&r" (expected_dc)
+      : [above] "r" (above), [left] "r" (left),
+        [dst] "r" (dst), [stride] "r" (stride)
+  );
+}
+
+void vp9_tm_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int32_t  abovel, abover;
+  int32_t  left0, left1, left2, left3;
+  int32_t  res0, res1;
+  int32_t  resl;
+  int32_t  resr;
+  int32_t  top_left;
+  uint8_t  *cm = vp9_ff_cropTbl;
+
+  __asm__ __volatile__ (
+      "ulw             %[resl],       (%[above])                         \n\t"
+
+      "lbu             %[left0],       (%[left])                         \n\t"
+      "lbu             %[left1],       1(%[left])                        \n\t"
+      "lbu             %[left2],       2(%[left])                        \n\t"
+      "lbu             %[left3],       3(%[left])                        \n\t"
+
+      "lbu             %[top_left],    -1(%[above])                      \n\t"
+
+      "preceu.ph.qbl   %[abovel],      %[resl]                           \n\t"
+      "preceu.ph.qbr   %[abover],      %[resl]                           \n\t"
+
+      "replv.ph        %[left0],       %[left0]                          \n\t"
+      "replv.ph        %[left1],       %[left1]                          \n\t"
+      "replv.ph        %[left2],       %[left2]                          \n\t"
+      "replv.ph        %[left3],       %[left3]                          \n\t"
+
+      "replv.ph        %[top_left],    %[top_left]                       \n\t"
+
+      "addu.ph         %[resl],        %[abovel],         %[left0]       \n\t"
+      "subu.ph         %[resl],        %[resl],           %[top_left]    \n\t"
+
+      "addu.ph         %[resr],        %[abover],         %[left0]       \n\t"
+      "subu.ph         %[resr],        %[resr],           %[top_left]    \n\t"
+
+      "sll             %[res0],        %[resr],           16             \n\t"
+      "sra             %[res0],        %[res0],           16             \n\t"
+      "lbux            %[res0],        %[res0](%[cm])                    \n\t"
+
+      "sra             %[res1],        %[resr],           16             \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                    \n\t"
+      "sb              %[res0],        (%[dst])                          \n\t"
+
+      "sll             %[res0],        %[resl],           16             \n\t"
+      "sra             %[res0],        %[res0],           16             \n\t"
+      "lbux            %[res0],        %[res0](%[cm])                    \n\t"
+      "sb              %[res1],        1(%[dst])                         \n\t"
+
+      "sra             %[res1],        %[resl],           16             \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                    \n\t"
+
+      "addu.ph         %[resl],        %[abovel],         %[left1]       \n\t"
+      "subu.ph         %[resl],        %[resl],           %[top_left]    \n\t"
+
+      "addu.ph         %[resr],        %[abover],         %[left1]       \n\t"
+      "subu.ph         %[resr],        %[resr],           %[top_left]    \n\t"
+
+      "sb              %[res0],        2(%[dst])                         \n\t"
+      "sb              %[res1],        3(%[dst])                         \n\t"
+
+      "add             %[dst],          %[dst],           %[stride]      \n\t"
+
+      "sll             %[res0],        %[resr],           16             \n\t"
+      "sra             %[res0],        %[res0],           16             \n\t"
+      "lbux            %[res0],        %[res0](%[cm])                    \n\t"
+
+      "sra             %[res1],        %[resr],           16             \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                    \n\t"
+      "sb              %[res0],        (%[dst])                          \n\t"
+
+      "sll             %[res0],        %[resl],           16             \n\t"
+      "sra             %[res0],        %[res0],           16             \n\t"
+      "lbux            %[res0],        %[res0](%[cm])                    \n\t"
+
+      "sb              %[res1],        1(%[dst])                         \n\t"
+      "sra             %[res1],        %[resl],           16             \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                    \n\t"
+
+      "addu.ph         %[resl],        %[abovel],         %[left2]       \n\t"
+      "subu.ph         %[resl],        %[resl],           %[top_left]    \n\t"
+
+      "addu.ph         %[resr],        %[abover],         %[left2]       \n\t"
+      "subu.ph         %[resr],        %[resr],           %[top_left]    \n\t"
+
+      "sb              %[res0],        2(%[dst])                         \n\t"
+      "sb              %[res1],        3(%[dst])                         \n\t"
+
+      "add             %[dst],          %[dst],           %[stride]      \n\t"
+
+      "sll             %[res0],        %[resr],           16             \n\t"
+      "sra             %[res0],        %[res0],           16             \n\t"
+      "lbux            %[res0],        %[res0](%[cm])                    \n\t"
+
+
+      "sra             %[res1],        %[resr],           16             \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                    \n\t"
+      "sb              %[res0],        (%[dst])                          \n\t"
+
+      "sll             %[res0],        %[resl],           16             \n\t"
+      "sra             %[res0],        %[res0],           16             \n\t"
+      "lbux            %[res0],        %[res0](%[cm])                    \n\t"
+
+
+      "sb              %[res1],        1(%[dst])                         \n\t"
+      "sra             %[res1],        %[resl],           16             \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                    \n\t"
+
+      "addu.ph         %[resl],        %[abovel],        %[left3]        \n\t"
+      "subu.ph         %[resl],        %[resl],          %[top_left]     \n\t"
+
+      "addu.ph         %[resr],        %[abover],        %[left3]        \n\t"
+      "subu.ph         %[resr],        %[resr],          %[top_left]     \n\t"
+
+      "sb              %[res0],        2(%[dst])                         \n\t"
+      "sb              %[res1],        3(%[dst])                         \n\t"
+
+      "add             %[dst],          %[dst],          %[stride]       \n\t"
+
+      "sll             %[res0],        %[resr],           16             \n\t"
+      "sra             %[res0],        %[res0],           16             \n\t"
+      "lbux            %[res0],        %[res0](%[cm])                    \n\t"
+
+      "sra             %[res1],        %[resr],           16             \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                    \n\t"
+      "sb              %[res0],        (%[dst])                          \n\t"
+
+      "sll             %[res0],        %[resl],           16             \n\t"
+      "sra             %[res0],        %[res0],           16             \n\t"
+      "lbux            %[res0],        %[res0](%[cm])                    \n\t"
+      "sb              %[res1],        1(%[dst])                         \n\t"
+
+      "sra             %[res1],        %[resl],           16             \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                    \n\t"
+
+      "sb              %[res0],        2(%[dst])                         \n\t"
+      "sb              %[res1],        3(%[dst])                         \n\t"
+
+      : [abovel] "=&r" (abovel), [abover] "=&r" (abover),
+        [left0] "=&r" (left0), [left1] "=&r" (left1), [left2] "=&r" (left2),
+        [res0] "=&r" (res0), [res1] "=&r" (res1), [left3] "=&r" (left3),
+        [resl] "=&r" (resl), [resr] "=&r" (resr), [top_left] "=&r" (top_left)
+      : [above] "r" (above), [left] "r" (left),
+        [dst] "r" (dst), [stride] "r" (stride), [cm] "r" (cm)
+  );
+}
+#endif  // #if HAVE_DSPR2
diff --git a/vp9/common/mips/dspr2/vp9_intrapred8_dspr2.c b/vp9/common/mips/dspr2/vp9_intrapred8_dspr2.c
new file mode 100644
index 0000000..40d93ae
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_intrapred8_dspr2.c
@@ -0,0 +1,610 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <stdlib.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+void vp9_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  int32_t  tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+
+  __asm__ __volatile__ (
+      "lb         %[tmp1],      (%[left])                   \n\t"
+      "lb         %[tmp2],      1(%[left])                  \n\t"
+      "lb         %[tmp3],      2(%[left])                  \n\t"
+      "lb         %[tmp4],      3(%[left])                  \n\t"
+      "lb         %[tmp5],      4(%[left])                  \n\t"
+      "lb         %[tmp6],      5(%[left])                  \n\t"
+      "lb         %[tmp7],      6(%[left])                  \n\t"
+      "lb         %[tmp8],      7(%[left])                  \n\t"
+
+      "replv.qb   %[tmp1],      %[tmp1]                     \n\t"
+      "replv.qb   %[tmp2],      %[tmp2]                     \n\t"
+      "replv.qb   %[tmp3],      %[tmp3]                     \n\t"
+      "replv.qb   %[tmp4],      %[tmp4]                     \n\t"
+      "replv.qb   %[tmp5],      %[tmp5]                     \n\t"
+      "replv.qb   %[tmp6],      %[tmp6]                     \n\t"
+      "replv.qb   %[tmp7],      %[tmp7]                     \n\t"
+      "replv.qb   %[tmp8],      %[tmp8]                     \n\t"
+
+      "sw         %[tmp1],      (%[dst])                    \n\t"
+      "sw         %[tmp1],      4(%[dst])                   \n\t"
+      "add        %[dst],       %[dst],         %[stride]   \n\t"
+      "sw         %[tmp2],      (%[dst])                    \n\t"
+      "sw         %[tmp2],      4(%[dst])                   \n\t"
+      "add        %[dst],       %[dst],         %[stride]   \n\t"
+      "sw         %[tmp3],      (%[dst])                    \n\t"
+      "sw         %[tmp3],      4(%[dst])                   \n\t"
+      "add        %[dst],       %[dst],         %[stride]   \n\t"
+      "sw         %[tmp4],      (%[dst])                    \n\t"
+      "sw         %[tmp4],      4(%[dst])                   \n\t"
+      "add        %[dst],       %[dst],         %[stride]   \n\t"
+      "sw         %[tmp5],      (%[dst])                    \n\t"
+      "sw         %[tmp5],      4(%[dst])                   \n\t"
+      "add        %[dst],       %[dst],         %[stride]   \n\t"
+      "sw         %[tmp6],      (%[dst])                    \n\t"
+      "sw         %[tmp6],      4(%[dst])                   \n\t"
+      "add        %[dst],       %[dst],         %[stride]   \n\t"
+      "sw         %[tmp7],      (%[dst])                    \n\t"
+      "sw         %[tmp7],      4(%[dst])                   \n\t"
+      "add        %[dst],       %[dst],         %[stride]   \n\t"
+      "sw         %[tmp8],      (%[dst])                    \n\t"
+      "sw         %[tmp8],      4(%[dst])                   \n\t"
+
+      : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2),
+        [tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4),
+        [tmp5] "=&r" (tmp5), [tmp7] "=&r" (tmp7),
+        [tmp6] "=&r" (tmp6), [tmp8] "=&r" (tmp8)
+      : [left] "r" (left), [dst] "r" (dst),
+        [stride] "r" (stride)
+  );
+}
+
+void vp9_dc_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int32_t  expected_dc;
+  int32_t  average;
+  int32_t  tmp, above1, above_l1, above_r1, left1, left_r1, left_l1;
+  int32_t  above2, above_l2, above_r2, left2, left_r2, left_l2;
+
+  __asm__ __volatile__ (
+      "lw              %[above1],         (%[above])                      \n\t"
+      "lw              %[above2],         4(%[above])                     \n\t"
+      "lw              %[left1],          (%[left])                       \n\t"
+      "lw              %[left2],          4(%[left])                      \n\t"
+
+      "preceu.ph.qbl   %[above_l1],       %[above1]                       \n\t"
+      "preceu.ph.qbr   %[above_r1],       %[above1]                       \n\t"
+      "preceu.ph.qbl   %[left_l1],        %[left1]                        \n\t"
+      "preceu.ph.qbr   %[left_r1],        %[left1]                        \n\t"
+
+      "preceu.ph.qbl   %[above_l2],       %[above2]                       \n\t"
+      "preceu.ph.qbr   %[above_r2],       %[above2]                       \n\t"
+      "preceu.ph.qbl   %[left_l2],        %[left2]                        \n\t"
+      "preceu.ph.qbr   %[left_r2],        %[left2]                        \n\t"
+
+      "addu.ph         %[average],        %[above_r1],      %[above_l1]   \n\t"
+      "addu.ph         %[average],        %[average],       %[left_l1]    \n\t"
+      "addu.ph         %[average],        %[average],       %[left_r1]    \n\t"
+
+      "addu.ph         %[average],        %[average],       %[above_l2]   \n\t"
+      "addu.ph         %[average],        %[average],       %[above_r2]   \n\t"
+      "addu.ph         %[average],        %[average],       %[left_l2]    \n\t"
+      "addu.ph         %[average],        %[average],       %[left_r2]    \n\t"
+
+      "addiu           %[average],        %[average],       8             \n\t"
+
+      "srl             %[tmp],            %[average],       16            \n\t"
+      "addu.ph         %[average],        %[tmp],           %[average]    \n\t"
+      "srl             %[expected_dc],    %[average],       4             \n\t"
+      "replv.qb        %[expected_dc],    %[expected_dc]                  \n\t"
+
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      "add             %[dst],             %[dst],          %[stride]     \n\t"
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      "add             %[dst],             %[dst],          %[stride]     \n\t"
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      "add             %[dst],             %[dst],          %[stride]     \n\t"
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      "add             %[dst],             %[dst],          %[stride]     \n\t"
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      "add             %[dst],             %[dst],          %[stride]     \n\t"
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      "add             %[dst],             %[dst],          %[stride]     \n\t"
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      "add             %[dst],             %[dst],          %[stride]     \n\t"
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      : [above1] "=&r" (above1), [above_l1] "=&r" (above_l1),
+        [above_r1] "=&r" (above_r1), [left1] "=&r" (left1),
+        [left_l1] "=&r" (left_l1), [left_r1] "=&r" (left_r1),
+        [above2] "=&r" (above2), [above_l2] "=&r" (above_l2),
+        [above_r2] "=&r" (above_r2), [left2] "=&r" (left2),
+        [left_l2] "=&r" (left_l2), [left_r2] "=&r" (left_r2),
+        [average] "=&r" (average), [tmp] "=&r" (tmp),
+        [expected_dc] "=&r" (expected_dc)
+      : [above] "r" (above), [left] "r" (left), [dst] "r" (dst),
+        [stride] "r" (stride)
+  );
+}
+
+void vp9_tm_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int32_t   abovel, abover;
+  int32_t   abovel_1, abover_1;
+  int32_t   left0;
+  int32_t   res0, res1, res2, res3;
+  int32_t   reshw;
+  int32_t   top_left;
+  uint8_t   *cm = vp9_ff_cropTbl;
+
+  __asm__ __volatile__ (
+      "ulw             %[reshw],       (%[above])                         \n\t"
+      "ulw             %[top_left],    4(%[above])                        \n\t"
+
+      "lbu             %[left0],       (%[left])                          \n\t"
+
+      "preceu.ph.qbl   %[abovel],      %[reshw]                           \n\t"
+      "preceu.ph.qbr   %[abover],      %[reshw]                           \n\t"
+      "preceu.ph.qbl   %[abovel_1],    %[top_left]                        \n\t"
+      "preceu.ph.qbr   %[abover_1],    %[top_left]                        \n\t"
+
+      "lbu             %[top_left],    -1(%[above])                       \n\t"
+      "replv.ph        %[left0],       %[left0]                           \n\t"
+
+      "replv.ph        %[top_left],    %[top_left]                        \n\t"
+
+      "addu.ph         %[reshw],       %[abovel],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        (%[dst])                           \n\t"
+      "sb              %[res1],        1(%[dst])                          \n\t"
+      "sb              %[res2],        2(%[dst])                          \n\t"
+      "sb              %[res3],        3(%[dst])                          \n\t"
+
+      "addu.ph         %[reshw],       %[abovel_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbu             %[left0],       1(%[left])                         \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        4(%[dst])                          \n\t"
+      "sb              %[res1],        5(%[dst])                          \n\t"
+      "sb              %[res2],        6(%[dst])                          \n\t"
+      "sb              %[res3],        7(%[dst])                          \n\t"
+
+      "replv.ph        %[left0],       %[left0]                           \n\t"
+      "add             %[dst],          %[dst],             %[stride]     \n\t"
+
+      "addu.ph         %[reshw],       %[abovel],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        (%[dst])                           \n\t"
+      "sb              %[res1],        1(%[dst])                          \n\t"
+      "sb              %[res2],        2(%[dst])                          \n\t"
+      "sb              %[res3],        3(%[dst])                          \n\t"
+
+      "addu.ph         %[reshw],       %[abovel_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbu             %[left0],       2(%[left])                         \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        4(%[dst])                          \n\t"
+      "sb              %[res1],        5(%[dst])                          \n\t"
+      "sb              %[res2],        6(%[dst])                          \n\t"
+      "sb              %[res3],        7(%[dst])                          \n\t"
+
+      "replv.ph        %[left0],       %[left0]                           \n\t"
+      "add             %[dst],          %[dst],             %[stride]     \n\t"
+
+      "addu.ph         %[reshw],       %[abovel],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        (%[dst])                           \n\t"
+      "sb              %[res1],        1(%[dst])                          \n\t"
+      "sb              %[res2],        2(%[dst])                          \n\t"
+      "sb              %[res3],        3(%[dst])                          \n\t"
+
+      "addu.ph         %[reshw],       %[abovel_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbu             %[left0],       3(%[left])                         \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        4(%[dst])                          \n\t"
+      "sb              %[res1],        5(%[dst])                          \n\t"
+      "sb              %[res2],        6(%[dst])                          \n\t"
+      "sb              %[res3],        7(%[dst])                          \n\t"
+
+      "replv.ph        %[left0],       %[left0]                           \n\t"
+      "add             %[dst],          %[dst],             %[stride]     \n\t"
+
+      "addu.ph         %[reshw],       %[abovel],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        (%[dst])                           \n\t"
+      "sb              %[res1],        1(%[dst])                          \n\t"
+      "sb              %[res2],        2(%[dst])                          \n\t"
+      "sb              %[res3],        3(%[dst])                          \n\t"
+
+      "addu.ph         %[reshw],       %[abovel_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbu             %[left0],       4(%[left])                         \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        4(%[dst])                          \n\t"
+      "sb              %[res1],        5(%[dst])                          \n\t"
+      "sb              %[res2],        6(%[dst])                          \n\t"
+      "sb              %[res3],        7(%[dst])                          \n\t"
+
+      "replv.ph        %[left0],       %[left0]                           \n\t"
+      "add             %[dst],          %[dst],             %[stride]     \n\t"
+
+      "addu.ph         %[reshw],       %[abovel],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        (%[dst])                           \n\t"
+      "sb              %[res1],        1(%[dst])                          \n\t"
+      "sb              %[res2],        2(%[dst])                          \n\t"
+      "sb              %[res3],        3(%[dst])                          \n\t"
+
+      "addu.ph         %[reshw],       %[abovel_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbu             %[left0],       5(%[left])                         \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        4(%[dst])                          \n\t"
+      "sb              %[res1],        5(%[dst])                          \n\t"
+      "sb              %[res2],        6(%[dst])                          \n\t"
+      "sb              %[res3],        7(%[dst])                          \n\t"
+
+      "replv.ph        %[left0],       %[left0]                           \n\t"
+      "add             %[dst],          %[dst],             %[stride]     \n\t"
+
+      "addu.ph         %[reshw],       %[abovel],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        (%[dst])                           \n\t"
+      "sb              %[res1],        1(%[dst])                          \n\t"
+      "sb              %[res2],        2(%[dst])                          \n\t"
+      "sb              %[res3],        3(%[dst])                          \n\t"
+
+      "addu.ph         %[reshw],       %[abovel_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbu             %[left0],       6(%[left])                         \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        4(%[dst])                          \n\t"
+      "sb              %[res1],        5(%[dst])                          \n\t"
+      "sb              %[res2],        6(%[dst])                          \n\t"
+      "sb              %[res3],        7(%[dst])                          \n\t"
+
+      "replv.ph        %[left0],       %[left0]                           \n\t"
+      "add             %[dst],          %[dst],             %[stride]     \n\t"
+
+      "addu.ph         %[reshw],       %[abovel],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        (%[dst])                           \n\t"
+      "sb              %[res1],        1(%[dst])                          \n\t"
+      "sb              %[res2],        2(%[dst])                          \n\t"
+      "sb              %[res3],        3(%[dst])                          \n\t"
+
+      "addu.ph         %[reshw],       %[abovel_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbu             %[left0],       7(%[left])                         \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        4(%[dst])                          \n\t"
+      "sb              %[res1],        5(%[dst])                          \n\t"
+      "sb              %[res2],        6(%[dst])                          \n\t"
+      "sb              %[res3],        7(%[dst])                          \n\t"
+
+      "replv.ph        %[left0],       %[left0]                           \n\t"
+      "add             %[dst],          %[dst],             %[stride]     \n\t"
+
+      "addu.ph         %[reshw],       %[abovel],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        (%[dst])                           \n\t"
+      "sb              %[res1],        1(%[dst])                          \n\t"
+      "sb              %[res2],        2(%[dst])                          \n\t"
+      "sb              %[res3],        3(%[dst])                          \n\t"
+
+      "addu.ph         %[reshw],       %[abovel_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        4(%[dst])                          \n\t"
+      "sb              %[res1],        5(%[dst])                          \n\t"
+      "sb              %[res2],        6(%[dst])                          \n\t"
+      "sb              %[res3],        7(%[dst])                          \n\t"
+
+      : [abovel] "=&r" (abovel), [abover] "=&r" (abover),
+        [abovel_1] "=&r" (abovel_1), [abover_1] "=&r" (abover_1),
+        [left0] "=&r" (left0), [res2] "=&r" (res2), [res3] "=&r" (res3),
+        [res0] "=&r" (res0), [res1] "=&r" (res1),
+        [reshw] "=&r" (reshw), [top_left] "=&r" (top_left)
+      : [above] "r" (above), [left] "r" (left),
+        [dst] "r" (dst), [stride] "r" (stride), [cm] "r" (cm)
+  );
+}
+#endif  // #if HAVE_DSPR2
diff --git a/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c
index d3aee73..bc67594 100644
--- a/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c
+++ b/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c
@@ -19,7 +19,8 @@
 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
 
 #if HAVE_DSPR2
-static void idct32_1d_rows_dspr2(const int16_t *input, int16_t *output) {
+static void idct32_1d_rows_dspr2(const int16_t *input, int16_t *output,
+                                 uint32_t no_rows) {
   int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
   int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
   int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20;
@@ -42,7 +43,7 @@
   const int const_2_power_13 = 8192;
   const int32_t *input_int;
 
-  for (i = 32; i--; ) {
+  for (i = no_rows; i--; ) {
     input_int = (const int32_t *)input;
 
     if (!(input_int[0]  | input_int[1]  | input_int[2]  | input_int[3]  |
@@ -881,12 +882,74 @@
   );
 
   // Rows
-  idct32_1d_rows_dspr2(input, outptr);
+  idct32_1d_rows_dspr2(input, outptr, 32);
 
   // Columns
   vp9_idct32_1d_cols_add_blk_dspr2(out, dest, dest_stride);
 }
 
+void vp9_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest,
+                                int stride) {
+  DECLARE_ALIGNED(32, int16_t,  out[32 * 32]);
+  int16_t *outptr = out;
+  uint32_t i;
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  // Rows
+  idct32_1d_rows_dspr2(input, outptr, 8);
+
+  outptr += 8;
+  __asm__ __volatile__ (
+      "sw     $zero,      0(%[outptr])     \n\t"
+      "sw     $zero,      4(%[outptr])     \n\t"
+      "sw     $zero,      8(%[outptr])     \n\t"
+      "sw     $zero,     12(%[outptr])     \n\t"
+      "sw     $zero,     16(%[outptr])     \n\t"
+      "sw     $zero,     20(%[outptr])     \n\t"
+      "sw     $zero,     24(%[outptr])     \n\t"
+      "sw     $zero,     28(%[outptr])     \n\t"
+      "sw     $zero,     32(%[outptr])     \n\t"
+      "sw     $zero,     36(%[outptr])     \n\t"
+      "sw     $zero,     40(%[outptr])     \n\t"
+      "sw     $zero,     44(%[outptr])     \n\t"
+
+      :
+      : [outptr] "r" (outptr)
+  );
+
+  for (i = 0; i < 31; ++i) {
+    outptr += 32;
+
+    __asm__ __volatile__ (
+        "sw     $zero,      0(%[outptr])     \n\t"
+        "sw     $zero,      4(%[outptr])     \n\t"
+        "sw     $zero,      8(%[outptr])     \n\t"
+        "sw     $zero,     12(%[outptr])     \n\t"
+        "sw     $zero,     16(%[outptr])     \n\t"
+        "sw     $zero,     20(%[outptr])     \n\t"
+        "sw     $zero,     24(%[outptr])     \n\t"
+        "sw     $zero,     28(%[outptr])     \n\t"
+        "sw     $zero,     32(%[outptr])     \n\t"
+        "sw     $zero,     36(%[outptr])     \n\t"
+        "sw     $zero,     40(%[outptr])     \n\t"
+        "sw     $zero,     44(%[outptr])     \n\t"
+
+        :
+        : [outptr] "r" (outptr)
+    );
+  }
+
+  // Columns
+  vp9_idct32_1d_cols_add_blk_dspr2(out, dest, stride);
+}
+
 void vp9_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,
                                int stride) {
   int       r, out;
diff --git a/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c b/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c
new file mode 100644
index 0000000..36cfc83
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c
@@ -0,0 +1,309 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_loopfilter.h"
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h"
+
+#if HAVE_DSPR2
+void vp9_loop_filter_horizontal_edge_dspr2(unsigned char *s,
+                                           int pitch,
+                                           const uint8_t *blimit,
+                                           const uint8_t *limit,
+                                           const uint8_t *thresh,
+                                           int count) {
+  uint8_t   i;
+  uint32_t  mask;
+  uint32_t  hev;
+  uint32_t  pm1, p0, p1, p2, p3, p4, p5, p6;
+  uint8_t   *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
+  uint32_t  thresh_vec, flimit_vec, limit_vec;
+  uint32_t  uflimit, ulimit, uthresh;
+
+  uflimit = *blimit;
+  ulimit = *limit;
+  uthresh = *thresh;
+
+  /* create quad-byte */
+  __asm__ __volatile__ (
+      "replv.qb       %[thresh_vec],    %[uthresh]    \n\t"
+      "replv.qb       %[flimit_vec],    %[uflimit]    \n\t"
+      "replv.qb       %[limit_vec],     %[ulimit]     \n\t"
+
+      : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
+        [limit_vec] "=r" (limit_vec)
+      : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
+  );
+
+  /* prefetch data for store */
+  vp9_prefetch_store(s);
+
+  /* loop filter designed to work using chars so that we can make maximum use
+     of 8 bit simd instructions. */
+  for (i = 0; i < 2; i++) {
+    sm1 = s - (pitch << 2);
+    s0 = sm1 + pitch;
+    s1 = s0 + pitch;
+    s2 = s - pitch;
+    s3 = s;
+    s4 = s + pitch;
+    s5 = s4 + pitch;
+    s6 = s5 + pitch;
+
+    __asm__ __volatile__ (
+        "lw     %[p1],  (%[s1])    \n\t"
+        "lw     %[p2],  (%[s2])    \n\t"
+        "lw     %[p3],  (%[s3])    \n\t"
+        "lw     %[p4],  (%[s4])    \n\t"
+
+        : [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4)
+        : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
+    );
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+       mask will be zero and filtering is not needed */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
+      __asm__ __volatile__ (
+          "lw       %[pm1], (%[sm1])   \n\t"
+          "lw       %[p0],  (%[s0])    \n\t"
+          "lw       %[p5],  (%[s5])    \n\t"
+          "lw       %[p6],  (%[s6])    \n\t"
+
+          : [pm1] "=&r" (pm1), [p0] "=&r" (p0), [p5] "=&r" (p5),
+            [p6] "=&r" (p6)
+          : [sm1] "r" (sm1), [s0] "r" (s0), [s5] "r" (s5), [s6] "r" (s6)
+      );
+
+      vp9_filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2,
+                                pm1, p0, p3, p4, p5, p6,
+                                thresh_vec, &hev, &mask);
+
+      /* if mask == 0 do filtering is not needed */
+      if (mask) {
+        /* filtering */
+        vp9_filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
+
+        __asm__ __volatile__ (
+            "sw     %[p1],  (%[s1])    \n\t"
+            "sw     %[p2],  (%[s2])    \n\t"
+            "sw     %[p3],  (%[s3])    \n\t"
+            "sw     %[p4],  (%[s4])    \n\t"
+
+            :
+            : [p1] "r" (p1), [p2] "r" (p2), [p3] "r" (p3), [p4] "r" (p4),
+              [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
+        );
+      }
+    }
+
+    s = s + 4;
+  }
+}
+
+void vp9_loop_filter_vertical_edge_dspr2(unsigned char *s,
+                                         int pitch,
+                                         const uint8_t *blimit,
+                                         const uint8_t *limit,
+                                         const uint8_t *thresh,
+                                         int count) {
+  uint8_t   i;
+  uint32_t  mask, hev;
+  uint32_t  pm1, p0, p1, p2, p3, p4, p5, p6;
+  uint8_t   *s1, *s2, *s3, *s4;
+  uint32_t  prim1, prim2, sec3, sec4, prim3, prim4;
+  uint32_t  thresh_vec, flimit_vec, limit_vec;
+  uint32_t  uflimit, ulimit, uthresh;
+
+  uflimit = *blimit;
+  ulimit = *limit;
+  uthresh = *thresh;
+
+  /* create quad-byte */
+  __asm__ __volatile__ (
+      "replv.qb       %[thresh_vec],    %[uthresh]    \n\t"
+      "replv.qb       %[flimit_vec],    %[uflimit]    \n\t"
+      "replv.qb       %[limit_vec],     %[ulimit]     \n\t"
+
+      : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
+        [limit_vec] "=r" (limit_vec)
+      : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
+  );
+
+  /* prefetch data for store */
+  vp9_prefetch_store(s + pitch);
+
+  for (i = 0; i < 2; i++) {
+    s1 = s;
+    s2 = s + pitch;
+    s3 = s2 + pitch;
+    s4 = s3 + pitch;
+    s  = s4 + pitch;
+
+    /* load quad-byte vectors
+     * memory is 4 byte aligned
+     */
+    p2  = *((uint32_t *)(s1 - 4));
+    p6  = *((uint32_t *)(s1));
+    p1  = *((uint32_t *)(s2 - 4));
+    p5  = *((uint32_t *)(s2));
+    p0  = *((uint32_t *)(s3 - 4));
+    p4  = *((uint32_t *)(s3));
+    pm1 = *((uint32_t *)(s4 - 4));
+    p3  = *((uint32_t *)(s4));
+
+    /* transpose pm1, p0, p1, p2 */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
+        "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
+
+        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
+        "append         %[p1],      %[sec3],    16          \n\t"
+        "append         %[pm1],     %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* transpose p3, p4, p5, p6 */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
+        "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
+
+        "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
+        "append         %[p5],      %[sec3],    16          \n\t"
+        "append         %[p3],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+     * mask will be zero and filtering is not needed
+     */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
+      vp9_filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1,
+                                p0, p3, p4, p5, p6, thresh_vec,
+                                &hev, &mask);
+
+      /* if mask == 0 do filtering is not needed */
+      if (mask) {
+        /* filtering */
+        vp9_filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
+
+        /* unpack processed 4x4 neighborhood
+         * don't use transpose on output data
+         * because memory isn't aligned
+         */
+        __asm__ __volatile__ (
+            "sb     %[p4],   1(%[s4])    \n\t"
+            "sb     %[p3],   0(%[s4])    \n\t"
+            "sb     %[p2],  -1(%[s4])    \n\t"
+            "sb     %[p1],  -2(%[s4])    \n\t"
+
+            :
+            : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1),
+              [s4] "r" (s4)
+        );
+
+        __asm__ __volatile__ (
+            "srl    %[p4],  %[p4],  8     \n\t"
+            "srl    %[p3],  %[p3],  8     \n\t"
+            "srl    %[p2],  %[p2],  8     \n\t"
+            "srl    %[p1],  %[p1],  8     \n\t"
+
+            : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+            :
+        );
+
+        __asm__ __volatile__ (
+            "sb     %[p4],   1(%[s3])    \n\t"
+            "sb     %[p3],   0(%[s3])    \n\t"
+            "sb     %[p2],  -1(%[s3])    \n\t"
+            "sb     %[p1],  -2(%[s3])    \n\t"
+
+            : [p1] "+r" (p1)
+            : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [s3] "r" (s3)
+        );
+
+        __asm__ __volatile__ (
+            "srl    %[p4],  %[p4],  8     \n\t"
+            "srl    %[p3],  %[p3],  8     \n\t"
+            "srl    %[p2],  %[p2],  8     \n\t"
+            "srl    %[p1],  %[p1],  8     \n\t"
+
+            : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+            :
+        );
+
+        __asm__ __volatile__ (
+            "sb     %[p4],   1(%[s2])    \n\t"
+            "sb     %[p3],   0(%[s2])    \n\t"
+            "sb     %[p2],  -1(%[s2])    \n\t"
+            "sb     %[p1],  -2(%[s2])    \n\t"
+
+            :
+            : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1),
+              [s2] "r" (s2)
+        );
+
+        __asm__ __volatile__ (
+            "srl    %[p4],  %[p4],  8     \n\t"
+            "srl    %[p3],  %[p3],  8     \n\t"
+            "srl    %[p2],  %[p2],  8     \n\t"
+            "srl    %[p1],  %[p1],  8     \n\t"
+
+            : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+            :
+        );
+
+        __asm__ __volatile__ (
+            "sb     %[p4],   1(%[s1])    \n\t"
+            "sb     %[p3],   0(%[s1])    \n\t"
+            "sb     %[p2],  -1(%[s1])    \n\t"
+            "sb     %[p1],  -2(%[s1])    \n\t"
+
+            :
+            : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1),
+              [s1] "r" (s1)
+        );
+      }
+    }
+  }
+}
+#endif  // #if HAVE_DSPR2
diff --git a/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h b/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h
new file mode 100644
index 0000000..98bfcfa
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h
@@ -0,0 +1,755 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_
+#define VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_
+
+#include <stdlib.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_onyxc_int.h"
+
+#if HAVE_DSPR2
+/* inputs & outputs are quad-byte vectors */
+static INLINE void vp9_filter_dspr2(uint32_t mask, uint32_t hev,
+                                    uint32_t *ps1, uint32_t *ps0,
+                                    uint32_t *qs0, uint32_t *qs1) {
+  int32_t   vp9_filter_l, vp9_filter_r;
+  int32_t   Filter1_l, Filter1_r, Filter2_l, Filter2_r;
+  int32_t   subr_r, subr_l;
+  uint32_t  t1, t2, HWM, t3;
+  uint32_t  hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
+  int32_t   vps1, vps0, vqs0, vqs1;
+  int32_t   vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
+  uint32_t  N128;
+
+  N128 = 0x80808080;
+  t1  = 0x03000300;
+  t2  = 0x04000400;
+  t3  = 0x01000100;
+  HWM = 0xFF00FF00;
+
+  vps0 = (*ps0) ^ N128;
+  vps1 = (*ps1) ^ N128;
+  vqs0 = (*qs0) ^ N128;
+  vqs1 = (*qs1) ^ N128;
+
+  /* use halfword pairs instead quad-bytes because of accuracy */
+  vps0_l = vps0 & HWM;
+  vps0_r = vps0 << 8;
+  vps0_r = vps0_r & HWM;
+
+  vps1_l = vps1 & HWM;
+  vps1_r = vps1 << 8;
+  vps1_r = vps1_r & HWM;
+
+  vqs0_l = vqs0 & HWM;
+  vqs0_r = vqs0 << 8;
+  vqs0_r = vqs0_r & HWM;
+
+  vqs1_l = vqs1 & HWM;
+  vqs1_r = vqs1 << 8;
+  vqs1_r = vqs1_r & HWM;
+
+  mask_l = mask & HWM;
+  mask_r = mask << 8;
+  mask_r = mask_r & HWM;
+
+  hev_l = hev & HWM;
+  hev_r = hev << 8;
+  hev_r = hev_r & HWM;
+
+  __asm__ __volatile__ (
+      /* vp9_filter = vp8_signed_char_clamp(ps1 - qs1); */
+      "subq_s.ph    %[vp9_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
+      "subq_s.ph    %[vp9_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
+
+      /* qs0 - ps0 */
+      "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
+      "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
+
+      /* vp9_filter &= hev; */
+      "and          %[vp9_filter_l], %[vp9_filter_l], %[hev_l]        \n\t"
+      "and          %[vp9_filter_r], %[vp9_filter_r], %[hev_r]        \n\t"
+
+      /* vp9_filter = vp8_signed_char_clamp(vp9_filter + 3 * (qs0 - ps0)); */
+      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
+      "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
+      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
+      "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
+      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
+
+      /* vp9_filter &= mask; */
+      "and          %[vp9_filter_l], %[vp9_filter_l], %[mask_l]       \n\t"
+      "and          %[vp9_filter_r], %[vp9_filter_r], %[mask_r]       \n\t"
+
+      : [vp9_filter_l] "=&r" (vp9_filter_l),
+        [vp9_filter_r] "=&r" (vp9_filter_r),
+        [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r),
+        [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r)
+      : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l),
+        [vps1_r] "r" (vps1_r), [vqs0_l] "r" (vqs0_l), [vqs0_r] "r" (vqs0_r),
+        [vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r),
+        [mask_l] "r" (mask_l), [mask_r] "r" (mask_r),
+        [hev_l] "r" (hev_l), [hev_r] "r" (hev_r),
+        [HWM] "r" (HWM)
+  );
+
+  /* save bottom 3 bits so that we round one side +4 and the other +3 */
+  __asm__ __volatile__ (
+      /* Filter2 = vp8_signed_char_clamp(vp9_filter + 3) >>= 3; */
+      "addq_s.ph    %[Filter1_l],    %[vp9_filter_l], %[t2]           \n\t"
+      "addq_s.ph    %[Filter1_r],    %[vp9_filter_r], %[t2]           \n\t"
+
+      /* Filter1 = vp8_signed_char_clamp(vp9_filter + 4) >>= 3; */
+      "addq_s.ph    %[Filter2_l],    %[vp9_filter_l], %[t1]           \n\t"
+      "addq_s.ph    %[Filter2_r],    %[vp9_filter_r], %[t1]           \n\t"
+      "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
+      "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
+
+      "shra.ph      %[Filter2_l],    %[Filter2_l],    3               \n\t"
+      "shra.ph      %[Filter2_r],    %[Filter2_r],    3               \n\t"
+
+      "and          %[Filter1_l],    %[Filter1_l],    %[HWM]          \n\t"
+      "and          %[Filter1_r],    %[Filter1_r],    %[HWM]          \n\t"
+
+      /* vps0 = vp8_signed_char_clamp(ps0 + Filter2); */
+      "addq_s.ph    %[vps0_l],       %[vps0_l],       %[Filter2_l]    \n\t"
+      "addq_s.ph    %[vps0_r],       %[vps0_r],       %[Filter2_r]    \n\t"
+
+      /* vqs0 = vp8_signed_char_clamp(qs0 - Filter1); */
+      "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[Filter1_l]    \n\t"
+      "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[Filter1_r]    \n\t"
+
+      : [Filter1_l] "=&r" (Filter1_l), [Filter1_r] "=&r" (Filter1_r),
+        [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r),
+        [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
+        [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
+      : [t1] "r" (t1), [t2] "r" (t2), [HWM] "r" (HWM),
+        [vp9_filter_l] "r" (vp9_filter_l), [vp9_filter_r] "r" (vp9_filter_r)
+  );
+
+  __asm__ __volatile__ (
+      /* (vp9_filter += 1) >>= 1 */
+      "addqh.ph    %[Filter1_l],    %[Filter1_l],     %[t3]           \n\t"
+      "addqh.ph    %[Filter1_r],    %[Filter1_r],     %[t3]           \n\t"
+
+      /* vp9_filter &= ~hev; */
+      "and          %[Filter1_l],    %[Filter1_l],    %[invhev_l]     \n\t"
+      "and          %[Filter1_r],    %[Filter1_r],    %[invhev_r]     \n\t"
+
+      /* vps1 = vp8_signed_char_clamp(ps1 + vp9_filter); */
+      "addq_s.ph    %[vps1_l],       %[vps1_l],       %[Filter1_l]    \n\t"
+      "addq_s.ph    %[vps1_r],       %[vps1_r],       %[Filter1_r]    \n\t"
+
+      /* vqs1 = vp8_signed_char_clamp(qs1 - vp9_filter); */
+      "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[Filter1_l]    \n\t"
+      "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
+
+      : [Filter1_l] "+r" (Filter1_l), [Filter1_r] "+r" (Filter1_r),
+        [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r),
+        [vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r)
+      : [t3] "r" (t3), [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r)
+  );
+
+  /* Create quad-bytes from halfword pairs */
+  vqs0_l = vqs0_l & HWM;
+  vqs1_l = vqs1_l & HWM;
+  vps0_l = vps0_l & HWM;
+  vps1_l = vps1_l & HWM;
+
+  __asm__ __volatile__ (
+      "shrl.ph      %[vqs0_r],       %[vqs0_r],       8   \n\t"
+      "shrl.ph      %[vps0_r],       %[vps0_r],       8   \n\t"
+      "shrl.ph      %[vqs1_r],       %[vqs1_r],       8   \n\t"
+      "shrl.ph      %[vps1_r],       %[vps1_r],       8   \n\t"
+
+      : [vps1_r] "+r" (vps1_r), [vqs1_r] "+r" (vqs1_r),
+        [vps0_r] "+r" (vps0_r), [vqs0_r] "+r" (vqs0_r)
+      :
+  );
+
+  vqs0 = vqs0_l | vqs0_r;
+  vqs1 = vqs1_l | vqs1_r;
+  vps0 = vps0_l | vps0_r;
+  vps1 = vps1_l | vps1_r;
+
+  *ps0 = vps0 ^ N128;
+  *ps1 = vps1 ^ N128;
+  *qs0 = vqs0 ^ N128;
+  *qs1 = vqs1 ^ N128;
+}
+
+static INLINE void vp9_filter1_dspr2(uint32_t mask, uint32_t hev,
+                                     uint32_t ps1, uint32_t ps0,
+                                     uint32_t qs0, uint32_t qs1,
+                                     uint32_t *p1_f0, uint32_t *p0_f0,
+                                     uint32_t *q0_f0, uint32_t *q1_f0) {
+  int32_t   vp9_filter_l, vp9_filter_r;
+  int32_t   Filter1_l, Filter1_r, Filter2_l, Filter2_r;
+  int32_t   subr_r, subr_l;
+  uint32_t  t1, t2, HWM, t3;
+  uint32_t  hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
+  int32_t   vps1, vps0, vqs0, vqs1;
+  int32_t   vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
+  uint32_t  N128;
+
+  N128 = 0x80808080;
+  t1  = 0x03000300;
+  t2  = 0x04000400;
+  t3  = 0x01000100;
+  HWM = 0xFF00FF00;
+
+  vps0 = (ps0) ^ N128;
+  vps1 = (ps1) ^ N128;
+  vqs0 = (qs0) ^ N128;
+  vqs1 = (qs1) ^ N128;
+
+  /* use halfword pairs instead quad-bytes because of accuracy */
+  vps0_l = vps0 & HWM;
+  vps0_r = vps0 << 8;
+  vps0_r = vps0_r & HWM;
+
+  vps1_l = vps1 & HWM;
+  vps1_r = vps1 << 8;
+  vps1_r = vps1_r & HWM;
+
+  vqs0_l = vqs0 & HWM;
+  vqs0_r = vqs0 << 8;
+  vqs0_r = vqs0_r & HWM;
+
+  vqs1_l = vqs1 & HWM;
+  vqs1_r = vqs1 << 8;
+  vqs1_r = vqs1_r & HWM;
+
+  mask_l = mask & HWM;
+  mask_r = mask << 8;
+  mask_r = mask_r & HWM;
+
+  hev_l = hev & HWM;
+  hev_r = hev << 8;
+  hev_r = hev_r & HWM;
+
+  __asm__ __volatile__ (
+      /* vp9_filter = vp8_signed_char_clamp(ps1 - qs1); */
+      "subq_s.ph    %[vp9_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
+      "subq_s.ph    %[vp9_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
+
+      /* qs0 - ps0 */
+      "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
+      "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
+
+      /* vp9_filter &= hev; */
+      "and          %[vp9_filter_l], %[vp9_filter_l], %[hev_l]        \n\t"
+      "and          %[vp9_filter_r], %[vp9_filter_r], %[hev_r]        \n\t"
+
+      /* vp9_filter = vp8_signed_char_clamp(vp9_filter + 3 * (qs0 - ps0)); */
+      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
+      "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
+      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
+      "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
+      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
+
+      /* vp9_filter &= mask; */
+      "and          %[vp9_filter_l], %[vp9_filter_l], %[mask_l]       \n\t"
+      "and          %[vp9_filter_r], %[vp9_filter_r], %[mask_r]       \n\t"
+
+      : [vp9_filter_l] "=&r" (vp9_filter_l),
+        [vp9_filter_r] "=&r" (vp9_filter_r),
+        [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r),
+        [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r)
+      : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l),
+        [vps1_r] "r" (vps1_r), [vqs0_l] "r" (vqs0_l), [vqs0_r] "r" (vqs0_r),
+        [vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r),
+        [mask_l] "r" (mask_l), [mask_r] "r" (mask_r),
+        [hev_l] "r" (hev_l), [hev_r] "r" (hev_r), [HWM] "r" (HWM)
+  );
+
+  /* save bottom 3 bits so that we round one side +4 and the other +3 */
+  __asm__ __volatile__ (
+      /* Filter2 = vp8_signed_char_clamp(vp9_filter + 3) >>= 3; */
+      "addq_s.ph    %[Filter1_l],    %[vp9_filter_l], %[t2]           \n\t"
+      "addq_s.ph    %[Filter1_r],    %[vp9_filter_r], %[t2]           \n\t"
+
+      /* Filter1 = vp8_signed_char_clamp(vp9_filter + 4) >>= 3; */
+      "addq_s.ph    %[Filter2_l],    %[vp9_filter_l], %[t1]           \n\t"
+      "addq_s.ph    %[Filter2_r],    %[vp9_filter_r], %[t1]           \n\t"
+      "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
+      "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
+
+      "shra.ph      %[Filter2_l],    %[Filter2_l],    3               \n\t"
+      "shra.ph      %[Filter2_r],    %[Filter2_r],    3               \n\t"
+
+      "and          %[Filter1_l],    %[Filter1_l],    %[HWM]          \n\t"
+      "and          %[Filter1_r],    %[Filter1_r],    %[HWM]          \n\t"
+
+      /* vps0 = vp8_signed_char_clamp(ps0 + Filter2); */
+      "addq_s.ph    %[vps0_l],       %[vps0_l],       %[Filter2_l]    \n\t"
+      "addq_s.ph    %[vps0_r],       %[vps0_r],       %[Filter2_r]    \n\t"
+
+      /* vqs0 = vp8_signed_char_clamp(qs0 - Filter1); */
+      "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[Filter1_l]    \n\t"
+      "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[Filter1_r]    \n\t"
+
+      : [Filter1_l] "=&r" (Filter1_l), [Filter1_r] "=&r" (Filter1_r),
+        [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r),
+        [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
+        [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
+      : [t1] "r" (t1), [t2] "r" (t2), [HWM] "r" (HWM),
+        [vp9_filter_l] "r" (vp9_filter_l), [vp9_filter_r] "r" (vp9_filter_r)
+  );
+
+  __asm__ __volatile__ (
+      /* (vp9_filter += 1) >>= 1 */
+      "addqh.ph    %[Filter1_l],    %[Filter1_l],     %[t3]           \n\t"
+      "addqh.ph    %[Filter1_r],    %[Filter1_r],     %[t3]           \n\t"
+
+      /* vp9_filter &= ~hev; */
+      "and          %[Filter1_l],    %[Filter1_l],    %[invhev_l]     \n\t"
+      "and          %[Filter1_r],    %[Filter1_r],    %[invhev_r]     \n\t"
+
+      /* vps1 = vp8_signed_char_clamp(ps1 + vp9_filter); */
+      "addq_s.ph    %[vps1_l],       %[vps1_l],       %[Filter1_l]    \n\t"
+      "addq_s.ph    %[vps1_r],       %[vps1_r],       %[Filter1_r]    \n\t"
+
+      /* vqs1 = vp8_signed_char_clamp(qs1 - vp9_filter); */
+      "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[Filter1_l]    \n\t"
+      "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
+
+      : [Filter1_l] "+r" (Filter1_l), [Filter1_r] "+r" (Filter1_r),
+        [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r),
+        [vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r)
+      : [t3] "r" (t3), [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r)
+  );
+
+  /* Create quad-bytes from halfword pairs */
+  vqs0_l = vqs0_l & HWM;
+  vqs1_l = vqs1_l & HWM;
+  vps0_l = vps0_l & HWM;
+  vps1_l = vps1_l & HWM;
+
+  __asm__ __volatile__ (
+      "shrl.ph      %[vqs0_r],       %[vqs0_r],       8   \n\t"
+      "shrl.ph      %[vps0_r],       %[vps0_r],       8   \n\t"
+      "shrl.ph      %[vqs1_r],       %[vqs1_r],       8   \n\t"
+      "shrl.ph      %[vps1_r],       %[vps1_r],       8   \n\t"
+
+      : [vps1_r] "+r" (vps1_r), [vqs1_r] "+r" (vqs1_r),
+        [vps0_r] "+r" (vps0_r), [vqs0_r] "+r" (vqs0_r)
+      :
+  );
+
+  vqs0 = vqs0_l | vqs0_r;
+  vqs1 = vqs1_l | vqs1_r;
+  vps0 = vps0_l | vps0_r;
+  vps1 = vps1_l | vps1_r;
+
+  *p0_f0 = vps0 ^ N128;
+  *p1_f0 = vps1 ^ N128;
+  *q0_f0 = vqs0 ^ N128;
+  *q1_f0 = vqs1 ^ N128;
+}
+
+static INLINE void vp9_mbfilter_dspr2(uint32_t *op3, uint32_t *op2,
+                                      uint32_t *op1, uint32_t *op0,
+                                      uint32_t *oq0, uint32_t *oq1,
+                                      uint32_t *oq2, uint32_t *oq3) {
+  /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
+  const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+  const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
+  uint32_t       res_op2, res_op1, res_op0;
+  uint32_t       res_oq0, res_oq1, res_oq2;
+  uint32_t       tmp;
+  uint32_t       add_p210_q012;
+  uint32_t       u32Four = 0x00040004;
+
+  /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3)  1 */
+  /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3)  2 */
+  /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3)  3 */
+  /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3)  4 */
+  /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3)  5 */
+  /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3)  6 */
+
+  __asm__ __volatile__ (
+      "addu.ph    %[add_p210_q012],  %[p2],             %[p1]            \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[p0]            \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q0]            \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q1]            \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q2]            \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[u32Four]       \n\t"
+
+      "shll.ph    %[tmp],            %[p3],             1                \n\t"
+      "addu.ph    %[res_op2],        %[tmp],            %[p3]            \n\t"
+      "addu.ph    %[res_op1],        %[p3],             %[p3]            \n\t"
+      "addu.ph    %[res_op2],        %[res_op2],        %[p2]            \n\t"
+      "addu.ph    %[res_op1],        %[res_op1],        %[p1]            \n\t"
+      "addu.ph    %[res_op2],        %[res_op2],        %[add_p210_q012] \n\t"
+      "addu.ph    %[res_op1],        %[res_op1],        %[add_p210_q012] \n\t"
+      "subu.ph    %[res_op2],        %[res_op2],        %[q1]            \n\t"
+      "subu.ph    %[res_op1],        %[res_op1],        %[q2]            \n\t"
+      "subu.ph    %[res_op2],        %[res_op2],        %[q2]            \n\t"
+      "shrl.ph    %[res_op1],        %[res_op1],        3                \n\t"
+      "shrl.ph    %[res_op2],        %[res_op2],        3                \n\t"
+      "addu.ph    %[res_op0],        %[p3],             %[p0]            \n\t"
+      "addu.ph    %[res_oq0],        %[q0],             %[q3]            \n\t"
+      "addu.ph    %[res_op0],        %[res_op0],        %[add_p210_q012] \n\t"
+      "addu.ph    %[res_oq0],        %[res_oq0],        %[add_p210_q012] \n\t"
+      "addu.ph    %[res_oq1],        %[q3],             %[q3]            \n\t"
+      "shll.ph    %[tmp],            %[q3],             1                \n\t"
+      "addu.ph    %[res_oq1],        %[res_oq1],        %[q1]            \n\t"
+      "addu.ph    %[res_oq2],        %[tmp],            %[q3]            \n\t"
+      "addu.ph    %[res_oq1],        %[res_oq1],        %[add_p210_q012] \n\t"
+      "addu.ph    %[res_oq2],        %[res_oq2],        %[add_p210_q012] \n\t"
+      "subu.ph    %[res_oq1],        %[res_oq1],        %[p2]            \n\t"
+      "addu.ph    %[res_oq2],        %[res_oq2],        %[q2]            \n\t"
+      "shrl.ph    %[res_oq1],        %[res_oq1],        3                \n\t"
+      "subu.ph    %[res_oq2],        %[res_oq2],        %[p2]            \n\t"
+      "shrl.ph    %[res_oq0],        %[res_oq0],        3                \n\t"
+      "subu.ph    %[res_oq2],        %[res_oq2],        %[p1]            \n\t"
+      "shrl.ph    %[res_op0],        %[res_op0],        3                \n\t"
+      "shrl.ph    %[res_oq2],        %[res_oq2],        3                \n\t"
+
+      : [add_p210_q012] "=&r" (add_p210_q012),
+        [tmp] "=&r" (tmp), [res_op2] "=&r" (res_op2),
+        [res_op1] "=&r" (res_op1), [res_op0] "=&r" (res_op0),
+        [res_oq0] "=&r" (res_oq0), [res_oq1] "=&r" (res_oq1),
+        [res_oq2] "=&r" (res_oq2)
+      : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [q1] "r" (q1),
+        [p2] "r" (p2), [q2] "r" (q2), [p3] "r" (p3), [q3] "r" (q3),
+        [u32Four] "r" (u32Four)
+  );
+
+  *op2 = res_op2;
+  *op1 = res_op1;
+  *op0 = res_op0;
+  *oq0 = res_oq0;
+  *oq1 = res_oq1;
+  *oq2 = res_oq2;
+}
+
+static INLINE void vp9_mbfilter1_dspr2(uint32_t p3, uint32_t p2,
+                                       uint32_t p1, uint32_t p0,
+                                       uint32_t q0, uint32_t q1,
+                                       uint32_t q2, uint32_t q3,
+                                       uint32_t *op2_f1,
+                                       uint32_t *op1_f1, uint32_t *op0_f1,
+                                       uint32_t *oq0_f1, uint32_t *oq1_f1,
+                                       uint32_t *oq2_f1) {
+  /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
+  uint32_t  res_op2, res_op1, res_op0;
+  uint32_t  res_oq0, res_oq1, res_oq2;
+  uint32_t  tmp;
+  uint32_t  add_p210_q012;
+  uint32_t  u32Four = 0x00040004;
+
+  /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3)   1 */
+  /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3)   2 */
+  /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3)   3 */
+  /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3)   4 */
+  /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3)   5 */
+  /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3)   6 */
+
+  __asm__ __volatile__ (
+      "addu.ph    %[add_p210_q012],  %[p2],             %[p1]             \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[p0]             \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q0]             \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q1]             \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q2]             \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[u32Four]        \n\t"
+
+      "shll.ph    %[tmp],            %[p3],             1                 \n\t"
+      "addu.ph    %[res_op2],        %[tmp],            %[p3]             \n\t"
+      "addu.ph    %[res_op1],        %[p3],             %[p3]             \n\t"
+      "addu.ph    %[res_op2],        %[res_op2],        %[p2]             \n\t"
+      "addu.ph    %[res_op1],        %[res_op1],        %[p1]             \n\t"
+      "addu.ph    %[res_op2],        %[res_op2],        %[add_p210_q012]  \n\t"
+      "addu.ph    %[res_op1],        %[res_op1],        %[add_p210_q012]  \n\t"
+      "subu.ph    %[res_op2],        %[res_op2],        %[q1]             \n\t"
+      "subu.ph    %[res_op1],        %[res_op1],        %[q2]             \n\t"
+      "subu.ph    %[res_op2],        %[res_op2],        %[q2]             \n\t"
+      "shrl.ph    %[res_op1],        %[res_op1],        3                 \n\t"
+      "shrl.ph    %[res_op2],        %[res_op2],        3                 \n\t"
+      "addu.ph    %[res_op0],        %[p3],             %[p0]             \n\t"
+      "addu.ph    %[res_oq0],        %[q0],             %[q3]             \n\t"
+      "addu.ph    %[res_op0],        %[res_op0],        %[add_p210_q012]  \n\t"
+      "addu.ph    %[res_oq0],        %[res_oq0],        %[add_p210_q012]  \n\t"
+      "addu.ph    %[res_oq1],        %[q3],             %[q3]             \n\t"
+      "shll.ph    %[tmp],            %[q3],             1                 \n\t"
+      "addu.ph    %[res_oq1],        %[res_oq1],        %[q1]             \n\t"
+      "addu.ph    %[res_oq2],        %[tmp],            %[q3]             \n\t"
+      "addu.ph    %[res_oq1],        %[res_oq1],        %[add_p210_q012]  \n\t"
+      "addu.ph    %[res_oq2],        %[res_oq2],        %[add_p210_q012]  \n\t"
+      "subu.ph    %[res_oq1],        %[res_oq1],        %[p2]             \n\t"
+      "addu.ph    %[res_oq2],        %[res_oq2],        %[q2]             \n\t"
+      "shrl.ph    %[res_oq1],        %[res_oq1],        3                 \n\t"
+      "subu.ph    %[res_oq2],        %[res_oq2],        %[p2]             \n\t"
+      "shrl.ph    %[res_oq0],        %[res_oq0],        3                 \n\t"
+      "subu.ph    %[res_oq2],        %[res_oq2],        %[p1]             \n\t"
+      "shrl.ph    %[res_op0],        %[res_op0],        3                 \n\t"
+      "shrl.ph    %[res_oq2],        %[res_oq2],        3                 \n\t"
+
+      : [add_p210_q012] "=&r" (add_p210_q012), [tmp] "=&r" (tmp),
+        [res_op2] "=&r" (res_op2), [res_op1] "=&r" (res_op1),
+        [res_op0] "=&r" (res_op0), [res_oq0] "=&r" (res_oq0),
+        [res_oq1] "=&r" (res_oq1), [res_oq2] "=&r" (res_oq2)
+      : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [q1] "r" (q1),
+        [p2] "r" (p2), [q2] "r" (q2), [p3] "r" (p3), [q3] "r" (q3),
+        [u32Four] "r" (u32Four)
+  );
+
+  *op2_f1 = res_op2;
+  *op1_f1 = res_op1;
+  *op0_f1 = res_op0;
+  *oq0_f1 = res_oq0;
+  *oq1_f1 = res_oq1;
+  *oq2_f1 = res_oq2;
+}
+
+static INLINE void vp9_wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6,
+                                           uint32_t *op5, uint32_t *op4,
+                                           uint32_t *op3, uint32_t *op2,
+                                           uint32_t *op1, uint32_t *op0,
+                                           uint32_t *oq0, uint32_t *oq1,
+                                           uint32_t *oq2, uint32_t *oq3,
+                                           uint32_t *oq4, uint32_t *oq5,
+                                           uint32_t *oq6, uint32_t *oq7) {
+  const uint32_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4;
+  const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+  const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
+  const uint32_t q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7;
+  uint32_t       res_op6, res_op5, res_op4, res_op3, res_op2, res_op1, res_op0;
+  uint32_t       res_oq0, res_oq1, res_oq2, res_oq3, res_oq4, res_oq5, res_oq6;
+  uint32_t       tmp;
+  uint32_t       add_p6toq6;
+  uint32_t       u32Eight = 0x00080008;
+
+  __asm__ __volatile__ (
+      /* addition of p6,p5,p4,p3,p2,p1,p0,q0,q1,q2,q3,q4,q5,q6
+         which is used most of the time */
+      "addu.ph      %[add_p6toq6],     %[p6],              %[p5]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p4]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p3]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p2]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p1]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p0]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q0]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q1]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q2]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q3]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q4]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q5]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q6]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[u32Eight]   \n\t"
+
+      : [add_p6toq6] "=&r" (add_p6toq6)
+      : [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4),
+        [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
+        [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2), [q3] "r" (q3),
+        [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6),
+        [u32Eight] "r" (u32Eight)
+  );
+
+  __asm__ __volatile__ (
+      /* *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 +
+                                   p3 + p2 + p1 + p0 + q0, 4) */
+      "shll.ph       %[tmp],            %[p7],            3               \n\t"
+      "subu.ph       %[res_op6],        %[tmp],           %[p7]           \n\t"
+      "addu.ph       %[res_op6],        %[res_op6],       %[p6]           \n\t"
+      "addu.ph       %[res_op6],        %[res_op6],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_op6],        %[res_op6],       %[q1]           \n\t"
+      "subu.ph       %[res_op6],        %[res_op6],       %[q2]           \n\t"
+      "subu.ph       %[res_op6],        %[res_op6],       %[q3]           \n\t"
+      "subu.ph       %[res_op6],        %[res_op6],       %[q4]           \n\t"
+      "subu.ph       %[res_op6],        %[res_op6],       %[q5]           \n\t"
+      "subu.ph       %[res_op6],        %[res_op6],       %[q6]           \n\t"
+      "shrl.ph       %[res_op6],        %[res_op6],       4               \n\t"
+
+      /* *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 +
+                                   p2 + p1 + p0 + q0 + q1, 4) */
+      "shll.ph       %[tmp],            %[p7],            2               \n\t"
+      "addu.ph       %[res_op5],        %[tmp],           %[p7]           \n\t"
+      "addu.ph       %[res_op5],        %[res_op5],       %[p7]           \n\t"
+      "addu.ph       %[res_op5],        %[res_op5],       %[p5]           \n\t"
+      "addu.ph       %[res_op5],        %[res_op5],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_op5],        %[res_op5],       %[q2]           \n\t"
+      "subu.ph       %[res_op5],        %[res_op5],       %[q3]           \n\t"
+      "subu.ph       %[res_op5],        %[res_op5],       %[q4]           \n\t"
+      "subu.ph       %[res_op5],        %[res_op5],       %[q5]           \n\t"
+      "subu.ph       %[res_op5],        %[res_op5],       %[q6]           \n\t"
+      "shrl.ph       %[res_op5],        %[res_op5],       4               \n\t"
+
+      /* *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 +
+                                   p1 + p0 + q0 + q1 + q2, 4) */
+      "shll.ph       %[tmp],            %[p7],            2               \n\t"
+      "addu.ph       %[res_op4],        %[tmp],           %[p7]           \n\t"
+      "addu.ph       %[res_op4],        %[res_op4],       %[p4]           \n\t"
+      "addu.ph       %[res_op4],        %[res_op4],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_op4],        %[res_op4],       %[q3]           \n\t"
+      "subu.ph       %[res_op4],        %[res_op4],       %[q4]           \n\t"
+      "subu.ph       %[res_op4],        %[res_op4],       %[q5]           \n\t"
+      "subu.ph       %[res_op4],        %[res_op4],       %[q6]           \n\t"
+      "shrl.ph       %[res_op4],        %[res_op4],       4               \n\t"
+
+      /* *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 +
+                                   p1 + p0 + q0 + q1 + q2 + q3, 4) */
+      "shll.ph       %[tmp],            %[p7],            2               \n\t"
+      "addu.ph       %[res_op3],        %[tmp],           %[p3]           \n\t"
+      "addu.ph       %[res_op3],        %[res_op3],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_op3],        %[res_op3],       %[q4]           \n\t"
+      "subu.ph       %[res_op3],        %[res_op3],       %[q5]           \n\t"
+      "subu.ph       %[res_op3],        %[res_op3],       %[q6]           \n\t"
+      "shrl.ph       %[res_op3],        %[res_op3],       4               \n\t"
+
+      /* *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 +
+                                   p0 + q0 + q1 + q2 + q3 + q4, 4) */
+      "shll.ph       %[tmp],            %[p7],            1               \n\t"
+      "addu.ph       %[res_op2],        %[tmp],           %[p7]           \n\t"
+      "addu.ph       %[res_op2],        %[res_op2],       %[p2]           \n\t"
+      "addu.ph       %[res_op2],        %[res_op2],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_op2],        %[res_op2],       %[q5]           \n\t"
+      "subu.ph       %[res_op2],        %[res_op2],       %[q6]           \n\t"
+      "shrl.ph       %[res_op2],        %[res_op2],       4               \n\t"
+
+      /* *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 +
+                                   p0 + q0 + q1 + q2 + q3 + q4 + q5, 4); */
+      "shll.ph       %[tmp],            %[p7],            1               \n\t"
+      "addu.ph       %[res_op1],        %[tmp],           %[p1]           \n\t"
+      "addu.ph       %[res_op1],        %[res_op1],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_op1],        %[res_op1],       %[q6]           \n\t"
+      "shrl.ph       %[res_op1],        %[res_op1],       4               \n\t"
+
+      /* *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
+                                  q0 + q1 + q2 + q3 + q4 + q5 + q6, 4) */
+      "addu.ph       %[res_op0],        %[p7],            %[p0]           \n\t"
+      "addu.ph       %[res_op0],        %[res_op0],       %[add_p6toq6]   \n\t"
+      "shrl.ph       %[res_op0],        %[res_op0],       4               \n\t"
+
+      : [res_op6] "=&r" (res_op6), [res_op5] "=&r" (res_op5),
+        [res_op4] "=&r" (res_op4), [res_op3] "=&r" (res_op3),
+        [res_op2] "=&r" (res_op2), [res_op1] "=&r" (res_op1),
+        [res_op0] "=&r" (res_op0), [tmp] "=&r" (tmp)
+      : [p7] "r" (p7), [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4),
+        [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
+        [q2] "r" (q2), [q1] "r" (q1),
+        [q3] "r" (q3), [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6),
+        [add_p6toq6] "r" (add_p6toq6)
+  );
+
+  *op6 = res_op6;
+  *op5 = res_op5;
+  *op4 = res_op4;
+  *op3 = res_op3;
+  *op2 = res_op2;
+  *op1 = res_op1;
+  *op0 = res_op0;
+
+  __asm__ __volatile__ (
+      /* *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 +
+                                   q1 + q2 + q3 + q4 + q5 + q6 + q7, 4); */
+      "addu.ph       %[res_oq0],        %[q7],            %[q0]           \n\t"
+      "addu.ph       %[res_oq0],        %[res_oq0],       %[add_p6toq6]   \n\t"
+      "shrl.ph       %[res_oq0],        %[res_oq0],       4               \n\t"
+
+      /* *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 +
+                                   q2 + q3 + q4 + q5 + q6 + q7 * 2, 4) */
+      "shll.ph       %[tmp],            %[q7],            1               \n\t"
+      "addu.ph       %[res_oq1],        %[tmp],           %[q1]           \n\t"
+      "addu.ph       %[res_oq1],        %[res_oq1],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_oq1],        %[res_oq1],       %[p6]           \n\t"
+      "shrl.ph       %[res_oq1],        %[res_oq1],       4               \n\t"
+
+      /* *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 +
+                                   q3 + q4 + q5 + q6 + q7 * 3, 4) */
+      "shll.ph       %[tmp],            %[q7],            1               \n\t"
+      "addu.ph       %[res_oq2],        %[tmp],           %[q7]           \n\t"
+      "addu.ph       %[res_oq2],        %[res_oq2],       %[q2]           \n\t"
+      "addu.ph       %[res_oq2],        %[res_oq2],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_oq2],        %[res_oq2],       %[p5]           \n\t"
+      "subu.ph       %[res_oq2],        %[res_oq2],       %[p6]           \n\t"
+      "shrl.ph       %[res_oq2],        %[res_oq2],       4               \n\t"
+
+      /* *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + q0 + q1 + q2 +
+                                   q3 * 2 + q4 + q5 + q6 + q7 * 4, 4) */
+      "shll.ph       %[tmp],            %[q7],            2               \n\t"
+      "addu.ph       %[res_oq3],        %[tmp],           %[q3]           \n\t"
+      "addu.ph       %[res_oq3],        %[res_oq3],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_oq3],        %[res_oq3],       %[p4]           \n\t"
+      "subu.ph       %[res_oq3],        %[res_oq3],       %[p5]           \n\t"
+      "subu.ph       %[res_oq3],        %[res_oq3],       %[p6]           \n\t"
+      "shrl.ph       %[res_oq3],        %[res_oq3],       4               \n\t"
+
+      /* *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q1 + q2 + q3 +
+                                   q4 * 2 + q5 + q6 + q7 * 5, 4) */
+      "shll.ph       %[tmp],            %[q7],            2               \n\t"
+      "addu.ph       %[res_oq4],        %[tmp],           %[q7]           \n\t"
+      "addu.ph       %[res_oq4],        %[res_oq4],       %[q4]           \n\t"
+      "addu.ph       %[res_oq4],        %[res_oq4],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_oq4],        %[res_oq4],       %[p3]           \n\t"
+      "subu.ph       %[res_oq4],        %[res_oq4],       %[p4]           \n\t"
+      "subu.ph       %[res_oq4],        %[res_oq4],       %[p5]           \n\t"
+      "subu.ph       %[res_oq4],        %[res_oq4],       %[p6]           \n\t"
+      "shrl.ph       %[res_oq4],        %[res_oq4],       4               \n\t"
+
+      /* *oq5 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q2 + q3 + q4 +
+                                   q5 * 2 + q6 + q7 * 6, 4) */
+      "shll.ph       %[tmp],            %[q7],            2               \n\t"
+      "addu.ph       %[res_oq5],        %[tmp],           %[q7]           \n\t"
+      "addu.ph       %[res_oq5],        %[res_oq5],       %[q7]           \n\t"
+      "addu.ph       %[res_oq5],        %[res_oq5],       %[q5]           \n\t"
+      "addu.ph       %[res_oq5],        %[res_oq5],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_oq5],        %[res_oq5],       %[p2]           \n\t"
+      "subu.ph       %[res_oq5],        %[res_oq5],       %[p3]           \n\t"
+      "subu.ph       %[res_oq5],        %[res_oq5],       %[p4]           \n\t"
+      "subu.ph       %[res_oq5],        %[res_oq5],       %[p5]           \n\t"
+      "subu.ph       %[res_oq5],        %[res_oq5],       %[p6]           \n\t"
+      "shrl.ph       %[res_oq5],        %[res_oq5],       4               \n\t"
+
+      /* *oq6 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 +
+                                   q4 + q5 + q6 * 2 + q7 * 7, 4) */
+      "shll.ph       %[tmp],            %[q7],            3               \n\t"
+      "subu.ph       %[res_oq6],        %[tmp],           %[q7]           \n\t"
+      "addu.ph       %[res_oq6],        %[res_oq6],       %[q6]           \n\t"
+      "addu.ph       %[res_oq6],        %[res_oq6],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_oq6],        %[res_oq6],       %[p1]           \n\t"
+      "subu.ph       %[res_oq6],        %[res_oq6],       %[p2]           \n\t"
+      "subu.ph       %[res_oq6],        %[res_oq6],       %[p3]           \n\t"
+      "subu.ph       %[res_oq6],        %[res_oq6],       %[p4]           \n\t"
+      "subu.ph       %[res_oq6],        %[res_oq6],       %[p5]           \n\t"
+      "subu.ph       %[res_oq6],        %[res_oq6],       %[p6]           \n\t"
+      "shrl.ph       %[res_oq6],        %[res_oq6],       4               \n\t"
+
+      : [res_oq6] "=&r" (res_oq6), [res_oq5] "=&r" (res_oq5),
+        [res_oq4] "=&r" (res_oq4), [res_oq3] "=&r" (res_oq3),
+        [res_oq2] "=&r" (res_oq2), [res_oq1] "=&r" (res_oq1),
+        [res_oq0] "=&r" (res_oq0), [tmp] "=&r" (tmp)
+      : [q7] "r" (q7), [q6] "r" (q6), [q5] "r" (q5), [q4] "r" (q4),
+        [q3] "r" (q3), [q2] "r" (q2), [q1] "r" (q1), [q0] "r" (q0),
+        [p1] "r" (p1), [p2] "r" (p2),
+        [p3] "r" (p3), [p4] "r" (p4), [p5] "r" (p5), [p6] "r" (p6),
+        [add_p6toq6] "r" (add_p6toq6)
+  );
+
+  *oq0 = res_oq0;
+  *oq1 = res_oq1;
+  *oq2 = res_oq2;
+  *oq3 = res_oq3;
+  *oq4 = res_oq4;
+  *oq5 = res_oq5;
+  *oq6 = res_oq6;
+}
+#endif  // #if HAVE_DSPR2
+#endif  // VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_
diff --git a/vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h b/vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h
new file mode 100644
index 0000000..4cb2ebb
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h
@@ -0,0 +1,470 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MACROS_DSPR2_H_
+#define VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MACROS_DSPR2_H_
+
+#include <stdlib.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_onyxc_int.h"
+
+#if HAVE_DSPR2
+#define STORE_F0() {                                                    \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q1_f0],    1(%[s4])           \n\t"                   \
+        "sb     %[q0_f0],    0(%[s4])           \n\t"                   \
+        "sb     %[p0_f0],   -1(%[s4])           \n\t"                   \
+        "sb     %[p1_f0],   -2(%[s4])           \n\t"                   \
+                                                                        \
+        :                                                               \
+        : [q1_f0] "r" (q1_f0), [q0_f0] "r" (q0_f0),                     \
+          [p0_f0] "r" (p0_f0), [p1_f0] "r" (p1_f0),                     \
+          [s4] "r" (s4)                                                 \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "srl    %[q1_f0],   %[q1_f0],   8       \n\t"                   \
+        "srl    %[q0_f0],   %[q0_f0],   8       \n\t"                   \
+        "srl    %[p0_f0],   %[p0_f0],   8       \n\t"                   \
+        "srl    %[p1_f0],   %[p1_f0],   8       \n\t"                   \
+                                                                        \
+        : [q1_f0] "+r" (q1_f0), [q0_f0] "+r" (q0_f0),                   \
+          [p0_f0] "+r" (p0_f0), [p1_f0] "+r" (p1_f0)                    \
+        :                                                               \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q1_f0],    1(%[s3])           \n\t"                   \
+        "sb     %[q0_f0],    0(%[s3])           \n\t"                   \
+        "sb     %[p0_f0],   -1(%[s3])           \n\t"                   \
+        "sb     %[p1_f0],   -2(%[s3])           \n\t"                   \
+                                                                        \
+        : [p1_f0] "+r" (p1_f0)                                          \
+        : [q1_f0] "r" (q1_f0), [q0_f0] "r" (q0_f0),                     \
+          [s3] "r" (s3), [p0_f0] "r" (p0_f0)                            \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "srl    %[q1_f0],   %[q1_f0],   8       \n\t"                   \
+        "srl    %[q0_f0],   %[q0_f0],   8       \n\t"                   \
+        "srl    %[p0_f0],   %[p0_f0],   8       \n\t"                   \
+        "srl    %[p1_f0],   %[p1_f0],   8       \n\t"                   \
+                                                                        \
+        : [q1_f0] "+r" (q1_f0), [q0_f0] "+r" (q0_f0),                   \
+          [p0_f0] "+r" (p0_f0), [p1_f0] "+r" (p1_f0)                    \
+        :                                                               \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q1_f0],    1(%[s2])           \n\t"                   \
+        "sb     %[q0_f0],    0(%[s2])           \n\t"                   \
+        "sb     %[p0_f0],   -1(%[s2])           \n\t"                   \
+        "sb     %[p1_f0],   -2(%[s2])           \n\t"                   \
+                                                                        \
+        :                                                               \
+        : [q1_f0] "r" (q1_f0), [q0_f0] "r" (q0_f0),                     \
+          [p0_f0] "r" (p0_f0), [p1_f0] "r" (p1_f0),                     \
+          [s2] "r" (s2)                                                 \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "srl    %[q1_f0],   %[q1_f0],   8       \n\t"                   \
+        "srl    %[q0_f0],   %[q0_f0],   8       \n\t"                   \
+        "srl    %[p0_f0],   %[p0_f0],   8       \n\t"                   \
+        "srl    %[p1_f0],   %[p1_f0],   8       \n\t"                   \
+                                                                        \
+        : [q1_f0] "+r" (q1_f0), [q0_f0] "+r" (q0_f0),                   \
+          [p0_f0] "+r" (p0_f0), [p1_f0] "+r" (p1_f0)                    \
+        :                                                               \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q1_f0],    1(%[s1])           \n\t"                   \
+        "sb     %[q0_f0],    0(%[s1])           \n\t"                   \
+        "sb     %[p0_f0],   -1(%[s1])           \n\t"                   \
+        "sb     %[p1_f0],   -2(%[s1])           \n\t"                   \
+                                                                        \
+        :                                                               \
+        : [q1_f0] "r" (q1_f0), [q0_f0] "r" (q0_f0),                     \
+          [p0_f0] "r" (p0_f0), [p1_f0] "r" (p1_f0),                     \
+          [s1] "r" (s1)                                                 \
+    );                                                                  \
+}
+
+#define STORE_F1() {                                                    \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q2_r],     2(%[s4])           \n\t"                   \
+        "sb     %[q1_r],     1(%[s4])           \n\t"                   \
+        "sb     %[q0_r],     0(%[s4])           \n\t"                   \
+        "sb     %[p0_r],    -1(%[s4])           \n\t"                   \
+        "sb     %[p1_r],    -2(%[s4])           \n\t"                   \
+        "sb     %[p2_r],    -3(%[s4])           \n\t"                   \
+                                                                        \
+        :                                                               \
+        : [q2_r] "r" (q2_r), [q1_r] "r" (q1_r), [q0_r] "r" (q0_r),      \
+          [p0_r] "r" (p0_r), [p1_r] "r" (p1_r), [p2_r] "r" (p2_r),      \
+          [s4] "r" (s4)                                                 \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "srl    %[q2_r],    %[q2_r],    16      \n\t"                   \
+        "srl    %[q1_r],    %[q1_r],    16      \n\t"                   \
+        "srl    %[q0_r],    %[q0_r],    16      \n\t"                   \
+        "srl    %[p0_r],    %[p0_r],    16      \n\t"                   \
+        "srl    %[p1_r],    %[p1_r],    16      \n\t"                   \
+        "srl    %[p2_r],    %[p2_r],    16      \n\t"                   \
+                                                                        \
+        : [q2_r] "+r" (q2_r), [q1_r] "+r" (q1_r), [q0_r] "+r" (q0_r),   \
+          [p0_r] "+r" (p0_r), [p1_r] "+r" (p1_r), [p2_r] "+r" (p2_r)    \
+        :                                                               \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q2_r],     2(%[s3])           \n\t"                   \
+        "sb     %[q1_r],     1(%[s3])           \n\t"                   \
+        "sb     %[q0_r],     0(%[s3])           \n\t"                   \
+        "sb     %[p0_r],    -1(%[s3])           \n\t"                   \
+        "sb     %[p1_r],    -2(%[s3])           \n\t"                   \
+        "sb     %[p2_r],    -3(%[s3])           \n\t"                   \
+                                                                        \
+        :                                                               \
+        : [q2_r] "r" (q2_r), [q1_r] "r" (q1_r), [q0_r] "r" (q0_r),      \
+          [p0_r] "r" (p0_r), [p1_r] "r" (p1_r), [p2_r] "r" (p2_r),      \
+          [s3] "r" (s3)                                                 \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q2_l],     2(%[s2])           \n\t"                   \
+        "sb     %[q1_l],     1(%[s2])           \n\t"                   \
+        "sb     %[q0_l],     0(%[s2])           \n\t"                   \
+        "sb     %[p0_l],    -1(%[s2])           \n\t"                   \
+        "sb     %[p1_l],    -2(%[s2])           \n\t"                   \
+        "sb     %[p2_l],    -3(%[s2])           \n\t"                   \
+                                                                        \
+        :                                                               \
+        : [q2_l] "r" (q2_l), [q1_l] "r" (q1_l), [q0_l] "r" (q0_l),      \
+          [p0_l] "r" (p0_l), [p1_l] "r" (p1_l), [p2_l] "r" (p2_l),      \
+          [s2] "r" (s2)                                                 \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "srl    %[q2_l],    %[q2_l],    16      \n\t"                   \
+        "srl    %[q1_l],    %[q1_l],    16      \n\t"                   \
+        "srl    %[q0_l],    %[q0_l],    16      \n\t"                   \
+        "srl    %[p0_l],    %[p0_l],    16      \n\t"                   \
+        "srl    %[p1_l],    %[p1_l],    16      \n\t"                   \
+        "srl    %[p2_l],    %[p2_l],    16      \n\t"                   \
+                                                                        \
+        : [q2_l] "+r" (q2_l), [q1_l] "+r" (q1_l), [q0_l] "+r" (q0_l),   \
+          [p0_l] "+r" (p0_l), [p1_l] "+r" (p1_l), [p2_l] "+r" (p2_l)    \
+        :                                                               \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q2_l],     2(%[s1])           \n\t"                   \
+        "sb     %[q1_l],     1(%[s1])           \n\t"                   \
+        "sb     %[q0_l],     0(%[s1])           \n\t"                   \
+        "sb     %[p0_l],    -1(%[s1])           \n\t"                   \
+        "sb     %[p1_l],    -2(%[s1])           \n\t"                   \
+        "sb     %[p2_l],    -3(%[s1])           \n\t"                   \
+                                                                        \
+        :                                                               \
+        : [q2_l] "r" (q2_l), [q1_l] "r" (q1_l), [q0_l] "r" (q0_l),      \
+          [p0_l] "r" (p0_l), [p1_l] "r" (p1_l), [p2_l] "r" (p2_l),      \
+          [s1] "r" (s1)                                                 \
+    );                                                                  \
+}
+
+#define STORE_F2() {                                                    \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q6_r],     6(%[s4])           \n\t"                   \
+        "sb     %[q5_r],     5(%[s4])           \n\t"                   \
+        "sb     %[q4_r],     4(%[s4])           \n\t"                   \
+        "sb     %[q3_r],     3(%[s4])           \n\t"                   \
+        "sb     %[q2_r],     2(%[s4])           \n\t"                   \
+        "sb     %[q1_r],     1(%[s4])           \n\t"                   \
+        "sb     %[q0_r],     0(%[s4])           \n\t"                   \
+        "sb     %[p0_r],    -1(%[s4])           \n\t"                   \
+        "sb     %[p1_r],    -2(%[s4])           \n\t"                   \
+        "sb     %[p2_r],    -3(%[s4])           \n\t"                   \
+        "sb     %[p3_r],    -4(%[s4])           \n\t"                   \
+        "sb     %[p4_r],    -5(%[s4])           \n\t"                   \
+        "sb     %[p5_r],    -6(%[s4])           \n\t"                   \
+        "sb     %[p6_r],    -7(%[s4])           \n\t"                   \
+                                                                        \
+        :                                                               \
+        : [q6_r] "r" (q6_r), [q5_r] "r" (q5_r), [q4_r] "r" (q4_r),      \
+          [q3_r] "r" (q3_r), [q2_r] "r" (q2_r), [q1_r] "r" (q1_r),      \
+          [q0_r] "r" (q0_r),                                            \
+          [p0_r] "r" (p0_r), [p1_r] "r" (p1_r), [p2_r] "r" (p2_r),      \
+          [p3_r] "r" (p3_r), [p4_r] "r" (p4_r), [p5_r] "r" (p5_r),      \
+          [p6_r] "r" (p6_r),                                            \
+          [s4] "r" (s4)                                                 \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "srl    %[q6_r],    %[q6_r],    16      \n\t"                   \
+        "srl    %[q5_r],    %[q5_r],    16      \n\t"                   \
+        "srl    %[q4_r],    %[q4_r],    16      \n\t"                   \
+        "srl    %[q3_r],    %[q3_r],    16      \n\t"                   \
+        "srl    %[q2_r],    %[q2_r],    16      \n\t"                   \
+        "srl    %[q1_r],    %[q1_r],    16      \n\t"                   \
+        "srl    %[q0_r],    %[q0_r],    16      \n\t"                   \
+        "srl    %[p0_r],    %[p0_r],    16      \n\t"                   \
+        "srl    %[p1_r],    %[p1_r],    16      \n\t"                   \
+        "srl    %[p2_r],    %[p2_r],    16      \n\t"                   \
+        "srl    %[p3_r],    %[p3_r],    16      \n\t"                   \
+        "srl    %[p4_r],    %[p4_r],    16      \n\t"                   \
+        "srl    %[p5_r],    %[p5_r],    16      \n\t"                   \
+        "srl    %[p6_r],    %[p6_r],    16      \n\t"                   \
+                                                                        \
+        : [q6_r] "+r" (q6_r), [q5_r] "+r" (q5_r), [q4_r] "+r" (q4_r),   \
+          [q3_r] "+r" (q3_r), [q2_r] "+r" (q2_r), [q1_r] "+r" (q1_r),   \
+          [q0_r] "+r" (q0_r),                                           \
+          [p0_r] "+r" (p0_r), [p1_r] "+r" (p1_r), [p2_r] "+r" (p2_r),   \
+          [p3_r] "+r" (p3_r), [p4_r] "+r" (p4_r), [p5_r] "+r" (p5_r),   \
+          [p6_r] "+r" (p6_r)                                            \
+        :                                                               \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q6_r],     6(%[s3])           \n\t"                   \
+        "sb     %[q5_r],     5(%[s3])           \n\t"                   \
+        "sb     %[q4_r],     4(%[s3])           \n\t"                   \
+        "sb     %[q3_r],     3(%[s3])           \n\t"                   \
+        "sb     %[q2_r],     2(%[s3])           \n\t"                   \
+        "sb     %[q1_r],     1(%[s3])           \n\t"                   \
+        "sb     %[q0_r],     0(%[s3])           \n\t"                   \
+        "sb     %[p0_r],    -1(%[s3])           \n\t"                   \
+        "sb     %[p1_r],    -2(%[s3])           \n\t"                   \
+        "sb     %[p2_r],    -3(%[s3])           \n\t"                   \
+        "sb     %[p3_r],    -4(%[s3])           \n\t"                   \
+        "sb     %[p4_r],    -5(%[s3])           \n\t"                   \
+        "sb     %[p5_r],    -6(%[s3])           \n\t"                   \
+        "sb     %[p6_r],    -7(%[s3])           \n\t"                   \
+                                                                        \
+        :                                                               \
+        : [q6_r] "r" (q6_r), [q5_r] "r" (q5_r), [q4_r] "r" (q4_r),      \
+          [q3_r] "r" (q3_r), [q2_r] "r" (q2_r), [q1_r] "r" (q1_r),      \
+          [q0_r] "r" (q0_r),                                            \
+          [p0_r] "r" (p0_r), [p1_r] "r" (p1_r), [p2_r] "r" (p2_r),      \
+          [p3_r] "r" (p3_r), [p4_r] "r" (p4_r), [p5_r] "r" (p5_r),      \
+          [p6_r] "r" (p6_r),                                            \
+          [s3] "r" (s3)                                                 \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q6_l],     6(%[s2])           \n\t"                   \
+        "sb     %[q5_l],     5(%[s2])           \n\t"                   \
+        "sb     %[q4_l],     4(%[s2])           \n\t"                   \
+        "sb     %[q3_l],     3(%[s2])           \n\t"                   \
+        "sb     %[q2_l],     2(%[s2])           \n\t"                   \
+        "sb     %[q1_l],     1(%[s2])           \n\t"                   \
+        "sb     %[q0_l],     0(%[s2])           \n\t"                   \
+        "sb     %[p0_l],    -1(%[s2])           \n\t"                   \
+        "sb     %[p1_l],    -2(%[s2])           \n\t"                   \
+        "sb     %[p2_l],    -3(%[s2])           \n\t"                   \
+        "sb     %[p3_l],    -4(%[s2])           \n\t"                   \
+        "sb     %[p4_l],    -5(%[s2])           \n\t"                   \
+        "sb     %[p5_l],    -6(%[s2])           \n\t"                   \
+        "sb     %[p6_l],    -7(%[s2])           \n\t"                   \
+                                                                        \
+        :                                                               \
+        : [q6_l] "r" (q6_l), [q5_l] "r" (q5_l), [q4_l] "r" (q4_l),      \
+          [q3_l] "r" (q3_l), [q2_l] "r" (q2_l), [q1_l] "r" (q1_l),      \
+          [q0_l] "r" (q0_l),                                            \
+          [p0_l] "r" (p0_l), [p1_l] "r" (p1_l), [p2_l] "r" (p2_l),      \
+          [p3_l] "r" (p3_l), [p4_l] "r" (p4_l), [p5_l] "r" (p5_l),      \
+          [p6_l] "r" (p6_l),                                            \
+          [s2] "r" (s2)                                                 \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "srl    %[q6_l],    %[q6_l],    16     \n\t"                    \
+        "srl    %[q5_l],    %[q5_l],    16     \n\t"                    \
+        "srl    %[q4_l],    %[q4_l],    16     \n\t"                    \
+        "srl    %[q3_l],    %[q3_l],    16     \n\t"                    \
+        "srl    %[q2_l],    %[q2_l],    16     \n\t"                    \
+        "srl    %[q1_l],    %[q1_l],    16     \n\t"                    \
+        "srl    %[q0_l],    %[q0_l],    16     \n\t"                    \
+        "srl    %[p0_l],    %[p0_l],    16     \n\t"                    \
+        "srl    %[p1_l],    %[p1_l],    16     \n\t"                    \
+        "srl    %[p2_l],    %[p2_l],    16     \n\t"                    \
+        "srl    %[p3_l],    %[p3_l],    16     \n\t"                    \
+        "srl    %[p4_l],    %[p4_l],    16     \n\t"                    \
+        "srl    %[p5_l],    %[p5_l],    16     \n\t"                    \
+        "srl    %[p6_l],    %[p6_l],    16     \n\t"                    \
+                                                                        \
+        : [q6_l] "+r" (q6_l), [q5_l] "+r" (q5_l), [q4_l] "+r" (q4_l),   \
+          [q3_l] "+r" (q3_l), [q2_l] "+r" (q2_l), [q1_l] "+r" (q1_l),   \
+          [q0_l] "+r" (q0_l),                                           \
+          [p0_l] "+r" (p0_l), [p1_l] "+r" (p1_l), [p2_l] "+r" (p2_l),   \
+          [p3_l] "+r" (p3_l), [p4_l] "+r" (p4_l), [p5_l] "+r" (p5_l),   \
+          [p6_l] "+r" (p6_l)                                            \
+        :                                                               \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q6_l],     6(%[s1])           \n\t"                   \
+        "sb     %[q5_l],     5(%[s1])           \n\t"                   \
+        "sb     %[q4_l],     4(%[s1])           \n\t"                   \
+        "sb     %[q3_l],     3(%[s1])           \n\t"                   \
+        "sb     %[q2_l],     2(%[s1])           \n\t"                   \
+        "sb     %[q1_l],     1(%[s1])           \n\t"                   \
+        "sb     %[q0_l],     0(%[s1])           \n\t"                   \
+        "sb     %[p0_l],    -1(%[s1])           \n\t"                   \
+        "sb     %[p1_l],    -2(%[s1])           \n\t"                   \
+        "sb     %[p2_l],    -3(%[s1])           \n\t"                   \
+        "sb     %[p3_l],    -4(%[s1])           \n\t"                   \
+        "sb     %[p4_l],    -5(%[s1])           \n\t"                   \
+        "sb     %[p5_l],    -6(%[s1])           \n\t"                   \
+        "sb     %[p6_l],    -7(%[s1])           \n\t"                   \
+                                                                        \
+        :                                                               \
+        : [q6_l] "r" (q6_l), [q5_l] "r" (q5_l), [q4_l] "r" (q4_l),      \
+          [q3_l] "r" (q3_l), [q2_l] "r" (q2_l), [q1_l] "r" (q1_l),      \
+          [q0_l] "r" (q0_l),                                            \
+          [p0_l] "r" (p0_l), [p1_l] "r" (p1_l), [p2_l] "r" (p2_l),      \
+          [p3_l] "r" (p3_l), [p4_l] "r" (p4_l), [p5_l] "r" (p5_l),      \
+          [p6_l] "r" (p6_l),                                            \
+          [s1] "r" (s1)                                                 \
+    );                                                                  \
+}
+
+#define PACK_LEFT_0TO3() {                                              \
+    __asm__ __volatile__ (                                              \
+        "preceu.ph.qbl   %[p3_l],   %[p3]   \n\t"                       \
+        "preceu.ph.qbl   %[p2_l],   %[p2]   \n\t"                       \
+        "preceu.ph.qbl   %[p1_l],   %[p1]   \n\t"                       \
+        "preceu.ph.qbl   %[p0_l],   %[p0]   \n\t"                       \
+        "preceu.ph.qbl   %[q0_l],   %[q0]   \n\t"                       \
+        "preceu.ph.qbl   %[q1_l],   %[q1]   \n\t"                       \
+        "preceu.ph.qbl   %[q2_l],   %[q2]   \n\t"                       \
+        "preceu.ph.qbl   %[q3_l],   %[q3]   \n\t"                       \
+                                                                        \
+        : [p3_l] "=&r" (p3_l), [p2_l] "=&r" (p2_l),                     \
+          [p1_l] "=&r" (p1_l), [p0_l] "=&r" (p0_l),                     \
+          [q0_l] "=&r" (q0_l), [q1_l] "=&r" (q1_l),                     \
+          [q2_l] "=&r" (q2_l), [q3_l] "=&r" (q3_l)                      \
+        : [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),   \
+          [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2), [q3] "r" (q3)    \
+    );                                                                  \
+}
+
+#define PACK_LEFT_4TO7() {                                              \
+    __asm__ __volatile__ (                                              \
+        "preceu.ph.qbl   %[p7_l],   %[p7]   \n\t"                       \
+        "preceu.ph.qbl   %[p6_l],   %[p6]   \n\t"                       \
+        "preceu.ph.qbl   %[p5_l],   %[p5]   \n\t"                       \
+        "preceu.ph.qbl   %[p4_l],   %[p4]   \n\t"                       \
+        "preceu.ph.qbl   %[q4_l],   %[q4]   \n\t"                       \
+        "preceu.ph.qbl   %[q5_l],   %[q5]   \n\t"                       \
+        "preceu.ph.qbl   %[q6_l],   %[q6]   \n\t"                       \
+        "preceu.ph.qbl   %[q7_l],   %[q7]   \n\t"                       \
+                                                                        \
+        : [p7_l] "=&r" (p7_l), [p6_l] "=&r" (p6_l),                     \
+          [p5_l] "=&r" (p5_l), [p4_l] "=&r" (p4_l),                     \
+          [q4_l] "=&r" (q4_l), [q5_l] "=&r" (q5_l),                     \
+          [q6_l] "=&r" (q6_l), [q7_l] "=&r" (q7_l)                      \
+        : [p7] "r" (p7), [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4),   \
+          [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6), [q7] "r" (q7)    \
+    );                                                                  \
+}
+
+#define PACK_RIGHT_0TO3() {                                             \
+    __asm__ __volatile__ (                                              \
+        "preceu.ph.qbr   %[p3_r],   %[p3]  \n\t"                        \
+        "preceu.ph.qbr   %[p2_r],   %[p2]   \n\t"                       \
+        "preceu.ph.qbr   %[p1_r],   %[p1]   \n\t"                       \
+        "preceu.ph.qbr   %[p0_r],   %[p0]   \n\t"                       \
+        "preceu.ph.qbr   %[q0_r],   %[q0]   \n\t"                       \
+        "preceu.ph.qbr   %[q1_r],   %[q1]   \n\t"                       \
+        "preceu.ph.qbr   %[q2_r],   %[q2]   \n\t"                       \
+        "preceu.ph.qbr   %[q3_r],   %[q3]   \n\t"                       \
+                                                                        \
+        : [p3_r] "=&r" (p3_r), [p2_r] "=&r" (p2_r),                     \
+          [p1_r] "=&r" (p1_r), [p0_r] "=&r" (p0_r),                     \
+          [q0_r] "=&r" (q0_r), [q1_r] "=&r" (q1_r),                     \
+          [q2_r] "=&r" (q2_r), [q3_r] "=&r" (q3_r)                      \
+        : [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),   \
+          [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2), [q3] "r" (q3)    \
+    );                                                                  \
+}
+
+#define PACK_RIGHT_4TO7() {                                             \
+    __asm__ __volatile__ (                                              \
+        "preceu.ph.qbr   %[p7_r],   %[p7]   \n\t"                       \
+        "preceu.ph.qbr   %[p6_r],   %[p6]   \n\t"                       \
+        "preceu.ph.qbr   %[p5_r],   %[p5]   \n\t"                       \
+        "preceu.ph.qbr   %[p4_r],   %[p4]   \n\t"                       \
+        "preceu.ph.qbr   %[q4_r],   %[q4]   \n\t"                       \
+        "preceu.ph.qbr   %[q5_r],   %[q5]   \n\t"                       \
+        "preceu.ph.qbr   %[q6_r],   %[q6]   \n\t"                       \
+        "preceu.ph.qbr   %[q7_r],   %[q7]   \n\t"                       \
+                                                                        \
+        : [p7_r] "=&r" (p7_r), [p6_r] "=&r" (p6_r),                     \
+          [p5_r] "=&r" (p5_r), [p4_r] "=&r" (p4_r),                     \
+          [q4_r] "=&r" (q4_r), [q5_r] "=&r" (q5_r),                     \
+          [q6_r] "=&r" (q6_r), [q7_r] "=&r" (q7_r)                      \
+        : [p7] "r" (p7), [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4),   \
+          [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6), [q7] "r" (q7)    \
+    );                                                                  \
+}
+
+#define COMBINE_LEFT_RIGHT_0TO2() {                                     \
+    __asm__ __volatile__ (                                              \
+        "precr.qb.ph    %[p2],  %[p2_l],    %[p2_r]    \n\t"            \
+        "precr.qb.ph    %[p1],  %[p1_l],    %[p1_r]    \n\t"            \
+        "precr.qb.ph    %[p0],  %[p0_l],    %[p0_r]    \n\t"            \
+        "precr.qb.ph    %[q0],  %[q0_l],    %[q0_r]    \n\t"            \
+        "precr.qb.ph    %[q1],  %[q1_l],    %[q1_r]    \n\t"            \
+        "precr.qb.ph    %[q2],  %[q2_l],    %[q2_r]    \n\t"            \
+                                                                        \
+        : [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0),            \
+          [q0] "=&r" (q0), [q1] "=&r" (q1), [q2] "=&r" (q2)             \
+        : [p2_l] "r" (p2_l), [p2_r] "r" (p2_r),                         \
+          [p1_l] "r" (p1_l), [p1_r] "r" (p1_r),                         \
+          [p0_l] "r" (p0_l), [p0_r] "r" (p0_r),                         \
+          [q0_l] "r" (q0_l), [q0_r] "r" (q0_r),                         \
+          [q1_l] "r" (q1_l), [q1_r] "r" (q1_r),                         \
+          [q2_l] "r" (q2_l), [q2_r] "r" (q2_r)                          \
+    );                                                                  \
+}
+
+#define COMBINE_LEFT_RIGHT_3TO6() {                                     \
+    __asm__ __volatile__ (                                              \
+        "precr.qb.ph    %[p6],  %[p6_l],    %[p6_r]    \n\t"            \
+        "precr.qb.ph    %[p5],  %[p5_l],    %[p5_r]    \n\t"            \
+        "precr.qb.ph    %[p4],  %[p4_l],    %[p4_r]    \n\t"            \
+        "precr.qb.ph    %[p3],  %[p3_l],    %[p3_r]    \n\t"            \
+        "precr.qb.ph    %[q3],  %[q3_l],    %[q3_r]    \n\t"            \
+        "precr.qb.ph    %[q4],  %[q4_l],    %[q4_r]    \n\t"            \
+        "precr.qb.ph    %[q5],  %[q5_l],    %[q5_r]    \n\t"            \
+        "precr.qb.ph    %[q6],  %[q6_l],    %[q6_r]    \n\t"            \
+                                                                        \
+        : [p6] "=&r" (p6),[p5] "=&r" (p5),                              \
+          [p4] "=&r" (p4),[p3] "=&r" (p3),                              \
+          [q3] "=&r" (q3),[q4] "=&r" (q4),                              \
+          [q5] "=&r" (q5),[q6] "=&r" (q6)                               \
+        : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l),                         \
+          [p4_l] "r" (p4_l), [p3_l] "r" (p3_l),                         \
+          [p6_r] "r" (p6_r), [p5_r] "r" (p5_r),                         \
+          [p4_r] "r" (p4_r), [p3_r] "r" (p3_r),                         \
+          [q3_l] "r" (q3_l), [q4_l] "r" (q4_l),                         \
+          [q5_l] "r" (q5_l), [q6_l] "r" (q6_l),                         \
+          [q3_r] "r" (q3_r), [q4_r] "r" (q4_r),                         \
+          [q5_r] "r" (q5_r), [q6_r] "r" (q6_r)                          \
+    );                                                                  \
+}
+
+#endif  // #if HAVE_DSPR2
+#endif  // VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MACROS_DSPR2_H_
diff --git a/vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h b/vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h
new file mode 100644
index 0000000..b9e0aca
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h
@@ -0,0 +1,365 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MASKS_DSPR2_H_
+#define VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MASKS_DSPR2_H_
+
+#include <stdlib.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_onyxc_int.h"
+
+#if HAVE_DSPR2
+/* processing 4 pixels at the same time
+ * compute hev and mask in the same function */
+static INLINE void vp9_filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit,
+                                             uint32_t p1, uint32_t p0,
+                                             uint32_t p3, uint32_t p2,
+                                             uint32_t q0, uint32_t q1,
+                                             uint32_t q2, uint32_t q3,
+                                             uint32_t thresh, uint32_t *hev,
+                                             uint32_t *mask) {
+  uint32_t  c, r, r3, r_k;
+  uint32_t  s1, s2, s3;
+  uint32_t  ones = 0xFFFFFFFF;
+  uint32_t  hev1;
+
+  __asm__ __volatile__ (
+      /* mask |= (abs(p3 - p2) > limit) */
+      "subu_s.qb      %[c],   %[p3],     %[p2]        \n\t"
+      "subu_s.qb      %[r_k], %[p2],     %[p3]        \n\t"
+      "or             %[r_k], %[r_k],    %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+      "or             %[r],   $0,        %[c]         \n\t"
+
+      /* mask |= (abs(p2 - p1) > limit) */
+      "subu_s.qb      %[c],   %[p2],     %[p1]        \n\t"
+      "subu_s.qb      %[r_k], %[p1],     %[p2]        \n\t"
+      "or             %[r_k], %[r_k],    %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+      "or             %[r],   %[r],      %[c]         \n\t"
+
+      /* mask |= (abs(p1 - p0) > limit)
+       * hev  |= (abs(p1 - p0) > thresh)
+       */
+      "subu_s.qb      %[c],   %[p1],     %[p0]        \n\t"
+      "subu_s.qb      %[r_k], %[p0],     %[p1]        \n\t"
+      "or             %[r_k], %[r_k],    %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[thresh], %[r_k]       \n\t"
+      "or             %[r3],  $0,        %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+      "or             %[r],   %[r],      %[c]         \n\t"
+
+      /* mask |= (abs(q1 - q0) > limit)
+       * hev  |= (abs(q1 - q0) > thresh)
+       */
+      "subu_s.qb      %[c],   %[q1],     %[q0]        \n\t"
+      "subu_s.qb      %[r_k], %[q0],     %[q1]        \n\t"
+      "or             %[r_k], %[r_k],    %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[thresh], %[r_k]       \n\t"
+      "or             %[r3],  %[r3],     %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+      "or             %[r],   %[r],      %[c]         \n\t"
+
+      /* mask |= (abs(q2 - q1) > limit) */
+      "subu_s.qb      %[c],   %[q2],     %[q1]        \n\t"
+      "subu_s.qb      %[r_k], %[q1],     %[q2]        \n\t"
+      "or             %[r_k], %[r_k],    %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+      "or             %[r],   %[r],      %[c]         \n\t"
+      "sll            %[r3],    %[r3],    24          \n\t"
+
+      /* mask |= (abs(q3 - q2) > limit) */
+      "subu_s.qb      %[c],   %[q3],     %[q2]        \n\t"
+      "subu_s.qb      %[r_k], %[q2],     %[q3]        \n\t"
+      "or             %[r_k], %[r_k],    %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+      "or             %[r],   %[r],      %[c]         \n\t"
+
+      : [c] "=&r" (c), [r_k] "=&r" (r_k),
+        [r] "=&r" (r), [r3] "=&r" (r3)
+      : [limit] "r" (limit), [p3] "r" (p3), [p2] "r" (p2),
+        [p1] "r" (p1), [p0] "r" (p0), [q1] "r" (q1), [q0] "r" (q0),
+        [q2] "r" (q2), [q3] "r" (q3), [thresh] "r" (thresh)
+  );
+
+  __asm__ __volatile__ (
+      /* abs(p0 - q0) */
+      "subu_s.qb      %[c],   %[p0],     %[q0]        \n\t"
+      "subu_s.qb      %[r_k], %[q0],     %[p0]        \n\t"
+      "wrdsp          %[r3]                           \n\t"
+      "or             %[s1],  %[r_k],    %[c]         \n\t"
+
+      /* abs(p1 - q1) */
+      "subu_s.qb      %[c],    %[p1],    %[q1]        \n\t"
+      "addu_s.qb      %[s3],   %[s1],    %[s1]        \n\t"
+      "pick.qb        %[hev1], %[ones],  $0           \n\t"
+      "subu_s.qb      %[r_k],  %[q1],    %[p1]        \n\t"
+      "or             %[s2],   %[r_k],   %[c]         \n\t"
+
+      /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > flimit * 2 + limit */
+      "shrl.qb        %[s2],   %[s2],     1           \n\t"
+      "addu_s.qb      %[s1],   %[s2],     %[s3]       \n\t"
+      "cmpgu.lt.qb    %[c],    %[flimit], %[s1]       \n\t"
+      "or             %[r],    %[r],      %[c]        \n\t"
+      "sll            %[r],    %[r],      24          \n\t"
+
+      "wrdsp          %[r]                            \n\t"
+      "pick.qb        %[s2],  $0,         %[ones]     \n\t"
+
+      : [c] "=&r" (c), [r_k] "=&r" (r_k), [s1] "=&r" (s1), [hev1] "=&r" (hev1),
+        [s2] "=&r" (s2), [r] "+r" (r), [s3] "=&r" (s3)
+      : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [r3] "r" (r3),
+        [q1] "r" (q1), [ones] "r" (ones), [flimit] "r" (flimit)
+  );
+
+  *hev = hev1;
+  *mask = s2;
+}
+
+static INLINE void vp9_filter_hev_mask_flatmask4_dspr2(uint32_t limit,
+                                                       uint32_t flimit,
+                                                       uint32_t thresh,
+                                                       uint32_t p1, uint32_t p0,
+                                                       uint32_t p3, uint32_t p2,
+                                                       uint32_t q0, uint32_t q1,
+                                                       uint32_t q2, uint32_t q3,
+                                                       uint32_t *hev,
+                                                       uint32_t *mask,
+                                                       uint32_t *flat) {
+  uint32_t  c, r, r3, r_k, r_flat;
+  uint32_t  s1, s2, s3;
+  uint32_t  ones = 0xFFFFFFFF;
+  uint32_t  flat_thresh = 0x01010101;
+  uint32_t  hev1;
+  uint32_t  flat1;
+
+  __asm__ __volatile__ (
+      /* mask |= (abs(p3 - p2) > limit) */
+      "subu_s.qb      %[c],       %[p3],          %[p2]        \n\t"
+      "subu_s.qb      %[r_k],     %[p2],          %[p3]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
+      "or             %[r],       $0,             %[c]         \n\t"
+
+      /* mask |= (abs(p2 - p1) > limit) */
+      "subu_s.qb      %[c],       %[p2],          %[p1]        \n\t"
+      "subu_s.qb      %[r_k],     %[p1],          %[p2]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
+      "or             %[r],       %[r],           %[c]         \n\t"
+
+      /* mask |= (abs(p1 - p0) > limit)
+       * hev  |= (abs(p1 - p0) > thresh)
+       * flat |= (abs(p1 - p0) > thresh)
+       */
+      "subu_s.qb      %[c],       %[p1],          %[p0]        \n\t"
+      "subu_s.qb      %[r_k],     %[p0],          %[p1]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[thresh],      %[r_k]       \n\t"
+      "or             %[r3],      $0,             %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
+      "or             %[r],       %[r],           %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  $0,             %[c]         \n\t"
+
+      /* mask |= (abs(q1 - q0) > limit)
+       * hev  |= (abs(q1 - q0) > thresh)
+       * flat |= (abs(q1 - q0) > thresh)
+       */
+      "subu_s.qb      %[c],       %[q1],          %[q0]        \n\t"
+      "subu_s.qb      %[r_k],     %[q0],          %[q1]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[thresh],      %[r_k]       \n\t"
+      "or             %[r3],      %[r3],          %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
+      "or             %[r],       %[r],           %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+
+      /* flat |= (abs(p0 - p2) > thresh) */
+      "subu_s.qb      %[c],       %[p0],          %[p2]        \n\t"
+      "subu_s.qb      %[r_k],     %[p2],          %[p0]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+
+      /* flat |= (abs(q0 - q2) > thresh) */
+      "subu_s.qb      %[c],       %[q0],          %[q2]        \n\t"
+      "subu_s.qb      %[r_k],     %[q2],          %[q0]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+
+      /* flat |= (abs(p3 - p0) > thresh) */
+      "subu_s.qb      %[c],       %[p3],          %[p0]        \n\t"
+      "subu_s.qb      %[r_k],     %[p0],          %[p3]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+
+      /* flat |= (abs(q3 - q0) > thresh) */
+      "subu_s.qb      %[c],       %[q3],          %[q0]        \n\t"
+      "subu_s.qb      %[r_k],     %[q0],          %[q3]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+      "sll            %[r_flat],  %[r_flat],      24           \n\t"
+      /* look at stall here */
+      "wrdsp          %[r_flat]                                \n\t"
+      "pick.qb        %[flat1],   $0,             %[ones]      \n\t"
+
+      /* mask |= (abs(q2 - q1) > limit) */
+      "subu_s.qb      %[c],       %[q2],          %[q1]        \n\t"
+      "subu_s.qb      %[r_k],     %[q1],          %[q2]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
+      "or             %[r],       %[r],           %[c]         \n\t"
+      "sll            %[r3],      %[r3],          24           \n\t"
+
+      /* mask |= (abs(q3 - q2) > limit) */
+      "subu_s.qb      %[c],       %[q3],          %[q2]        \n\t"
+      "subu_s.qb      %[r_k],     %[q2],          %[q3]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
+      "or             %[r],       %[r],           %[c]         \n\t"
+
+      : [c] "=&r" (c), [r_k] "=&r" (r_k), [r] "=&r" (r), [r3] "=&r" (r3),
+        [r_flat] "=&r" (r_flat), [flat1] "=&r" (flat1)
+      : [limit] "r" (limit), [p3] "r" (p3), [p2] "r" (p2),
+        [p1] "r" (p1), [p0] "r" (p0), [q1] "r" (q1), [q0] "r" (q0),
+        [q2] "r" (q2), [q3] "r" (q3), [thresh] "r" (thresh),
+        [flat_thresh] "r" (flat_thresh), [ones] "r" (ones)
+  );
+
+  __asm__ __volatile__ (
+      /* abs(p0 - q0) */
+      "subu_s.qb      %[c],   %[p0],     %[q0]        \n\t"
+      "subu_s.qb      %[r_k], %[q0],     %[p0]        \n\t"
+      "wrdsp          %[r3]                           \n\t"
+      "or             %[s1],  %[r_k],    %[c]         \n\t"
+
+      /* abs(p1 - q1) */
+      "subu_s.qb      %[c],    %[p1],    %[q1]        \n\t"
+      "addu_s.qb      %[s3],   %[s1],    %[s1]        \n\t"
+      "pick.qb        %[hev1], %[ones],  $0           \n\t"
+      "subu_s.qb      %[r_k],  %[q1],    %[p1]        \n\t"
+      "or             %[s2],   %[r_k],   %[c]         \n\t"
+
+      /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > flimit * 2 + limit */
+      "shrl.qb        %[s2],   %[s2],     1           \n\t"
+      "addu_s.qb      %[s1],   %[s2],     %[s3]       \n\t"
+      "cmpgu.lt.qb    %[c],    %[flimit], %[s1]       \n\t"
+      "or             %[r],    %[r],      %[c]        \n\t"
+      "sll            %[r],    %[r],      24          \n\t"
+
+      "wrdsp          %[r]                            \n\t"
+      "pick.qb        %[s2],   $0,        %[ones]     \n\t"
+
+      : [c] "=&r" (c), [r_k] "=&r" (r_k), [s1] "=&r" (s1), [hev1] "=&r" (hev1),
+        [s2] "=&r" (s2), [r] "+r" (r), [s3] "=&r" (s3)
+      : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [r3] "r" (r3),
+        [q1] "r" (q1), [ones] "r" (ones), [flimit] "r" (flimit)
+  );
+
+  *hev = hev1;
+  *mask = s2;
+  *flat = flat1;
+}
+
+static INLINE void vp9_flatmask5(uint32_t p4, uint32_t p3,
+                                 uint32_t p2, uint32_t p1,
+                                 uint32_t p0, uint32_t q0,
+                                 uint32_t q1, uint32_t q2,
+                                 uint32_t q3, uint32_t q4,
+                                 uint32_t *flat2) {
+  uint32_t  c, r, r_k, r_flat;
+  uint32_t  ones = 0xFFFFFFFF;
+  uint32_t  flat_thresh = 0x01010101;
+  uint32_t  flat1, flat3;
+
+  __asm__ __volatile__ (
+      /* flat |= (abs(p4 - p0) > thresh) */
+      "subu_s.qb      %[c],   %[p4],           %[p0]        \n\t"
+      "subu_s.qb      %[r_k], %[p0],           %[p4]        \n\t"
+      "or             %[r_k], %[r_k],          %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[flat_thresh],  %[r_k]       \n\t"
+      "or             %[r],   $0,              %[c]         \n\t"
+
+      /* flat |= (abs(q4 - q0) > thresh) */
+      "subu_s.qb      %[c],     %[q4],           %[q0]     \n\t"
+      "subu_s.qb      %[r_k],   %[q0],           %[q4]     \n\t"
+      "or             %[r_k],   %[r_k],          %[c]      \n\t"
+      "cmpgu.lt.qb    %[c],     %[flat_thresh],  %[r_k]    \n\t"
+      "or             %[r],     %[r],            %[c]      \n\t"
+      "sll            %[r],     %[r],            24        \n\t"
+      "wrdsp          %[r]                                 \n\t"
+      "pick.qb        %[flat3], $0,           %[ones]      \n\t"
+
+      /* flat |= (abs(p1 - p0) > thresh) */
+      "subu_s.qb      %[c],       %[p1],          %[p0]        \n\t"
+      "subu_s.qb      %[r_k],     %[p0],          %[p1]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  $0,             %[c]         \n\t"
+
+      /* flat |= (abs(q1 - q0) > thresh) */
+      "subu_s.qb      %[c],      %[q1],           %[q0]        \n\t"
+      "subu_s.qb      %[r_k],    %[q0],           %[q1]        \n\t"
+      "or             %[r_k],    %[r_k],          %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],      %[flat_thresh],  %[r_k]       \n\t"
+      "or             %[r_flat], %[r_flat],       %[c]         \n\t"
+
+      /* flat |= (abs(p0 - p2) > thresh) */
+      "subu_s.qb      %[c],       %[p0],          %[p2]        \n\t"
+      "subu_s.qb      %[r_k],     %[p2],          %[p0]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+
+      /* flat |= (abs(q0 - q2) > thresh) */
+      "subu_s.qb      %[c],       %[q0],          %[q2]        \n\t"
+      "subu_s.qb      %[r_k],     %[q2],          %[q0]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+
+      /* flat |= (abs(p3 - p0) > thresh) */
+      "subu_s.qb      %[c],       %[p3],          %[p0]        \n\t"
+      "subu_s.qb      %[r_k],     %[p0],          %[p3]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+
+      /* flat |= (abs(q3 - q0) > thresh) */
+      "subu_s.qb      %[c],       %[q3],          %[q0]        \n\t"
+      "subu_s.qb      %[r_k],     %[q0],          %[q3]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+      "sll            %[r_flat],  %[r_flat],      24           \n\t"
+      "wrdsp          %[r_flat]                                \n\t"
+      "pick.qb        %[flat1],   $0,             %[ones]      \n\t"
+      /* flat & flatmask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3) */
+      "and            %[flat1],  %[flat3],        %[flat1]     \n\t"
+
+      : [c] "=&r" (c), [r_k] "=&r" (r_k), [r] "=&r" (r), 
+        [r_flat] "=&r" (r_flat), [flat1] "=&r" (flat1), [flat3] "=&r" (flat3)
+      : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2),
+        [p1] "r" (p1), [p0] "r" (p0), [q0] "r" (q0), [q1] "r" (q1),
+        [q2] "r" (q2), [q3] "r" (q3), [q4] "r" (q4),
+        [flat_thresh] "r" (flat_thresh), [ones] "r" (ones)
+  );
+
+  *flat2 = flat1;
+}
+#endif  // #if HAVE_DSPR2
+#endif  // VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MASKS_DSPR2_H_
diff --git a/vp9/common/mips/dspr2/vp9_mbloop_loopfilter_dspr2.c b/vp9/common/mips/dspr2/vp9_mbloop_loopfilter_dspr2.c
new file mode 100644
index 0000000..adfd755
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_mbloop_loopfilter_dspr2.c
@@ -0,0 +1,652 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_loopfilter.h"
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h"
+
+#if HAVE_DSPR2
+void vp9_mbloop_filter_horizontal_edge_dspr2(unsigned char *s,
+                                             int pitch,
+                                             const uint8_t *blimit,
+                                             const uint8_t *limit,
+                                             const uint8_t *thresh,
+                                             int count) {
+  uint32_t  mask;
+  uint32_t  hev, flat;
+  uint8_t   i;
+  uint8_t   *sp3, *sp2, *sp1, *sp0, *sq0, *sq1, *sq2, *sq3;
+  uint32_t  thresh_vec, flimit_vec, limit_vec;
+  uint32_t  uflimit, ulimit, uthresh;
+  uint32_t  p1_f0, p0_f0, q0_f0, q1_f0;
+  uint32_t  p3, p2, p1, p0, q0, q1, q2, q3;
+  uint32_t  p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l;
+  uint32_t  p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r;
+
+  uflimit = *blimit;
+  ulimit  = *limit;
+  uthresh = *thresh;
+
+  /* create quad-byte */
+  __asm__ __volatile__ (
+      "replv.qb       %[thresh_vec],    %[uthresh]    \n\t"
+      "replv.qb       %[flimit_vec],    %[uflimit]    \n\t"
+      "replv.qb       %[limit_vec],     %[ulimit]     \n\t"
+
+      : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
+        [limit_vec] "=r" (limit_vec)
+      : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
+  );
+
+  /* prefetch data for store */
+  vp9_prefetch_store(s);
+
+  for (i = 0; i < 2; i++) {
+    sp3 = s - (pitch << 2);
+    sp2 = sp3 + pitch;
+    sp1 = sp2 + pitch;
+    sp0 = sp1 + pitch;
+    sq0 = s;
+    sq1 = s + pitch;
+    sq2 = sq1 + pitch;
+    sq3 = sq2 + pitch;
+
+    __asm__ __volatile__ (
+        "lw     %[p3],      (%[sp3])    \n\t"
+        "lw     %[p2],      (%[sp2])    \n\t"
+        "lw     %[p1],      (%[sp1])    \n\t"
+        "lw     %[p0],      (%[sp0])    \n\t"
+        "lw     %[q0],      (%[sq0])    \n\t"
+        "lw     %[q1],      (%[sq1])    \n\t"
+        "lw     %[q2],      (%[sq2])    \n\t"
+        "lw     %[q3],      (%[sq3])    \n\t"
+
+        : [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0),
+          [q3] "=&r" (q3), [q2] "=&r" (q2), [q1] "=&r" (q1), [q0] "=&r" (q0)
+        : [sp3] "r" (sp3), [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+          [sq3] "r" (sq3), [sq2] "r" (sq2), [sq1] "r" (sq1), [sq0] "r" (sq0)
+    );
+
+    vp9_filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
+                                        p1, p0, p3, p2, q0, q1, q2, q3,
+                                        &hev, &mask, &flat);
+
+    if ((flat == 0) && (mask != 0)) {
+      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      __asm__ __volatile__ (
+          "sw       %[p1_f0],   (%[sp1])    \n\t"
+          "sw       %[p0_f0],   (%[sp0])    \n\t"
+          "sw       %[q0_f0],   (%[sq0])    \n\t"
+          "sw       %[q1_f0],   (%[sq1])    \n\t"
+
+          :
+          : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+            [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+            [sp1] "r" (sp1), [sp0] "r" (sp0),
+            [sq0] "r" (sq0), [sq1] "r" (sq1)
+      );
+    } else if ((mask & flat) == 0xFFFFFFFF) {
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                         &q0_l, &q1_l, &q2_l, &q3_l);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                         &q0_r, &q1_r, &q2_r, &q3_r);
+
+      COMBINE_LEFT_RIGHT_0TO2()
+
+      __asm__ __volatile__ (
+          "sw       %[p2],      (%[sp2])    \n\t"
+          "sw       %[p1],      (%[sp1])    \n\t"
+          "sw       %[p0],      (%[sp0])    \n\t"
+          "sw       %[q0],      (%[sq0])    \n\t"
+          "sw       %[q1],      (%[sq1])    \n\t"
+          "sw       %[q2],      (%[sq2])    \n\t"
+
+          :
+          : [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
+            [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2),
+            [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+            [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+      );
+    } else if ((flat != 0) && (mask != 0)) {
+      /* filtering */
+      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                         &q0_l, &q1_l, &q2_l, &q3_l);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                         &q0_r, &q1_r, &q2_r, &q3_r);
+
+      if (mask & flat & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb     %[p2_r],    (%[sp2])    \n\t"
+            "sb     %[p1_r],    (%[sp1])    \n\t"
+            "sb     %[p0_r],    (%[sp0])    \n\t"
+            "sb     %[q0_r],    (%[sq0])    \n\t"
+            "sb     %[q1_r],    (%[sq1])    \n\t"
+            "sb     %[q2_r],    (%[sq2])    \n\t"
+
+            :
+            : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
+              [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  (%[sp1])    \n\t"
+            "sb         %[p0_f0],  (%[sp0])    \n\t"
+            "sb         %[q0_f0],  (%[sq0])    \n\t"
+            "sb         %[q1_f0],  (%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p2_r],    %[p2_r],    16      \n\t"
+          "srl      %[p1_r],    %[p1_r],    16      \n\t"
+          "srl      %[p0_r],    %[p0_r],    16      \n\t"
+          "srl      %[q0_r],    %[q0_r],    16      \n\t"
+          "srl      %[q1_r],    %[q1_r],    16      \n\t"
+          "srl      %[q2_r],    %[q2_r],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r),
+            [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb     %[p2_r],    +1(%[sp2])    \n\t"
+            "sb     %[p1_r],    +1(%[sp1])    \n\t"
+            "sb     %[p0_r],    +1(%[sp0])    \n\t"
+            "sb     %[q0_r],    +1(%[sq0])    \n\t"
+            "sb     %[q1_r],    +1(%[sq1])    \n\t"
+            "sb     %[q2_r],    +1(%[sq2])    \n\t"
+
+            :
+            : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
+              [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb     %[p1_f0],   +1(%[sp1])    \n\t"
+            "sb     %[p0_f0],   +1(%[sp0])    \n\t"
+            "sb     %[q0_f0],   +1(%[sq0])    \n\t"
+            "sb     %[q1_f0],   +1(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
+
+          : [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0),
+            [q0] "+r" (q0), [q1] "+r" (q1), [q2] "+r" (q2),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb     %[p2_l],    +2(%[sp2])    \n\t"
+            "sb     %[p1_l],    +2(%[sp1])    \n\t"
+            "sb     %[p0_l],    +2(%[sp0])    \n\t"
+            "sb     %[q0_l],    +2(%[sq0])    \n\t"
+            "sb     %[q1_l],    +2(%[sq1])    \n\t"
+            "sb     %[q2_l],    +2(%[sq2])    \n\t"
+
+            :
+            : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
+              [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb     %[p1_f0],   +2(%[sp1])    \n\t"
+            "sb     %[p0_f0],   +2(%[sp0])    \n\t"
+            "sb     %[q0_f0],   +2(%[sq0])    \n\t"
+            "sb     %[q1_f0],   +2(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p2_l],    %[p2_l],    16      \n\t"
+          "srl      %[p1_l],    %[p1_l],    16      \n\t"
+          "srl      %[p0_l],    %[p0_l],    16      \n\t"
+          "srl      %[q0_l],    %[q0_l],    16      \n\t"
+          "srl      %[q1_l],    %[q1_l],    16      \n\t"
+          "srl      %[q2_l],    %[q2_l],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l),
+            [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb     %[p2_l],    +3(%[sp2])    \n\t"
+            "sb     %[p1_l],    +3(%[sp1])    \n\t"
+            "sb     %[p0_l],    +3(%[sp0])    \n\t"
+            "sb     %[q0_l],    +3(%[sq0])    \n\t"
+            "sb     %[q1_l],    +3(%[sq1])    \n\t"
+            "sb     %[q2_l],    +3(%[sq2])    \n\t"
+
+            :
+            : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
+              [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb     %[p1_f0],   +3(%[sp1])    \n\t"
+            "sb     %[p0_f0],   +3(%[sp0])    \n\t"
+            "sb     %[q0_f0],   +3(%[sq0])    \n\t"
+            "sb     %[q1_f0],   +3(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+    }
+
+    s = s + 4;
+  }
+}
+
+void vp9_mbloop_filter_vertical_edge_dspr2(unsigned char *s,
+                                           int pitch,
+                                           const uint8_t *blimit,
+                                           const uint8_t *limit,
+                                           const uint8_t *thresh,
+                                           int count) {
+  uint8_t   i;
+  uint32_t  mask, hev, flat;
+  uint8_t   *s1, *s2, *s3, *s4;
+  uint32_t  prim1, prim2, sec3, sec4, prim3, prim4;
+  uint32_t  thresh_vec, flimit_vec, limit_vec;
+  uint32_t  uflimit, ulimit, uthresh;
+  uint32_t  p3, p2, p1, p0, q3, q2, q1, q0;
+  uint32_t  p1_f0, p0_f0, q0_f0, q1_f0;
+  uint32_t  p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l;
+  uint32_t  p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r;
+
+  uflimit = *blimit;
+  ulimit  = *limit;
+  uthresh = *thresh;
+
+  /* create quad-byte */
+  __asm__ __volatile__ (
+      "replv.qb     %[thresh_vec],  %[uthresh]    \n\t"
+      "replv.qb     %[flimit_vec],  %[uflimit]    \n\t"
+      "replv.qb     %[limit_vec],   %[ulimit]     \n\t"
+
+      : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
+        [limit_vec] "=r" (limit_vec)
+      : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
+  );
+
+  vp9_prefetch_store(s + pitch);
+
+  for (i = 0; i < 2; i++) {
+    s1 = s;
+    s2 = s + pitch;
+    s3 = s2 + pitch;
+    s4 = s3 + pitch;
+    s  = s4 + pitch;
+
+    __asm__ __volatile__ (
+        "lw     %[p0],  -4(%[s1])    \n\t"
+        "lw     %[p1],  -4(%[s2])    \n\t"
+        "lw     %[p2],  -4(%[s3])    \n\t"
+        "lw     %[p3],  -4(%[s4])    \n\t"
+        "lw     %[q3],    (%[s1])    \n\t"
+        "lw     %[q2],    (%[s2])    \n\t"
+        "lw     %[q1],    (%[s3])    \n\t"
+        "lw     %[q0],    (%[s4])    \n\t"
+
+        : [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0),
+          [q0] "=&r" (q0), [q1] "=&r" (q1), [q2] "=&r" (q2), [q3] "=&r" (q3)
+        : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
+    );
+
+    /* transpose p3, p2, p1, p0
+       original (when loaded from memory)
+       register       -4    -3   -2     -1
+         p0         p0_0  p0_1  p0_2  p0_3
+         p1         p1_0  p1_1  p1_2  p1_3
+         p2         p2_0  p2_1  p2_2  p2_3
+         p3         p3_0  p3_1  p3_2  p3_3
+
+       after transpose
+       register
+         p0         p3_3  p2_3  p1_3  p0_3
+         p1         p3_2  p2_2  p1_2  p0_2
+         p2         p3_1  p2_1  p1_1  p0_1
+         p3         p3_0  p2_0  p1_0  p0_0
+    */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[p0],      %[p1]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p0],      %[p1]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p2],      %[p3]       \n\t"
+        "precr.qb.ph    %[prim4],   %[p2],      %[p3]       \n\t"
+
+        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p0],      %[p1],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p2],      %[p3],      %[sec4]     \n\t"
+        "append         %[p1],      %[sec3],    16          \n\t"
+        "append         %[p3],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [p0] "+r" (p0), [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* transpose q0, q1, q2, q3
+       original (when loaded from memory)
+       register       +1    +2    +3    +4
+         q3         q3_0  q3_1  q3_2  q3_3
+         q2         q2_0  q2_1  q2_2  q2_3
+         q1         q1_0  q1_1  q1_2  q1_3
+         q0         q0_0  q0_1  q0_2  q0_3
+
+       after transpose
+       register
+         q3         q0_3  q1_3  q2_3  q3_3
+         q2         q0_2  q1_2  q2_2  q3_2
+         q1         q0_1  q1_1  q2_1  q3_1
+         q0         q0_0  q1_0  q2_0  q3_0
+    */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[q3],      %[q2]       \n\t"
+        "precr.qb.ph    %[prim2],   %[q3],      %[q2]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[q1],      %[q0]       \n\t"
+        "precr.qb.ph    %[prim4],   %[q1],      %[q0]       \n\t"
+
+        "precrq.qb.ph   %[q2],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[q0],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[q3],      %[q2],      %[sec3]     \n\t"
+        "precrq.ph.w    %[q1],      %[q0],      %[sec4]     \n\t"
+        "append         %[q2],      %[sec3],    16          \n\t"
+        "append         %[q0],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [q3] "+r" (q3), [q2] "+r" (q2), [q1] "+r" (q1), [q0] "+r" (q0),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    vp9_filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
+                                        p1, p0, p3, p2, q0, q1, q2, q3,
+                                        &hev, &mask, &flat);
+
+    if ((flat == 0) && (mask != 0)) {
+      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      STORE_F0()
+    } else if ((mask & flat) == 0xFFFFFFFF) {
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                         &q0_l, &q1_l, &q2_l, &q3_l);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                         &q0_r, &q1_r, &q2_r, &q3_r);
+
+      STORE_F1()
+    } else if ((flat != 0) && (mask != 0)) {
+      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                         &q0_l, &q1_l, &q2_l, &q3_l);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                         &q0_r, &q1_r, &q2_r, &q3_r);
+
+      if (mask & flat & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb         %[p2_r],  -3(%[s4])    \n\t"
+            "sb         %[p1_r],  -2(%[s4])    \n\t"
+            "sb         %[p0_r],  -1(%[s4])    \n\t"
+            "sb         %[q0_r],    (%[s4])    \n\t"
+            "sb         %[q1_r],  +1(%[s4])    \n\t"
+            "sb         %[q2_r],  +2(%[s4])    \n\t"
+
+            :
+            : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
+              [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+              [s4] "r" (s4)
+        );
+      } else if (mask & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  -2(%[s4])    \n\t"
+            "sb         %[p0_f0],  -1(%[s4])    \n\t"
+            "sb         %[q0_f0],    (%[s4])    \n\t"
+            "sb         %[q1_f0],  +1(%[s4])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [s4] "r" (s4)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p2_r],    %[p2_r],    16      \n\t"
+          "srl      %[p1_r],    %[p1_r],    16      \n\t"
+          "srl      %[p0_r],    %[p0_r],    16      \n\t"
+          "srl      %[q0_r],    %[q0_r],    16      \n\t"
+          "srl      %[q1_r],    %[q1_r],    16      \n\t"
+          "srl      %[q2_r],    %[q2_r],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r),
+            [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb         %[p2_r],  -3(%[s3])    \n\t"
+            "sb         %[p1_r],  -2(%[s3])    \n\t"
+            "sb         %[p0_r],  -1(%[s3])    \n\t"
+            "sb         %[q0_r],    (%[s3])    \n\t"
+            "sb         %[q1_r],  +1(%[s3])    \n\t"
+            "sb         %[q2_r],  +2(%[s3])    \n\t"
+
+            :
+            : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
+              [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+              [s3] "r" (s3)
+        );
+      } else if (mask & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  -2(%[s3])    \n\t"
+            "sb         %[p0_f0],  -1(%[s3])    \n\t"
+            "sb         %[q0_f0],    (%[s3])    \n\t"
+            "sb         %[q1_f0],  +1(%[s3])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [s3] "r" (s3)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
+
+          : [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0),
+            [q0] "+r" (q0), [q1] "+r" (q1), [q2] "+r" (q2),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0x00FF0000) {
+        __asm__ __volatile__ (
+          "sb         %[p2_l],  -3(%[s2])    \n\t"
+          "sb         %[p1_l],  -2(%[s2])    \n\t"
+          "sb         %[p0_l],  -1(%[s2])    \n\t"
+          "sb         %[q0_l],    (%[s2])    \n\t"
+          "sb         %[q1_l],  +1(%[s2])    \n\t"
+          "sb         %[q2_l],  +2(%[s2])    \n\t"
+
+          :
+          : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
+            [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+            [s2] "r" (s2)
+        );
+      } else if (mask & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  -2(%[s2])    \n\t"
+            "sb         %[p0_f0],  -1(%[s2])    \n\t"
+            "sb         %[q0_f0],    (%[s2])    \n\t"
+            "sb         %[q1_f0],  +1(%[s2])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [s2] "r" (s2)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p2_l],    %[p2_l],    16      \n\t"
+          "srl      %[p1_l],    %[p1_l],    16      \n\t"
+          "srl      %[p0_l],    %[p0_l],    16      \n\t"
+          "srl      %[q0_l],    %[q0_l],    16      \n\t"
+          "srl      %[q1_l],    %[q1_l],    16      \n\t"
+          "srl      %[q2_l],    %[q2_l],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l),
+            [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb         %[p2_l],  -3(%[s1])    \n\t"
+            "sb         %[p1_l],  -2(%[s1])    \n\t"
+            "sb         %[p0_l],  -1(%[s1])    \n\t"
+            "sb         %[q0_l],    (%[s1])    \n\t"
+            "sb         %[q1_l],  +1(%[s1])    \n\t"
+            "sb         %[q2_l],  +2(%[s1])    \n\t"
+
+            :
+            : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
+              [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+              [s1] "r" (s1)
+        );
+      } else if (mask & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  -2(%[s1])    \n\t"
+            "sb         %[p0_f0],  -1(%[s1])    \n\t"
+            "sb         %[q0_f0],    (%[s1])    \n\t"
+            "sb         %[q1_f0],  +1(%[s1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0),
+              [q1_f0] "r" (q1_f0), [s1] "r" (s1)
+        );
+      }
+    }
+  }
+}
+#endif  // #if HAVE_DSPR2
diff --git a/vp9/common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c b/vp9/common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c
new file mode 100644
index 0000000..0759755
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c
@@ -0,0 +1,795 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_loopfilter.h"
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h"
+
+#if HAVE_DSPR2
+void vp9_mb_lpf_horizontal_edge_w_dspr2(unsigned char *s,
+                                        int pitch,
+                                        const uint8_t *blimit,
+                                        const uint8_t *limit,
+                                        const uint8_t *thresh,
+                                        int count) {
+  uint32_t  mask;
+  uint32_t  hev, flat, flat2;
+  uint8_t   i;
+  uint8_t   *sp7, *sp6, *sp5, *sp4, *sp3, *sp2, *sp1, *sp0;
+  uint8_t   *sq0, *sq1, *sq2, *sq3, *sq4, *sq5, *sq6, *sq7;
+  uint32_t  thresh_vec, flimit_vec, limit_vec;
+  uint32_t  uflimit, ulimit, uthresh;
+  uint32_t  p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+  uint32_t  p1_f0, p0_f0, q0_f0, q1_f0;
+  uint32_t  p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l;
+  uint32_t  q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l;
+  uint32_t  p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r;
+  uint32_t  q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r;
+  uint32_t  p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1;
+  uint32_t  q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1;
+
+  uflimit = *blimit;
+  ulimit  = *limit;
+  uthresh = *thresh;
+
+  /* create quad-byte */
+  __asm__ __volatile__ (
+      "replv.qb       %[thresh_vec],    %[uthresh]      \n\t"
+      "replv.qb       %[flimit_vec],    %[uflimit]      \n\t"
+      "replv.qb       %[limit_vec],     %[ulimit]       \n\t"
+
+      : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
+        [limit_vec] "=r" (limit_vec)
+      : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
+  );
+
+  /* prefetch data for store */
+  vp9_prefetch_store(s);
+
+  for (i = 0; i < (2 * count); i++) {
+    sp7 = s - (pitch << 3);
+    sp6 = sp7 + pitch;
+    sp5 = sp6 + pitch;
+    sp4 = sp5 + pitch;
+    sp3 = sp4 + pitch;
+    sp2 = sp3 + pitch;
+    sp1 = sp2 + pitch;
+    sp0 = sp1 + pitch;
+    sq0 = s;
+    sq1 = s + pitch;
+    sq2 = sq1 + pitch;
+    sq3 = sq2 + pitch;
+    sq4 = sq3 + pitch;
+    sq5 = sq4 + pitch;
+    sq6 = sq5 + pitch;
+    sq7 = sq6 + pitch;
+
+    __asm__ __volatile__ (
+        "lw     %[p7],      (%[sp7])            \n\t"
+        "lw     %[p6],      (%[sp6])            \n\t"
+        "lw     %[p5],      (%[sp5])            \n\t"
+        "lw     %[p4],      (%[sp4])            \n\t"
+        "lw     %[p3],      (%[sp3])            \n\t"
+        "lw     %[p2],      (%[sp2])            \n\t"
+        "lw     %[p1],      (%[sp1])            \n\t"
+        "lw     %[p0],      (%[sp0])            \n\t"
+
+        : [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0),
+          [p7] "=&r" (p7), [p6] "=&r" (p6), [p5] "=&r" (p5), [p4] "=&r" (p4)
+        : [sp3] "r" (sp3), [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+          [sp4] "r" (sp4), [sp5] "r" (sp5), [sp6] "r" (sp6), [sp7] "r" (sp7)
+    );
+
+    __asm__ __volatile__ (
+        "lw     %[q0],      (%[sq0])            \n\t"
+        "lw     %[q1],      (%[sq1])            \n\t"
+        "lw     %[q2],      (%[sq2])            \n\t"
+        "lw     %[q3],      (%[sq3])            \n\t"
+        "lw     %[q4],      (%[sq4])            \n\t"
+        "lw     %[q5],      (%[sq5])            \n\t"
+        "lw     %[q6],      (%[sq6])            \n\t"
+        "lw     %[q7],      (%[sq7])            \n\t"
+
+        : [q3] "=&r" (q3), [q2] "=&r" (q2), [q1] "=&r" (q1), [q0] "=&r" (q0),
+          [q7] "=&r" (q7), [q6] "=&r" (q6), [q5] "=&r" (q5), [q4] "=&r" (q4)
+        : [sq3] "r" (sq3), [sq2] "r" (sq2), [sq1] "r" (sq1), [sq0] "r" (sq0),
+          [sq4] "r" (sq4), [sq5] "r" (sq5), [sq6] "r" (sq6), [sq7] "r" (sq7)
+    );
+
+    vp9_filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
+                                        p1, p0, p3, p2, q0, q1, q2, q3,
+                                        &hev, &mask, &flat);
+
+    vp9_flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
+
+    /* f0 */
+    if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
+        ((flat2 != 0) && (flat == 0) && (mask != 0))) {
+      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      __asm__ __volatile__ (
+          "sw       %[p1_f0],   (%[sp1])            \n\t"
+          "sw       %[p0_f0],   (%[sp0])            \n\t"
+          "sw       %[q0_f0],   (%[sq0])            \n\t"
+          "sw       %[q1_f0],   (%[sq1])            \n\t"
+
+          :
+          : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+            [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+            [sp1] "r" (sp1), [sp0] "r" (sp0),
+            [sq0] "r" (sq0), [sq1] "r" (sq1)
+      );
+    } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) &&
+               (mask == 0xFFFFFFFF)) {
+      /* f2 */
+      PACK_LEFT_0TO3()
+      PACK_LEFT_4TO7()
+      vp9_wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
+                              &p3_l, &p2_l, &p1_l, &p0_l,
+                              &q0_l, &q1_l, &q2_l, &q3_l,
+                              &q4_l, &q5_l, &q6_l, &q7_l);
+
+      PACK_RIGHT_0TO3()
+      PACK_RIGHT_4TO7()
+      vp9_wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
+                              &p3_r, &p2_r, &p1_r, &p0_r,
+                              &q0_r, &q1_r, &q2_r, &q3_r,
+                              &q4_r, &q5_r, &q6_r, &q7_r);
+
+      COMBINE_LEFT_RIGHT_0TO2()
+      COMBINE_LEFT_RIGHT_3TO6()
+
+      __asm__ __volatile__ (
+          "sw         %[p6], (%[sp6])    \n\t"
+          "sw         %[p5], (%[sp5])    \n\t"
+          "sw         %[p4], (%[sp4])    \n\t"
+          "sw         %[p3], (%[sp3])    \n\t"
+          "sw         %[p2], (%[sp2])    \n\t"
+          "sw         %[p1], (%[sp1])    \n\t"
+          "sw         %[p0], (%[sp0])    \n\t"
+
+          :
+          : [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3),
+            [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
+            [sp6] "r" (sp6), [sp5] "r" (sp5), [sp4] "r" (sp4), [sp3] "r" (sp3),
+            [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0)
+      );
+
+      __asm__ __volatile__ (
+          "sw         %[q6], (%[sq6])    \n\t"
+          "sw         %[q5], (%[sq5])    \n\t"
+          "sw         %[q4], (%[sq4])    \n\t"
+          "sw         %[q3], (%[sq3])    \n\t"
+          "sw         %[q2], (%[sq2])    \n\t"
+          "sw         %[q1], (%[sq1])    \n\t"
+          "sw         %[q0], (%[sq0])    \n\t"
+
+          :
+          : [q6] "r" (q6), [q5] "r" (q5), [q4] "r" (q4), [q3] "r" (q3),
+            [q2] "r" (q2), [q1] "r" (q1), [q0] "r" (q0),
+            [sq6] "r" (sq6), [sq5] "r" (sq5), [sq4] "r" (sq4), [sq3] "r" (sq3),
+            [sq2] "r" (sq2), [sq1] "r" (sq1), [sq0] "r" (sq0)
+      );
+    } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) {
+      /* f1 */
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                         &q0_l, &q1_l, &q2_l, &q3_l);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                         &q0_r, &q1_r, &q2_r, &q3_r);
+
+      COMBINE_LEFT_RIGHT_0TO2()
+
+      __asm__ __volatile__ (
+          "sw         %[p2], (%[sp2])    \n\t"
+          "sw         %[p1], (%[sp1])    \n\t"
+          "sw         %[p0], (%[sp0])    \n\t"
+          "sw         %[q0], (%[sq0])    \n\t"
+          "sw         %[q1], (%[sq1])    \n\t"
+          "sw         %[q2], (%[sq2])    \n\t"
+
+          :
+          : [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
+            [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2),
+            [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+            [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+      );
+    } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
+      /* f0+f1 */
+      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                         &q0_l, &q1_l, &q2_l, &q3_l);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                         &q0_r, &q1_r, &q2_r, &q3_r);
+
+      if (mask & flat & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb         %[p2_r],  (%[sp2])    \n\t"
+            "sb         %[p1_r],  (%[sp1])    \n\t"
+            "sb         %[p0_r],  (%[sp0])    \n\t"
+            "sb         %[q0_r],  (%[sq0])    \n\t"
+            "sb         %[q1_r],  (%[sq1])    \n\t"
+            "sb         %[q2_r],  (%[sq2])    \n\t"
+
+            :
+            : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
+              [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  (%[sp1])    \n\t"
+            "sb         %[p0_f0],  (%[sp0])    \n\t"
+            "sb         %[q0_f0],  (%[sq0])    \n\t"
+            "sb         %[q1_f0],  (%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p2_r],    %[p2_r],    16      \n\t"
+          "srl      %[p1_r],    %[p1_r],    16      \n\t"
+          "srl      %[p0_r],    %[p0_r],    16      \n\t"
+          "srl      %[q0_r],    %[q0_r],    16      \n\t"
+          "srl      %[q1_r],    %[q1_r],    16      \n\t"
+          "srl      %[q2_r],    %[q2_r],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r),
+            [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb         %[p2_r],  +1(%[sp2])    \n\t"
+            "sb         %[p1_r],  +1(%[sp1])    \n\t"
+            "sb         %[p0_r],  +1(%[sp0])    \n\t"
+            "sb         %[q0_r],  +1(%[sq0])    \n\t"
+            "sb         %[q1_r],  +1(%[sq1])    \n\t"
+            "sb         %[q2_r],  +1(%[sq2])    \n\t"
+
+            :
+            : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
+              [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  +1(%[sp1])    \n\t"
+            "sb         %[p0_f0],  +1(%[sp0])    \n\t"
+            "sb         %[q0_f0],  +1(%[sq0])    \n\t"
+            "sb         %[q1_f0],  +1(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
+
+          : [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb         %[p2_l],  +2(%[sp2])    \n\t"
+            "sb         %[p1_l],  +2(%[sp1])    \n\t"
+            "sb         %[p0_l],  +2(%[sp0])    \n\t"
+            "sb         %[q0_l],  +2(%[sq0])    \n\t"
+            "sb         %[q1_l],  +2(%[sq1])    \n\t"
+            "sb         %[q2_l],  +2(%[sq2])    \n\t"
+
+            :
+            : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
+              [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  +2(%[sp1])    \n\t"
+            "sb         %[p0_f0],  +2(%[sp0])    \n\t"
+            "sb         %[q0_f0],  +2(%[sq0])    \n\t"
+            "sb         %[q1_f0],  +2(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p2_l],    %[p2_l],    16      \n\t"
+          "srl      %[p1_l],    %[p1_l],    16      \n\t"
+          "srl      %[p0_l],    %[p0_l],    16      \n\t"
+          "srl      %[q0_l],    %[q0_l],    16      \n\t"
+          "srl      %[q1_l],    %[q1_l],    16      \n\t"
+          "srl      %[q2_l],    %[q2_l],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l),
+            [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb         %[p2_l],  +3(%[sp2])    \n\t"
+            "sb         %[p1_l],  +3(%[sp1])    \n\t"
+            "sb         %[p0_l],  +3(%[sp0])    \n\t"
+            "sb         %[q0_l],  +3(%[sq0])    \n\t"
+            "sb         %[q1_l],  +3(%[sq1])    \n\t"
+            "sb         %[q2_l],  +3(%[sq2])    \n\t"
+
+            :
+            : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
+              [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  +3(%[sp1])    \n\t"
+            "sb         %[p0_f0],  +3(%[sp0])    \n\t"
+            "sb         %[q0_f0],  +3(%[sq0])    \n\t"
+            "sb         %[q1_f0],  +3(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+    } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
+      /* f0 + f1 + f2 */
+      /* f0  function */
+      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      /* f1  function */
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      vp9_mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l,
+                          q0_l, q1_l, q2_l, q3_l,
+                          &p2_l_f1, &p1_l_f1, &p0_l_f1,
+                          &q0_l_f1, &q1_l_f1, &q2_l_f1);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      vp9_mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r,
+                          q0_r, q1_r, q2_r, q3_r,
+                          &p2_r_f1, &p1_r_f1, &p0_r_f1,
+                          &q0_r_f1, &q1_r_f1, &q2_r_f1);
+
+      /* f2  function */
+      PACK_LEFT_4TO7()
+      vp9_wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
+                              &p3_l, &p2_l, &p1_l, &p0_l,
+                              &q0_l, &q1_l, &q2_l, &q3_l,
+                              &q4_l, &q5_l, &q6_l, &q7_l);
+
+      PACK_RIGHT_4TO7()
+      vp9_wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
+                              &p3_r, &p2_r, &p1_r, &p0_r,
+                              &q0_r, &q1_r, &q2_r, &q3_r,
+                              &q4_r, &q5_r, &q6_r, &q7_r);
+
+      if (mask & flat & flat2 & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb         %[p6_r],  (%[sp6])    \n\t"
+            "sb         %[p5_r],  (%[sp5])    \n\t"
+            "sb         %[p4_r],  (%[sp4])    \n\t"
+            "sb         %[p3_r],  (%[sp3])    \n\t"
+            "sb         %[p2_r],  (%[sp2])    \n\t"
+            "sb         %[p1_r],  (%[sp1])    \n\t"
+            "sb         %[p0_r],  (%[sp0])    \n\t"
+
+            :
+            : [p6_r] "r" (p6_r), [p5_r] "r" (p5_r), [p4_r] "r" (p4_r),
+              [p3_r] "r" (p3_r), [p2_r] "r" (p2_r), [p1_r] "r" (p1_r),
+              [sp6] "r" (sp6), [sp5] "r" (sp5), [sp4] "r" (sp4),
+              [sp3] "r" (sp3), [sp2] "r" (sp2), [sp1] "r" (sp1),
+              [p0_r] "r" (p0_r), [sp0] "r" (sp0)
+        );
+
+        __asm__ __volatile__ (
+            "sb         %[q0_r],  (%[sq0])    \n\t"
+            "sb         %[q1_r],  (%[sq1])    \n\t"
+            "sb         %[q2_r],  (%[sq2])    \n\t"
+            "sb         %[q3_r],  (%[sq3])    \n\t"
+            "sb         %[q4_r],  (%[sq4])    \n\t"
+            "sb         %[q5_r],  (%[sq5])    \n\t"
+            "sb         %[q6_r],  (%[sq6])    \n\t"
+
+            :
+            : [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+              [q3_r] "r" (q3_r), [q4_r] "r" (q4_r), [q5_r] "r" (q5_r),
+              [q6_r] "r" (q6_r),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2),
+              [sq3] "r" (sq3), [sq4] "r" (sq4), [sq5] "r" (sq5),
+              [sq6] "r" (sq6)
+        );
+      } else if (mask & flat & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb         %[p2_r_f1],  (%[sp2])    \n\t"
+            "sb         %[p1_r_f1],  (%[sp1])    \n\t"
+            "sb         %[p0_r_f1],  (%[sp0])    \n\t"
+            "sb         %[q0_r_f1],  (%[sq0])    \n\t"
+            "sb         %[q1_r_f1],  (%[sq1])    \n\t"
+            "sb         %[q2_r_f1],  (%[sq2])    \n\t"
+
+            :
+            : [p2_r_f1] "r" (p2_r_f1), [p1_r_f1] "r" (p1_r_f1),
+              [p0_r_f1] "r" (p0_r_f1), [q0_r_f1] "r" (q0_r_f1),
+              [q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  (%[sp1])    \n\t"
+            "sb         %[p0_f0],  (%[sp0])    \n\t"
+            "sb         %[q0_f0],  (%[sq0])    \n\t"
+            "sb         %[q1_f0],  (%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0),
+              [q1_f0] "r" (q1_f0), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl        %[p6_r], %[p6_r], 16     \n\t"
+          "srl        %[p5_r], %[p5_r], 16     \n\t"
+          "srl        %[p4_r], %[p4_r], 16     \n\t"
+          "srl        %[p3_r], %[p3_r], 16     \n\t"
+          "srl        %[p2_r], %[p2_r], 16     \n\t"
+          "srl        %[p1_r], %[p1_r], 16     \n\t"
+          "srl        %[p0_r], %[p0_r], 16     \n\t"
+          "srl        %[q0_r], %[q0_r], 16     \n\t"
+          "srl        %[q1_r], %[q1_r], 16     \n\t"
+          "srl        %[q2_r], %[q2_r], 16     \n\t"
+          "srl        %[q3_r], %[q3_r], 16     \n\t"
+          "srl        %[q4_r], %[q4_r], 16     \n\t"
+          "srl        %[q5_r], %[q5_r], 16     \n\t"
+          "srl        %[q6_r], %[q6_r], 16     \n\t"
+
+          : [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r),
+            [q3_r] "+r" (q3_r), [q4_r] "+r" (q4_r), [q5_r] "+r" (q5_r),
+            [p6_r] "+r" (p6_r), [p5_r] "+r" (p5_r), [p4_r] "+r" (p4_r),
+            [p3_r] "+r" (p3_r), [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r),
+            [q6_r] "+r" (q6_r), [p0_r] "+r" (p0_r)
+          :
+      );
+
+      __asm__ __volatile__ (
+          "srl        %[p2_r_f1], %[p2_r_f1], 16     \n\t"
+          "srl        %[p1_r_f1], %[p1_r_f1], 16     \n\t"
+          "srl        %[p0_r_f1], %[p0_r_f1], 16     \n\t"
+          "srl        %[q0_r_f1], %[q0_r_f1], 16     \n\t"
+          "srl        %[q1_r_f1], %[q1_r_f1], 16     \n\t"
+          "srl        %[q2_r_f1], %[q2_r_f1], 16     \n\t"
+          "srl        %[p1_f0],   %[p1_f0],   8      \n\t"
+          "srl        %[p0_f0],   %[p0_f0],   8      \n\t"
+          "srl        %[q0_f0],   %[q0_f0],   8      \n\t"
+          "srl        %[q1_f0],   %[q1_f0],   8      \n\t"
+
+          : [p2_r_f1] "+r" (p2_r_f1), [p1_r_f1] "+r" (p1_r_f1),
+            [p0_r_f1] "+r" (p0_r_f1), [q0_r_f1] "+r" (q0_r_f1),
+            [q1_r_f1] "+r" (q1_r_f1), [q2_r_f1] "+r" (q2_r_f1),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & flat2 & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb         %[p6_r],  +1(%[sp6])    \n\t"
+            "sb         %[p5_r],  +1(%[sp5])    \n\t"
+            "sb         %[p4_r],  +1(%[sp4])    \n\t"
+            "sb         %[p3_r],  +1(%[sp3])    \n\t"
+            "sb         %[p2_r],  +1(%[sp2])    \n\t"
+            "sb         %[p1_r],  +1(%[sp1])    \n\t"
+            "sb         %[p0_r],  +1(%[sp0])    \n\t"
+
+            :
+            : [p6_r] "r" (p6_r), [p5_r] "r" (p5_r), [p4_r] "r" (p4_r),
+              [p3_r] "r" (p3_r), [p2_r] "r" (p2_r), [p1_r] "r" (p1_r),
+              [p0_r] "r" (p0_r), [sp6] "r" (sp6), [sp5] "r" (sp5),
+              [sp4] "r" (sp4), [sp3] "r" (sp3),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0)
+        );
+
+        __asm__ __volatile__ (
+            "sb         %[q0_r],  +1(%[sq0])    \n\t"
+            "sb         %[q1_r],  +1(%[sq1])    \n\t"
+            "sb         %[q2_r],  +1(%[sq2])    \n\t"
+            "sb         %[q3_r],  +1(%[sq3])    \n\t"
+            "sb         %[q4_r],  +1(%[sq4])    \n\t"
+            "sb         %[q5_r],  +1(%[sq5])    \n\t"
+            "sb         %[q6_r],  +1(%[sq6])    \n\t"
+
+            :
+            : [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+              [q3_r] "r" (q3_r), [q4_r] "r" (q4_r), [q5_r] "r" (q5_r),
+              [q6_r] "r" (q6_r), [sq0] "r" (sq0), [sq1] "r" (sq1),
+              [sq2] "r" (sq2), [sq3] "r" (sq3),
+              [sq4] "r" (sq4), [sq5] "r" (sq5), [sq6] "r" (sq6)
+        );
+      } else if (mask & flat & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb         %[p2_r_f1],  +1(%[sp2])    \n\t"
+            "sb         %[p1_r_f1],  +1(%[sp1])    \n\t"
+            "sb         %[p0_r_f1],  +1(%[sp0])    \n\t"
+            "sb         %[q0_r_f1],  +1(%[sq0])    \n\t"
+            "sb         %[q1_r_f1],  +1(%[sq1])    \n\t"
+            "sb         %[q2_r_f1],  +1(%[sq2])    \n\t"
+
+            :
+            : [p2_r_f1] "r" (p2_r_f1), [p1_r_f1] "r" (p1_r_f1),
+              [p0_r_f1] "r" (p0_r_f1), [q0_r_f1] "r" (q0_r_f1),
+              [q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  +1(%[sp1])    \n\t"
+            "sb         %[p0_f0],  +1(%[sp0])    \n\t"
+            "sb         %[q0_f0],  +1(%[sq0])    \n\t"
+            "sb         %[q1_f0],  +1(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0),
+              [q1_f0] "r" (q1_f0), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl        %[p1_f0], %[p1_f0], 8     \n\t"
+          "srl        %[p0_f0], %[p0_f0], 8     \n\t"
+          "srl        %[q0_f0], %[q0_f0], 8     \n\t"
+          "srl        %[q1_f0], %[q1_f0], 8     \n\t"
+
+          : [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & flat2 & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb         %[p6_l],  +2(%[sp6])    \n\t"
+            "sb         %[p5_l],  +2(%[sp5])    \n\t"
+            "sb         %[p4_l],  +2(%[sp4])    \n\t"
+            "sb         %[p3_l],  +2(%[sp3])    \n\t"
+            "sb         %[p2_l],  +2(%[sp2])    \n\t"
+            "sb         %[p1_l],  +2(%[sp1])    \n\t"
+            "sb         %[p0_l],  +2(%[sp0])    \n\t"
+
+            :
+            : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), [p4_l] "r" (p4_l),
+              [p3_l] "r" (p3_l), [p2_l] "r" (p2_l), [p1_l] "r" (p1_l),
+              [p0_l] "r" (p0_l), [sp6] "r" (sp6), [sp5] "r" (sp5),
+              [sp4] "r" (sp4), [sp3] "r" (sp3),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0)
+        );
+
+        __asm__ __volatile__ (
+            "sb         %[q0_l],  +2(%[sq0])    \n\t"
+            "sb         %[q1_l],  +2(%[sq1])    \n\t"
+            "sb         %[q2_l],  +2(%[sq2])    \n\t"
+            "sb         %[q3_l],  +2(%[sq3])    \n\t"
+            "sb         %[q4_l],  +2(%[sq4])    \n\t"
+            "sb         %[q5_l],  +2(%[sq5])    \n\t"
+            "sb         %[q6_l],  +2(%[sq6])    \n\t"
+
+            :
+            : [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+              [q3_l] "r" (q3_l), [q4_l] "r" (q4_l), [q5_l] "r" (q5_l),
+              [q6_l] "r" (q6_l), [sq0] "r" (sq0), [sq1] "r" (sq1),
+              [sq2] "r" (sq2), [sq3] "r" (sq3),
+              [sq4] "r" (sq4), [sq5] "r" (sq5), [sq6] "r" (sq6)
+        );
+      } else if (mask & flat & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb         %[p2_l_f1],  +2(%[sp2])    \n\t"
+            "sb         %[p1_l_f1],  +2(%[sp1])    \n\t"
+            "sb         %[p0_l_f1],  +2(%[sp0])    \n\t"
+            "sb         %[q0_l_f1],  +2(%[sq0])    \n\t"
+            "sb         %[q1_l_f1],  +2(%[sq1])    \n\t"
+            "sb         %[q2_l_f1],  +2(%[sq2])    \n\t"
+
+            :
+            : [p2_l_f1] "r" (p2_l_f1), [p1_l_f1] "r" (p1_l_f1),
+              [p0_l_f1] "r" (p0_l_f1), [q0_l_f1] "r" (q0_l_f1),
+              [q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  +2(%[sp1])    \n\t"
+            "sb         %[p0_f0],  +2(%[sp0])    \n\t"
+            "sb         %[q0_f0],  +2(%[sq0])    \n\t"
+            "sb         %[q1_f0],  +2(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0),
+              [q1_f0] "r" (q1_f0), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p6_l],    %[p6_l],    16   \n\t"
+          "srl      %[p5_l],    %[p5_l],    16   \n\t"
+          "srl      %[p4_l],    %[p4_l],    16   \n\t"
+          "srl      %[p3_l],    %[p3_l],    16   \n\t"
+          "srl      %[p2_l],    %[p2_l],    16   \n\t"
+          "srl      %[p1_l],    %[p1_l],    16   \n\t"
+          "srl      %[p0_l],    %[p0_l],    16   \n\t"
+          "srl      %[q0_l],    %[q0_l],    16   \n\t"
+          "srl      %[q1_l],    %[q1_l],    16   \n\t"
+          "srl      %[q2_l],    %[q2_l],    16   \n\t"
+          "srl      %[q3_l],    %[q3_l],    16   \n\t"
+          "srl      %[q4_l],    %[q4_l],    16   \n\t"
+          "srl      %[q5_l],    %[q5_l],    16   \n\t"
+          "srl      %[q6_l],    %[q6_l],    16   \n\t"
+
+          : [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
+            [q3_l] "+r" (q3_l), [q4_l] "+r" (q4_l), [q5_l] "+r" (q5_l),
+            [q6_l] "+r" (q6_l), [p6_l] "+r" (p6_l), [p5_l] "+r" (p5_l),
+            [p4_l] "+r" (p4_l), [p3_l] "+r" (p3_l), [p2_l] "+r" (p2_l),
+            [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l)
+          :
+      );
+
+      __asm__ __volatile__ (
+          "srl      %[p2_l_f1],   %[p2_l_f1],   16   \n\t"
+          "srl      %[p1_l_f1],   %[p1_l_f1],   16   \n\t"
+          "srl      %[p0_l_f1],   %[p0_l_f1],   16   \n\t"
+          "srl      %[q0_l_f1],   %[q0_l_f1],   16   \n\t"
+          "srl      %[q1_l_f1],   %[q1_l_f1],   16   \n\t"
+          "srl      %[q2_l_f1],   %[q2_l_f1],   16   \n\t"
+          "srl      %[p1_f0],     %[p1_f0],     8    \n\t"
+          "srl      %[p0_f0],     %[p0_f0],     8    \n\t"
+          "srl      %[q0_f0],     %[q0_f0],     8    \n\t"
+          "srl      %[q1_f0],     %[q1_f0],     8    \n\t"
+
+          : [p2_l_f1] "+r" (p2_l_f1), [p1_l_f1] "+r" (p1_l_f1),
+            [p0_l_f1] "+r" (p0_l_f1), [q0_l_f1] "+r" (q0_l_f1),
+            [q1_l_f1] "+r" (q1_l_f1), [q2_l_f1] "+r" (q2_l_f1),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & flat2 & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb     %[p6_l],    +3(%[sp6])    \n\t"
+            "sb     %[p5_l],    +3(%[sp5])    \n\t"
+            "sb     %[p4_l],    +3(%[sp4])    \n\t"
+            "sb     %[p3_l],    +3(%[sp3])    \n\t"
+            "sb     %[p2_l],    +3(%[sp2])    \n\t"
+            "sb     %[p1_l],    +3(%[sp1])    \n\t"
+            "sb     %[p0_l],    +3(%[sp0])    \n\t"
+
+            :
+            : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), [p4_l] "r" (p4_l),
+              [p3_l] "r" (p3_l), [p2_l] "r" (p2_l), [p1_l] "r" (p1_l),
+              [p0_l] "r" (p0_l), [sp6] "r" (sp6), [sp5] "r" (sp5),
+              [sp4] "r" (sp4), [sp3] "r" (sp3), [sp2] "r" (sp2),
+              [sp1] "r" (sp1), [sp0] "r" (sp0)
+        );
+
+        __asm__ __volatile__ (
+            "sb     %[q0_l],    +3(%[sq0])    \n\t"
+            "sb     %[q1_l],    +3(%[sq1])    \n\t"
+            "sb     %[q2_l],    +3(%[sq2])    \n\t"
+            "sb     %[q3_l],    +3(%[sq3])    \n\t"
+            "sb     %[q4_l],    +3(%[sq4])    \n\t"
+            "sb     %[q5_l],    +3(%[sq5])    \n\t"
+            "sb     %[q6_l],    +3(%[sq6])    \n\t"
+
+            :
+            : [q0_l] "r" (q0_l), [q1_l] "r" (q1_l),
+              [q2_l] "r" (q2_l), [q3_l] "r" (q3_l),
+              [q4_l] "r" (q4_l), [q5_l] "r" (q5_l),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2),
+              [sq3] "r" (sq3), [sq4] "r" (sq4), [sq5] "r" (sq5),
+              [q6_l] "r" (q6_l), [sq6] "r" (sq6)
+        );
+      } else if (mask & flat & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb     %[p2_l_f1],     +3(%[sp2])    \n\t"
+            "sb     %[p1_l_f1],     +3(%[sp1])    \n\t"
+            "sb     %[p0_l_f1],     +3(%[sp0])    \n\t"
+            "sb     %[q0_l_f1],     +3(%[sq0])    \n\t"
+            "sb     %[q1_l_f1],     +3(%[sq1])    \n\t"
+            "sb     %[q2_l_f1],     +3(%[sq2])    \n\t"
+
+            :
+            : [p2_l_f1] "r" (p2_l_f1), [p1_l_f1] "r" (p1_l_f1),
+              [p0_l_f1] "r" (p0_l_f1), [q0_l_f1] "r" (q0_l_f1),
+              [q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb     %[p1_f0],   +3(%[sp1])    \n\t"
+            "sb     %[p0_f0],   +3(%[sp0])    \n\t"
+            "sb     %[q0_f0],   +3(%[sq0])    \n\t"
+            "sb     %[q1_f0],   +3(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+    }
+
+    s = s + 4;
+  }
+}
+#endif  // #if HAVE_DSPR2
diff --git a/vp9/common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c b/vp9/common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c
new file mode 100644
index 0000000..9e9171c
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c
@@ -0,0 +1,840 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_loopfilter.h"
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h"
+
+#if HAVE_DSPR2
+void vp9_mb_lpf_vertical_edge_w_dspr2(uint8_t *s,
+                                      int pitch,
+                                      const uint8_t *blimit,
+                                      const uint8_t *limit,
+                                      const uint8_t *thresh) {
+  uint8_t   i;
+  uint32_t  mask, hev, flat, flat2;
+  uint8_t   *s1, *s2, *s3, *s4;
+  uint32_t  prim1, prim2, sec3, sec4, prim3, prim4;
+  uint32_t  thresh_vec, flimit_vec, limit_vec;
+  uint32_t  uflimit, ulimit, uthresh;
+  uint32_t  p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+  uint32_t  p1_f0, p0_f0, q0_f0, q1_f0;
+  uint32_t  p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l;
+  uint32_t  q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l;
+  uint32_t  p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r;
+  uint32_t  q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r;
+  uint32_t  p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1;
+  uint32_t  q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1;
+
+  uflimit = *blimit;
+  ulimit = *limit;
+  uthresh = *thresh;
+
+  /* create quad-byte */
+  __asm__ __volatile__ (
+      "replv.qb     %[thresh_vec],     %[uthresh]    \n\t"
+      "replv.qb     %[flimit_vec],     %[uflimit]    \n\t"
+      "replv.qb     %[limit_vec],      %[ulimit]     \n\t"
+
+      : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
+        [limit_vec] "=r" (limit_vec)
+      : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
+  );
+
+  vp9_prefetch_store(s + pitch);
+
+  for (i = 0; i < 2; i++) {
+    s1 = s;
+    s2 = s + pitch;
+    s3 = s2 + pitch;
+    s4 = s3 + pitch;
+    s  = s4 + pitch;
+
+    __asm__ __volatile__ (
+        "lw     %[p0],  -4(%[s1])    \n\t"
+        "lw     %[p1],  -4(%[s2])    \n\t"
+        "lw     %[p2],  -4(%[s3])    \n\t"
+        "lw     %[p3],  -4(%[s4])    \n\t"
+        "lw     %[p4],  -8(%[s1])    \n\t"
+        "lw     %[p5],  -8(%[s2])    \n\t"
+        "lw     %[p6],  -8(%[s3])    \n\t"
+        "lw     %[p7],  -8(%[s4])    \n\t"
+
+        : [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1),
+          [p0] "=&r" (p0), [p7] "=&r" (p7), [p6] "=&r" (p6),
+          [p5] "=&r" (p5), [p4] "=&r" (p4)
+        : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
+    );
+
+    __asm__ __volatile__ (
+        "lw     %[q3],  (%[s1])     \n\t"
+        "lw     %[q2],  (%[s2])     \n\t"
+        "lw     %[q1],  (%[s3])     \n\t"
+        "lw     %[q0],  (%[s4])     \n\t"
+        "lw     %[q7],  +4(%[s1])   \n\t"
+        "lw     %[q6],  +4(%[s2])   \n\t"
+        "lw     %[q5],  +4(%[s3])   \n\t"
+        "lw     %[q4],  +4(%[s4])   \n\t"
+
+        : [q3] "=&r" (q3), [q2] "=&r" (q2), [q1] "=&r" (q1),
+          [q0] "=&r" (q0), [q7] "=&r" (q7), [q6] "=&r" (q6),
+          [q5] "=&r" (q5), [q4] "=&r" (q4)
+        : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
+    );
+
+    /* transpose p3, p2, p1, p0
+       original (when loaded from memory)
+       register       -4    -3   -2     -1
+         p0         p0_0  p0_1  p0_2  p0_3
+         p1         p1_0  p1_1  p1_2  p1_3
+         p2         p2_0  p2_1  p2_2  p2_3
+         p3         p3_0  p3_1  p3_2  p3_3
+
+       after transpose
+       register
+         p0         p3_3  p2_3  p1_3  p0_3
+         p1         p3_2  p2_2  p1_2  p0_2
+         p2         p3_1  p2_1  p1_1  p0_1
+         p3         p3_0  p2_0  p1_0  p0_0
+    */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[p0],      %[p1]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p0],      %[p1]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p2],      %[p3]       \n\t"
+        "precr.qb.ph    %[prim4],   %[p2],      %[p3]       \n\t"
+
+        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p0],      %[p1],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p2],      %[p3],      %[sec4]     \n\t"
+        "append         %[p1],      %[sec3],    16          \n\t"
+        "append         %[p3],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [p0] "+r" (p0), [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* transpose q0, q1, q2, q3
+       original (when loaded from memory)
+       register       +1    +2    +3    +4
+         q3         q3_0  q3_1  q3_2  q3_3
+         q2         q2_0  q2_1  q2_2  q2_3
+         q1         q1_0  q1_1  q1_2  q1_3
+         q0         q0_0  q0_1  q0_2  q0_3
+
+       after transpose
+       register
+         q3         q0_3  q1_3  q2_3  q3_3
+         q2         q0_2  q1_2  q2_2  q3_2
+         q1         q0_1  q1_1  q2_1  q3_1
+         q0         q0_0  q1_0  q2_0  q3_0
+    */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[q3],      %[q2]       \n\t"
+        "precr.qb.ph    %[prim2],   %[q3],      %[q2]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[q1],      %[q0]       \n\t"
+        "precr.qb.ph    %[prim4],   %[q1],      %[q0]       \n\t"
+
+        "precrq.qb.ph   %[q2],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[q0],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[q3],      %[q2],      %[sec3]     \n\t"
+        "precrq.ph.w    %[q1],      %[q0],      %[sec4]     \n\t"
+        "append         %[q2],      %[sec3],    16          \n\t"
+        "append         %[q0],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [q3] "+r" (q3), [q2] "+r" (q2), [q1] "+r" (q1), [q0] "+r" (q0),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* transpose p7, p6, p5, p4
+       original (when loaded from memory)
+       register      -8    -7   -6     -5
+         p4         p4_0  p4_1  p4_2  p4_3
+         p5         p5_0  p5_1  p5_2  p5_3
+         p6         p6_0  p6_1  p6_2  p6_3
+         p7         p7_0  p7_1  p7_2  p7_3
+
+       after transpose
+       register
+         p4         p7_3  p6_3  p5_3  p4_3
+         p5         p7_2  p6_2  p5_2  p4_2
+         p6         p7_1  p6_1  p5_1  p4_1
+         p7         p7_0  p6_0  p5_0  p4_0
+    */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[p4],      %[p5]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p4],      %[p5]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p6],      %[p7]       \n\t"
+        "precr.qb.ph    %[prim4],   %[p6],      %[p7]       \n\t"
+
+        "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[p7],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p4],      %[p5],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p6],      %[p7],      %[sec4]     \n\t"
+        "append         %[p5],      %[sec3],    16          \n\t"
+        "append         %[p7],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [p4] "+r" (p4), [p5] "+r" (p5), [p6] "+r" (p6), [p7] "+r" (p7),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* transpose q4, q5, q6, q7
+       original (when loaded from memory)
+       register      +5    +6    +7    +8
+         q7         q7_0  q7_1  q7_2  q7_3
+         q6         q6_0  q6_1  q6_2  q6_3
+         q5         q5_0  q5_1  q5_2  q5_3
+         q4         q4_0  q4_1  q4_2  q4_3
+
+       after transpose
+       register
+         q7         q4_3  q5_3  q26_3  q7_3
+         q6         q4_2  q5_2  q26_2  q7_2
+         q5         q4_1  q5_1  q26_1  q7_1
+         q4         q4_0  q5_0  q26_0  q7_0
+    */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[q7],      %[q6]       \n\t"
+        "precr.qb.ph    %[prim2],   %[q7],      %[q6]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[q5],      %[q4]       \n\t"
+        "precr.qb.ph    %[prim4],   %[q5],      %[q4]       \n\t"
+
+        "precrq.qb.ph   %[q6],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[q4],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[q7],      %[q6],      %[sec3]     \n\t"
+        "precrq.ph.w    %[q5],      %[q4],      %[sec4]     \n\t"
+        "append         %[q6],      %[sec3],    16          \n\t"
+        "append         %[q4],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [q7] "+r" (q7), [q6] "+r" (q6), [q5] "+r" (q5), [q4] "+r" (q4),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    vp9_filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
+                                        p1, p0, p3, p2, q0, q1, q2, q3,
+                                        &hev, &mask, &flat);
+
+    vp9_flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
+
+    /* f0 */
+    if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
+        ((flat2 != 0) && (flat == 0) && (mask != 0))) {
+      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      STORE_F0()
+    } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) &&
+               (mask == 0xFFFFFFFF)) {
+      /* f2 */
+      PACK_LEFT_0TO3()
+      PACK_LEFT_4TO7()
+      vp9_wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
+                              &p3_l, &p2_l, &p1_l, &p0_l,
+                              &q0_l, &q1_l, &q2_l, &q3_l,
+                              &q4_l, &q5_l, &q6_l, &q7_l);
+
+      PACK_RIGHT_0TO3()
+      PACK_RIGHT_4TO7()
+      vp9_wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
+                              &p3_r, &p2_r, &p1_r, &p0_r,
+                              &q0_r, &q1_r, &q2_r, &q3_r,
+                              &q4_r, &q5_r, &q6_r, &q7_r);
+
+      STORE_F2()
+    } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) {
+      /* f1 */
+      PACK_LEFT_0TO3()
+      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                         &q0_l, &q1_l, &q2_l, &q3_l);
+
+      PACK_RIGHT_0TO3()
+      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                         &q0_r, &q1_r, &q2_r, &q3_r);
+
+      STORE_F1()
+    } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
+      /* f0 + f1 */
+      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                         &q0_l, &q1_l, &q2_l, &q3_l);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                         &q0_r, &q1_r, &q2_r, &q3_r);
+
+      if (mask & flat & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb     %[p2_r],    -3(%[s4])    \n\t"
+            "sb     %[p1_r],    -2(%[s4])    \n\t"
+            "sb     %[p0_r],    -1(%[s4])    \n\t"
+            "sb     %[q0_r],      (%[s4])    \n\t"
+            "sb     %[q1_r],    +1(%[s4])    \n\t"
+            "sb     %[q2_r],    +2(%[s4])    \n\t"
+
+            :
+            : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
+              [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+              [s4] "r" (s4)
+        );
+      } else if (mask & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  -2(%[s4])    \n\t"
+            "sb         %[p0_f0],  -1(%[s4])    \n\t"
+            "sb         %[q0_f0],    (%[s4])    \n\t"
+            "sb         %[q1_f0],  +1(%[s4])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [s4] "r" (s4)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p2_r],    %[p2_r],    16      \n\t"
+          "srl      %[p1_r],    %[p1_r],    16      \n\t"
+          "srl      %[p0_r],    %[p0_r],    16      \n\t"
+          "srl      %[q0_r],    %[q0_r],    16      \n\t"
+          "srl      %[q1_r],    %[q1_r],    16      \n\t"
+          "srl      %[q2_r],    %[q2_r],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r),
+            [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb     %[p2_r],    -3(%[s3])    \n\t"
+            "sb     %[p1_r],    -2(%[s3])    \n\t"
+            "sb     %[p0_r],    -1(%[s3])    \n\t"
+            "sb     %[q0_r],      (%[s3])    \n\t"
+            "sb     %[q1_r],    +1(%[s3])    \n\t"
+            "sb     %[q2_r],    +2(%[s3])    \n\t"
+
+            :
+            : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
+              [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+              [s3] "r" (s3)
+        );
+      } else if (mask & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb     %[p1_f0],   -2(%[s3])    \n\t"
+            "sb     %[p0_f0],   -1(%[s3])    \n\t"
+            "sb     %[q0_f0],     (%[s3])    \n\t"
+            "sb     %[q1_f0],   +1(%[s3])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [s3] "r" (s3)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
+
+          : [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0x00FF0000) {
+        __asm__ __volatile__ (
+          "sb       %[p2_l],    -3(%[s2])    \n\t"
+          "sb       %[p1_l],    -2(%[s2])    \n\t"
+          "sb       %[p0_l],    -1(%[s2])    \n\t"
+          "sb       %[q0_l],      (%[s2])    \n\t"
+          "sb       %[q1_l],    +1(%[s2])    \n\t"
+          "sb       %[q2_l],    +2(%[s2])    \n\t"
+
+          :
+          : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
+            [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+            [s2] "r" (s2)
+        );
+      } else if (mask & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb     %[p1_f0],   -2(%[s2])    \n\t"
+            "sb     %[p0_f0],   -1(%[s2])    \n\t"
+            "sb     %[q0_f0],     (%[s2])    \n\t"
+            "sb     %[q1_f0],   +1(%[s2])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [s2] "r" (s2)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p2_l],    %[p2_l],    16      \n\t"
+          "srl      %[p1_l],    %[p1_l],    16      \n\t"
+          "srl      %[p0_l],    %[p0_l],    16      \n\t"
+          "srl      %[q0_l],    %[q0_l],    16      \n\t"
+          "srl      %[q1_l],    %[q1_l],    16      \n\t"
+          "srl      %[q2_l],    %[q2_l],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l),
+            [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb     %[p2_l],    -3(%[s1])    \n\t"
+            "sb     %[p1_l],    -2(%[s1])    \n\t"
+            "sb     %[p0_l],    -1(%[s1])    \n\t"
+            "sb     %[q0_l],      (%[s1])    \n\t"
+            "sb     %[q1_l],    +1(%[s1])    \n\t"
+            "sb     %[q2_l],    +2(%[s1])    \n\t"
+
+            :
+            : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
+              [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+              [s1] "r" (s1)
+        );
+      } else if (mask & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb     %[p1_f0],   -2(%[s1])    \n\t"
+            "sb     %[p0_f0],   -1(%[s1])    \n\t"
+            "sb     %[q0_f0],     (%[s1])    \n\t"
+            "sb     %[q1_f0],   +1(%[s1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [s1] "r" (s1)
+        );
+      }
+    } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
+      /* f0+f1+f2 */
+      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      PACK_LEFT_0TO3()
+      vp9_mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l,
+                          q0_l, q1_l, q2_l, q3_l,
+                          &p2_l_f1, &p1_l_f1, &p0_l_f1,
+                          &q0_l_f1, &q1_l_f1, &q2_l_f1);
+
+      PACK_RIGHT_0TO3()
+      vp9_mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r,
+                          q0_r, q1_r, q2_r, q3_r,
+                          &p2_r_f1, &p1_r_f1, &p0_r_f1,
+                          &q0_r_f1, &q1_r_f1, &q2_r_f1);
+
+      PACK_LEFT_4TO7()
+      vp9_wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
+                              &p3_l, &p2_l, &p1_l, &p0_l,
+                              &q0_l, &q1_l, &q2_l, &q3_l,
+                              &q4_l, &q5_l, &q6_l, &q7_l);
+
+      PACK_RIGHT_4TO7()
+      vp9_wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
+                              &p3_r, &p2_r, &p1_r, &p0_r,
+                              &q0_r, &q1_r, &q2_r, &q3_r,
+                              &q4_r, &q5_r, &q6_r, &q7_r);
+
+      if (mask & flat & flat2 & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb     %[p6_r],    -7(%[s4])    \n\t"
+            "sb     %[p5_r],    -6(%[s4])    \n\t"
+            "sb     %[p4_r],    -5(%[s4])    \n\t"
+            "sb     %[p3_r],    -4(%[s4])    \n\t"
+            "sb     %[p2_r],    -3(%[s4])    \n\t"
+            "sb     %[p1_r],    -2(%[s4])    \n\t"
+            "sb     %[p0_r],    -1(%[s4])    \n\t"
+
+            :
+            : [p6_r] "r" (p6_r), [p5_r] "r" (p5_r),
+              [p4_r] "r" (p4_r), [p3_r] "r" (p3_r),
+              [p2_r] "r" (p2_r), [p1_r] "r" (p1_r),
+              [p0_r] "r" (p0_r), [s4] "r" (s4)
+        );
+
+        __asm__ __volatile__ (
+            "sb     %[q0_r],      (%[s4])    \n\t"
+            "sb     %[q1_r],    +1(%[s4])    \n\t"
+            "sb     %[q2_r],    +2(%[s4])    \n\t"
+            "sb     %[q3_r],    +3(%[s4])    \n\t"
+            "sb     %[q4_r],    +4(%[s4])    \n\t"
+            "sb     %[q5_r],    +5(%[s4])    \n\t"
+            "sb     %[q6_r],    +6(%[s4])    \n\t"
+
+            :
+            : [q0_r] "r" (q0_r), [q1_r] "r" (q1_r),
+              [q2_r] "r" (q2_r), [q3_r] "r" (q3_r),
+              [q4_r] "r" (q4_r), [q5_r] "r" (q5_r),
+              [q6_r] "r" (q6_r), [s4] "r" (s4)
+        );
+      } else if (mask & flat & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb     %[p2_r_f1],     -3(%[s4])    \n\t"
+            "sb     %[p1_r_f1],     -2(%[s4])    \n\t"
+            "sb     %[p0_r_f1],     -1(%[s4])    \n\t"
+            "sb     %[q0_r_f1],       (%[s4])    \n\t"
+            "sb     %[q1_r_f1],     +1(%[s4])    \n\t"
+            "sb     %[q2_r_f1],     +2(%[s4])    \n\t"
+
+            :
+            : [p2_r_f1] "r" (p2_r_f1), [p1_r_f1] "r" (p1_r_f1),
+              [p0_r_f1] "r" (p0_r_f1), [q0_r_f1] "r" (q0_r_f1),
+              [q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1),
+              [s4] "r" (s4)
+        );
+      } else if (mask & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb     %[p1_f0],   -2(%[s4])    \n\t"
+            "sb     %[p0_f0],   -1(%[s4])    \n\t"
+            "sb     %[q0_f0],     (%[s4])    \n\t"
+            "sb     %[q1_f0],   +1(%[s4])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [s4] "r" (s4)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p6_r],        %[p6_r],        16     \n\t"
+          "srl      %[p5_r],        %[p5_r],        16     \n\t"
+          "srl      %[p4_r],        %[p4_r],        16     \n\t"
+          "srl      %[p3_r],        %[p3_r],        16     \n\t"
+          "srl      %[p2_r],        %[p2_r],        16     \n\t"
+          "srl      %[p1_r],        %[p1_r],        16     \n\t"
+          "srl      %[p0_r],        %[p0_r],        16     \n\t"
+          "srl      %[q0_r],        %[q0_r],        16     \n\t"
+          "srl      %[q1_r],        %[q1_r],        16     \n\t"
+          "srl      %[q2_r],        %[q2_r],        16     \n\t"
+          "srl      %[q3_r],        %[q3_r],        16     \n\t"
+          "srl      %[q4_r],        %[q4_r],        16     \n\t"
+          "srl      %[q5_r],        %[q5_r],        16     \n\t"
+          "srl      %[q6_r],        %[q6_r],        16     \n\t"
+
+          : [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r),
+            [q2_r] "+r" (q2_r), [q3_r] "+r" (q3_r),
+            [q4_r] "+r" (q4_r), [q5_r] "+r" (q5_r),
+            [q6_r] "+r" (q6_r), [p6_r] "+r" (p6_r),
+            [p5_r] "+r" (p5_r), [p4_r] "+r" (p4_r),
+            [p3_r] "+r" (p3_r), [p2_r] "+r" (p2_r),
+            [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r)
+          :
+      );
+
+      __asm__ __volatile__ (
+          "srl      %[p2_r_f1],     %[p2_r_f1],     16      \n\t"
+          "srl      %[p1_r_f1],     %[p1_r_f1],     16      \n\t"
+          "srl      %[p0_r_f1],     %[p0_r_f1],     16      \n\t"
+          "srl      %[q0_r_f1],     %[q0_r_f1],     16      \n\t"
+          "srl      %[q1_r_f1],     %[q1_r_f1],     16      \n\t"
+          "srl      %[q2_r_f1],     %[q2_r_f1],     16      \n\t"
+          "srl      %[p1_f0],       %[p1_f0],       8       \n\t"
+          "srl      %[p0_f0],       %[p0_f0],       8       \n\t"
+          "srl      %[q0_f0],       %[q0_f0],       8       \n\t"
+          "srl      %[q1_f0],       %[q1_f0],       8       \n\t"
+
+          : [p2_r_f1] "+r" (p2_r_f1), [p1_r_f1] "+r" (p1_r_f1),
+            [p0_r_f1] "+r" (p0_r_f1), [q0_r_f1] "+r" (q0_r_f1),
+            [q1_r_f1] "+r" (q1_r_f1), [q2_r_f1] "+r" (q2_r_f1),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & flat2 & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb     %[p6_r],    -7(%[s3])    \n\t"
+            "sb     %[p5_r],    -6(%[s3])    \n\t"
+            "sb     %[p4_r],    -5(%[s3])    \n\t"
+            "sb     %[p3_r],    -4(%[s3])    \n\t"
+            "sb     %[p2_r],    -3(%[s3])    \n\t"
+            "sb     %[p1_r],    -2(%[s3])    \n\t"
+            "sb     %[p0_r],    -1(%[s3])    \n\t"
+
+            :
+            : [p6_r] "r" (p6_r), [p5_r] "r" (p5_r), [p4_r] "r" (p4_r),
+              [p3_r] "r" (p3_r), [p2_r] "r" (p2_r), [p1_r] "r" (p1_r),
+              [p0_r] "r" (p0_r), [s3] "r" (s3)
+        );
+
+        __asm__ __volatile__ (
+            "sb     %[q0_r],      (%[s3])    \n\t"
+            "sb     %[q1_r],    +1(%[s3])    \n\t"
+            "sb     %[q2_r],    +2(%[s3])    \n\t"
+            "sb     %[q3_r],    +3(%[s3])    \n\t"
+            "sb     %[q4_r],    +4(%[s3])    \n\t"
+            "sb     %[q5_r],    +5(%[s3])    \n\t"
+            "sb     %[q6_r],    +6(%[s3])    \n\t"
+
+            :
+            : [q0_r] "r" (q0_r), [q1_r] "r" (q1_r),
+              [q2_r] "r" (q2_r), [q3_r] "r" (q3_r),
+              [q4_r] "r" (q4_r), [q5_r] "r" (q5_r),
+              [q6_r] "r" (q6_r), [s3] "r" (s3)
+        );
+      } else if (mask & flat & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb     %[p2_r_f1],     -3(%[s3])    \n\t"
+            "sb     %[p1_r_f1],     -2(%[s3])    \n\t"
+            "sb     %[p0_r_f1],     -1(%[s3])    \n\t"
+            "sb     %[q0_r_f1],       (%[s3])    \n\t"
+            "sb     %[q1_r_f1],     +1(%[s3])    \n\t"
+            "sb     %[q2_r_f1],     +2(%[s3])    \n\t"
+
+            :
+            : [p2_r_f1] "r" (p2_r_f1), [p1_r_f1] "r" (p1_r_f1),
+              [p0_r_f1] "r" (p0_r_f1), [q0_r_f1] "r" (q0_r_f1),
+              [q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1),
+              [s3] "r" (s3)
+        );
+      } else if (mask & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb     %[p1_f0],   -2(%[s3])    \n\t"
+            "sb     %[p0_f0],   -1(%[s3])    \n\t"
+            "sb     %[q0_f0],     (%[s3])    \n\t"
+            "sb     %[q1_f0],   +1(%[s3])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [s3] "r" (s3)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
+
+          : [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & flat2 & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb     %[p6_l],    -7(%[s2])    \n\t"
+            "sb     %[p5_l],    -6(%[s2])    \n\t"
+            "sb     %[p4_l],    -5(%[s2])    \n\t"
+            "sb     %[p3_l],    -4(%[s2])    \n\t"
+            "sb     %[p2_l],    -3(%[s2])    \n\t"
+            "sb     %[p1_l],    -2(%[s2])    \n\t"
+            "sb     %[p0_l],    -1(%[s2])    \n\t"
+
+            :
+            : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), [p4_l] "r" (p4_l),
+              [p3_l] "r" (p3_l), [p2_l] "r" (p2_l), [p1_l] "r" (p1_l),
+              [p0_l] "r" (p0_l), [s2] "r" (s2)
+        );
+
+        __asm__ __volatile__ (
+            "sb     %[q0_l],      (%[s2])    \n\t"
+            "sb     %[q1_l],    +1(%[s2])    \n\t"
+            "sb     %[q2_l],    +2(%[s2])    \n\t"
+            "sb     %[q3_l],    +3(%[s2])    \n\t"
+            "sb     %[q4_l],    +4(%[s2])    \n\t"
+            "sb     %[q5_l],    +5(%[s2])    \n\t"
+            "sb     %[q6_l],    +6(%[s2])    \n\t"
+
+            :
+            : [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+              [q3_l] "r" (q3_l), [q4_l] "r" (q4_l), [q5_l] "r" (q5_l),
+              [q6_l] "r" (q6_l), [s2] "r" (s2)
+        );
+      } else if (mask & flat & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb     %[p2_l_f1],     -3(%[s2])    \n\t"
+            "sb     %[p1_l_f1],     -2(%[s2])    \n\t"
+            "sb     %[p0_l_f1],     -1(%[s2])    \n\t"
+            "sb     %[q0_l_f1],       (%[s2])    \n\t"
+            "sb     %[q1_l_f1],     +1(%[s2])    \n\t"
+            "sb     %[q2_l_f1],     +2(%[s2])    \n\t"
+
+            :
+            : [p2_l_f1] "r" (p2_l_f1), [p1_l_f1] "r" (p1_l_f1),
+              [p0_l_f1] "r" (p0_l_f1), [q0_l_f1] "r" (q0_l_f1),
+              [q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1),
+              [s2] "r" (s2)
+        );
+      } else if (mask & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb     %[p1_f0],   -2(%[s2])    \n\t"
+            "sb     %[p0_f0],   -1(%[s2])    \n\t"
+            "sb     %[q0_f0],     (%[s2])    \n\t"
+            "sb     %[q1_f0],   +1(%[s2])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [s2] "r" (s2)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p6_l],        %[p6_l],        16     \n\t"
+          "srl      %[p5_l],        %[p5_l],        16     \n\t"
+          "srl      %[p4_l],        %[p4_l],        16     \n\t"
+          "srl      %[p3_l],        %[p3_l],        16     \n\t"
+          "srl      %[p2_l],        %[p2_l],        16     \n\t"
+          "srl      %[p1_l],        %[p1_l],        16     \n\t"
+          "srl      %[p0_l],        %[p0_l],        16     \n\t"
+          "srl      %[q0_l],        %[q0_l],        16     \n\t"
+          "srl      %[q1_l],        %[q1_l],        16     \n\t"
+          "srl      %[q2_l],        %[q2_l],        16     \n\t"
+          "srl      %[q3_l],        %[q3_l],        16     \n\t"
+          "srl      %[q4_l],        %[q4_l],        16     \n\t"
+          "srl      %[q5_l],        %[q5_l],        16     \n\t"
+          "srl      %[q6_l],        %[q6_l],        16     \n\t"
+
+          : [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
+            [q3_l] "+r" (q3_l), [q4_l] "+r" (q4_l), [q5_l] "+r" (q5_l),
+            [q6_l] "+r" (q6_l), [p6_l] "+r" (p6_l), [p5_l] "+r" (p5_l),
+            [p4_l] "+r" (p4_l), [p3_l] "+r" (p3_l), [p2_l] "+r" (p2_l),
+            [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l)
+          :
+      );
+
+      __asm__ __volatile__ (
+          "srl      %[p2_l_f1],     %[p2_l_f1],     16      \n\t"
+          "srl      %[p1_l_f1],     %[p1_l_f1],     16      \n\t"
+          "srl      %[p0_l_f1],     %[p0_l_f1],     16      \n\t"
+          "srl      %[q0_l_f1],     %[q0_l_f1],     16      \n\t"
+          "srl      %[q1_l_f1],     %[q1_l_f1],     16      \n\t"
+          "srl      %[q2_l_f1],     %[q2_l_f1],     16      \n\t"
+          "srl      %[p1_f0],       %[p1_f0],       8       \n\t"
+          "srl      %[p0_f0],       %[p0_f0],       8       \n\t"
+          "srl      %[q0_f0],       %[q0_f0],       8       \n\t"
+          "srl      %[q1_f0],       %[q1_f0],       8       \n\t"
+
+          : [p2_l_f1] "+r" (p2_l_f1), [p1_l_f1] "+r" (p1_l_f1),
+            [p0_l_f1] "+r" (p0_l_f1), [q0_l_f1] "+r" (q0_l_f1),
+            [q1_l_f1] "+r" (q1_l_f1), [q2_l_f1] "+r" (q2_l_f1),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & flat2 & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb     %[p6_l],    -7(%[s1])    \n\t"
+            "sb     %[p5_l],    -6(%[s1])    \n\t"
+            "sb     %[p4_l],    -5(%[s1])    \n\t"
+            "sb     %[p3_l],    -4(%[s1])    \n\t"
+            "sb     %[p2_l],    -3(%[s1])    \n\t"
+            "sb     %[p1_l],    -2(%[s1])    \n\t"
+            "sb     %[p0_l],    -1(%[s1])    \n\t"
+
+            :
+            : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), [p4_l] "r" (p4_l),
+              [p3_l] "r" (p3_l), [p2_l] "r" (p2_l), [p1_l] "r" (p1_l),
+              [p0_l] "r" (p0_l),
+              [s1] "r" (s1)
+        );
+
+        __asm__ __volatile__ (
+            "sb     %[q0_l],     (%[s1])    \n\t"
+            "sb     %[q1_l],    1(%[s1])    \n\t"
+            "sb     %[q2_l],    2(%[s1])    \n\t"
+            "sb     %[q3_l],    3(%[s1])    \n\t"
+            "sb     %[q4_l],    4(%[s1])    \n\t"
+            "sb     %[q5_l],    5(%[s1])    \n\t"
+            "sb     %[q6_l],    6(%[s1])    \n\t"
+
+            :
+            : [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+              [q3_l] "r" (q3_l), [q4_l] "r" (q4_l), [q5_l] "r" (q5_l),
+              [q6_l] "r" (q6_l),
+              [s1] "r" (s1)
+        );
+      } else if (mask & flat & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb     %[p2_l_f1],     -3(%[s1])    \n\t"
+            "sb     %[p1_l_f1],     -2(%[s1])    \n\t"
+            "sb     %[p0_l_f1],     -1(%[s1])    \n\t"
+            "sb     %[q0_l_f1],       (%[s1])    \n\t"
+            "sb     %[q1_l_f1],     +1(%[s1])    \n\t"
+            "sb     %[q2_l_f1],     +2(%[s1])    \n\t"
+
+            :
+            : [p2_l_f1] "r" (p2_l_f1), [p1_l_f1] "r" (p1_l_f1),
+              [p0_l_f1] "r" (p0_l_f1), [q0_l_f1] "r" (q0_l_f1),
+              [q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1),
+              [s1] "r" (s1)
+        );
+      } else if (mask & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb     %[p1_f0],   -2(%[s1])    \n\t"
+            "sb     %[p0_f0],   -1(%[s1])    \n\t"
+            "sb     %[q0_f0],     (%[s1])    \n\t"
+            "sb     %[q1_f0],   +1(%[s1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [s1] "r" (s1)
+        );
+      }
+    }
+  }
+}
+#endif  // #if HAVE_DSPR2
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index f52adfc..c5da375 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -170,9 +170,9 @@
 };
 
 struct macroblockd_plane {
-  DECLARE_ALIGNED(16, int16_t,  qcoeff[64 * 64]);
-  DECLARE_ALIGNED(16, int16_t,  dqcoeff[64 * 64]);
-  DECLARE_ALIGNED(16, uint16_t, eobs[256]);
+  int16_t *qcoeff;
+  int16_t *dqcoeff;
+  uint16_t *eobs;
   PLANE_TYPE plane_type;
   int subsampling_x;
   int subsampling_y;
@@ -215,13 +215,6 @@
 
   int corrupted;
 
-  unsigned char sb_index;   // index of 32x32 block inside the 64x64 block
-  unsigned char mb_index;   // index of 16x16 block inside the 32x32 block
-  unsigned char b_index;    // index of 8x8 block inside the 16x16 block
-  unsigned char ab_index;   // index of 4x4 block inside the 8x8 block
-
-  int q_index;
-
   /* Y,U,V,(A) */
   ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
   ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
@@ -456,57 +449,46 @@
     }
   }
 }
-static void set_contexts_on_border(const MACROBLOCKD *xd,
-                                   struct macroblockd_plane *pd,
-                                   BLOCK_SIZE plane_bsize,
-                                   int tx_size_in_blocks, int has_eob,
-                                   int aoff, int loff,
-                                   ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L) {
-  int mi_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
-  int mi_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
-  int above_contexts = tx_size_in_blocks;
-  int left_contexts = tx_size_in_blocks;
-  int pt;
-
-  // xd->mb_to_right_edge is in units of pixels * 8.  This converts
-  // it to 4x4 block sizes.
-  if (xd->mb_to_right_edge < 0)
-    mi_blocks_wide += (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
-
-  if (xd->mb_to_bottom_edge < 0)
-    mi_blocks_high += (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
-
-  // this code attempts to avoid copying into contexts that are outside
-  // our border.  Any blocks that do are set to 0...
-  if (above_contexts + aoff > mi_blocks_wide)
-    above_contexts = mi_blocks_wide - aoff;
-
-  if (left_contexts + loff > mi_blocks_high)
-    left_contexts = mi_blocks_high - loff;
-
-  for (pt = 0; pt < above_contexts; pt++)
-    A[pt] = has_eob;
-  for (pt = above_contexts; pt < tx_size_in_blocks; pt++)
-    A[pt] = 0;
-  for (pt = 0; pt < left_contexts; pt++)
-    L[pt] = has_eob;
-  for (pt = left_contexts; pt < tx_size_in_blocks; pt++)
-    L[pt] = 0;
-}
 
 static void set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
                          BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
                          int has_eob, int aoff, int loff) {
-  ENTROPY_CONTEXT *const A = pd->above_context + aoff;
-  ENTROPY_CONTEXT *const L = pd->left_context + loff;
+  ENTROPY_CONTEXT *const a = pd->above_context + aoff;
+  ENTROPY_CONTEXT *const l = pd->left_context + loff;
   const int tx_size_in_blocks = 1 << tx_size;
 
-  if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
-    set_contexts_on_border(xd, pd, plane_bsize, tx_size_in_blocks, has_eob,
-                           aoff, loff, A, L);
+  // above
+  if (has_eob && xd->mb_to_right_edge < 0) {
+    int i;
+    const int blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize] +
+                            (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+    int above_contexts = tx_size_in_blocks;
+    if (above_contexts + aoff > blocks_wide)
+      above_contexts = blocks_wide - aoff;
+
+    for (i = 0; i < above_contexts; ++i)
+      a[i] = has_eob;
+    for (i = above_contexts; i < tx_size_in_blocks; ++i)
+      a[i] = 0;
   } else {
-    vpx_memset(A, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
-    vpx_memset(L, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+    vpx_memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+  }
+
+  // left
+  if (has_eob && xd->mb_to_bottom_edge < 0) {
+    int i;
+    const int blocks_high = num_4x4_blocks_high_lookup[plane_bsize] +
+                            (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+    int left_contexts = tx_size_in_blocks;
+    if (left_contexts + loff > blocks_high)
+      left_contexts = blocks_high - loff;
+
+    for (i = 0; i < left_contexts; ++i)
+      l[i] = has_eob;
+    for (i = left_contexts; i < tx_size_in_blocks; ++i)
+      l[i] = 0;
+  } else {
+    vpx_memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
   }
 }
 
diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index c58e852..67b1669 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -51,7 +51,7 @@
 extern struct vp9_token vp9_coef_encodings[MAX_ENTROPY_TOKENS];
 
 typedef struct {
-  vp9_tree_index *tree;
+  const vp9_tree_index *tree;
   const vp9_prob *prob;
   int len;
   int base_val;
@@ -127,12 +127,6 @@
 extern const uint8_t vp9_coefband_trans_8x8plus[MAXBAND_INDEX + 1];
 extern const uint8_t vp9_coefband_trans_4x4[MAXBAND_INDEX + 1];
 
-
-static int get_coef_band(const uint8_t * band_translate, int coef_index) {
-  return (coef_index > MAXBAND_INDEX)
-    ? (COEF_BANDS-1) : band_translate[coef_index];
-}
-
 // 128 lists of probabilities are stored for the following ONE node probs:
 // 1, 3, 5, 7, ..., 253, 255
 // In between probabilities are interpolated linearly
@@ -181,11 +175,6 @@
   return combine_entropy_contexts(above_ec, left_ec);
 }
 
-static const uint8_t *get_band_translate(TX_SIZE tx_size) {
-  return tx_size == TX_4X4 ? vp9_coefband_trans_4x4
-                           : vp9_coefband_trans_8x8plus;
-}
-
 static void get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size,
                      PLANE_TYPE type, int block_idx,
                      const int16_t **scan, const int16_t **scan_nb) {
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 218e12e..ff504a1 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -32,6 +32,8 @@
   uint16_t left_uv[TX_SIZES];
   uint16_t above_uv[TX_SIZES];
   uint16_t int_4x4_uv;
+  uint8_t lfl_y[64];
+  uint8_t lfl_uv[16];
 } LOOP_FILTER_MASK;
 
 // 64 bit masks for left transform size.  Each 1 represents a position where
@@ -322,20 +324,14 @@
   }
 }
 
-static int build_lfi(const loop_filter_info_n *lfi_n,
-                     const MB_MODE_INFO *mbmi,
-                     const loop_filter_thresh **lfi) {
+static uint8_t build_lfi(const loop_filter_info_n *lfi_n,
+                     const MB_MODE_INFO *mbmi) {
   const int seg = mbmi->segment_id;
   const int ref = mbmi->ref_frame[0];
   const int mode = lfi_n->mode_lf_lut[mbmi->mode];
   const int filter_level = lfi_n->lvl[seg][ref][mode];
 
-  if (filter_level > 0) {
-    *lfi = &lfi_n->lfthr[filter_level];
-    return 1;
-  } else {
-    return 0;
-  }
+  return filter_level;
 }
 
 static void filter_selectively_vert(uint8_t *s, int pitch,
@@ -343,12 +339,13 @@
                                     unsigned int mask_8x8,
                                     unsigned int mask_4x4,
                                     unsigned int mask_4x4_int,
-                                    const loop_filter_thresh **p_lfi) {
+                                    const loop_filter_info_n *lfi_n,
+                                    const uint8_t *lfl) {
   unsigned int mask;
 
   for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
        mask; mask >>= 1) {
-    const loop_filter_thresh *lfi = *p_lfi;
+    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
 
     if (mask & 1) {
       if (mask_16x16 & 1) {
@@ -373,7 +370,7 @@
       vp9_loop_filter_vertical_edge(s + 4, pitch, lfi->mblim, lfi->lim,
                                     lfi->hev_thr, 1);
     s += 8;
-    p_lfi++;
+    lfl += 1;
     mask_16x16 >>= 1;
     mask_8x8 >>= 1;
     mask_4x4 >>= 1;
@@ -386,49 +383,112 @@
                                      unsigned int mask_8x8,
                                      unsigned int mask_4x4,
                                      unsigned int mask_4x4_int,
-                                     int only_4x4_1,
-                                     const loop_filter_thresh **p_lfi) {
+                                     const loop_filter_info_n *lfi_n,
+                                     const uint8_t *lfl) {
   unsigned int mask;
   int count;
 
   for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
        mask; mask >>= count) {
-    const loop_filter_thresh *lfi = *p_lfi;
+    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
 
     count = 1;
     if (mask & 1) {
-      if (!only_4x4_1) {
-        if (mask_16x16 & 1) {
-          if ((mask_16x16 & 3) == 3) {
-            vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim,
-                                         lfi->hev_thr, 2);
-            count = 2;
-          } else {
-            vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim,
-                                         lfi->hev_thr, 1);
-          }
-          assert(!(mask_8x8 & 1));
-          assert(!(mask_4x4 & 1));
-          assert(!(mask_4x4_int & 1));
-        } else if (mask_8x8 & 1) {
+      if (mask_16x16 & 1) {
+        if ((mask_16x16 & 3) == 3) {
+          vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim,
+                                       lfi->hev_thr, 2);
+          count = 2;
+        } else {
+          vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim,
+                                       lfi->hev_thr, 1);
+        }
+        assert(!(mask_8x8 & 1));
+        assert(!(mask_4x4 & 1));
+        assert(!(mask_4x4_int & 1));
+      } else if (mask_8x8 & 1) {
+        if ((mask_8x8 & 3) == 3) {
+          // Next block's thresholds
+          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+
+          // TODO(yunqingwang): Combine next 2 calls as 1 wide filtering.
           vp9_mbloop_filter_horizontal_edge(s, pitch, lfi->mblim, lfi->lim,
                                             lfi->hev_thr, 1);
-          assert(!(mask_16x16 & 1));
-          assert(!(mask_4x4 & 1));
-        } else if (mask_4x4 & 1) {
-          vp9_loop_filter_horizontal_edge(s, pitch, lfi->mblim, lfi->lim,
-                                          lfi->hev_thr, 1);
-          assert(!(mask_16x16 & 1));
-          assert(!(mask_8x8 & 1));
-        }
-      }
+          vp9_mbloop_filter_horizontal_edge(s + 8, pitch, lfin->mblim,
+                                            lfin->lim, lfin->hev_thr, 1);
 
-      if (mask_4x4_int & 1)
+          if ((mask_4x4_int & 3) == 3) {
+            // TODO(yunqingwang): Combine next 2 calls as 1 wide filtering.
+            vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim,
+                                            lfi->lim, lfi->hev_thr, 1);
+            vp9_loop_filter_horizontal_edge(s + 8 + 4 * pitch, pitch,
+                                            lfin->mblim, lfin->lim,
+                                            lfin->hev_thr, 1);
+          } else {
+            if (mask_4x4_int & 1)
+              vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim,
+                                              lfi->lim, lfi->hev_thr, 1);
+            else if (mask_4x4_int & 2)
+              vp9_loop_filter_horizontal_edge(s + 8 + 4 * pitch, pitch,
+                                              lfin->mblim, lfin->lim,
+                                              lfin->hev_thr, 1);
+          }
+          count = 2;
+        } else {
+          vp9_mbloop_filter_horizontal_edge(s, pitch, lfi->mblim, lfi->lim,
+                                            lfi->hev_thr, 1);
+
+          if (mask_4x4_int & 1)
+            vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim,
+                                            lfi->lim, lfi->hev_thr, 1);
+        }
+        assert(!(mask_16x16 & 1));
+        assert(!(mask_4x4 & 1));
+      } else if (mask_4x4 & 1) {
+        if ((mask_4x4 & 3) == 3) {
+          // Next block's thresholds
+          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+
+          // TODO(yunqingwang): Combine next 2 calls as 1 wide filtering.
+          vp9_loop_filter_horizontal_edge(s, pitch, lfi->mblim, lfi->lim,
+                                            lfi->hev_thr, 1);
+          vp9_loop_filter_horizontal_edge(s + 8, pitch, lfin->mblim, lfin->lim,
+                                            lfin->hev_thr, 1);
+
+          if ((mask_4x4_int & 3) == 3) {
+            // TODO(yunqingwang): Combine next 2 calls as 1 wide filtering.
+            vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim,
+                                            lfi->lim, lfi->hev_thr, 1);
+            vp9_loop_filter_horizontal_edge(s + 8 + 4 * pitch, pitch,
+                                            lfin->mblim, lfin->lim,
+                                            lfin->hev_thr, 1);
+          } else {
+            if (mask_4x4_int & 1)
+              vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim,
+                                              lfi->lim, lfi->hev_thr, 1);
+            else if (mask_4x4_int & 2)
+              vp9_loop_filter_horizontal_edge(s + 8 + 4 * pitch, pitch,
+                                              lfin->mblim, lfin->lim,
+                                              lfin->hev_thr, 1);
+          }
+          count = 2;
+        } else {
+        vp9_loop_filter_horizontal_edge(s, pitch, lfi->mblim, lfi->lim,
+                                        lfi->hev_thr, 1);
+
+        if (mask_4x4_int & 1)
+          vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim,
+                                          lfi->lim, lfi->hev_thr, 1);
+        }
+        assert(!(mask_16x16 & 1));
+        assert(!(mask_8x8 & 1));
+      } else if (mask_4x4_int & 1) {
         vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim,
                                         lfi->lim, lfi->hev_thr, 1);
+      }
     }
     s += 8 * count;
-    p_lfi += count;
+    lfl += count;
     mask_16x16 >>= count;
     mask_8x8 >>= count;
     mask_4x4 >>= count;
@@ -461,10 +521,20 @@
   uint16_t *left_uv = &lfm->left_uv[tx_size_uv];
   uint16_t *above_uv = &lfm->above_uv[tx_size_uv];
   uint16_t *int_4x4_uv = &lfm->int_4x4_uv;
+  int i;
+  int w = num_8x8_blocks_wide_lookup[block_size];
+  int h = num_8x8_blocks_high_lookup[block_size];
 
   // If filter level is 0 we don't loop filter.
-  if (!filter_level)
+  if (!filter_level) {
     return;
+  } else {
+    int index = shift_y;
+    for (i = 0; i < h; i++) {
+      vpx_memset(&lfm->lfl_y[index], filter_level, w);
+      index += 8;
+    }
+  }
 
   // These set 1 in the current block size for the block size edges.
   // For instance if the block size is 32x16,   we'll set :
@@ -530,9 +600,19 @@
   uint64_t *left_y = &lfm->left_y[tx_size_y];
   uint64_t *above_y = &lfm->above_y[tx_size_y];
   uint64_t *int_4x4_y = &lfm->int_4x4_y;
+  int i;
+  int w = num_8x8_blocks_wide_lookup[block_size];
+  int h = num_8x8_blocks_high_lookup[block_size];
 
-  if (!filter_level)
+  if (!filter_level) {
     return;
+  } else {
+    int index = shift_y;
+    for (i = 0; i < h; i++) {
+      vpx_memset(&lfm->lfl_y[index], filter_level, w);
+      index += 8;
+    }
+  }
 
   *above_y |= above_prediction_mask[block_size] << shift_y;
   *left_y |= left_prediction_mask[block_size] << shift_y;
@@ -785,6 +865,7 @@
     }
   }
 }
+
 #if CONFIG_NON420
 static void filter_block_plane_non420(VP9_COMMON *cm,
                                       struct macroblockd_plane *plane,
@@ -801,7 +882,7 @@
   unsigned int mask_8x8[MI_BLOCK_SIZE] = {0};
   unsigned int mask_4x4[MI_BLOCK_SIZE] = {0};
   unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0};
-  const loop_filter_thresh *lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
+  uint8_t lfl[MI_BLOCK_SIZE * MI_BLOCK_SIZE];
   int r, c;
 
   for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
@@ -830,7 +911,8 @@
       const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
 
       // Filter level can vary per MI
-      if (!build_lfi(&cm->lf_info, &mi[0].mbmi, &lfi[r][c >> ss_x]))
+      if (!(lfl[(r << 3) + (c >> ss_x)] =
+          build_lfi(&cm->lf_info, &mi[0].mbmi)))
         continue;
 
       // Build masks based on the transform size of each block
@@ -887,7 +969,8 @@
                             mask_16x16_c & border_mask,
                             mask_8x8_c & border_mask,
                             mask_4x4_c & border_mask,
-                            mask_4x4_int[r], lfi[r]);
+                            mask_4x4_int[r],
+                            &cm->lf_info, &lfl[r << 3]);
     dst->buf += 8 * dst->stride;
     mi_8x8 += row_step_stride;
   }
@@ -898,11 +981,26 @@
     const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
     const unsigned int mask_4x4_int_r = skip_border_4x4_r ? 0 : mask_4x4_int[r];
 
+    unsigned int mask_16x16_r;
+    unsigned int mask_8x8_r;
+    unsigned int mask_4x4_r;
+
+    if (mi_row + r == 0) {
+      mask_16x16_r = 0;
+      mask_8x8_r = 0;
+      mask_4x4_r = 0;
+    } else {
+      mask_16x16_r = mask_16x16[r];
+      mask_8x8_r = mask_8x8[r];
+      mask_4x4_r = mask_4x4[r];
+    }
+
     filter_selectively_horiz(dst->buf, dst->stride,
-                             mask_16x16[r],
-                             mask_8x8[r],
-                             mask_4x4[r],
-                             mask_4x4_int_r, mi_row + r == 0, lfi[r]);
+                             mask_16x16_r,
+                             mask_8x8_r,
+                             mask_4x4_r,
+                             mask_4x4_int_r,
+                             &cm->lf_info, &lfl[r << 3]);
     dst->buf += 8 * dst->stride;
   }
 }
@@ -910,81 +1008,137 @@
 
 static void filter_block_plane(VP9_COMMON *const cm,
                                struct macroblockd_plane *const plane,
-                               MODE_INFO **mi_8x8,
-                               int mi_row, int mi_col,
+                               int mi_row,
                                LOOP_FILTER_MASK *lfm) {
-  const int ss_x = plane->subsampling_x;
-  const int ss_y = plane->subsampling_y;
-  const int row_step = 1 << ss_x;
-  const int col_step = 1 << ss_y;
-  const int row_step_stride = cm->mode_info_stride * row_step;
   struct buf_2d *const dst = &plane->dst;
   uint8_t* const dst0 = dst->buf;
-  unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0};
-  const loop_filter_thresh *lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
+  unsigned int mask_4x4_int_row[MI_BLOCK_SIZE] = {0};
   int r, c;
-  int row_shift = 3 - ss_x;
-  int row_mask = 0xff >> (ss_x << 2);
 
-#define MASK_ROW(value) ((value >> (r_sampled << row_shift)) & row_mask)
+  if (!plane->plane_type) {
+    uint64_t mask_16x16 = lfm->left_y[TX_16X16];
+    uint64_t mask_8x8 = lfm->left_y[TX_8X8];
+    uint64_t mask_4x4 = lfm->left_y[TX_4X4];
+    uint64_t mask_4x4_int = lfm->int_4x4_y;
 
-  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
-    int r_sampled = r >> ss_x;
+    // Vertical pass
+    for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r++) {
+      mask_4x4_int_row[r] = mask_4x4_int & 0xff;
 
-    // Determine the vertical edges that need filtering
-    for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
-      const MODE_INFO *mi = mi_8x8[c];
-
-      build_lfi(&cm->lf_info, &mi[0].mbmi, &lfi[r][c >> ss_x]);
-    }
-    if (!plane->plane_type) {
-      mask_4x4_int[r] = MASK_ROW(lfm->int_4x4_y);
       // Disable filtering on the leftmost column
       filter_selectively_vert(dst->buf, dst->stride,
-                              MASK_ROW(lfm->left_y[TX_16X16]),
-                              MASK_ROW(lfm->left_y[TX_8X8]),
-                              MASK_ROW(lfm->left_y[TX_4X4]),
-                              MASK_ROW(lfm->int_4x4_y),
-                              lfi[r]);
-    } else {
-      mask_4x4_int[r] = MASK_ROW(lfm->int_4x4_uv);
+                              mask_16x16 & 0xff,
+                              mask_8x8 & 0xff,
+                              mask_4x4 & 0xff,
+                              mask_4x4_int_row[r],
+                              &cm->lf_info, &lfm->lfl_y[r << 3]);
+
+      dst->buf += 8 * dst->stride;
+      mask_16x16 >>= 8;
+      mask_8x8 >>= 8;
+      mask_4x4 >>= 8;
+      mask_4x4_int >>= 8;
+    }
+
+    // Horizontal pass
+    dst->buf = dst0;
+    mask_16x16 = lfm->above_y[TX_16X16];
+    mask_8x8 = lfm->above_y[TX_8X8];
+    mask_4x4 = lfm->above_y[TX_4X4];
+
+    for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r++) {
+      unsigned int mask_16x16_r;
+      unsigned int mask_8x8_r;
+      unsigned int mask_4x4_r;
+
+      if (mi_row + r == 0) {
+        mask_16x16_r = 0;
+        mask_8x8_r = 0;
+        mask_4x4_r = 0;
+      } else {
+        mask_16x16_r = mask_16x16 & 0xff;
+        mask_8x8_r = mask_8x8 & 0xff;
+        mask_4x4_r = mask_4x4 & 0xff;
+      }
+
+      filter_selectively_horiz(dst->buf, dst->stride,
+                               mask_16x16_r,
+                               mask_8x8_r,
+                               mask_4x4_r,
+                               mask_4x4_int_row[r],
+                               &cm->lf_info, &lfm->lfl_y[r << 3]);
+
+      dst->buf += 8 * dst->stride;
+      mask_16x16 >>= 8;
+      mask_8x8 >>= 8;
+      mask_4x4 >>= 8;
+    }
+  } else {
+    uint16_t mask_16x16 = lfm->left_uv[TX_16X16];
+    uint16_t mask_8x8 = lfm->left_uv[TX_8X8];
+    uint16_t mask_4x4 = lfm->left_uv[TX_4X4];
+    uint16_t mask_4x4_int = lfm->int_4x4_uv;
+
+    // Vertical pass
+    for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) {
+      if (plane->plane_type == 1) {
+        for (c = 0; c < (MI_BLOCK_SIZE >> 1); c++)
+          lfm->lfl_uv[(r << 1) + c] = lfm->lfl_y[(r << 3) + (c << 1)];
+      }
+
+      mask_4x4_int_row[r] = mask_4x4_int & 0xf;
       // Disable filtering on the leftmost column
       filter_selectively_vert(dst->buf, dst->stride,
-                              MASK_ROW(lfm->left_uv[TX_16X16]),
-                              MASK_ROW(lfm->left_uv[TX_8X8]),
-                              MASK_ROW(lfm->left_uv[TX_4X4]),
-                              MASK_ROW(lfm->int_4x4_uv),
-                              lfi[r]);
+                              mask_16x16 & 0xf,
+                              mask_8x8 & 0xf,
+                              mask_4x4 & 0xf,
+                              mask_4x4_int_row[r],
+                              &cm->lf_info, &lfm->lfl_uv[r << 1]);
+
+      dst->buf += 8 * dst->stride;
+      mask_16x16 >>= 4;
+      mask_8x8 >>= 4;
+      mask_4x4 >>= 4;
+      mask_4x4_int >>= 4;
     }
-    dst->buf += 8 * dst->stride;
-    mi_8x8 += row_step_stride;
-  }
 
-  // Now do horizontal pass
-  dst->buf = dst0;
-  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
-    const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
-    const unsigned int mask_4x4_int_r = skip_border_4x4_r ? 0 : mask_4x4_int[r];
-    int r_sampled = r >> ss_x;
+    // Horizontal pass
+    dst->buf = dst0;
+    mask_16x16 = lfm->above_uv[TX_16X16];
+    mask_8x8 = lfm->above_uv[TX_8X8];
+    mask_4x4 = lfm->above_uv[TX_4X4];
 
-    if (!plane->plane_type) {
+    for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) {
+      const int skip_border_4x4_r = mi_row + r == cm->mi_rows - 1;
+      const unsigned int mask_4x4_int_r = skip_border_4x4_r ?
+          0 : (mask_4x4_int_row[r]);
+      unsigned int mask_16x16_r;
+      unsigned int mask_8x8_r;
+      unsigned int mask_4x4_r;
+
+      if (mi_row + r == 0) {
+        mask_16x16_r = 0;
+        mask_8x8_r = 0;
+        mask_4x4_r = 0;
+      } else {
+        mask_16x16_r = mask_16x16 & 0xf;
+        mask_8x8_r = mask_8x8 & 0xf;
+        mask_4x4_r = mask_4x4 & 0xf;
+      }
+
       filter_selectively_horiz(dst->buf, dst->stride,
-                               MASK_ROW(lfm->above_y[TX_16X16]),
-                               MASK_ROW(lfm->above_y[TX_8X8]),
-                               MASK_ROW(lfm->above_y[TX_4X4]),
-                               MASK_ROW(lfm->int_4x4_y),
-                               mi_row + r == 0, lfi[r]);
-    } else {
-      filter_selectively_horiz(dst->buf, dst->stride,
-                               MASK_ROW(lfm->above_uv[TX_16X16]),
-                               MASK_ROW(lfm->above_uv[TX_8X8]),
-                               MASK_ROW(lfm->above_uv[TX_4X4]),
+                               mask_16x16_r,
+                               mask_8x8_r,
+                               mask_4x4_r,
                                mask_4x4_int_r,
-                               mi_row + r == 0, lfi[r]);
+                               &cm->lf_info, &lfm->lfl_uv[r << 1]);
+
+      dst->buf += 8 * dst->stride;
+      mask_16x16 >>= 4;
+      mask_8x8 >>= 4;
+      mask_4x4 >>= 4;
     }
-    dst->buf += 8 * dst->stride;
   }
-#undef MASK_ROW
 }
 
 void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
@@ -1017,8 +1171,7 @@
 #if CONFIG_NON420
         if (use_420)
 #endif
-          filter_block_plane(cm, &xd->plane[plane], mi_8x8 + mi_col, mi_row,
-                             mi_col, &lfm);
+          filter_block_plane(cm, &xd->plane[plane], mi_row, &lfm);
 #if CONFIG_NON420
         else
           filter_block_plane_non420(cm, &xd->plane[plane], mi_8x8 + mi_col,
diff --git a/vp9/common/vp9_pred_common.h b/vp9/common/vp9_pred_common.h
index 19032bf..9190930 100644
--- a/vp9/common/vp9_pred_common.h
+++ b/vp9/common/vp9_pred_common.h
@@ -109,32 +109,40 @@
 
 unsigned char vp9_get_pred_context_tx_size(const MACROBLOCKD *xd);
 
-static const vp9_prob *get_tx_probs(BLOCK_SIZE bsize, uint8_t context,
+static const vp9_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx,
                                     const struct tx_probs *tx_probs) {
-  if (bsize < BLOCK_16X16)
-    return tx_probs->p8x8[context];
-  else if (bsize < BLOCK_32X32)
-    return tx_probs->p16x16[context];
-  else
-    return tx_probs->p32x32[context];
+  switch (max_tx_size) {
+    case TX_8X8:
+      return tx_probs->p8x8[ctx];
+    case TX_16X16:
+      return tx_probs->p16x16[ctx];
+    case TX_32X32:
+      return tx_probs->p32x32[ctx];
+    default:
+      assert(!"Invalid max_tx_size.");
+      return NULL;
+  }
 }
 
-static const vp9_prob *get_tx_probs2(const MACROBLOCKD *xd,
-                                     const struct tx_probs *tx_probs,
-                                     const MODE_INFO *m) {
-  const BLOCK_SIZE bsize = m->mbmi.sb_type;
-  const int context = vp9_get_pred_context_tx_size(xd);
-  return get_tx_probs(bsize, context, tx_probs);
+static const vp9_prob *get_tx_probs2(TX_SIZE max_tx_size, const MACROBLOCKD *xd,
+                                     const struct tx_probs *tx_probs) {
+  const int ctx = vp9_get_pred_context_tx_size(xd);
+  return get_tx_probs(max_tx_size, ctx, tx_probs);
 }
 
-static unsigned int *get_tx_counts(BLOCK_SIZE bsize, uint8_t context,
+static unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx,
                                    struct tx_counts *tx_counts) {
-  if (bsize < BLOCK_16X16)
-    return tx_counts->p8x8[context];
-  else if (bsize < BLOCK_32X32)
-    return tx_counts->p16x16[context];
-  else
-    return tx_counts->p32x32[context];
+  switch (max_tx_size) {
+    case TX_8X8:
+      return tx_counts->p8x8[ctx];
+    case TX_16X16:
+      return tx_counts->p16x16[ctx];
+    case TX_32X32:
+      return tx_counts->p32x32[ctx];
+    default:
+      assert(!"Invalid max_tx_size.");
+      return NULL;
+  }
 }
 
 #endif  // VP9_COMMON_VP9_PRED_COMMON_H_
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index 1c96788..7cc66c8 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -117,16 +117,13 @@
   return clamped_mv;
 }
 
-struct build_inter_predictors_args {
-  MACROBLOCKD *xd;
-  int x, y;
-};
 
-static void build_inter_predictors(int plane, int block, BLOCK_SIZE bsize,
-                                   int pred_w, int pred_h,
-                                   void *argv) {
-  const struct build_inter_predictors_args* const arg = argv;
-  MACROBLOCKD *const xd = arg->xd;
+// TODO(jkoleszar): In principle, pred_w, pred_h are unnecessary, as we could
+// calculate the subsampled BLOCK_SIZE, but that type isn't defined for
+// sizes smaller than 16x16 yet.
+static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
+                                   BLOCK_SIZE bsize, int pred_w, int pred_h,
+                                   int mi_x, int mi_y) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const int bwl = b_width_log2(bsize) - pd->subsampling_x;
   const int bw = 4 << bwl;
@@ -172,7 +169,7 @@
 
     if (vp9_is_scaled(scale->sfc)) {
       pre = pre_buf->buf + scaled_buffer_offset(x, y, pre_buf->stride, scale);
-      scale->sfc->set_scaled_offsets(scale, arg->y + y, arg->x + x);
+      scale->sfc->set_scaled_offsets(scale, mi_y + y, mi_x + x);
       scaled_mv = scale->sfc->scale_mv(&mv_q4, scale);
       xs = scale->sfc->x_step_q4;
       ys = scale->sfc->y_step_q4;
@@ -190,40 +187,25 @@
   }
 }
 
-// TODO(jkoleszar): In principle, pred_w, pred_h are unnecessary, as we could
-// calculate the subsampled BLOCK_SIZE, but that type isn't defined for
-// sizes smaller than 16x16 yet.
-typedef void (*foreach_predicted_block_visitor)(int plane, int block,
-                                                BLOCK_SIZE bsize,
-                                                int pred_w, int pred_h,
-                                                void *arg);
-static INLINE void foreach_predicted_block_in_plane(
-    const MACROBLOCKD* const xd, BLOCK_SIZE bsize, int plane,
-    foreach_predicted_block_visitor visit, void *arg) {
-  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
-  const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
-
-  if (xd->mi_8x8[0]->mbmi.sb_type < BLOCK_8X8) {
-    int i = 0, x, y;
-    assert(bsize == BLOCK_8X8);
-    for (y = 0; y < 1 << bhl; ++y)
-      for (x = 0; x < 1 << bwl; ++x)
-        visit(plane, i++, bsize, 0, 0, arg);
-  } else {
-    visit(plane, 0, bsize, bwl, bhl, arg);
-  }
-}
-
 static void build_inter_predictors_for_planes(MACROBLOCKD *xd, BLOCK_SIZE bsize,
                                               int mi_row, int mi_col,
                                               int plane_from, int plane_to) {
   int plane;
   for (plane = plane_from; plane <= plane_to; ++plane) {
-    struct build_inter_predictors_args args = {
-      xd, mi_col * MI_SIZE, mi_row * MI_SIZE,
-    };
-    foreach_predicted_block_in_plane(xd, bsize, plane, build_inter_predictors,
-                                     &args);
+    const int mi_x = mi_col * MI_SIZE;
+    const int mi_y = mi_row * MI_SIZE;
+    const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
+    const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
+
+    if (xd->mi_8x8[0]->mbmi.sb_type < BLOCK_8X8) {
+      int i = 0, x, y;
+      assert(bsize == BLOCK_8X8);
+      for (y = 0; y < 1 << bhl; ++y)
+        for (x = 0; x < 1 << bwl; ++x)
+          build_inter_predictors(xd, plane, i++, bsize, 0, 0, mi_x, mi_y);
+    } else {
+      build_inter_predictors(xd, plane, 0, bsize, bwl, bhl, mi_x, mi_y);
+    }
   }
 }
 
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 6313f33..2c0864e 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -41,7 +41,7 @@
 specialize vp9_d63_predictor_4x4 $ssse3_x86inc
 
 prototype void vp9_h_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_h_predictor_4x4 $ssse3_x86inc
+specialize vp9_h_predictor_4x4 $ssse3_x86inc dspr2
 
 prototype void vp9_d117_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_d117_predictor_4x4
@@ -56,10 +56,10 @@
 specialize vp9_v_predictor_4x4 $sse_x86inc
 
 prototype void vp9_tm_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_tm_predictor_4x4 $sse_x86inc
+specialize vp9_tm_predictor_4x4 $sse_x86inc dspr2
 
 prototype void vp9_dc_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_dc_predictor_4x4 $sse_x86inc
+specialize vp9_dc_predictor_4x4 $sse_x86inc dspr2
 
 prototype void vp9_dc_top_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_dc_top_predictor_4x4
@@ -80,7 +80,7 @@
 specialize vp9_d63_predictor_8x8 $ssse3_x86inc
 
 prototype void vp9_h_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_h_predictor_8x8 $ssse3_x86inc
+specialize vp9_h_predictor_8x8 $ssse3_x86inc dspr2
 
 prototype void vp9_d117_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_d117_predictor_8x8
@@ -95,10 +95,10 @@
 specialize vp9_v_predictor_8x8 $sse_x86inc
 
 prototype void vp9_tm_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_tm_predictor_8x8 $sse2_x86inc
+specialize vp9_tm_predictor_8x8 $sse2_x86inc dspr2
 
 prototype void vp9_dc_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_dc_predictor_8x8 $sse_x86inc
+specialize vp9_dc_predictor_8x8 $sse_x86inc dspr2
 
 prototype void vp9_dc_top_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_dc_top_predictor_8x8
@@ -119,7 +119,7 @@
 specialize vp9_d63_predictor_16x16 $ssse3_x86inc
 
 prototype void vp9_h_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_h_predictor_16x16 $ssse3_x86inc
+specialize vp9_h_predictor_16x16 $ssse3_x86inc dspr2
 
 prototype void vp9_d117_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_d117_predictor_16x16
@@ -137,7 +137,7 @@
 specialize vp9_tm_predictor_16x16 $sse2_x86inc
 
 prototype void vp9_dc_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_dc_predictor_16x16 $sse2_x86inc
+specialize vp9_dc_predictor_16x16 $sse2_x86inc dspr2
 
 prototype void vp9_dc_top_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_dc_top_predictor_16x16
@@ -191,22 +191,22 @@
 # Loopfilter
 #
 prototype void vp9_mb_lpf_vertical_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"
-specialize vp9_mb_lpf_vertical_edge_w sse2 neon
+specialize vp9_mb_lpf_vertical_edge_w sse2 neon dspr2
 
 prototype void vp9_mbloop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_mbloop_filter_vertical_edge sse2 neon
+specialize vp9_mbloop_filter_vertical_edge sse2 neon dspr2
 
 prototype void vp9_loop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_loop_filter_vertical_edge mmx neon
+specialize vp9_loop_filter_vertical_edge mmx neon dspr2
 
 prototype void vp9_mb_lpf_horizontal_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_mb_lpf_horizontal_edge_w sse2 avx2 neon
+specialize vp9_mb_lpf_horizontal_edge_w sse2 avx2 neon dspr2
 
 prototype void vp9_mbloop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_mbloop_filter_horizontal_edge sse2 neon
+specialize vp9_mbloop_filter_horizontal_edge sse2 neon dspr2
 
 prototype void vp9_loop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_loop_filter_horizontal_edge mmx neon
+specialize vp9_loop_filter_horizontal_edge mmx neon dspr2
 
 #
 # post proc
@@ -296,7 +296,7 @@
 specialize vp9_idct32x32_1024_add sse2 neon dspr2
 
 prototype void vp9_idct32x32_34_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct32x32_34_add sse2
+specialize vp9_idct32x32_34_add sse2 dspr2
 
 prototype void vp9_idct32x32_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_idct32x32_1_add sse2 neon dspr2
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 14f2ce5..b948429 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -62,31 +62,28 @@
 }
 
 static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd,
-                                     BLOCK_SIZE bsize, vp9_reader *r) {
-  const uint8_t context = vp9_get_pred_context_tx_size(xd);
-  const vp9_prob *tx_probs = get_tx_probs(bsize, context, &cm->fc.tx_probs);
+                                     TX_SIZE max_tx_size, vp9_reader *r) {
+  const int ctx = vp9_get_pred_context_tx_size(xd);
+  const vp9_prob *tx_probs = get_tx_probs(max_tx_size, ctx, &cm->fc.tx_probs);
   TX_SIZE tx_size = vp9_read(r, tx_probs[0]);
-  if (tx_size != TX_4X4 && bsize >= BLOCK_16X16) {
+  if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) {
     tx_size += vp9_read(r, tx_probs[1]);
-    if (tx_size != TX_8X8 && bsize >= BLOCK_32X32)
+    if (tx_size != TX_8X8 && max_tx_size >= TX_32X32)
       tx_size += vp9_read(r, tx_probs[2]);
   }
 
   if (!cm->frame_parallel_decoding_mode)
-    ++get_tx_counts(bsize, context, &cm->counts.tx)[tx_size];
+    ++get_tx_counts(max_tx_size, ctx, &cm->counts.tx)[tx_size];
   return tx_size;
 }
 
-static TX_SIZE read_tx_size(VP9_COMMON *const cm, MACROBLOCKD *const xd,
-                            TX_MODE tx_mode, BLOCK_SIZE bsize, int allow_select,
-                            vp9_reader *r) {
-  if (allow_select && tx_mode == TX_MODE_SELECT && bsize >= BLOCK_8X8) {
-    return read_selected_tx_size(cm, xd, bsize, r);
-  } else {
-    const TX_SIZE max_tx_size_block = max_txsize_lookup[bsize];
-    const TX_SIZE max_tx_size_txmode = tx_mode_to_biggest_tx_size[tx_mode];
-    return MIN(max_tx_size_block, max_tx_size_txmode);
-  }
+static TX_SIZE read_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd, TX_MODE tx_mode,
+                            BLOCK_SIZE bsize, int allow_select, vp9_reader *r) {
+  const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
+  if (allow_select && tx_mode == TX_MODE_SELECT && bsize >= BLOCK_8X8)
+    return read_selected_tx_size(cm, xd, max_tx_size, r);
+  else
+    return MIN(max_tx_size, tx_mode_to_biggest_tx_size[tx_mode]);
 }
 
 static void set_segment_id(VP9_COMMON *cm, BLOCK_SIZE bsize,
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index aad400a..75c6384 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -42,6 +42,10 @@
   vp9_reader bit_reader;
   DECLARE_ALIGNED(16, MACROBLOCKD, xd);
   DECLARE_ALIGNED(16, unsigned char, token_cache[1024]);
+  DECLARE_ALIGNED(16, int16_t,  qcoeff[MAX_MB_PLANE][64 * 64]);
+  DECLARE_ALIGNED(16, int16_t,  dqcoeff[MAX_MB_PLANE][64 * 64]);
+  DECLARE_ALIGNED(16, uint16_t, eobs[MAX_MB_PLANE][256]);
+  const uint8_t *band_translate[2];
 } TileWorkerData;
 
 static int read_be32(const uint8_t *p) {
@@ -239,31 +243,30 @@
 }
 
 static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
-                                    BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
+                                    TX_SIZE tx_size, int x, int y) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  int16_t* const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  const int stride = pd->dst.stride;
   const int eob = pd->eobs[block];
   if (eob > 0) {
     TX_TYPE tx_type;
-    const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size,
-                                                         block);
-    uint8_t* const dst = raster_block_offset_uint8(plane_bsize, raster_block,
-                                                   pd->dst.buf, stride);
+    const int plane_type = pd->plane_type;
+    const int stride = pd->dst.stride;
+    int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+    uint8_t *const dst = &pd->dst.buf[4 * y * stride + 4 * x];
+
     switch (tx_size) {
       case TX_4X4:
-        tx_type = get_tx_type_4x4(pd->plane_type, xd, raster_block);
+        tx_type = get_tx_type_4x4(plane_type, xd, block);
         if (tx_type == DCT_DCT)
           xd->itxm_add(dqcoeff, dst, stride, eob);
         else
           vp9_iht4x4_16_add(dqcoeff, dst, stride, tx_type);
         break;
       case TX_8X8:
-        tx_type = get_tx_type_8x8(pd->plane_type, xd);
+        tx_type = get_tx_type_8x8(plane_type, xd);
         vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob);
         break;
       case TX_16X16:
-        tx_type = get_tx_type_16x16(pd->plane_type, xd);
+        tx_type = get_tx_type_16x16(plane_type, xd);
         vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob);
         break;
       case TX_32X32:
@@ -291,7 +294,8 @@
   VP9_COMMON *cm;
   MACROBLOCKD *xd;
   vp9_reader *r;
-  unsigned char* token_cache;
+  uint8_t *token_cache;
+  const uint8_t *band_translate[2];
 };
 
 static void predict_and_reconstruct_intra_block(int plane, int block,
@@ -300,17 +304,19 @@
   struct intra_args *const args = arg;
   VP9_COMMON *const cm = args->cm;
   MACROBLOCKD *const xd = args->xd;
-
+  const uint8_t *band_translate[2] = {
+    args->band_translate[0], args->band_translate[1]
+  };
   struct macroblockd_plane *const pd = &xd->plane[plane];
   MODE_INFO *const mi = xd->mi_8x8[0];
-  const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size,
-                                                       block);
-  uint8_t* const dst = raster_block_offset_uint8(plane_bsize, raster_block,
-                                                 pd->dst.buf, pd->dst.stride);
   const MB_PREDICTION_MODE mode = (plane == 0)
-        ? ((mi->mbmi.sb_type < BLOCK_8X8) ? mi->bmi[raster_block].as_mode
-                                          : mi->mbmi.mode)
-        : mi->mbmi.uv_mode;
+          ? ((mi->mbmi.sb_type < BLOCK_8X8) ? mi->bmi[block].as_mode
+                                            : mi->mbmi.mode)
+          : mi->mbmi.uv_mode;
+  int x, y;
+  uint8_t *dst;
+  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
+  dst = &pd->dst.buf[4 * y * pd->dst.stride + 4 * x];
 
   if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0)
     extend_for_intra(xd, plane_bsize, plane, block, tx_size);
@@ -320,9 +326,9 @@
                           dst, pd->dst.stride, dst, pd->dst.stride);
 
   if (!mi->mbmi.skip_coeff) {
-    vp9_decode_block_tokens(cm, xd, plane, block, plane_bsize, tx_size,
-                            args->r, args->token_cache);
-    inverse_transform_block(xd, plane, block, plane_bsize, tx_size);
+    vp9_decode_block_tokens(cm, xd, plane, block, plane_bsize, x, y, tx_size,
+                            args->r, args->token_cache, band_translate);
+    inverse_transform_block(xd, plane, block, tx_size, x, y);
   }
 }
 
@@ -331,7 +337,8 @@
   MACROBLOCKD *xd;
   vp9_reader *r;
   int *eobtotal;
-  unsigned char* token_cache;
+  uint8_t *token_cache;
+  const uint8_t *band_translate[2];
 };
 
 static void reconstruct_inter_block(int plane, int block,
@@ -340,11 +347,17 @@
   struct inter_args *args = arg;
   VP9_COMMON *const cm = args->cm;
   MACROBLOCKD *const xd = args->xd;
+  const uint8_t *band_translate[2] = {
+    args->band_translate[0], args->band_translate[1]
+  };
+  int x, y;
+  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
 
   *args->eobtotal += vp9_decode_block_tokens(cm, xd, plane, block,
-                                             plane_bsize, tx_size,
-                                             args->r, args->token_cache);
-  inverse_transform_block(xd, plane, block, plane_bsize, tx_size);
+                                             plane_bsize, x, y, tx_size,
+                                             args->r, args->token_cache,
+                                             band_translate);
+  inverse_transform_block(xd, plane, block, tx_size, x, y);
 }
 
 static void set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
@@ -395,7 +408,8 @@
                            const TileInfo *const tile,
                            int mi_row, int mi_col,
                            vp9_reader *r, BLOCK_SIZE bsize,
-                           unsigned char *token_cache) {
+                           uint8_t *token_cache,
+                           const uint8_t *band_translate[2]) {
   const int less8x8 = bsize < BLOCK_8X8;
   MB_MODE_INFO *mbmi;
 
@@ -417,7 +431,9 @@
   }
 
   if (!is_inter_block(mbmi)) {
-    struct intra_args arg = { cm, xd, r, token_cache };
+    struct intra_args arg = {
+      cm, xd, r, token_cache, {band_translate[0], band_translate[1]}
+    };
     foreach_transformed_block(xd, bsize, predict_and_reconstruct_intra_block,
                               &arg);
   } else {
@@ -435,7 +451,10 @@
     // Reconstruction
     if (!mbmi->skip_coeff) {
       int eobtotal = 0;
-      struct inter_args arg = { cm, xd, r, &eobtotal, token_cache };
+      struct inter_args arg = {
+        cm, xd, r, &eobtotal, token_cache,
+        {band_translate[0], band_translate[1]}
+      };
       foreach_transformed_block(xd, bsize, reconstruct_inter_block, &arg);
       if (!less8x8 && eobtotal == 0)
         mbmi->skip_coeff = 1;  // skip loopfilter
@@ -475,7 +494,8 @@
                             const TileInfo *const tile,
                             int mi_row, int mi_col,
                             vp9_reader* r, BLOCK_SIZE bsize,
-                            unsigned char *token_cache) {
+                            uint8_t *token_cache,
+                            const uint8_t *band_translate[2]) {
   const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
   PARTITION_TYPE partition;
   BLOCK_SIZE subsize;
@@ -486,33 +506,37 @@
   partition = read_partition(cm, xd, hbs, mi_row, mi_col, bsize, r);
   subsize = get_subsize(bsize, partition);
   if (subsize < BLOCK_8X8) {
-    decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
+    decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache,
+                   band_translate);
   } else {
     switch (partition) {
       case PARTITION_NONE:
-        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
+        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache,
+                       band_translate);
         break;
       case PARTITION_HORZ:
-        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
+        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache,
+                       band_translate);
         if (mi_row + hbs < cm->mi_rows)
           decode_modes_b(cm, xd, tile, mi_row + hbs, mi_col, r, subsize,
-                         token_cache);
+                         token_cache, band_translate);
         break;
       case PARTITION_VERT:
-        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
+        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache,
+                       band_translate);
         if (mi_col + hbs < cm->mi_cols)
           decode_modes_b(cm, xd, tile, mi_row, mi_col + hbs, r, subsize,
-                         token_cache);
+                         token_cache, band_translate);
         break;
       case PARTITION_SPLIT:
         decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, subsize,
-                        token_cache);
+                        token_cache, band_translate);
         decode_modes_sb(cm, xd, tile, mi_row, mi_col + hbs, r, subsize,
-                        token_cache);
+                        token_cache, band_translate);
         decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col, r, subsize,
-                        token_cache);
+                        token_cache, band_translate);
         decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize,
-                        token_cache);
+                        token_cache, band_translate);
         break;
       default:
         assert(!"Invalid partition type");
@@ -795,9 +819,13 @@
     vp9_zero(xd->left_context);
     vp9_zero(xd->left_seg_context);
     for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
-         mi_col += MI_BLOCK_SIZE)
+         mi_col += MI_BLOCK_SIZE) {
+      const uint8_t *band_translate[2] = {
+        vp9_coefband_trans_4x4, pbi->coefband_trans_8x8plus
+      };
       decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, BLOCK_64X64,
-                      pbi->token_cache);
+                      pbi->token_cache, band_translate);
+    }
 
     if (pbi->do_loopfilter_inline) {
       const int lf_start = mi_row - MI_BLOCK_SIZE;
@@ -931,8 +959,21 @@
   return end;
 }
 
+static void setup_tile_macroblockd(TileWorkerData *const tile_data) {
+  MACROBLOCKD *xd = &tile_data->xd;
+  struct macroblockd_plane *const pd = xd->plane;
+  int i;
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    pd[i].qcoeff  = tile_data->qcoeff[i];
+    pd[i].dqcoeff = tile_data->dqcoeff[i];
+    pd[i].eobs    = tile_data->eobs[i];
+    vpx_memset(xd->plane[i].dqcoeff, 0, 64 * 64 * sizeof(int16_t));
+  }
+}
+
 static int tile_worker_hook(void *arg1, void *arg2) {
-  TileWorkerData *tile_data = (TileWorkerData*)arg1;
+  TileWorkerData *const tile_data = (TileWorkerData*)arg1;
   const TileInfo *const tile = (TileInfo*)arg2;
   int mi_row, mi_col;
 
@@ -944,7 +985,8 @@
          mi_col += MI_BLOCK_SIZE) {
       decode_modes_sb(tile_data->cm, &tile_data->xd, tile,
                       mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64,
-                      tile_data->token_cache);
+                      tile_data->token_cache,
+                      tile_data->band_translate);
     }
   }
   return !tile_data->xd.corrupted;
@@ -1003,11 +1045,14 @@
       tile_data->cm = cm;
       tile_data->xd = pbi->mb;
       tile_data->xd.corrupted = 0;
+      tile_data->band_translate[0] = vp9_coefband_trans_4x4;
+      tile_data->band_translate[1] = pbi->coefband_trans_8x8plus;
       vp9_tile_init(tile, tile_data->cm, 0, tile_col);
 
       setup_token_decoder(data, data_end, size, &cm->error,
                           &tile_data->bit_reader);
       setup_tile_context(pbi, &tile_data->xd, 0, tile_col);
+      setup_tile_macroblockd(tile_data);
 
       worker->had_error = 0;
       if (i == num_workers - 1 || tile_col == tile_cols - 1) {
@@ -1282,6 +1327,13 @@
   const int tile_cols = 1 << cm->log2_tile_cols;
   YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm);
 
+  vpx_memset(pbi->coefband_trans_8x8plus,
+             (COEF_BANDS - 1),
+             sizeof(pbi->coefband_trans_8x8plus));
+  vpx_memcpy(pbi->coefband_trans_8x8plus,
+             vp9_coefband_trans_8x8plus,
+             sizeof(vp9_coefband_trans_8x8plus));
+
   if (!first_partition_size) {
       // showing a frame directly
       *p_data_end = data + 1;
@@ -1309,7 +1361,6 @@
 
   alloc_tile_storage(pbi, tile_rows, tile_cols);
 
-  xd->mi_8x8 = cm->mi_grid_visible;
   xd->mode_info_stride = cm->mode_info_stride;
   set_prev_mi(cm);
 
@@ -1319,7 +1370,7 @@
   cm->fc = cm->frame_contexts[cm->frame_context_idx];
   vp9_zero(cm->counts);
   for (i = 0; i < MAX_MB_PLANE; ++i)
-    vp9_zero(xd->plane[i].dqcoeff);
+    vpx_memset(xd->plane[i].dqcoeff, 0, 64 * 64 * sizeof(int16_t));
 
   xd->corrupted = 0;
   new_fb->corrupted = read_compressed_header(pbi, data, first_partition_size);
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index b8d670b..d5ad303 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -93,7 +93,8 @@
                         vp9_reader *r, int block_idx,
                         PLANE_TYPE type, int seg_eob, int16_t *dqcoeff_ptr,
                         TX_SIZE tx_size, const int16_t *dq, int pt,
-                        uint8_t *token_cache) {
+                        uint8_t *token_cache,
+                        const uint8_t *band_translate) {
   const FRAME_CONTEXT *const fc = &cm->fc;
   FRAME_COUNTS *const counts = &cm->counts;
   const int ref = is_inter_block(&xd->mi_8x8[0]->mbmi);
@@ -108,31 +109,30 @@
   unsigned int (*eob_branch_count)[PREV_COEF_CONTEXTS] =
       counts->eob_branch[tx_size][type][ref];
   const int16_t *scan, *nb;
-  const uint8_t *const band_translate = get_band_translate(tx_size);
+  const uint8_t *cat6;
   get_scan(xd, tx_size, type, block_idx, &scan, &nb);
 
-  while (1) {
+  while (c < seg_eob) {
     int val;
-    const uint8_t *cat6 = cat6_prob;
-    if (c >= seg_eob)
-      break;
     if (c)
       pt = get_coef_context(nb, token_cache, c);
-    band = get_coef_band(band_translate, c);
+    band = *band_translate++;
     prob = coef_probs[band][pt];
     if (!cm->frame_parallel_decoding_mode)
       ++eob_branch_count[band][pt];
     if (!vp9_read(r, prob[EOB_CONTEXT_NODE]))
       break;
+    goto DECODE_ZERO;
 
   SKIP_START:
     if (c >= seg_eob)
       break;
     if (c)
       pt = get_coef_context(nb, token_cache, c);
-    band = get_coef_band(band_translate, c);
+    band = *band_translate++;
     prob = coef_probs[band][pt];
 
+  DECODE_ZERO:
     if (!vp9_read(r, prob[ZERO_CONTEXT_NODE])) {
       INCREMENT_COUNT(ZERO_TOKEN);
       token_cache[scan[c]] = vp9_pt_energy_class[ZERO_TOKEN];
@@ -200,6 +200,7 @@
       WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY5);
     }
     val = 0;
+    cat6 = cat6_prob;
     while (*cat6) {
       val = (val << 1) | vp9_read(r, *cat6++);
     }
@@ -217,22 +218,19 @@
 
 int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
                             int plane, int block, BLOCK_SIZE plane_bsize,
-                            TX_SIZE tx_size, vp9_reader *r,
-                            uint8_t *token_cache) {
+                            int x, int y, TX_SIZE tx_size, vp9_reader *r,
+                            uint8_t *token_cache,
+                            const uint8_t *band_translate[2]) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const int seg_eob = get_tx_eob(&cm->seg, xd->mi_8x8[0]->mbmi.segment_id,
                                  tx_size);
-  int aoff, loff, eob, pt;
-  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
-  pt = get_entropy_context(tx_size, pd->above_context + aoff,
-                                    pd->left_context + loff);
-
-  eob = decode_coefs(cm, xd, r, block,
-                     pd->plane_type, seg_eob, BLOCK_OFFSET(pd->dqcoeff, block),
-                     tx_size, pd->dequant, pt, token_cache);
-
-  set_contexts(xd, pd, plane_bsize, tx_size, eob > 0, aoff, loff);
-
+  const int pt = get_entropy_context(tx_size, pd->above_context + x,
+                                              pd->left_context + y);
+  const int eob = decode_coefs(cm, xd, r, block, pd->plane_type, seg_eob,
+                               BLOCK_OFFSET(pd->dqcoeff, block), tx_size,
+                               pd->dequant, pt, token_cache,
+                               band_translate[tx_size != TX_4X4]);
+  set_contexts(xd, pd, plane_bsize, tx_size, eob > 0, x, y);
   pd->eobs[block] = eob;
   return eob;
 }
diff --git a/vp9/decoder/vp9_detokenize.h b/vp9/decoder/vp9_detokenize.h
index 04939ea..7522c97 100644
--- a/vp9/decoder/vp9_detokenize.h
+++ b/vp9/decoder/vp9_detokenize.h
@@ -17,7 +17,8 @@
 
 int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
                             int plane, int block, BLOCK_SIZE plane_bsize,
-                            TX_SIZE tx_size, vp9_reader *r,
-                            uint8_t *token_cache);
+                            int x, int y, TX_SIZE tx_size, vp9_reader *r,
+                            uint8_t *token_cache,
+                            const uint8_t *band_translate[2]);
 
 #endif  // VP9_DECODER_VP9_DETOKENIZE_H_
diff --git a/vp9/decoder/vp9_onyxd_if.c b/vp9/decoder/vp9_onyxd_if.c
index 5f970a3..cb45d37 100644
--- a/vp9/decoder/vp9_onyxd_if.c
+++ b/vp9/decoder/vp9_onyxd_if.c
@@ -107,6 +107,18 @@
   }
 }
 
+static void init_macroblockd(VP9D_COMP *const pbi) {
+  MACROBLOCKD *xd = &pbi->mb;
+  struct macroblockd_plane *const pd = xd->plane;
+  int i;
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    pd[i].qcoeff  = pbi->qcoeff[i];
+    pd[i].dqcoeff = pbi->dqcoeff[i];
+    pd[i].eobs    = pbi->eobs[i];
+  }
+}
+
 VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf) {
   VP9D_COMP *const pbi = vpx_memalign(32, sizeof(VP9D_COMP));
   VP9_COMMON *const cm = pbi ? &pbi->common : NULL;
@@ -141,6 +153,8 @@
   cm->error.setjmp = 0;
   pbi->decoded_key_frame = 0;
 
+  init_macroblockd(pbi);
+
   vp9_worker_init(&pbi->lf_worker);
 
   return pbi;
diff --git a/vp9/decoder/vp9_onyxd_int.h b/vp9/decoder/vp9_onyxd_int.h
index 7c4c9db..e29b453 100644
--- a/vp9/decoder/vp9_onyxd_int.h
+++ b/vp9/decoder/vp9_onyxd_int.h
@@ -22,6 +22,10 @@
 
   DECLARE_ALIGNED(16, VP9_COMMON, common);
 
+  DECLARE_ALIGNED(16, int16_t,  qcoeff[MAX_MB_PLANE][64 * 64]);
+  DECLARE_ALIGNED(16, int16_t,  dqcoeff[MAX_MB_PLANE][64 * 64]);
+  DECLARE_ALIGNED(16, uint16_t, eobs[MAX_MB_PLANE][256]);
+
   VP9D_CONFIG oxcf;
 
   const uint8_t *source;
@@ -50,7 +54,8 @@
   ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
   PARTITION_CONTEXT *above_seg_context;
 
-  DECLARE_ALIGNED(16, unsigned char, token_cache[1024]);
+  DECLARE_ALIGNED(16, uint8_t, token_cache[1024]);
+  DECLARE_ALIGNED(16, uint8_t, coefband_trans_8x8plus[1024]);
 } VP9D_COMP;
 
 #endif  // VP9_DECODER_VP9_ONYXD_INT_H_
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index e1978d4..efbadba 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -189,12 +189,14 @@
 static void write_selected_tx_size(const VP9_COMP *cpi, MODE_INFO *m,
                                    TX_SIZE tx_size, BLOCK_SIZE bsize,
                                    vp9_writer *w) {
+  const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
   const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-  const vp9_prob *tx_probs = get_tx_probs2(xd, &cpi->common.fc.tx_probs, m);
+  const vp9_prob *const tx_probs = get_tx_probs2(max_tx_size, xd,
+                                                 &cpi->common.fc.tx_probs);
   vp9_write(w, tx_size != TX_4X4, tx_probs[0]);
-  if (bsize >= BLOCK_16X16 && tx_size != TX_4X4) {
+  if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) {
     vp9_write(w, tx_size != TX_8X8, tx_probs[1]);
-    if (bsize >= BLOCK_32X32 && tx_size != TX_8X8)
+    if (tx_size != TX_8X8 && max_tx_size >= TX_32X32)
       vp9_write(w, tx_size != TX_16X16, tx_probs[2]);
   }
 }
@@ -256,15 +258,15 @@
   }
 }
 
-static void pack_mb_tokens(vp9_writer* const bc,
+static void pack_mb_tokens(vp9_writer* const w,
                            TOKENEXTRA **tp,
                            const TOKENEXTRA *const stop) {
   TOKENEXTRA *p = *tp;
 
   while (p < stop && p->token != EOSB_TOKEN) {
     const int t = p->token;
-    const struct vp9_token *const a = vp9_coef_encodings + t;
-    const vp9_extra_bit *const b = vp9_extra_bits + t;
+    const struct vp9_token *const a = &vp9_coef_encodings[t];
+    const vp9_extra_bit *const b = &vp9_extra_bits[t];
     int i = 0;
     const vp9_prob *pp;
     int v = a->value;
@@ -287,7 +289,7 @@
 
     do {
       const int bb = (v >> --n) & 1;
-      vp9_write(bc, bb, pp[i >> 1]);
+      vp9_write(w, bb, pp[i >> 1]);
       i = vp9_coef_tree[i + bb];
     } while (n);
 
@@ -302,12 +304,12 @@
 
         do {
           const int bb = (v >> --n) & 1;
-          vp9_write(bc, bb, pb[i >> 1]);
+          vp9_write(w, bb, pb[i >> 1]);
           i = b->tree[i + bb];
         } while (n);
       }
 
-      vp9_write_bit(bc, e & 1);
+      vp9_write_bit(w, e & 1);
     }
     ++p;
   }
@@ -543,37 +545,33 @@
 }
 
 static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile,
-                          MODE_INFO **mi_8x8, vp9_writer *bc,
-                          TOKENEXTRA **tok, TOKENEXTRA *tok_end,
-                          int mi_row, int mi_col, int index) {
+                          vp9_writer *w, TOKENEXTRA **tok, TOKENEXTRA *tok_end,
+                          int mi_row, int mi_col) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-  MODE_INFO *m = mi_8x8[0];
+  MODE_INFO *m;
 
-  if (m->mbmi.sb_type < BLOCK_8X8)
-    if (index > 0)
-      return;
-
-  xd->mi_8x8 = mi_8x8;
+  xd->mi_8x8 = cm->mi_grid_visible + (mi_row * cm->mode_info_stride + mi_col);
+  m = xd->mi_8x8[0];
 
   set_mi_row_col(xd, tile,
                  mi_row, num_8x8_blocks_high_lookup[m->mbmi.sb_type],
                  mi_col, num_8x8_blocks_wide_lookup[m->mbmi.sb_type],
                  cm->mi_rows, cm->mi_cols);
   if (frame_is_intra_only(cm)) {
-    write_mb_modes_kf(cpi, mi_8x8, bc);
+    write_mb_modes_kf(cpi, xd->mi_8x8, w);
 #ifdef ENTROPY_STATS
     active_section = 8;
 #endif
   } else {
-    pack_inter_mode_mvs(cpi, m, bc);
+    pack_inter_mode_mvs(cpi, m, w);
 #ifdef ENTROPY_STATS
     active_section = 1;
 #endif
   }
 
   assert(*tok < tok_end);
-  pack_mb_tokens(bc, tok, tok_end);
+  pack_mb_tokens(w, tok, tok_end);
 }
 
 static void write_partition(VP9_COMP *cpi, int hbs, int mi_row, int mi_col,
@@ -600,59 +598,50 @@
 }
 
 static void write_modes_sb(VP9_COMP *cpi, const TileInfo *const tile,
-                           MODE_INFO **mi_8x8, vp9_writer *bc,
-                           TOKENEXTRA **tok, TOKENEXTRA *tok_end,
-                           int mi_row, int mi_col, BLOCK_SIZE bsize,
-                           int index) {
+                           vp9_writer *w, TOKENEXTRA **tok, TOKENEXTRA *tok_end,
+                           int mi_row, int mi_col, BLOCK_SIZE bsize) {
   VP9_COMMON *const cm = &cpi->common;
-  const int mis = cm->mode_info_stride;
-  int bsl = b_width_log2(bsize);
-  int bs = (1 << bsl) / 4;  // mode_info step for subsize
-  int n;
-  PARTITION_TYPE partition = PARTITION_NONE;
+  const int bsl = b_width_log2(bsize);
+  const int bs = (1 << bsl) / 4;
+  PARTITION_TYPE partition;
   BLOCK_SIZE subsize;
-  MODE_INFO *m = mi_8x8[0];
+  MODE_INFO *m = cm->mi_grid_visible[mi_row * cm->mode_info_stride + mi_col];
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
   partition = partition_lookup[bsl][m->mbmi.sb_type];
-
-  if (bsize < BLOCK_8X8) {
-    if (index > 0)
-      return;
-  } else {
-    write_partition(cpi, bs, mi_row, mi_col, partition, bsize, bc);
-  }
-
+  write_partition(cpi, bs, mi_row, mi_col, partition, bsize, w);
   subsize = get_subsize(bsize, partition);
-
-  switch (partition) {
-    case PARTITION_NONE:
-      write_modes_b(cpi, tile, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0);
-      break;
-    case PARTITION_HORZ:
-      write_modes_b(cpi, tile, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0);
-      if ((mi_row + bs) < cm->mi_rows)
-        write_modes_b(cpi, tile, mi_8x8 + bs * mis, bc, tok, tok_end,
-                      mi_row + bs, mi_col, 1);
-      break;
-    case PARTITION_VERT:
-      write_modes_b(cpi, tile, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0);
-      if ((mi_col + bs) < cm->mi_cols)
-        write_modes_b(cpi, tile, mi_8x8 + bs, bc, tok, tok_end,
-                      mi_row, mi_col + bs, 1);
-      break;
-    case PARTITION_SPLIT:
-      for (n = 0; n < 4; n++) {
-        const int j = n >> 1, i = n & 1;
-        write_modes_sb(cpi, tile, mi_8x8 + j * bs * mis + i * bs, bc,
-                       tok, tok_end,
-                       mi_row + j * bs, mi_col + i * bs, subsize, n);
-      }
-      break;
-    default:
-      assert(0);
+  if (subsize < BLOCK_8X8) {
+    write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+  } else {
+    switch (partition) {
+      case PARTITION_NONE:
+        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+        break;
+      case PARTITION_HORZ:
+        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+        if (mi_row + bs < cm->mi_rows)
+          write_modes_b(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col);
+        break;
+      case PARTITION_VERT:
+        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+        if (mi_col + bs < cm->mi_cols)
+          write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + bs);
+        break;
+      case PARTITION_SPLIT:
+        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, subsize);
+        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col + bs,
+                       subsize);
+        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col,
+                       subsize);
+        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col + bs,
+                       subsize);
+        break;
+      default:
+        assert(0);
+    }
   }
 
   // update partition context
@@ -663,25 +652,15 @@
 }
 
 static void write_modes(VP9_COMP *cpi, const TileInfo *const tile,
-                        vp9_writer* const bc,
-                        TOKENEXTRA **tok, TOKENEXTRA *tok_end) {
-  VP9_COMMON *const cm = &cpi->common;
-  const int mis = cm->mode_info_stride;
+                        vp9_writer *w, TOKENEXTRA **tok, TOKENEXTRA *tok_end) {
   int mi_row, mi_col;
-  MODE_INFO **mi_8x8 = cm->mi_grid_visible;
-  MODE_INFO **m_8x8;
-
-  mi_8x8 += tile->mi_col_start + tile->mi_row_start * mis;
 
   for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
-       mi_row += 8, mi_8x8 += 8 * mis) {
-    m_8x8 = mi_8x8;
-    vp9_zero(cpi->left_seg_context);
+       mi_row += MI_BLOCK_SIZE) {
+      vp9_zero(cpi->left_seg_context);
     for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
-         mi_col += MI_BLOCK_SIZE, m_8x8 += MI_BLOCK_SIZE) {
-      write_modes_sb(cpi, tile, m_8x8, bc, tok, tok_end, mi_row, mi_col,
-                     BLOCK_64X64, 0);
-    }
+         mi_col += MI_BLOCK_SIZE)
+      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, BLOCK_64X64);
   }
 }
 
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 8033a4d..6427f7f 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -27,6 +27,18 @@
 typedef struct {
   MODE_INFO mic;
   uint8_t *zcoeff_blk;
+  int16_t *coeff[MAX_MB_PLANE][2];
+  int16_t *qcoeff[MAX_MB_PLANE][2];
+  int16_t *dqcoeff[MAX_MB_PLANE][2];
+  uint16_t *eobs[MAX_MB_PLANE][2];
+
+  // dual buffer pointers, 0: in use, 1: best in store
+  int16_t *coeff_pbuf[MAX_MB_PLANE][2];
+  int16_t *qcoeff_pbuf[MAX_MB_PLANE][2];
+  int16_t *dqcoeff_pbuf[MAX_MB_PLANE][2];
+  uint16_t *eobs_pbuf[MAX_MB_PLANE][2];
+
+  int is_coded;
   int num_4x4_blk;
   int skip;
   int_mv best_ref_mv;
@@ -57,7 +69,7 @@
 
 struct macroblock_plane {
   DECLARE_ALIGNED(16, int16_t, src_diff[64 * 64]);
-  DECLARE_ALIGNED(16, int16_t, coeff[64 * 64]);
+  int16_t *coeff;
   struct buf_2d src;
 
   // Quantizer setings
@@ -81,6 +93,9 @@
 
   MACROBLOCKD e_mbd;
   int skip_block;
+  int select_txfm_size;
+  int skip_optimize;
+  int q_index;
 
   search_site *ss;
   int ss_count;
@@ -120,6 +135,11 @@
   int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
   int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
 
+  unsigned char sb_index;   // index of 32x32 block inside the 64x64 block
+  unsigned char mb_index;   // index of 16x16 block inside the 32x32 block
+  unsigned char b_index;    // index of 8x8 block inside the 16x16 block
+  unsigned char ab_index;   // index of 4x4 block inside the 8x8 block
+
   // These define limits to motion vector components to prevent them
   // from extending outside the UMV borders
   int mv_col_min;
@@ -173,41 +193,42 @@
   BLOCK_SIZE sb64_partitioning;
 
   void (*fwd_txm4x4)(const int16_t *input, int16_t *output, int stride);
+
+  // band cache
+  DECLARE_ALIGNED(16, uint8_t, coefband_trans_8x8plus[1024]);
 };
 
 // TODO(jingning): the variables used here are little complicated. need further
 // refactoring on organizing the temporary buffers, when recursive
 // partition down to 4x4 block size is enabled.
 static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x, BLOCK_SIZE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-
   switch (bsize) {
     case BLOCK_64X64:
       return &x->sb64_context;
     case BLOCK_64X32:
-      return &x->sb64x32_context[xd->sb_index];
+      return &x->sb64x32_context[x->sb_index];
     case BLOCK_32X64:
-      return &x->sb32x64_context[xd->sb_index];
+      return &x->sb32x64_context[x->sb_index];
     case BLOCK_32X32:
-      return &x->sb32_context[xd->sb_index];
+      return &x->sb32_context[x->sb_index];
     case BLOCK_32X16:
-      return &x->sb32x16_context[xd->sb_index][xd->mb_index];
+      return &x->sb32x16_context[x->sb_index][x->mb_index];
     case BLOCK_16X32:
-      return &x->sb16x32_context[xd->sb_index][xd->mb_index];
+      return &x->sb16x32_context[x->sb_index][x->mb_index];
     case BLOCK_16X16:
-      return &x->mb_context[xd->sb_index][xd->mb_index];
+      return &x->mb_context[x->sb_index][x->mb_index];
     case BLOCK_16X8:
-      return &x->sb16x8_context[xd->sb_index][xd->mb_index][xd->b_index];
+      return &x->sb16x8_context[x->sb_index][x->mb_index][x->b_index];
     case BLOCK_8X16:
-      return &x->sb8x16_context[xd->sb_index][xd->mb_index][xd->b_index];
+      return &x->sb8x16_context[x->sb_index][x->mb_index][x->b_index];
     case BLOCK_8X8:
-      return &x->sb8x8_context[xd->sb_index][xd->mb_index][xd->b_index];
+      return &x->sb8x8_context[x->sb_index][x->mb_index][x->b_index];
     case BLOCK_8X4:
-      return &x->sb8x4_context[xd->sb_index][xd->mb_index][xd->b_index];
+      return &x->sb8x4_context[x->sb_index][x->mb_index][x->b_index];
     case BLOCK_4X8:
-      return &x->sb4x8_context[xd->sb_index][xd->mb_index][xd->b_index];
+      return &x->sb4x8_context[x->sb_index][x->mb_index][x->b_index];
     case BLOCK_4X4:
-      return &x->ab4x4_context[xd->sb_index][xd->mb_index][xd->b_index];
+      return &x->ab4x4_context[x->sb_index][x->mb_index][x->b_index];
     default:
       assert(0);
       return NULL;
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 702fc70..e9b68cc 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -50,25 +50,25 @@
 int enc_debug = 0;
 #endif
 
-static INLINE uint8_t *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE subsize) {
+static INLINE uint8_t *get_sb_index(MACROBLOCK *x, BLOCK_SIZE subsize) {
   switch (subsize) {
     case BLOCK_64X64:
     case BLOCK_64X32:
     case BLOCK_32X64:
     case BLOCK_32X32:
-      return &xd->sb_index;
+      return &x->sb_index;
     case BLOCK_32X16:
     case BLOCK_16X32:
     case BLOCK_16X16:
-      return &xd->mb_index;
+      return &x->mb_index;
     case BLOCK_16X8:
     case BLOCK_8X16:
     case BLOCK_8X8:
-      return &xd->b_index;
+      return &x->b_index;
     case BLOCK_8X4:
     case BLOCK_4X8:
     case BLOCK_4X4:
-      return &xd->ab_index;
+      return &x->ab_index;
     default:
       assert(0);
       return NULL;
@@ -367,6 +367,8 @@
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *const p = x->plane;
+  struct macroblockd_plane *const pd = xd->plane;
   MODE_INFO *mi = &ctx->mic;
   MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
   MODE_INFO *mi_addr = xd->mi_8x8[0];
@@ -383,6 +385,13 @@
 
   *mi_addr = *mi;
 
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    p[i].coeff = ctx->coeff_pbuf[i][1];
+    pd[i].qcoeff = ctx->qcoeff_pbuf[i][1];
+    pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
+    pd[i].eobs = ctx->eobs_pbuf[i][1];
+  }
+
   // Restore the coding context of the MB to that that was in place
   // when the mode was picked for it
   for (y = 0; y < mi_height; y++)
@@ -578,6 +587,9 @@
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *const p = x->plane;
+  struct macroblockd_plane *const pd = xd->plane;
+  int i;
   int orig_rdmult = x->rdmult;
   double rdmult_ratio;
 
@@ -590,7 +602,7 @@
   if (bsize < BLOCK_8X8) {
     // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0
     // there is nothing to be done.
-    if (xd->ab_index != 0) {
+    if (x->ab_index != 0) {
       *totalrate = 0;
       *totaldist = 0;
       return;
@@ -600,6 +612,14 @@
   set_offsets(cpi, tile, mi_row, mi_col, bsize);
   xd->mi_8x8[0]->mbmi.sb_type = bsize;
 
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    p[i].coeff = ctx->coeff_pbuf[i][0];
+    pd[i].qcoeff = ctx->qcoeff_pbuf[i][0];
+    pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][0];
+    pd[i].eobs = ctx->eobs_pbuf[i][0];
+  }
+  ctx->is_coded = 0;
+
   // Set to zero to make sure we do not use the previous encoded frame stats
   xd->mi_8x8[0]->mbmi.skip_coeff = 0;
 
@@ -687,16 +707,15 @@
 }
 
 static BLOCK_SIZE *get_sb_partitioning(MACROBLOCK *x, BLOCK_SIZE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
   switch (bsize) {
     case BLOCK_64X64:
       return &x->sb64_partitioning;
     case BLOCK_32X32:
-      return &x->sb_partitioning[xd->sb_index];
+      return &x->sb_partitioning[x->sb_index];
     case BLOCK_16X16:
-      return &x->mb_partitioning[xd->sb_index][xd->mb_index];
+      return &x->mb_partitioning[x->sb_index][x->mb_index];
     case BLOCK_8X8:
-      return &x->b_partitioning[xd->sb_index][xd->mb_index][xd->b_index];
+      return &x->b_partitioning[x->sb_index][x->mb_index][x->b_index];
     default:
       assert(0);
       return NULL;
@@ -769,20 +788,19 @@
 static void encode_b(VP9_COMP *cpi, const TileInfo *const tile,
                      TOKENEXTRA **tp, int mi_row, int mi_col,
                      int output_enabled, BLOCK_SIZE bsize, int sub_index) {
-  VP9_COMMON * const cm = &cpi->common;
-  MACROBLOCK * const x = &cpi->mb;
-  MACROBLOCKD * const xd = &x->e_mbd;
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
   if (sub_index != -1)
-    *get_sb_index(xd, bsize) = sub_index;
+    *get_sb_index(x, bsize) = sub_index;
 
   if (bsize < BLOCK_8X8) {
     // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0
     // there is nothing to be done.
-    if (xd->ab_index > 0)
+    if (x->ab_index > 0)
       return;
   }
   set_offsets(cpi, tile, mi_row, mi_col, bsize);
@@ -800,9 +818,8 @@
 static void encode_sb(VP9_COMP *cpi, const TileInfo *const tile,
                       TOKENEXTRA **tp, int mi_row, int mi_col,
                       int output_enabled, BLOCK_SIZE bsize) {
-  VP9_COMMON * const cm = &cpi->common;
-  MACROBLOCK * const x = &cpi->mb;
-  MACROBLOCKD * const xd = &x->e_mbd;
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
   BLOCK_SIZE c1 = BLOCK_8X8;
   const int bsl = b_width_log2(bsize), bs = (1 << bsl) / 4;
   int pl = 0;
@@ -848,7 +865,7 @@
       for (i = 0; i < 4; i++) {
         const int x_idx = i & 1, y_idx = i >> 1;
 
-        *get_sb_index(xd, subsize) = i;
+        *get_sb_index(x, subsize) = i;
         encode_sb(cpi, tile, tp, mi_row + y_idx * bs, mi_col + x_idx * bs,
                   output_enabled, subsize);
       }
@@ -975,9 +992,8 @@
                              TOKENEXTRA **tp, int mi_row, int mi_col,
                              BLOCK_SIZE bsize, int *rate, int64_t *dist,
                              int do_recon) {
-  VP9_COMMON * const cm = &cpi->common;
-  MACROBLOCK * const x = &cpi->mb;
-  MACROBLOCKD *xd = &cpi->mb.e_mbd;
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
   const int mis = cm->mode_info_stride;
   int bsl = b_width_log2(bsize);
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
@@ -1012,7 +1028,7 @@
   if (bsize < BLOCK_8X8) {
     // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0
     // there is nothing to be done.
-    if (xd->ab_index != 0) {
+    if (x->ab_index != 0) {
       *rate = 0;
       *dist = 0;
       return;
@@ -1070,7 +1086,7 @@
                     bsize, get_block_context(x, bsize), INT64_MAX);
       break;
     case PARTITION_HORZ:
-      *get_sb_index(xd, subsize) = 0;
+      *get_sb_index(x, subsize) = 0;
       pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist,
                     subsize, get_block_context(x, subsize), INT64_MAX);
       if (last_part_rate != INT_MAX &&
@@ -1079,7 +1095,7 @@
         int64_t dt = 0;
         update_state(cpi, get_block_context(x, subsize), subsize, 0);
         encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
-        *get_sb_index(xd, subsize) = 1;
+        *get_sb_index(x, subsize) = 1;
         pick_sb_modes(cpi, tile, mi_row + (ms >> 1), mi_col, &rt, &dt, subsize,
                       get_block_context(x, subsize), INT64_MAX);
         if (rt == INT_MAX || dt == INT_MAX) {
@@ -1093,7 +1109,7 @@
       }
       break;
     case PARTITION_VERT:
-      *get_sb_index(xd, subsize) = 0;
+      *get_sb_index(x, subsize) = 0;
       pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist,
                     subsize, get_block_context(x, subsize), INT64_MAX);
       if (last_part_rate != INT_MAX &&
@@ -1102,7 +1118,7 @@
         int64_t dt = 0;
         update_state(cpi, get_block_context(x, subsize), subsize, 0);
         encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
-        *get_sb_index(xd, subsize) = 1;
+        *get_sb_index(x, subsize) = 1;
         pick_sb_modes(cpi, tile, mi_row, mi_col + (ms >> 1), &rt, &dt, subsize,
                       get_block_context(x, subsize), INT64_MAX);
         if (rt == INT_MAX || dt == INT_MAX) {
@@ -1128,7 +1144,7 @@
         if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
           continue;
 
-        *get_sb_index(xd, subsize) = i;
+        *get_sb_index(x, subsize) = i;
 
         rd_use_partition(cpi, tile, mi_8x8 + jj * bss * mis + ii * bss, tp,
                          mi_row + y_idx, mi_col + x_idx, subsize, &rt, &dt,
@@ -1169,11 +1185,10 @@
       ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
       PARTITION_CONTEXT sl[8], sa[8];
 
-      if ((mi_row + y_idx >= cm->mi_rows)
-          || (mi_col + x_idx >= cm->mi_cols))
+      if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
         continue;
 
-      *get_sb_index(xd, split_subsize) = i;
+      *get_sb_index(x, split_subsize) = i;
       *get_sb_partitioning(x, bsize) = split_subsize;
       *get_sb_partitioning(x, split_subsize) = split_subsize;
 
@@ -1353,7 +1368,6 @@
 static void compute_fast_motion_search_level(VP9_COMP *cpi, BLOCK_SIZE bsize) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
 
   // Only use 8x8 result for non HD videos.
   // int use_8x8 = (MIN(cpi->common.width, cpi->common.height) < 720) ? 1 : 0;
@@ -1366,9 +1380,9 @@
     PICK_MODE_CONTEXT *block_context = NULL;
 
     if (bsize == BLOCK_16X16) {
-      block_context = x->sb8x8_context[xd->sb_index][xd->mb_index];
+      block_context = x->sb8x8_context[x->sb_index][x->mb_index];
     } else if (bsize == BLOCK_32X32) {
-      block_context = x->mb_context[xd->sb_index];
+      block_context = x->mb_context[x->sb_index];
     } else if (bsize == BLOCK_64X64) {
       block_context = x->sb32_context;
     }
@@ -1456,9 +1470,8 @@
                               TOKENEXTRA **tp, int mi_row,
                               int mi_col, BLOCK_SIZE bsize, int *rate,
                               int64_t *dist, int do_recon, int64_t best_rd) {
-  VP9_COMMON * const cm = &cpi->common;
-  MACROBLOCK * const x = &cpi->mb;
-  MACROBLOCKD * const xd = &x->e_mbd;
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
   const int ms = num_8x8_blocks_wide_lookup[bsize] / 2;
   ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
   PARTITION_CONTEXT sl[8], sa[8];
@@ -1484,7 +1497,7 @@
   if (bsize < BLOCK_8X8) {
     // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0
     // there is nothing to be done.
-    if (xd->ab_index != 0) {
+    if (x->ab_index != 0) {
       *rate = 0;
       *dist = 0;
       return;
@@ -1582,7 +1595,7 @@
       if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
         continue;
 
-      *get_sb_index(xd, subsize) = i;
+      *get_sb_index(x, subsize) = i;
       if (cpi->sf.adaptive_motion_search)
         load_pred_mv(x, get_block_context(x, bsize));
       rd_pick_partition(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx, subsize,
@@ -1629,7 +1642,7 @@
   // PARTITION_HORZ
   if (partition_horz_allowed && do_rect) {
     subsize = get_subsize(bsize, PARTITION_HORZ);
-    *get_sb_index(xd, subsize) = 0;
+    *get_sb_index(x, subsize) = 0;
     if (cpi->sf.adaptive_motion_search)
       load_pred_mv(x, get_block_context(x, bsize));
     pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
@@ -1640,7 +1653,7 @@
       update_state(cpi, get_block_context(x, subsize), subsize, 0);
       encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
 
-      *get_sb_index(xd, subsize) = 1;
+      *get_sb_index(x, subsize) = 1;
       if (cpi->sf.adaptive_motion_search)
         load_pred_mv(x, get_block_context(x, bsize));
       pick_sb_modes(cpi, tile, mi_row + ms, mi_col, &this_rate,
@@ -1674,7 +1687,7 @@
   if (partition_vert_allowed && do_rect) {
     subsize = get_subsize(bsize, PARTITION_VERT);
 
-    *get_sb_index(xd, subsize) = 0;
+    *get_sb_index(x, subsize) = 0;
     if (cpi->sf.adaptive_motion_search)
       load_pred_mv(x, get_block_context(x, bsize));
     pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
@@ -1684,7 +1697,7 @@
       update_state(cpi, get_block_context(x, subsize), subsize, 0);
       encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
 
-      *get_sb_index(xd, subsize) = 1;
+      *get_sb_index(x, subsize) = 1;
       if (cpi->sf.adaptive_motion_search)
         load_pred_mv(x, get_block_context(x, bsize));
       pick_sb_modes(cpi, tile, mi_row, mi_col + ms, &this_rate,
@@ -2388,13 +2401,16 @@
   MODE_INFO **mi_8x8 = xd->mi_8x8;
   MODE_INFO *mi = mi_8x8[0];
   MB_MODE_INFO *mbmi = &mi->mbmi;
+  PICK_MODE_CONTEXT *ctx = get_block_context(x, bsize);
   unsigned int segment_id = mbmi->segment_id;
   const int mis = cm->mode_info_stride;
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  x->skip_optimize = ctx->is_coded;
+  ctx->is_coded = 1;
   x->use_lp32x32fdct = cpi->sf.use_lp32x32fdct;
   x->skip_encode = (!output_enabled && cpi->sf.skip_encode_frame &&
-                    xd->q_index < QIDX_SKIP_THRESH);
+                    x->q_index < QIDX_SKIP_THRESH);
   if (x->skip_encode)
     return;
 
@@ -2480,7 +2496,8 @@
             (mbmi->skip_coeff ||
              vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)))) {
       const uint8_t context = vp9_get_pred_context_tx_size(xd);
-      ++get_tx_counts(bsize, context, &cm->counts.tx)[mbmi->tx_size];
+      ++get_tx_counts(max_txsize_lookup[bsize],
+                      context, &cm->counts.tx)[mbmi->tx_size];
     } else {
       int x, y;
       TX_SIZE sz = tx_mode_to_biggest_tx_size[cm->tx_mode];
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index c43e96d..b709848 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -137,7 +137,9 @@
   const int mul = 1 + (tx_size == TX_32X32);
   uint8_t token_cache[1024];
   const int16_t *dequant_ptr = pd->dequant;
-  const uint8_t *const band_translate = get_band_translate(tx_size);
+  const uint8_t *const band_translate = (tx_size == TX_4X4 ?
+                                         vp9_coefband_trans_4x4 :
+                                         mb->coefband_trans_8x8plus);
 
   assert((!type && !plane) || (type && plane));
   dqcoeff_ptr = BLOCK_OFFSET(pd->dqcoeff, block);
@@ -178,7 +180,7 @@
       t0 = (vp9_dct_value_tokens_ptr + x)->token;
       /* Consider both possible successor states. */
       if (next < default_eob) {
-        band = get_coef_band(band_translate, i + 1);
+        band = band_translate[i + 1];
         pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
         rate0 +=
           mb->token_costs[tx_size][type][ref][band][0][pt]
@@ -229,7 +231,7 @@
         t0 = t1 = (vp9_dct_value_tokens_ptr + x)->token;
       }
       if (next < default_eob) {
-        band = get_coef_band(band_translate, i + 1);
+        band = band_translate[i + 1];
         if (t0 != DCT_EOB_TOKEN) {
           pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
           rate0 += mb->token_costs[tx_size][type][ref][band][!x][pt]
@@ -263,7 +265,7 @@
       /* There's no choice to make for a zero coefficient, so we don't
        *  add a new trellis node, but we do need to update the costs.
        */
-      band = get_coef_band(band_translate, i + 1);
+      band = band_translate[i + 1];
       t0 = tokens[next][0].token;
       t1 = tokens[next][1].token;
       /* Update the cost of each path if we're past the EOB token. */
@@ -283,7 +285,7 @@
   }
 
   /* Now pick the best path through the whole trellis. */
-  band = get_coef_band(band_translate, i + 1);
+  band = band_translate[i + 1];
   pt = combine_entropy_contexts(*a, *l);
   rate0 = tokens[next][0].rate;
   rate1 = tokens[next][1].rate;
@@ -429,18 +431,26 @@
   // TODO(jingning): per transformed block zero forcing only enabled for
   // luma component. will integrate chroma components as well.
   if (x->zcoeff_blk[tx_size][block] && plane == 0) {
-    int i, j;
+    int i, k;
     pd->eobs[block] = 0;
-    txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
+    txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &k);
     ctx->ta[plane][i] = 0;
-    ctx->tl[plane][j] = 0;
+    ctx->tl[plane][k] = 0;
     return;
   }
 
-  vp9_xform_quant(plane, block, plane_bsize, tx_size, arg);
+  if (x->select_txfm_size || xd->mi_8x8[0]->mbmi.sb_type < BLOCK_8X8)
+    vp9_xform_quant(plane, block, plane_bsize, tx_size, arg);
 
-  if (x->optimize)
+  if (x->optimize && (x->select_txfm_size ||
+      xd->mi_8x8[0]->mbmi.sb_type < BLOCK_8X8|| !x->skip_optimize)) {
     vp9_optimize_b(plane, block, plane_bsize, tx_size, x, ctx);
+  } else {
+    int i, k;
+    txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &k);
+    ctx->ta[plane][i] = pd->eobs[block] > 0;
+    ctx->tl[plane][k] = pd->eobs[block] > 0;
+  }
 
   if (x->skip_encode || pd->eobs[block] == 0)
     return;
@@ -504,7 +514,8 @@
   struct optimize_ctx ctx;
   struct encode_b_args arg = {x, &ctx};
 
-  vp9_subtract_sb(x, bsize);
+  if (x->select_txfm_size || xd->mi_8x8[0]->mbmi.sb_type < BLOCK_8X8)
+    vp9_subtract_sb(x, bsize);
 
   if (x->optimize) {
     int i;
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 6a3555d..b5428df 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -482,6 +482,10 @@
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   TileInfo tile;
+  struct macroblock_plane *const p = x->plane;
+  struct macroblockd_plane *const pd = xd->plane;
+  PICK_MODE_CONTEXT *ctx = &x->sb64_context;
+  int i;
 
   int recon_yoffset, recon_uvoffset;
   const int lst_yv12_idx = cm->ref_frame_map[cpi->lst_fb_idx];
@@ -525,6 +529,14 @@
 
   vp9_frame_init_quantizer(cpi);
 
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    p[i].coeff = ctx->coeff_pbuf[i][1];
+    pd[i].qcoeff = ctx->qcoeff_pbuf[i][1];
+    pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
+    pd[i].eobs = ctx->eobs_pbuf[i][1];
+  }
+
+
   // Initialise the MV cost table to the defaults
   // if( cm->current_video_frame == 0)
   // if ( 0 )
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index f922f90..8a98343 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -834,6 +834,7 @@
 
         sf->adaptive_rd_thresh = 2;
         sf->recode_loop = 2;
+        sf->use_lp32x32fdct = 1;
         sf->mode_skip_start = 11;
         sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
         sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
@@ -1223,6 +1224,13 @@
   cpi->fixed_divide[0] = 0;
   for (i = 1; i < 512; i++)
     cpi->fixed_divide[i] = 0x80000 / i;
+
+  vpx_memset(cpi->mb.coefband_trans_8x8plus,
+             (COEF_BANDS-1),
+             sizeof(cpi->mb.coefband_trans_8x8plus));
+  vpx_memcpy(cpi->mb.coefband_trans_8x8plus,
+             vp9_coefband_trans_8x8plus,
+             sizeof(vp9_coefband_trans_8x8plus));
 }
 
 
@@ -1436,90 +1444,121 @@
   } while (++i <= MV_MAX);
 }
 
+static void alloc_mode_context(VP9_COMMON *cm, int num_4x4_blk,
+                               PICK_MODE_CONTEXT *ctx) {
+  int num_pix = num_4x4_blk << 4;
+  int i, k;
+  ctx->num_4x4_blk = num_4x4_blk;
+  CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
+                  vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    for (k = 0; k < 2; ++k) {
+      CHECK_MEM_ERROR(cm, ctx->coeff[i][k],
+                      vpx_memalign(16, num_pix * sizeof(int16_t)));
+      CHECK_MEM_ERROR(cm, ctx->qcoeff[i][k],
+                      vpx_memalign(16, num_pix * sizeof(int16_t)));
+      CHECK_MEM_ERROR(cm, ctx->dqcoeff[i][k],
+                      vpx_memalign(16, num_pix * sizeof(int16_t)));
+      CHECK_MEM_ERROR(cm, ctx->eobs[i][k],
+                      vpx_memalign(16, num_pix * sizeof(uint16_t)));
+      ctx->coeff_pbuf[i][k]   = ctx->coeff[i][k];
+      ctx->qcoeff_pbuf[i][k]  = ctx->qcoeff[i][k];
+      ctx->dqcoeff_pbuf[i][k] = ctx->dqcoeff[i][k];
+      ctx->eobs_pbuf[i][k]    = ctx->eobs[i][k];
+    }
+  }
+}
+
+static void free_mode_context(PICK_MODE_CONTEXT *ctx) {
+  int i, k;
+  vpx_free(ctx->zcoeff_blk);
+  ctx->zcoeff_blk = 0;
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    for (k = 0; k < 2; ++k) {
+      vpx_free(ctx->coeff[i][k]);
+      ctx->coeff[i][k] = 0;
+      vpx_free(ctx->qcoeff[i][k]);
+      ctx->qcoeff[i][k] = 0;
+      vpx_free(ctx->dqcoeff[i][k]);
+      ctx->dqcoeff[i][k] = 0;
+      vpx_free(ctx->eobs[i][k]);
+      ctx->eobs[i][k] = 0;
+    }
+  }
+}
+
 static void init_pick_mode_context(VP9_COMP *cpi) {
   int i;
-  MACROBLOCK  *x  = &cpi->mb;
-  MACROBLOCKD *xd = &x->e_mbd;
-  VP9_COMMON  *cm = &cpi->common;
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x  = &cpi->mb;
+
 
   for (i = 0; i < BLOCK_SIZES; ++i) {
     const int num_4x4_w = num_4x4_blocks_wide_lookup[i];
     const int num_4x4_h = num_4x4_blocks_high_lookup[i];
     const int num_4x4_blk = MAX(4, num_4x4_w * num_4x4_h);
     if (i < BLOCK_16X16) {
-      for (xd->sb_index = 0; xd->sb_index < 4; ++xd->sb_index) {
-        for (xd->mb_index = 0; xd->mb_index < 4; ++xd->mb_index) {
-          for (xd->b_index = 0; xd->b_index < 16 / num_4x4_blk; ++xd->b_index) {
+      for (x->sb_index = 0; x->sb_index < 4; ++x->sb_index) {
+        for (x->mb_index = 0; x->mb_index < 4; ++x->mb_index) {
+          for (x->b_index = 0; x->b_index < 16 / num_4x4_blk; ++x->b_index) {
             PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
-            ctx->num_4x4_blk = num_4x4_blk;
-            CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
-                            vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
+            alloc_mode_context(cm, num_4x4_blk, ctx);
           }
         }
       }
     } else if (i < BLOCK_32X32) {
-      for (xd->sb_index = 0; xd->sb_index < 4; ++xd->sb_index) {
-        for (xd->mb_index = 0; xd->mb_index < 64 / num_4x4_blk;
-                               ++xd->mb_index) {
+      for (x->sb_index = 0; x->sb_index < 4; ++x->sb_index) {
+        for (x->mb_index = 0; x->mb_index < 64 / num_4x4_blk; ++x->mb_index) {
           PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
           ctx->num_4x4_blk = num_4x4_blk;
-          CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
-                          vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
+          alloc_mode_context(cm, num_4x4_blk, ctx);
         }
       }
     } else if (i < BLOCK_64X64) {
-      for (xd->sb_index = 0; xd->sb_index < 256 / num_4x4_blk; ++xd->sb_index) {
+      for (x->sb_index = 0; x->sb_index < 256 / num_4x4_blk; ++x->sb_index) {
         PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
         ctx->num_4x4_blk = num_4x4_blk;
-        CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
-                        vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
+        alloc_mode_context(cm, num_4x4_blk, ctx);
       }
     } else {
       PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
       ctx->num_4x4_blk = num_4x4_blk;
-      CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
-                      vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
+      alloc_mode_context(cm, num_4x4_blk, ctx);
     }
   }
 }
 
 static void free_pick_mode_context(MACROBLOCK *x) {
   int i;
-  MACROBLOCKD *xd = &x->e_mbd;
 
   for (i = 0; i < BLOCK_SIZES; ++i) {
     const int num_4x4_w = num_4x4_blocks_wide_lookup[i];
     const int num_4x4_h = num_4x4_blocks_high_lookup[i];
     const int num_4x4_blk = MAX(4, num_4x4_w * num_4x4_h);
     if (i < BLOCK_16X16) {
-      for (xd->sb_index = 0; xd->sb_index < 4; ++xd->sb_index) {
-        for (xd->mb_index = 0; xd->mb_index < 4; ++xd->mb_index) {
-          for (xd->b_index = 0; xd->b_index < 16 / num_4x4_blk; ++xd->b_index) {
+      for (x->sb_index = 0; x->sb_index < 4; ++x->sb_index) {
+        for (x->mb_index = 0; x->mb_index < 4; ++x->mb_index) {
+          for (x->b_index = 0; x->b_index < 16 / num_4x4_blk; ++x->b_index) {
             PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
-            vpx_free(ctx->zcoeff_blk);
-            ctx->zcoeff_blk = 0;
+            free_mode_context(ctx);
           }
         }
       }
     } else if (i < BLOCK_32X32) {
-      for (xd->sb_index = 0; xd->sb_index < 4; ++xd->sb_index) {
-        for (xd->mb_index = 0; xd->mb_index < 64 / num_4x4_blk;
-                               ++xd->mb_index) {
+      for (x->sb_index = 0; x->sb_index < 4; ++x->sb_index) {
+        for (x->mb_index = 0; x->mb_index < 64 / num_4x4_blk; ++x->mb_index) {
           PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
-          vpx_free(ctx->zcoeff_blk);
-          ctx->zcoeff_blk = 0;
+          free_mode_context(ctx);
         }
       }
     } else if (i < BLOCK_64X64) {
-      for (xd->sb_index = 0; xd->sb_index < 256 / num_4x4_blk; ++xd->sb_index) {
+      for (x->sb_index = 0; x->sb_index < 256 / num_4x4_blk; ++x->sb_index) {
         PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
-        vpx_free(ctx->zcoeff_blk);
-        ctx->zcoeff_blk = 0;
+        free_mode_context(ctx);
       }
     } else {
       PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
-      vpx_free(ctx->zcoeff_blk);
-      ctx->zcoeff_blk = 0;
+      free_mode_context(ctx);
     }
   }
 }
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 9429c7f..9e80212 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -312,7 +312,6 @@
   VP9_COMMON common;
   VP9_CONFIG oxcf;
   struct rdcost_block_args rdcost_stack;
-
   struct lookahead_ctx    *lookahead;
   struct lookahead_entry  *source;
 #if CONFIG_MULTIPLE_ARF
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 6222394..d24be96 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -301,17 +301,17 @@
                                         SEG_LVL_SKIP);
 
   /* save this macroblock QIndex for vp9_update_zbin_extra() */
-  x->e_mbd.q_index = qindex;
+  x->q_index = qindex;
 
   /* R/D setup */
   cpi->mb.errorperbit = rdmult >> 6;
   cpi->mb.errorperbit += (cpi->mb.errorperbit == 0);
 
-  vp9_initialize_me_consts(cpi, xd->q_index);
+  vp9_initialize_me_consts(cpi, x->q_index);
 }
 
 void vp9_update_zbin_extra(VP9_COMP *cpi, MACROBLOCK *x) {
-  const int qindex = x->e_mbd.q_index;
+  const int qindex = x->q_index;
   const int y_zbin_extra = (cpi->common.y_dequant[qindex][1] *
                 (cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7;
   const int uv_zbin_extra = (cpi->common.uv_dequant[qindex][1] *
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 6aaa2c7..695a2e2 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -246,6 +246,9 @@
 
   vp9_set_speed_features(cpi);
 
+  cpi->mb.select_txfm_size = cpi->sf.tx_size_search_method == USE_LARGESTALL ?
+                             0 : 1;
+
   set_block_thresholds(cpi);
 
   fill_token_costs(cpi->mb.token_costs, cm->fc.coef_probs);
@@ -740,7 +743,7 @@
   int n, m;
   int s0, s1;
 
-  const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs, xd->mi_8x8[0]);
+  const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc.tx_probs);
 
   for (n = TX_4X4; n <= max_tx_size; n++) {
     r[n][1] = r[n][0];
@@ -845,7 +848,7 @@
   double scale_rd[TX_SIZES] = {1.73, 1.44, 1.20, 1.00};
   // double scale_r[TX_SIZES] = {2.82, 2.00, 1.41, 1.00};
 
-  const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs, xd->mi_8x8[0]);
+  const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc.tx_probs);
 
   // for (n = TX_4X4; n <= max_txfm_size; n++)
   //   r[n][0] = (r[n][0] * scale_r[n]);
@@ -1901,6 +1904,7 @@
                         bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,
                         x->mvcost, cpi);
 
+
         bsi->rdstat[i][mode_idx].mvs[0].as_int = mode_mv[this_mode].as_int;
         if (num_4x4_blocks_wide > 1)
           bsi->rdstat[i + 1][mode_idx].mvs[0].as_int =
@@ -2477,54 +2481,41 @@
   int pw = 4 << b_width_log2(bsize), ph = 4 << b_height_log2(bsize);
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
-  int refs[2] = { mbmi->ref_frame[0],
-    (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
+  const int refs[2] = { mbmi->ref_frame[0],
+                        mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] };
   int_mv ref_mv[2];
   const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
-  int ite;
+  int ite, ref;
   // Prediction buffer from second frame.
   uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t));
 
   // Do joint motion search in compound mode to get more accurate mv.
-  struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
-  struct buf_2d backup_second_yv12[MAX_MB_PLANE] = {{0}};
-  struct buf_2d scaled_first_yv12;
+  struct buf_2d backup_yv12[2][MAX_MB_PLANE];
+  struct buf_2d scaled_first_yv12 = xd->plane[0].pre[0];
   int last_besterr[2] = {INT_MAX, INT_MAX};
-  YV12_BUFFER_CONFIG *scaled_ref_frame[2] = {NULL, NULL};
-  scaled_ref_frame[0] = get_scaled_ref_frame(cpi, mbmi->ref_frame[0]);
-  scaled_ref_frame[1] = get_scaled_ref_frame(cpi, mbmi->ref_frame[1]);
+  YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = {
+    get_scaled_ref_frame(cpi, mbmi->ref_frame[0]),
+    get_scaled_ref_frame(cpi, mbmi->ref_frame[1])
+  };
 
-  ref_mv[0] = mbmi->ref_mvs[refs[0]][0];
-  ref_mv[1] = mbmi->ref_mvs[refs[1]][0];
+  for (ref = 0; ref < 2; ++ref) {
+    ref_mv[ref] = mbmi->ref_mvs[refs[ref]][0];
 
-  if (scaled_ref_frame[0]) {
-    int i;
-    // Swap out the reference frame for a version that's been scaled to
-    // match the resolution of the current frame, allowing the existing
-    // motion search code to be used without additional modifications.
-    for (i = 0; i < MAX_MB_PLANE; i++)
-      backup_yv12[i] = xd->plane[i].pre[0];
-    setup_pre_planes(xd, 0, scaled_ref_frame[0], mi_row, mi_col, NULL);
+    if (scaled_ref_frame[ref]) {
+      int i;
+      // Swap out the reference frame for a version that's been scaled to
+      // match the resolution of the current frame, allowing the existing
+      // motion search code to be used without additional modifications.
+      for (i = 0; i < MAX_MB_PLANE; i++)
+        backup_yv12[ref][i] = xd->plane[i].pre[ref];
+      setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col, NULL);
+    }
+
+    xd->scale_factor[ref].sfc->set_scaled_offsets(&xd->scale_factor[ref],
+                                                  mi_row, mi_col);
+    frame_mv[refs[ref]].as_int = single_newmv[refs[ref]].as_int;
   }
 
-  if (scaled_ref_frame[1]) {
-    int i;
-    for (i = 0; i < MAX_MB_PLANE; i++)
-      backup_second_yv12[i] = xd->plane[i].pre[1];
-
-    setup_pre_planes(xd, 1, scaled_ref_frame[1], mi_row, mi_col, NULL);
-  }
-
-  xd->scale_factor[0].sfc->set_scaled_offsets(&xd->scale_factor[0],
-                                         mi_row, mi_col);
-  xd->scale_factor[1].sfc->set_scaled_offsets(&xd->scale_factor[1],
-                                         mi_row, mi_col);
-  scaled_first_yv12 = xd->plane[0].pre[0];
-
-  // Initialize mv using single prediction mode result.
-  frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
-  frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
-
   // Allow joint search multiple times iteratively for each ref frame
   // and break out the search loop if it couldn't find better mv.
   for (ite = 0; ite < 4; ite++) {
@@ -2604,24 +2595,20 @@
     }
   }
 
-  // restore the predictor
-  if (scaled_ref_frame[0]) {
-    int i;
-    for (i = 0; i < MAX_MB_PLANE; i++)
-      xd->plane[i].pre[0] = backup_yv12[i];
-  }
+  *rate_mv = 0;
 
-  if (scaled_ref_frame[1]) {
-    int i;
-    for (i = 0; i < MAX_MB_PLANE; i++)
-      xd->plane[i].pre[1] = backup_second_yv12[i];
+  for (ref = 0; ref < 2; ++ref) {
+    if (scaled_ref_frame[ref]) {
+      // restore the predictor
+      int i;
+      for (i = 0; i < MAX_MB_PLANE; i++)
+        xd->plane[i].pre[ref] = backup_yv12[ref][i];
+    }
+
+    *rate_mv += vp9_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
+                                &mbmi->ref_mvs[refs[ref]][0].as_mv,
+                                x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
   }
-  *rate_mv  = vp9_mv_bit_cost(&frame_mv[refs[0]].as_mv,
-                              &mbmi->ref_mvs[refs[0]][0].as_mv,
-                              x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
-  *rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]].as_mv,
-                              &mbmi->ref_mvs[refs[1]][0].as_mv,
-                              x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
 
   vpx_free(second_pred);
 }
@@ -3046,6 +3033,29 @@
   return this_rd;  // if 0, this will be re-calculated by caller
 }
 
+static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
+  int i;
+  struct macroblock_plane *const p = x->plane;
+  struct macroblockd_plane *const pd = x->e_mbd.plane;
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    p[i].coeff    = ctx->coeff_pbuf[i][1];
+    pd[i].qcoeff  = ctx->qcoeff_pbuf[i][1];
+    pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
+    pd[i].eobs    = ctx->eobs_pbuf[i][1];
+
+    ctx->coeff_pbuf[i][1]   = ctx->coeff_pbuf[i][0];
+    ctx->qcoeff_pbuf[i][1]  = ctx->qcoeff_pbuf[i][0];
+    ctx->dqcoeff_pbuf[i][1] = ctx->dqcoeff_pbuf[i][0];
+    ctx->eobs_pbuf[i][1]    = ctx->eobs_pbuf[i][0];
+
+    ctx->coeff_pbuf[i][0]   = p[i].coeff;
+    ctx->qcoeff_pbuf[i][0]  = pd[i].qcoeff;
+    ctx->dqcoeff_pbuf[i][0] = pd[i].dqcoeff;
+    ctx->eobs_pbuf[i][0]    = pd[i].eobs;
+  }
+}
+
 void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                int *returnrate, int64_t *returndist,
                                BLOCK_SIZE bsize,
@@ -3157,7 +3167,7 @@
   const int bhs = num_8x8_blocks_high_lookup[bsize] / 2;
   int best_skip2 = 0;
 
-  x->skip_encode = cpi->sf.skip_encode_frame && xd->q_index < QIDX_SKIP_THRESH;
+  x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
 
   // Everywhere the flag is set the error is much higher than its neighbors.
   ctx->frames_with_high_error = 0;
@@ -3196,8 +3206,8 @@
       case BLOCK_32X32:
         for (i = 0; i < 4; i++) {
           ref_frame_mask |=
-              x->mb_context[xd->sb_index][i].frames_with_high_error;
-          mode_mask |= x->mb_context[xd->sb_index][i].modes_with_high_error;
+              x->mb_context[x->sb_index][i].frames_with_high_error;
+          mode_mask |= x->mb_context[x->sb_index][i].modes_with_high_error;
         }
         break;
       default:
@@ -3588,6 +3598,8 @@
         best_rd = this_rd;
         best_mbmode = *mbmi;
         best_skip2 = this_skip2;
+        if (!x->select_txfm_size)
+          swap_block_ptr(x, ctx);
         vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
                    sizeof(uint8_t) * ctx->num_4x4_blk);
 
@@ -3850,7 +3862,7 @@
   b_mode_info best_bmodes[4];
   int best_skip2 = 0;
 
-  x->skip_encode = cpi->sf.skip_encode_frame && xd->q_index < QIDX_SKIP_THRESH;
+  x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
   vpx_memset(x->zcoeff_blk[TX_4X4], 0, 4);
 
   for (i = 0; i < 4; i++) {
@@ -4332,6 +4344,8 @@
                    RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv);
         best_mbmode = *mbmi;
         best_skip2 = this_skip2;
+        if (!x->select_txfm_size)
+          swap_block_ptr(x, ctx);
         vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
                    sizeof(uint8_t) * ctx->num_4x4_blk);
 
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 7d4676e..3d21ea8 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -57,7 +57,7 @@
     // initialize the cost for extra bits for all possible coefficient value.
     {
       int cost = 0;
-      const vp9_extra_bit *p = vp9_extra_bits + t[i].token;
+      const vp9_extra_bit *p = &vp9_extra_bits[t[i].token];
 
       if (p->base_val) {
         const int extra = t[i].extra;
@@ -73,7 +73,7 @@
   } while (++i < DCT_MAX_VALUE);
 
   vp9_dct_value_tokens_ptr = dct_value_tokens + DCT_MAX_VALUE;
-  vp9_dct_value_cost_ptr   = dct_value_cost + DCT_MAX_VALUE;
+  vp9_dct_value_cost_ptr = dct_value_cost + DCT_MAX_VALUE;
 }
 
 struct tokenize_b_args {
@@ -115,7 +115,9 @@
   vp9_coeff_count *const counts = cpi->coef_counts[tx_size];
   vp9_coeff_probs_model *const coef_probs = cpi->common.fc.coef_probs[tx_size];
   const int ref = is_inter_block(mbmi);
-  const uint8_t *const band_translate = get_band_translate(tx_size);
+  const uint8_t *const band_translate = (tx_size == TX_4X4 ?
+                                         vp9_coefband_trans_4x4 :
+                                         cpi->mb.coefband_trans_8x8plus);
   const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
   int aoff, loff;
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
@@ -127,7 +129,7 @@
   get_scan(xd, tx_size, type, block, &scan, &nb);
   c = 0;
   do {
-    const int band = get_coef_band(band_translate, c);
+    const int band = band_translate[c];
     int token;
     int v = 0;
     rc = scan[c];
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index db36506..2dd2bf0 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -103,11 +103,21 @@
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve8_dspr2.c
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve8_horiz_dspr2.c
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve8_vert_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_intrapred4_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_intrapred8_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_intrapred16_dspr2.c
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans4_dspr2.c
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans8_dspr2.c
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans16_dspr2.c
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans32_cols_dspr2.c
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans32_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_loopfilter_filters_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_loopfilter_filters_dspr2.h
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_loopfilter_macros_dspr2.h
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_loopfilter_masks_dspr2.h
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_mbloop_loopfilter_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c
 
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
 
diff --git a/vpxenc.c b/vpxenc.c
index f0f0df3..b7897db 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vpx_config.h"
+#include "./vpx_config.h"
 
 #if defined(_WIN32) || defined(__OS2__) || !CONFIG_OS_SUPPORT
 #define USE_POSIX_MMAP 0
@@ -16,6 +16,7 @@
 #define USE_POSIX_MMAP 1
 #endif
 
+#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdarg.h>
@@ -43,11 +44,12 @@
 #include "vpx/vp8dx.h"
 #endif
 
+#include "./tools_common.h"
 #include "vpx_ports/mem_ops.h"
 #include "vpx_ports/vpx_timer.h"
-#include "tools_common.h"
-#include "webmenc.h"
-#include "y4minput.h"
+#include "./vpxstats.h"
+#include "./webmenc.h"
+#include "./y4minput.h"
 
 /* Swallow warnings about unused results of fread/fwrite */
 static size_t wrap_fread(void *ptr, size_t size, size_t nmemb,
@@ -116,126 +118,6 @@
   va_end(ap);
 }
 
-/* This structure is used to abstract the different ways of handling
- * first pass statistics.
- */
-typedef struct {
-  vpx_fixed_buf_t buf;
-  int             pass;
-  FILE           *file;
-  char           *buf_ptr;
-  size_t          buf_alloc_sz;
-} stats_io_t;
-
-int stats_open_file(stats_io_t *stats, const char *fpf, int pass) {
-  int res;
-
-  stats->pass = pass;
-
-  if (pass == 0) {
-    stats->file = fopen(fpf, "wb");
-    stats->buf.sz = 0;
-    stats->buf.buf = NULL,
-               res = (stats->file != NULL);
-  } else {
-#if 0
-#elif USE_POSIX_MMAP
-    struct stat stat_buf;
-    int fd;
-
-    fd = open(fpf, O_RDONLY);
-    stats->file = fdopen(fd, "rb");
-    fstat(fd, &stat_buf);
-    stats->buf.sz = stat_buf.st_size;
-    stats->buf.buf = mmap(NULL, stats->buf.sz, PROT_READ, MAP_PRIVATE,
-                          fd, 0);
-    res = (stats->buf.buf != NULL);
-#else
-    size_t nbytes;
-
-    stats->file = fopen(fpf, "rb");
-
-    if (fseek(stats->file, 0, SEEK_END))
-      fatal("First-pass stats file must be seekable!");
-
-    stats->buf.sz = stats->buf_alloc_sz = ftell(stats->file);
-    rewind(stats->file);
-
-    stats->buf.buf = malloc(stats->buf_alloc_sz);
-
-    if (!stats->buf.buf)
-      fatal("Failed to allocate first-pass stats buffer (%lu bytes)",
-            (unsigned long)stats->buf_alloc_sz);
-
-    nbytes = fread(stats->buf.buf, 1, stats->buf.sz, stats->file);
-    res = (nbytes == stats->buf.sz);
-#endif
-  }
-
-  return res;
-}
-
-int stats_open_mem(stats_io_t *stats, int pass) {
-  int res;
-  stats->pass = pass;
-
-  if (!pass) {
-    stats->buf.sz = 0;
-    stats->buf_alloc_sz = 64 * 1024;
-    stats->buf.buf = malloc(stats->buf_alloc_sz);
-  }
-
-  stats->buf_ptr = stats->buf.buf;
-  res = (stats->buf.buf != NULL);
-  return res;
-}
-
-
-void stats_close(stats_io_t *stats, int last_pass) {
-  if (stats->file) {
-    if (stats->pass == last_pass) {
-#if 0
-#elif USE_POSIX_MMAP
-      munmap(stats->buf.buf, stats->buf.sz);
-#else
-      free(stats->buf.buf);
-#endif
-    }
-
-    fclose(stats->file);
-    stats->file = NULL;
-  } else {
-    if (stats->pass == last_pass)
-      free(stats->buf.buf);
-  }
-}
-
-void stats_write(stats_io_t *stats, const void *pkt, size_t len) {
-  if (stats->file) {
-    (void) fwrite(pkt, 1, len, stats->file);
-  } else {
-    if (stats->buf.sz + len > stats->buf_alloc_sz) {
-      size_t  new_sz = stats->buf_alloc_sz + 64 * 1024;
-      char   *new_ptr = realloc(stats->buf.buf, new_sz);
-
-      if (new_ptr) {
-        stats->buf_ptr = new_ptr + (stats->buf_ptr - (char *)stats->buf.buf);
-        stats->buf.buf = new_ptr;
-        stats->buf_alloc_sz = new_sz;
-      } else
-        fatal("Failed to realloc firstpass stats buffer.");
-    }
-
-    memcpy(stats->buf_ptr, pkt, len);
-    stats->buf.sz += len;
-    stats->buf_ptr += len;
-  }
-}
-
-vpx_fixed_buf_t stats_get(stats_io_t *stats) {
-  return stats->buf;
-}
-
 enum video_file_type {
   FILE_TYPE_RAW,
   FILE_TYPE_IVF,
@@ -263,7 +145,6 @@
   int                   only_i420;
 };
 
-
 #define IVF_FRAME_HDR_SZ (4+8) /* 4 byte size + 8 byte timestamp */
 static int read_frame(struct input_state *input, vpx_image_t *img) {
   FILE *f = input->file;
@@ -478,22 +359,6 @@
   return h;
 }
 
-#include "math.h"
-#define MAX_PSNR 100
-static double vp8_mse2psnr(double Samples, double Peak, double Mse) {
-  double psnr;
-
-  if ((double)Mse > 0.0)
-    psnr = 10.0 * log10(Peak * Peak * Samples / Mse);
-  else
-    psnr = MAX_PSNR;      /* Limit to prevent / 0 */
-
-  if (psnr > MAX_PSNR)
-    psnr = MAX_PSNR;
-
-  return psnr;
-}
-
 
 #include "args.h"
 static const arg_def_t debugmode = ARG_DEF("D", "debug", 0,
diff --git a/vpxstats.c b/vpxstats.c
new file mode 100644
index 0000000..70cea3e
--- /dev/null
+++ b/vpxstats.c
@@ -0,0 +1,135 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpxstats.h"
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "./tools_common.h"
+
+int stats_open_file(stats_io_t *stats, const char *fpf, int pass) {
+  int res;
+  stats->pass = pass;
+
+  if (pass == 0) {
+    stats->file = fopen(fpf, "wb");
+    stats->buf.sz = 0;
+    stats->buf.buf = NULL;
+    res = (stats->file != NULL);
+  } else {
+#if USE_POSIX_MMAP
+    struct stat stat_buf;
+    int fd;
+
+    fd = open(fpf, O_RDONLY);
+    stats->file = fdopen(fd, "rb");
+    fstat(fd, &stat_buf);
+    stats->buf.sz = stat_buf.st_size;
+    stats->buf.buf = mmap(NULL, stats->buf.sz, PROT_READ, MAP_PRIVATE, fd, 0);
+    res = (stats->buf.buf != NULL);
+#else
+    size_t nbytes;
+
+    stats->file = fopen(fpf, "rb");
+
+    if (fseek(stats->file, 0, SEEK_END))
+      fatal("First-pass stats file must be seekable!");
+
+    stats->buf.sz = stats->buf_alloc_sz = ftell(stats->file);
+    rewind(stats->file);
+
+    stats->buf.buf = malloc(stats->buf_alloc_sz);
+
+    if (!stats->buf.buf)
+      fatal("Failed to allocate first-pass stats buffer (%lu bytes)",
+            (unsigned int)stats->buf_alloc_sz);
+
+    nbytes = fread(stats->buf.buf, 1, stats->buf.sz, stats->file);
+    res = (nbytes == stats->buf.sz);
+#endif  /* USE_POSIX_MMAP */
+  }
+
+  return res;
+}
+
+int stats_open_mem(stats_io_t *stats, int pass) {
+  int res;
+  stats->pass = pass;
+
+  if (!pass) {
+    stats->buf.sz = 0;
+    stats->buf_alloc_sz = 64 * 1024;
+    stats->buf.buf = malloc(stats->buf_alloc_sz);
+  }
+
+  stats->buf_ptr = stats->buf.buf;
+  res = (stats->buf.buf != NULL);
+  return res;
+}
+
+void stats_close(stats_io_t *stats, int last_pass) {
+  if (stats->file) {
+    if (stats->pass == last_pass) {
+#if USE_POSIX_MMAP
+      munmap(stats->buf.buf, stats->buf.sz);
+#else
+      free(stats->buf.buf);
+#endif  /* USE_POSIX_MMAP */
+    }
+
+    fclose(stats->file);
+    stats->file = NULL;
+  } else {
+    if (stats->pass == last_pass)
+      free(stats->buf.buf);
+  }
+}
+
+void stats_write(stats_io_t *stats, const void *pkt, size_t len) {
+  if (stats->file) {
+    (void) fwrite(pkt, 1, len, stats->file);
+  } else {
+    if (stats->buf.sz + len > stats->buf_alloc_sz) {
+      size_t  new_sz = stats->buf_alloc_sz + 64 * 1024;
+      char   *new_ptr = realloc(stats->buf.buf, new_sz);
+
+      if (new_ptr) {
+        stats->buf_ptr = new_ptr + (stats->buf_ptr - (char *)stats->buf.buf);
+        stats->buf.buf = new_ptr;
+        stats->buf_alloc_sz = new_sz;
+      } else {
+        fatal("Failed to realloc firstpass stats buffer.");
+      }
+    }
+
+    memcpy(stats->buf_ptr, pkt, len);
+    stats->buf.sz += len;
+    stats->buf_ptr += len;
+  }
+}
+
+vpx_fixed_buf_t stats_get(stats_io_t *stats) {
+  return stats->buf;
+}
+
+double vp8_mse2psnr(double samples, double peak, double mse) {
+  const int kMaxPSNR = 100;
+  double psnr = kMaxPSNR;
+
+  if (mse > 0.0)
+    psnr = 10.0 * log10(peak * peak * samples / mse);
+
+  if (psnr > kMaxPSNR)
+    psnr = kMaxPSNR;
+
+  return psnr;
+}
diff --git a/vpxstats.h b/vpxstats.h
new file mode 100644
index 0000000..18b3acd
--- /dev/null
+++ b/vpxstats.h
@@ -0,0 +1,37 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPXSTATS_H_
+#define VPXSTATS_H_
+
+#include <stdio.h>
+
+#include "vpx/vpx_encoder.h"
+
+/* This structure is used to abstract the different ways of handling
+ * first pass statistics
+ */
+typedef struct {
+  vpx_fixed_buf_t buf;
+  int pass;
+  FILE *file;
+  char *buf_ptr;
+  size_t buf_alloc_sz;
+} stats_io_t;
+
+int stats_open_file(stats_io_t *stats, const char *fpf, int pass);
+int stats_open_mem(stats_io_t *stats, int pass);
+void stats_close(stats_io_t *stats, int last_pass);
+void stats_write(stats_io_t *stats, const void *pkt, size_t len);
+vpx_fixed_buf_t stats_get(stats_io_t *stats);
+
+double vp8_mse2psnr(double samples, double peak, double mse);
+
+#endif  // VPXSTATS_H_