Merge "Adding encode_loopfilter function." into experimental
diff --git a/build/make/Makefile b/build/make/Makefile
index 4ac5bcf..de71c61 100644
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -103,6 +103,18 @@
 .PHONY: testdata
 testdata::
 
+# Add compiler flags for intrinsic files
+$(BUILD_PFX)%_mmx.c.d: CFLAGS += -mmmx
+$(BUILD_PFX)%_mmx.c.o: CFLAGS += -mmmx
+$(BUILD_PFX)%_sse2.c.d: CFLAGS += -msse2
+$(BUILD_PFX)%_sse2.c.o: CFLAGS += -msse2
+$(BUILD_PFX)%_sse3.c.d: CFLAGS += -msse3
+$(BUILD_PFX)%_sse3.c.o: CFLAGS += -msse3
+$(BUILD_PFX)%_ssse3.c.d: CFLAGS += -mssse3
+$(BUILD_PFX)%_ssse3.c.o: CFLAGS += -mssse3
+$(BUILD_PFX)%_sse4.c.d: CFLAGS += -msse4.1
+$(BUILD_PFX)%_sse4.c.o: CFLAGS += -msse4.1
+
 $(BUILD_PFX)%.c.d: %.c
 	$(if $(quiet),@echo "    [DEP] $@")
 	$(qexec)mkdir -p $(dir $@)
diff --git a/build/make/configure.sh b/build/make/configure.sh
index ac86d50..c50ef58 100755
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -265,12 +265,13 @@
 fi
 TMP_H="${TMPDIRx}/vpx-conf-$$-${RANDOM}.h"
 TMP_C="${TMPDIRx}/vpx-conf-$$-${RANDOM}.c"
+TMP_CC="${TMPDIRx}/vpx-conf-$$-${RANDOM}.cc"
 TMP_O="${TMPDIRx}/vpx-conf-$$-${RANDOM}.o"
 TMP_X="${TMPDIRx}/vpx-conf-$$-${RANDOM}.x"
 TMP_ASM="${TMPDIRx}/vpx-conf-$$-${RANDOM}.asm"
 
 clean_temp_files() {
-    rm -f ${TMP_C} ${TMP_H} ${TMP_O} ${TMP_X} ${TMP_ASM}
+    rm -f ${TMP_C} ${TMP_CC} ${TMP_H} ${TMP_O} ${TMP_X} ${TMP_ASM}
 }
 
 #
@@ -291,9 +292,9 @@
 
 check_cxx() {
     log check_cxx "$@"
-    cat >${TMP_C}
-    log_file ${TMP_C}
-    check_cmd ${CXX} ${CXXFLAGS} "$@" -c -o ${TMP_O} ${TMP_C}
+    cat >${TMP_CC}
+    log_file ${TMP_CC}
+    check_cmd ${CXX} ${CXXFLAGS} "$@" -c -o ${TMP_O} ${TMP_CC}
 }
 
 check_cpp() {
diff --git a/configure b/configure
index a799be0..372e259 100755
--- a/configure
+++ b/configure
@@ -249,7 +249,7 @@
     oneshotq
     multiple_arf
     code_zerogroup
-    sb8x8
+    non420
 "
 CONFIG_LIST="
     external_build
@@ -601,7 +601,10 @@
         check_add_cflags -Wimplicit-function-declaration
         check_add_cflags -Wuninitialized
         check_add_cflags -Wunused-variable
-        check_add_cflags -Wunused-but-set-variable
+        case ${CC} in
+          *clang*) ;;
+          *) check_add_cflags -Wunused-but-set-variable ;;
+        esac
         enabled extra_warnings || check_add_cflags -Wno-unused-function
     fi
 
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index a8139cb..151a38b 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -8,6 +8,10 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
 
 extern "C" {
 #include "./vpx_config.h"
@@ -16,10 +20,6 @@
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 }
-#include "third_party/googletest/src/include/gtest/gtest.h"
-#include "test/acm_random.h"
-#include "test/register_state_check.h"
-#include "test/util.h"
 
 namespace {
 typedef void (*convolve_fn_t)(const uint8_t *src, int src_stride,
@@ -46,27 +46,27 @@
 // Reference 8-tap subpixel filter, slightly modified to fit into this test.
 #define VP9_FILTER_WEIGHT 128
 #define VP9_FILTER_SHIFT 7
-static uint8_t clip_pixel(int x) {
+uint8_t clip_pixel(int x) {
   return x < 0 ? 0 :
          x > 255 ? 255 :
          x;
 }
 
-static void filter_block2d_8_c(const uint8_t *src_ptr,
-                               const unsigned int src_stride,
-                               const int16_t *HFilter,
-                               const int16_t *VFilter,
-                               uint8_t *dst_ptr,
-                               unsigned int dst_stride,
-                               unsigned int output_width,
-                               unsigned int output_height) {
+void filter_block2d_8_c(const uint8_t *src_ptr,
+                        const unsigned int src_stride,
+                        const int16_t *HFilter,
+                        const int16_t *VFilter,
+                        uint8_t *dst_ptr,
+                        unsigned int dst_stride,
+                        unsigned int output_width,
+                        unsigned int output_height) {
   // Between passes, we use an intermediate buffer whose height is extended to
   // have enough horizontally filtered values as input for the vertical pass.
   // This buffer is allocated to be big enough for the largest block type we
   // support.
   const int kInterp_Extend = 4;
   const unsigned int intermediate_height =
-    (kInterp_Extend - 1) + output_height + kInterp_Extend;
+      (kInterp_Extend - 1) + output_height + kInterp_Extend;
 
   /* Size of intermediate_buffer is max_intermediate_height * filter_max_width,
    * where max_intermediate_height = (kInterp_Extend - 1) + filter_max_height
@@ -87,15 +87,15 @@
     for (i = 0; i < intermediate_height; ++i) {
       for (j = 0; j < output_width; ++j) {
         // Apply filter...
-        int temp = ((int)src_ptr[0] * HFilter[0]) +
-                   ((int)src_ptr[1] * HFilter[1]) +
-                   ((int)src_ptr[2] * HFilter[2]) +
-                   ((int)src_ptr[3] * HFilter[3]) +
-                   ((int)src_ptr[4] * HFilter[4]) +
-                   ((int)src_ptr[5] * HFilter[5]) +
-                   ((int)src_ptr[6] * HFilter[6]) +
-                   ((int)src_ptr[7] * HFilter[7]) +
-                   (VP9_FILTER_WEIGHT >> 1);  // Rounding
+        const int temp = (src_ptr[0] * HFilter[0]) +
+                         (src_ptr[1] * HFilter[1]) +
+                         (src_ptr[2] * HFilter[2]) +
+                         (src_ptr[3] * HFilter[3]) +
+                         (src_ptr[4] * HFilter[4]) +
+                         (src_ptr[5] * HFilter[5]) +
+                         (src_ptr[6] * HFilter[6]) +
+                         (src_ptr[7] * HFilter[7]) +
+                         (VP9_FILTER_WEIGHT >> 1);  // Rounding
 
         // Normalize back to 0-255...
         *output_ptr = clip_pixel(temp >> VP9_FILTER_SHIFT);
@@ -115,15 +115,15 @@
     for (i = 0; i < output_height; ++i) {
       for (j = 0; j < output_width; ++j) {
         // Apply filter...
-        int temp = ((int)src_ptr[0] * VFilter[0]) +
-                   ((int)src_ptr[1] * VFilter[1]) +
-                   ((int)src_ptr[2] * VFilter[2]) +
-                   ((int)src_ptr[3] * VFilter[3]) +
-                   ((int)src_ptr[4] * VFilter[4]) +
-                   ((int)src_ptr[5] * VFilter[5]) +
-                   ((int)src_ptr[6] * VFilter[6]) +
-                   ((int)src_ptr[7] * VFilter[7]) +
-                   (VP9_FILTER_WEIGHT >> 1);  // Rounding
+        const int temp = (src_ptr[0] * VFilter[0]) +
+                         (src_ptr[1] * VFilter[1]) +
+                         (src_ptr[2] * VFilter[2]) +
+                         (src_ptr[3] * VFilter[3]) +
+                         (src_ptr[4] * VFilter[4]) +
+                         (src_ptr[5] * VFilter[5]) +
+                         (src_ptr[6] * VFilter[6]) +
+                         (src_ptr[7] * VFilter[7]) +
+                         (VP9_FILTER_WEIGHT >> 1);  // Rounding
 
         // Normalize back to 0-255...
         *dst_ptr++ = clip_pixel(temp >> VP9_FILTER_SHIFT);
@@ -135,12 +135,12 @@
   }
 }
 
-static void block2d_average_c(uint8_t *src,
-                              unsigned int src_stride,
-                              uint8_t *output_ptr,
-                              unsigned int output_stride,
-                              unsigned int output_width,
-                              unsigned int output_height) {
+void block2d_average_c(uint8_t *src,
+                       unsigned int src_stride,
+                       uint8_t *output_ptr,
+                       unsigned int output_stride,
+                       unsigned int output_width,
+                       unsigned int output_height) {
   unsigned int i, j;
   for (i = 0; i < output_height; ++i) {
     for (j = 0; j < output_width; ++j) {
@@ -150,15 +150,15 @@
   }
 }
 
-static void filter_average_block2d_8_c(const uint8_t *src_ptr,
-                                       const unsigned int src_stride,
-                                       const int16_t *HFilter,
-                                       const int16_t *VFilter,
-                                       uint8_t *dst_ptr,
-                                       unsigned int dst_stride,
-                                       unsigned int output_width,
-                                       unsigned int output_height) {
-  uint8_t tmp[64*64];
+void filter_average_block2d_8_c(const uint8_t *src_ptr,
+                                const unsigned int src_stride,
+                                const int16_t *HFilter,
+                                const int16_t *VFilter,
+                                uint8_t *dst_ptr,
+                                unsigned int dst_stride,
+                                unsigned int output_width,
+                                unsigned int output_height) {
+  uint8_t tmp[64 * 64];
 
   assert(output_width <= 64);
   assert(output_height <= 64);
@@ -173,10 +173,9 @@
   static void SetUpTestCase() {
     // Force input_ to be unaligned, output to be 16 byte aligned.
     input_ = reinterpret_cast<uint8_t*>(
-        vpx_memalign(kDataAlignment, kOuterBlockSize * kOuterBlockSize + 1))
-        + 1;
+        vpx_memalign(kDataAlignment, kInputBufferSize + 1)) + 1;
     output_ = reinterpret_cast<uint8_t*>(
-        vpx_memalign(kDataAlignment, kOuterBlockSize * kOuterBlockSize));
+        vpx_memalign(kDataAlignment, kOutputBufferSize));
   }
 
   static void TearDownTestCase() {
@@ -186,62 +185,63 @@
     output_ = NULL;
   }
 
-  protected:
-    static const int kDataAlignment = 16;
-    static const int kOuterBlockSize = 128;
-    static const int kInputStride = kOuterBlockSize;
-    static const int kOutputStride = kOuterBlockSize;
-    static const int kMaxDimension = 64;
+ protected:
+  static const int kDataAlignment = 16;
+  static const int kOuterBlockSize = 128;
+  static const int kInputStride = kOuterBlockSize;
+  static const int kOutputStride = kOuterBlockSize;
+  static const int kMaxDimension = 64;
+  static const int kInputBufferSize = kOuterBlockSize * kOuterBlockSize;
+  static const int kOutputBufferSize = kOuterBlockSize * kOuterBlockSize;
 
-    int Width() const { return GET_PARAM(0); }
-    int Height() const { return GET_PARAM(1); }
-    int BorderLeft() const {
-      const int center = (kOuterBlockSize - Width()) / 2;
-      return (center + (kDataAlignment - 1)) & ~(kDataAlignment - 1);
-    }
-    int BorderTop() const { return (kOuterBlockSize - Height()) / 2; }
+  int Width() const { return GET_PARAM(0); }
+  int Height() const { return GET_PARAM(1); }
+  int BorderLeft() const {
+    const int center = (kOuterBlockSize - Width()) / 2;
+    return (center + (kDataAlignment - 1)) & ~(kDataAlignment - 1);
+  }
+  int BorderTop() const { return (kOuterBlockSize - Height()) / 2; }
 
-    bool IsIndexInBorder(int i) {
-      return (i < BorderTop() * kOuterBlockSize ||
-              i >= (BorderTop() + Height()) * kOuterBlockSize ||
-              i % kOuterBlockSize < BorderLeft() ||
-              i % kOuterBlockSize >= (BorderLeft() + Width()));
+  bool IsIndexInBorder(int i) {
+    return (i < BorderTop() * kOuterBlockSize ||
+            i >= (BorderTop() + Height()) * kOuterBlockSize ||
+            i % kOuterBlockSize < BorderLeft() ||
+            i % kOuterBlockSize >= (BorderLeft() + Width()));
+  }
+
+  virtual void SetUp() {
+    UUT_ = GET_PARAM(2);
+    /* Set up guard blocks for an inner block cetered in the outer block */
+    for (int i = 0; i < kOutputBufferSize; ++i) {
+      if (IsIndexInBorder(i))
+        output_[i] = 255;
+      else
+        output_[i] = 0;
     }
 
-    virtual void SetUp() {
-      UUT_ = GET_PARAM(2);
-      memset(input_, 0, sizeof(input_));
-      /* Set up guard blocks for an inner block cetered in the outer block */
-      for (int i = 0; i < kOuterBlockSize * kOuterBlockSize; ++i) {
-        if (IsIndexInBorder(i))
-          output_[i] = 255;
-        else
-          output_[i] = 0;
-      }
+    ::libvpx_test::ACMRandom prng;
+    for (int i = 0; i < kInputBufferSize; ++i)
+      input_[i] = prng.Rand8Extremes();
+  }
 
-      ::libvpx_test::ACMRandom prng;
-      for (int i = 0; i < kOuterBlockSize * kOuterBlockSize; ++i)
-        input_[i] = prng.Rand8Extremes();
+  void CheckGuardBlocks() {
+    for (int i = 0; i < kOutputBufferSize; ++i) {
+      if (IsIndexInBorder(i))
+        EXPECT_EQ(255, output_[i]);
     }
+  }
 
-    void CheckGuardBlocks() {
-      for (int i = 0; i < kOuterBlockSize * kOuterBlockSize; ++i) {
-        if (IsIndexInBorder(i))
-          EXPECT_EQ(255, output_[i]);
-      }
-    }
+  uint8_t* input() const {
+    return input_ + BorderTop() * kOuterBlockSize + BorderLeft();
+  }
 
-    uint8_t* input() {
-      return input_ + BorderTop() * kOuterBlockSize + BorderLeft();
-    }
+  uint8_t* output() const {
+    return output_ + BorderTop() * kOuterBlockSize + BorderLeft();
+  }
 
-    uint8_t* output() {
-      return output_ + BorderTop() * kOuterBlockSize + BorderLeft();
-    }
-
-    const ConvolveFunctions* UUT_;
-    static uint8_t* input_;
-    static uint8_t* output_;
+  const ConvolveFunctions* UUT_;
+  static uint8_t* input_;
+  static uint8_t* output_;
 };
 uint8_t* ConvolveTest::input_ = NULL;
 uint8_t* ConvolveTest::output_ = NULL;
@@ -309,7 +309,7 @@
   vp9_sub_pel_filters_8lp
 };
 const int kNumFilterBanks = sizeof(kTestFilterList) /
-    sizeof(kTestFilterList[0]);
+                            sizeof(kTestFilterList[0]);
 const int kNumFilters = 16;
 
 TEST(ConvolveTest, FiltersWontSaturateWhenAddedPairwise) {
@@ -525,7 +525,6 @@
     make_tuple(64, 32, &convolve8_c),
     make_tuple(32, 64, &convolve8_c),
     make_tuple(64, 64, &convolve8_c)));
-}
 
 #if HAVE_SSSE3
 const ConvolveFunctions convolve8_ssse3(
@@ -548,3 +547,4 @@
     make_tuple(32, 64, &convolve8_ssse3),
     make_tuple(64, 64, &convolve8_ssse3)));
 #endif
+}  // namespace
diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h
index 5a37816..6aeb96b 100644
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@@ -10,9 +10,10 @@
 #ifndef TEST_ENCODE_TEST_DRIVER_H_
 #define TEST_ENCODE_TEST_DRIVER_H_
 
-#include "./vpx_config.h"
 #include <string>
 #include <vector>
+
+#include "./vpx_config.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "vpx/vpx_encoder.h"
 
@@ -46,7 +47,7 @@
 class CxDataIterator {
  public:
   explicit CxDataIterator(vpx_codec_ctx_t *encoder)
-    : encoder_(encoder), iter_(NULL) {}
+      : encoder_(encoder), iter_(NULL) {}
 
   const vpx_codec_cx_pkt_t *Next() {
     return vpx_codec_get_cx_data(encoder_, &iter_);
@@ -92,7 +93,7 @@
     memset(&encoder_, 0, sizeof(encoder_));
   }
 
-  ~Encoder() {
+  virtual ~Encoder() {
     vpx_codec_destroy(&encoder_);
   }
 
diff --git a/test/error_resilience_test.cc b/test/error_resilience_test.cc
index 1eee0f5..deacbc0 100644
--- a/test/error_resilience_test.cc
+++ b/test/error_resilience_test.cc
@@ -182,7 +182,7 @@
 TEST_P(ErrorResilienceTest, DropFramesWithoutRecovery) {
   const vpx_rational timebase = { 33333333, 1000000000 };
   cfg_.g_timebase = timebase;
-  cfg_.rc_target_bitrate = 500;
+  cfg_.rc_target_bitrate = 1500;
 
   init_flags_ = VPX_CODEC_USE_PSNR;
 
diff --git a/test/superframe_test.cc b/test/superframe_test.cc
index 52faddb..062ec6c 100644
--- a/test/superframe_test.cc
+++ b/test/superframe_test.cc
@@ -30,7 +30,7 @@
   }
 
   virtual void TearDown() {
-    delete modified_buf_;
+    delete[] modified_buf_;
   }
 
   virtual bool Continue() const {
@@ -59,7 +59,7 @@
         buffer[pkt->data.frame.sz - index_sz] == marker) {
       // frame is a superframe. strip off the index.
       if (modified_buf_)
-        delete modified_buf_;
+        delete[] modified_buf_;
       modified_buf_ = new uint8_t[pkt->data.frame.sz - index_sz];
       memcpy(modified_buf_, pkt->data.frame.buf,
              pkt->data.frame.sz - index_sz);
diff --git a/test/tile_independence_test.cc b/test/tile_independence_test.cc
index 711d0bd..9633ed7 100644
--- a/test/tile_independence_test.cc
+++ b/test/tile_independence_test.cc
@@ -56,7 +56,13 @@
 
   void UpdateMD5(::libvpx_test::Decoder *dec, const vpx_codec_cx_pkt_t *pkt,
                  ::libvpx_test::MD5 *md5) {
-    dec->DecodeFrame((uint8_t *) pkt->data.frame.buf, pkt->data.frame.sz);
+    const vpx_codec_err_t res =
+        dec->DecodeFrame(reinterpret_cast<uint8_t*>(pkt->data.frame.buf),
+                         pkt->data.frame.sz);
+    if (res != VPX_CODEC_OK) {
+      abort_ = true;
+      ASSERT_EQ(VPX_CODEC_OK, res);
+    }
     const vpx_image_t *img = dec->GetDxData().Next();
     md5->Add(img);
   }
diff --git a/tools/cpplint.py b/tools/cpplint.py
index 526b955..159dbbb 100755
--- a/tools/cpplint.py
+++ b/tools/cpplint.py
@@ -53,12 +53,8 @@
 #  - Check for 0 in char context (should be '\0')
 #  - Check for camel-case method name conventions for methods
 #    that are not simple inline getters and setters
-#  - Check that base classes have virtual destructors
-#    put "  // namespace" after } that closes a namespace, with
-#    namespace's name after 'namespace' if it is named.
 #  - Do not indent namespace contents
 #  - Avoid inlining non-trivial constructors in header files
-#    include base/basictypes.h if DISALLOW_EVIL_CONSTRUCTORS is used
 #  - Check for old-school (void) cast for call-sites of functions
 #    ignored return value
 #  - Check gUnit usage of anonymous namespace
@@ -80,6 +76,7 @@
 """
 
 import codecs
+import copy
 import getopt
 import math  # for log
 import os
@@ -139,6 +136,22 @@
       the top-level categories like 'build' and 'whitespace' will
       also be printed. If 'detailed' is provided, then a count
       is provided for each category like 'build/class'.
+
+    root=subdir
+      The root directory used for deriving header guard CPP variable.
+      By default, the header guard CPP variable is calculated as the relative
+      path to the directory that contains .git, .hg, or .svn.  When this flag
+      is specified, the relative path is calculated from the specified
+      directory. If the specified directory does not exist, this flag is
+      ignored.
+
+      Examples:
+        Assuing that src/.git exists, the header guard CPP variables for
+        src/chrome/browser/ui/browser.h are:
+
+        No flag => CHROME_BROWSER_UI_BROWSER_H_
+        --root=chrome => BROWSER_UI_BROWSER_H_
+        --root=chrome/browser => UI_BROWSER_H_
 """
 
 # We categorize each error message we print.  Here are the categories.
@@ -161,6 +174,7 @@
   'build/printf_format',
   'build/storage_class',
   'legal/copyright',
+  'readability/alt_tokens',
   'readability/braces',
   'readability/casting',
   'readability/check',
@@ -169,6 +183,7 @@
   'readability/function',
   'readability/multiline_comment',
   'readability/multiline_string',
+  'readability/namespace',
   'readability/nolint',
   'readability/streams',
   'readability/todo',
@@ -189,13 +204,14 @@
   'runtime/sizeof',
   'runtime/string',
   'runtime/threadsafe_fn',
-  'runtime/virtual',
   'whitespace/blank_line',
   'whitespace/braces',
   'whitespace/comma',
   'whitespace/comments',
+  'whitespace/empty_loop_body',
   'whitespace/end_of_line',
   'whitespace/ending_newline',
+  'whitespace/forcolon',
   'whitespace/indent',
   'whitespace/labels',
   'whitespace/line_length',
@@ -278,6 +294,34 @@
   _CHECK_REPLACEMENT['EXPECT_FALSE_M'][op] = 'EXPECT_%s_M' % inv_replacement
   _CHECK_REPLACEMENT['ASSERT_FALSE_M'][op] = 'ASSERT_%s_M' % inv_replacement
 
+# Alternative tokens and their replacements.  For full list, see section 2.5
+# Alternative tokens [lex.digraph] in the C++ standard.
+#
+# Digraphs (such as '%:') are not included here since it's a mess to
+# match those on a word boundary.
+_ALT_TOKEN_REPLACEMENT = {
+    'and': '&&',
+    'bitor': '|',
+    'or': '||',
+    'xor': '^',
+    'compl': '~',
+    'bitand': '&',
+    'and_eq': '&=',
+    'or_eq': '|=',
+    'xor_eq': '^=',
+    'not': '!',
+    'not_eq': '!='
+    }
+
+# Compile regular expression that matches all the above keywords.  The "[ =()]"
+# bit is meant to avoid matching these keywords outside of boolean expressions.
+#
+# False positives include C-style multi-line comments (http://go/nsiut )
+# and multi-line strings (http://go/beujw ), but those have always been
+# troublesome for cpplint.
+_ALT_TOKEN_REPLACEMENT_PATTERN = re.compile(
+    r'[ =()](' + ('|'.join(_ALT_TOKEN_REPLACEMENT.keys())) + r')(?=[ (]|$)')
+
 
 # These constants define types of headers for use with
 # _IncludeState.CheckNextIncludeOrder().
@@ -287,6 +331,17 @@
 _POSSIBLE_MY_HEADER = 4
 _OTHER_HEADER = 5
 
+# These constants define the current inline assembly state
+_NO_ASM = 0       # Outside of inline assembly block
+_INSIDE_ASM = 1   # Inside inline assembly block
+_END_ASM = 2      # Last line of inline assembly block
+_BLOCK_ASM = 3    # The whole block is an inline assembly block
+
+# Match start of assembly blocks
+_MATCH_ASM = re.compile(r'^\s*(?:asm|_asm|__asm|__asm__)'
+                        r'(?:\s+(volatile|__volatile__))?'
+                        r'\s*[{(]')
+
 
 _regexp_compile_cache = {}
 
@@ -297,6 +352,10 @@
 # on which those errors are expected and should be suppressed.
 _error_suppressions = {}
 
+# The root directory used for deriving header guard CPP variable.
+# This is set by --root flag.
+_root = None
+
 def ParseNolintSuppressions(filename, raw_line, linenum, error):
   """Updates the global list of error-suppressions.
 
@@ -925,7 +984,7 @@
 
   1) elided member contains lines without strings and comments,
   2) lines member contains lines without comments, and
-  3) raw member contains all the lines without processing.
+  3) raw_lines member contains all the lines without processing.
   All these three members are of <type 'list'>, and of the same length.
   """
 
@@ -965,6 +1024,29 @@
     return elided
 
 
+def FindEndOfExpressionInLine(line, startpos, depth, startchar, endchar):
+  """Find the position just after the matching endchar.
+
+  Args:
+    line: a CleansedLines line.
+    startpos: start searching at this position.
+    depth: nesting level at startpos.
+    startchar: expression opening character.
+    endchar: expression closing character.
+
+  Returns:
+    Index just after endchar.
+  """
+  for i in xrange(startpos, len(line)):
+    if line[i] == startchar:
+      depth += 1
+    elif line[i] == endchar:
+      depth -= 1
+      if depth == 0:
+        return i + 1
+  return -1
+
+
 def CloseExpression(clean_lines, linenum, pos):
   """If input points to ( or { or [, finds the position that closes it.
 
@@ -991,18 +1073,23 @@
   if startchar == '[': endchar = ']'
   if startchar == '{': endchar = '}'
 
-  num_open = line.count(startchar) - line.count(endchar)
-  while linenum < clean_lines.NumLines() and num_open > 0:
+  # Check first line
+  end_pos = FindEndOfExpressionInLine(line, pos, 0, startchar, endchar)
+  if end_pos > -1:
+    return (line, linenum, end_pos)
+  tail = line[pos:]
+  num_open = tail.count(startchar) - tail.count(endchar)
+  while linenum < clean_lines.NumLines() - 1:
     linenum += 1
     line = clean_lines.elided[linenum]
-    num_open += line.count(startchar) - line.count(endchar)
-  # OK, now find the endchar that actually got us back to even
-  endpos = len(line)
-  while num_open >= 0:
-    endpos = line.rfind(')', 0, endpos)
-    num_open -= 1                 # chopped off another )
-  return (line, linenum, endpos + 1)
+    delta = line.count(startchar) - line.count(endchar)
+    if num_open + delta <= 0:
+      return (line, linenum,
+              FindEndOfExpressionInLine(line, 0, num_open, startchar, endchar))
+    num_open += delta
 
+  # Did not find endchar before end of file, give up
+  return (line, clean_lines.NumLines(), -1)
 
 def CheckForCopyright(filename, lines, error):
   """Logs an error if no Copyright message appears at the top of the file."""
@@ -1032,9 +1119,13 @@
   # Restores original filename in case that cpplint is invoked from Emacs's
   # flymake.
   filename = re.sub(r'_flymake\.h$', '.h', filename)
+  filename = re.sub(r'/\.flymake/([^/]*)$', r'/\1', filename)
 
   fileinfo = FileInfo(filename)
-  return re.sub(r'[-./\s]', '_', fileinfo.RepositoryName()).upper() + '_'
+  file_path_from_root = fileinfo.RepositoryName()
+  if _root:
+    file_path_from_root = re.sub('^' + _root + os.sep, '', file_path_from_root)
+  return re.sub(r'[-./\s]', '_', file_path_from_root).upper() + '_'
 
 
 def CheckForHeaderGuard(filename, lines, error):
@@ -1259,17 +1350,55 @@
           'Changing pointer instead of value (or unused value of operator*).')
 
 
-class _ClassInfo(object):
+class _BlockInfo(object):
+  """Stores information about a generic block of code."""
+
+  def __init__(self, seen_open_brace):
+    self.seen_open_brace = seen_open_brace
+    self.open_parentheses = 0
+    self.inline_asm = _NO_ASM
+
+  def CheckBegin(self, filename, clean_lines, linenum, error):
+    """Run checks that applies to text up to the opening brace.
+
+    This is mostly for checking the text after the class identifier
+    and the "{", usually where the base class is specified.  For other
+    blocks, there isn't much to check, so we always pass.
+
+    Args:
+      filename: The name of the current file.
+      clean_lines: A CleansedLines instance containing the file.
+      linenum: The number of the line to check.
+      error: The function to call with any errors found.
+    """
+    pass
+
+  def CheckEnd(self, filename, clean_lines, linenum, error):
+    """Run checks that applies to text after the closing brace.
+
+    This is mostly used for checking end of namespace comments.
+
+    Args:
+      filename: The name of the current file.
+      clean_lines: A CleansedLines instance containing the file.
+      linenum: The number of the line to check.
+      error: The function to call with any errors found.
+    """
+    pass
+
+
+class _ClassInfo(_BlockInfo):
   """Stores information about a class."""
 
-  def __init__(self, name, clean_lines, linenum):
+  def __init__(self, name, class_or_struct, clean_lines, linenum):
+    _BlockInfo.__init__(self, False)
     self.name = name
-    self.linenum = linenum
-    self.seen_open_brace = False
+    self.starting_linenum = linenum
     self.is_derived = False
-    self.virtual_method_linenumber = None
-    self.has_virtual_destructor = False
-    self.brace_depth = 0
+    if class_or_struct == 'struct':
+      self.access = 'public'
+    else:
+      self.access = 'private'
 
     # Try to find the end of the class.  This will be confused by things like:
     #   class A {
@@ -1279,26 +1408,324 @@
     self.last_line = 0
     depth = 0
     for i in range(linenum, clean_lines.NumLines()):
-      line = clean_lines.lines[i]
+      line = clean_lines.elided[i]
       depth += line.count('{') - line.count('}')
       if not depth:
         self.last_line = i
         break
 
+  def CheckBegin(self, filename, clean_lines, linenum, error):
+    # Look for a bare ':'
+    if Search('(^|[^:]):($|[^:])', clean_lines.elided[linenum]):
+      self.is_derived = True
 
-class _ClassState(object):
-  """Holds the current state of the parse relating to class declarations.
 
-  It maintains a stack of _ClassInfos representing the parser's guess
-  as to the current nesting of class declarations. The innermost class
-  is at the top (back) of the stack. Typically, the stack will either
-  be empty or have exactly one entry.
-  """
+class _NamespaceInfo(_BlockInfo):
+  """Stores information about a namespace."""
+
+  def __init__(self, name, linenum):
+    _BlockInfo.__init__(self, False)
+    self.name = name or ''
+    self.starting_linenum = linenum
+
+  def CheckEnd(self, filename, clean_lines, linenum, error):
+    """Check end of namespace comments."""
+    line = clean_lines.raw_lines[linenum]
+
+    # Check how many lines is enclosed in this namespace.  Don't issue
+    # warning for missing namespace comments if there aren't enough
+    # lines.  However, do apply checks if there is already an end of
+    # namespace comment and it's incorrect.
+    #
+    # TODO(unknown): We always want to check end of namespace comments
+    # if a namespace is large, but sometimes we also want to apply the
+    # check if a short namespace contained nontrivial things (something
+    # other than forward declarations).  There is currently no logic on
+    # deciding what these nontrivial things are, so this check is
+    # triggered by namespace size only, which works most of the time.
+    if (linenum - self.starting_linenum < 10
+        and not Match(r'};*\s*(//|/\*).*\bnamespace\b', line)):
+      return
+
+    # Look for matching comment at end of namespace.
+    #
+    # Note that we accept C style "/* */" comments for terminating
+    # namespaces, so that code that terminate namespaces inside
+    # preprocessor macros can be cpplint clean.  Example: http://go/nxpiz
+    #
+    # We also accept stuff like "// end of namespace <name>." with the
+    # period at the end.
+    #
+    # Besides these, we don't accept anything else, otherwise we might
+    # get false negatives when existing comment is a substring of the
+    # expected namespace.  Example: http://go/ldkdc, http://cl/23548205
+    if self.name:
+      # Named namespace
+      if not Match((r'};*\s*(//|/\*).*\bnamespace\s+' + re.escape(self.name) +
+                    r'[\*/\.\\\s]*$'),
+                   line):
+        error(filename, linenum, 'readability/namespace', 5,
+              'Namespace should be terminated with "// namespace %s"' %
+              self.name)
+    else:
+      # Anonymous namespace
+      if not Match(r'};*\s*(//|/\*).*\bnamespace[\*/\.\\\s]*$', line):
+        error(filename, linenum, 'readability/namespace', 5,
+              'Namespace should be terminated with "// namespace"')
+
+
+class _PreprocessorInfo(object):
+  """Stores checkpoints of nesting stacks when #if/#else is seen."""
+
+  def __init__(self, stack_before_if):
+    # The entire nesting stack before #if
+    self.stack_before_if = stack_before_if
+
+    # The entire nesting stack up to #else
+    self.stack_before_else = []
+
+    # Whether we have already seen #else or #elif
+    self.seen_else = False
+
+
+class _NestingState(object):
+  """Holds states related to parsing braces."""
 
   def __init__(self):
-    self.classinfo_stack = []
+    # Stack for tracking all braces.  An object is pushed whenever we
+    # see a "{", and popped when we see a "}".  Only 3 types of
+    # objects are possible:
+    # - _ClassInfo: a class or struct.
+    # - _NamespaceInfo: a namespace.
+    # - _BlockInfo: some other type of block.
+    self.stack = []
 
-  def CheckFinished(self, filename, error):
+    # Stack of _PreprocessorInfo objects.
+    self.pp_stack = []
+
+  def SeenOpenBrace(self):
+    """Check if we have seen the opening brace for the innermost block.
+
+    Returns:
+      True if we have seen the opening brace, False if the innermost
+      block is still expecting an opening brace.
+    """
+    return (not self.stack) or self.stack[-1].seen_open_brace
+
+  def InNamespaceBody(self):
+    """Check if we are currently one level inside a namespace body.
+
+    Returns:
+      True if top of the stack is a namespace block, False otherwise.
+    """
+    return self.stack and isinstance(self.stack[-1], _NamespaceInfo)
+
+  def UpdatePreprocessor(self, line):
+    """Update preprocessor stack.
+
+    We need to handle preprocessors due to classes like this:
+      #ifdef SWIG
+      struct ResultDetailsPageElementExtensionPoint {
+      #else
+      struct ResultDetailsPageElementExtensionPoint : public Extension {
+      #endif
+    (see http://go/qwddn for original example)
+
+    We make the following assumptions (good enough for most files):
+    - Preprocessor condition evaluates to true from #if up to first
+      #else/#elif/#endif.
+
+    - Preprocessor condition evaluates to false from #else/#elif up
+      to #endif.  We still perform lint checks on these lines, but
+      these do not affect nesting stack.
+
+    Args:
+      line: current line to check.
+    """
+    if Match(r'^\s*#\s*(if|ifdef|ifndef)\b', line):
+      # Beginning of #if block, save the nesting stack here.  The saved
+      # stack will allow us to restore the parsing state in the #else case.
+      self.pp_stack.append(_PreprocessorInfo(copy.deepcopy(self.stack)))
+    elif Match(r'^\s*#\s*(else|elif)\b', line):
+      # Beginning of #else block
+      if self.pp_stack:
+        if not self.pp_stack[-1].seen_else:
+          # This is the first #else or #elif block.  Remember the
+          # whole nesting stack up to this point.  This is what we
+          # keep after the #endif.
+          self.pp_stack[-1].seen_else = True
+          self.pp_stack[-1].stack_before_else = copy.deepcopy(self.stack)
+
+        # Restore the stack to how it was before the #if
+        self.stack = copy.deepcopy(self.pp_stack[-1].stack_before_if)
+      else:
+        # TODO(unknown): unexpected #else, issue warning?
+        pass
+    elif Match(r'^\s*#\s*endif\b', line):
+      # End of #if or #else blocks.
+      if self.pp_stack:
+        # If we saw an #else, we will need to restore the nesting
+        # stack to its former state before the #else, otherwise we
+        # will just continue from where we left off.
+        if self.pp_stack[-1].seen_else:
+          # Here we can just use a shallow copy since we are the last
+          # reference to it.
+          self.stack = self.pp_stack[-1].stack_before_else
+        # Drop the corresponding #if
+        self.pp_stack.pop()
+      else:
+        # TODO(unknown): unexpected #endif, issue warning?
+        pass
+
+  def Update(self, filename, clean_lines, linenum, error):
+    """Update nesting state with current line.
+
+    Args:
+      filename: The name of the current file.
+      clean_lines: A CleansedLines instance containing the file.
+      linenum: The number of the line to check.
+      error: The function to call with any errors found.
+    """
+    line = clean_lines.elided[linenum]
+
+    # Update pp_stack first
+    self.UpdatePreprocessor(line)
+
+    # Count parentheses.  This is to avoid adding struct arguments to
+    # the nesting stack.
+    if self.stack:
+      inner_block = self.stack[-1]
+      depth_change = line.count('(') - line.count(')')
+      inner_block.open_parentheses += depth_change
+
+      # Also check if we are starting or ending an inline assembly block.
+      if inner_block.inline_asm in (_NO_ASM, _END_ASM):
+        if (depth_change != 0 and
+            inner_block.open_parentheses == 1 and
+            _MATCH_ASM.match(line)):
+          # Enter assembly block
+          inner_block.inline_asm = _INSIDE_ASM
+        else:
+          # Not entering assembly block.  If previous line was _END_ASM,
+          # we will now shift to _NO_ASM state.
+          inner_block.inline_asm = _NO_ASM
+      elif (inner_block.inline_asm == _INSIDE_ASM and
+            inner_block.open_parentheses == 0):
+        # Exit assembly block
+        inner_block.inline_asm = _END_ASM
+
+    # Consume namespace declaration at the beginning of the line.  Do
+    # this in a loop so that we catch same line declarations like this:
+    #   namespace proto2 { namespace bridge { class MessageSet; } }
+    while True:
+      # Match start of namespace.  The "\b\s*" below catches namespace
+      # declarations even if it weren't followed by a whitespace, this
+      # is so that we don't confuse our namespace checker.  The
+      # missing spaces will be flagged by CheckSpacing.
+      namespace_decl_match = Match(r'^\s*namespace\b\s*([:\w]+)?(.*)$', line)
+      if not namespace_decl_match:
+        break
+
+      new_namespace = _NamespaceInfo(namespace_decl_match.group(1), linenum)
+      self.stack.append(new_namespace)
+
+      line = namespace_decl_match.group(2)
+      if line.find('{') != -1:
+        new_namespace.seen_open_brace = True
+        line = line[line.find('{') + 1:]
+
+    # Look for a class declaration in whatever is left of the line
+    # after parsing namespaces.  The regexp accounts for decorated classes
+    # such as in:
+    #   class LOCKABLE API Object {
+    #   };
+    #
+    # Templates with class arguments may confuse the parser, for example:
+    #   template <class T
+    #             class Comparator = less<T>,
+    #             class Vector = vector<T> >
+    #   class HeapQueue {
+    #
+    # Because this parser has no nesting state about templates, by the
+    # time it saw "class Comparator", it may think that it's a new class.
+    # Nested templates have a similar problem:
+    #   template <
+    #       typename ExportedType,
+    #       typename TupleType,
+    #       template <typename, typename> class ImplTemplate>
+    #
+    # To avoid these cases, we ignore classes that are followed by '=' or '>'
+    class_decl_match = Match(
+        r'\s*(template\s*<[\w\s<>,:]*>\s*)?'
+        '(class|struct)\s+([A-Z_]+\s+)*(\w+(?:::\w+)*)'
+        '(([^=>]|<[^<>]*>)*)$', line)
+    if (class_decl_match and
+        (not self.stack or self.stack[-1].open_parentheses == 0)):
+      self.stack.append(_ClassInfo(
+          class_decl_match.group(4), class_decl_match.group(2),
+          clean_lines, linenum))
+      line = class_decl_match.group(5)
+
+    # If we have not yet seen the opening brace for the innermost block,
+    # run checks here.
+    if not self.SeenOpenBrace():
+      self.stack[-1].CheckBegin(filename, clean_lines, linenum, error)
+
+    # Update access control if we are inside a class/struct
+    if self.stack and isinstance(self.stack[-1], _ClassInfo):
+      access_match = Match(r'\s*(public|private|protected)\s*:', line)
+      if access_match:
+        self.stack[-1].access = access_match.group(1)
+
+    # Consume braces or semicolons from what's left of the line
+    while True:
+      # Match first brace, semicolon, or closed parenthesis.
+      matched = Match(r'^[^{;)}]*([{;)}])(.*)$', line)
+      if not matched:
+        break
+
+      token = matched.group(1)
+      if token == '{':
+        # If namespace or class hasn't seen a opening brace yet, mark
+        # namespace/class head as complete.  Push a new block onto the
+        # stack otherwise.
+        if not self.SeenOpenBrace():
+          self.stack[-1].seen_open_brace = True
+        else:
+          self.stack.append(_BlockInfo(True))
+          if _MATCH_ASM.match(line):
+            self.stack[-1].inline_asm = _BLOCK_ASM
+      elif token == ';' or token == ')':
+        # If we haven't seen an opening brace yet, but we already saw
+        # a semicolon, this is probably a forward declaration.  Pop
+        # the stack for these.
+        #
+        # Similarly, if we haven't seen an opening brace yet, but we
+        # already saw a closing parenthesis, then these are probably
+        # function arguments with extra "class" or "struct" keywords.
+        # Also pop these stack for these.
+        if not self.SeenOpenBrace():
+          self.stack.pop()
+      else:  # token == '}'
+        # Perform end of block checks and pop the stack.
+        if self.stack:
+          self.stack[-1].CheckEnd(filename, clean_lines, linenum, error)
+          self.stack.pop()
+      line = matched.group(2)
+
+  def InnermostClass(self):
+    """Get class info on the top of the stack.
+
+    Returns:
+      A _ClassInfo object if we are inside a class, or None otherwise.
+    """
+    for i in range(len(self.stack), 0, -1):
+      classinfo = self.stack[i - 1]
+      if isinstance(classinfo, _ClassInfo):
+        return classinfo
+    return None
+
+  def CheckClassFinished(self, filename, error):
     """Checks that all classes have been completely parsed.
 
     Call this when all lines in a file have been processed.
@@ -1306,17 +1733,18 @@
       filename: The name of the current file.
       error: The function to call with any errors found.
     """
-    if self.classinfo_stack:
-      # Note: This test can result in false positives if #ifdef constructs
-      # get in the way of brace matching. See the testBuildClass test in
-      # cpplint_unittest.py for an example of this.
-      error(filename, self.classinfo_stack[0].linenum, 'build/class', 5,
-            'Failed to find complete declaration of class %s' %
-            self.classinfo_stack[0].name)
+    # Note: This test can result in false positives if #ifdef constructs
+    # get in the way of brace matching. See the testBuildClass test in
+    # cpplint_unittest.py for an example of this.
+    for obj in self.stack:
+      if isinstance(obj, _ClassInfo):
+        error(filename, obj.starting_linenum, 'build/class', 5,
+              'Failed to find complete declaration of class %s' %
+              obj.name)
 
 
 def CheckForNonStandardConstructs(filename, clean_lines, linenum,
-                                  class_state, error):
+                                  nesting_state, error):
   """Logs an error if we see certain non-ANSI constructs ignored by gcc-2.
 
   Complain about several constructs which gcc-2 accepts, but which are
@@ -1329,8 +1757,6 @@
   - text after #endif is not allowed.
   - invalid inner-style forward declaration.
   - >? and <? operators, and their >?= and <?= cousins.
-  - classes with virtual methods need virtual destructors (compiler warning
-    available, but not turned on yet.)
 
   Additionally, check for constructor/destructor style violations and reference
   members, as it is very convenient to do so while checking for
@@ -1340,8 +1766,8 @@
     filename: The name of the current file.
     clean_lines: A CleansedLines instance containing the file.
     linenum: The number of the line to check.
-    class_state: A _ClassState instance which maintains information about
-                 the current stack of nested class declarations being parsed.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
     error: A callable to which errors are reported, which takes 4 arguments:
            filename, line number, error level, and message
   """
@@ -1370,7 +1796,7 @@
   if Search(r'\b(const|volatile|void|char|short|int|long'
             r'|float|double|signed|unsigned'
             r'|schar|u?int8|u?int16|u?int32|u?int64)'
-            r'\s+(auto|register|static|extern|typedef)\b',
+            r'\s+(register|static|extern|typedef)\b',
             line):
     error(filename, linenum, 'build/storage_class', 5,
           'Storage class (static, extern, typedef, etc) should be first.')
@@ -1400,45 +1826,13 @@
           'const string& members are dangerous. It is much better to use '
           'alternatives, such as pointers or simple constants.')
 
-  # Track class entry and exit, and attempt to find cases within the
-  # class declaration that don't meet the C++ style
-  # guidelines. Tracking is very dependent on the code matching Google
-  # style guidelines, but it seems to perform well enough in testing
-  # to be a worthwhile addition to the checks.
-  classinfo_stack = class_state.classinfo_stack
-  # Look for a class declaration. The regexp accounts for decorated classes
-  # such as in:
-  # class LOCKABLE API Object {
-  # };
-  class_decl_match = Match(
-      r'\s*(template\s*<[\w\s<>,:]*>\s*)?'
-      '(class|struct)\s+([A-Z_]+\s+)*(\w+(::\w+)*)', line)
-  if class_decl_match:
-    classinfo_stack.append(_ClassInfo(
-        class_decl_match.group(4), clean_lines, linenum))
-
-  # Everything else in this function uses the top of the stack if it's
-  # not empty.
-  if not classinfo_stack:
+  # Everything else in this function operates on class declarations.
+  # Return early if the top of the nesting stack is not a class, or if
+  # the class head is not completed yet.
+  classinfo = nesting_state.InnermostClass()
+  if not classinfo or not classinfo.seen_open_brace:
     return
 
-  classinfo = classinfo_stack[-1]
-
-  # If the opening brace hasn't been seen look for it and also
-  # parent class declarations.
-  if not classinfo.seen_open_brace:
-    # If the line has a ';' in it, assume it's a forward declaration or
-    # a single-line class declaration, which we won't process.
-    if line.find(';') != -1:
-      classinfo_stack.pop()
-      return
-    classinfo.seen_open_brace = (line.find('{') != -1)
-    # Look for a bare ':'
-    if Search('(^|[^:]):($|[^:])', line):
-      classinfo.is_derived = True
-    if not classinfo.seen_open_brace:
-      return  # Everything else in this function is for after open brace
-
   # The class may have been declared with namespace or classname qualifiers.
   # The constructor and destructor will not have those qualifiers.
   base_classname = classinfo.name.split('::')[-1]
@@ -1455,35 +1849,6 @@
     error(filename, linenum, 'runtime/explicit', 5,
           'Single-argument constructors should be marked explicit.')
 
-  # Look for methods declared virtual.
-  if Search(r'\bvirtual\b', line):
-    classinfo.virtual_method_linenumber = linenum
-    # Only look for a destructor declaration on the same line. It would
-    # be extremely unlikely for the destructor declaration to occupy
-    # more than one line.
-    if Search(r'~%s\s*\(' % base_classname, line):
-      classinfo.has_virtual_destructor = True
-
-  # Look for class end.
-  brace_depth = classinfo.brace_depth
-  brace_depth = brace_depth + line.count('{') - line.count('}')
-  if brace_depth <= 0:
-    classinfo = classinfo_stack.pop()
-    # Try to detect missing virtual destructor declarations.
-    # For now, only warn if a non-derived class with virtual methods lacks
-    # a virtual destructor. This is to make it less likely that people will
-    # declare derived virtual destructors without declaring the base
-    # destructor virtual.
-    if ((classinfo.virtual_method_linenumber is not None) and
-        (not classinfo.has_virtual_destructor) and
-        (not classinfo.is_derived)):  # Only warn for base classes
-      error(filename, classinfo.linenum, 'runtime/virtual', 4,
-            'The class %s probably needs a virtual destructor due to '
-            'having virtual method(s), one declared at line %d.'
-            % (classinfo.name, classinfo.virtual_method_linenumber))
-  else:
-    classinfo.brace_depth = brace_depth
-
 
 def CheckSpacingForFunctionCall(filename, line, linenum, error):
   """Checks for the correctness of various spacing around function calls.
@@ -1535,7 +1900,8 @@
       error(filename, linenum, 'whitespace/parens', 2,
             'Extra space after (')
     if (Search(r'\w\s+\(', fncall) and
-        not Search(r'#\s*define|typedef', fncall)):
+        not Search(r'#\s*define|typedef', fncall) and
+        not Search(r'\w\s+\((\w+::)?\*\w+\)\(', fncall)):
       error(filename, linenum, 'whitespace/parens', 4,
             'Extra space before ( in function call')
     # If the ) is followed only by a newline or a { + newline, assume it's
@@ -1668,8 +2034,165 @@
       error(filename, linenum, 'whitespace/todo', 2,
             'TODO(my_username) should be followed by a space')
 
+def CheckAccess(filename, clean_lines, linenum, nesting_state, error):
+  """Checks for improper use of DISALLOW* macros.
 
-def CheckSpacing(filename, clean_lines, linenum, error):
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]  # get rid of comments and strings
+
+  matched = Match((r'\s*(DISALLOW_COPY_AND_ASSIGN|'
+                   r'DISALLOW_EVIL_CONSTRUCTORS|'
+                   r'DISALLOW_IMPLICIT_CONSTRUCTORS)'), line)
+  if not matched:
+    return
+  if nesting_state.stack and isinstance(nesting_state.stack[-1], _ClassInfo):
+    if nesting_state.stack[-1].access != 'private':
+      error(filename, linenum, 'readability/constructors', 3,
+            '%s must be in the private: section' % matched.group(1))
+
+  else:
+    # Found DISALLOW* macro outside a class declaration, or perhaps it
+    # was used inside a function when it should have been part of the
+    # class declaration.  We could issue a warning here, but it
+    # probably resulted in a compiler error already.
+    pass
+
+
+def FindNextMatchingAngleBracket(clean_lines, linenum, init_suffix):
+  """Find the corresponding > to close a template.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: Current line number.
+    init_suffix: Remainder of the current line after the initial <.
+
+  Returns:
+    True if a matching bracket exists.
+  """
+  line = init_suffix
+  nesting_stack = ['<']
+  while True:
+    # Find the next operator that can tell us whether < is used as an
+    # opening bracket or as a less-than operator.  We only want to
+    # warn on the latter case.
+    #
+    # We could also check all other operators and terminate the search
+    # early, e.g. if we got something like this "a<b+c", the "<" is
+    # most likely a less-than operator, but then we will get false
+    # positives for default arguments (e.g. http://go/prccd) and
+    # other template expressions (e.g. http://go/oxcjq).
+    match = Search(r'^[^<>(),;\[\]]*([<>(),;\[\]])(.*)$', line)
+    if match:
+      # Found an operator, update nesting stack
+      operator = match.group(1)
+      line = match.group(2)
+
+      if nesting_stack[-1] == '<':
+        # Expecting closing angle bracket
+        if operator in ('<', '(', '['):
+          nesting_stack.append(operator)
+        elif operator == '>':
+          nesting_stack.pop()
+          if not nesting_stack:
+            # Found matching angle bracket
+            return True
+        elif operator == ',':
+          # Got a comma after a bracket, this is most likely a template
+          # argument.  We have not seen a closing angle bracket yet, but
+          # it's probably a few lines later if we look for it, so just
+          # return early here.
+          return True
+        else:
+          # Got some other operator.
+          return False
+
+      else:
+        # Expecting closing parenthesis or closing bracket
+        if operator in ('<', '(', '['):
+          nesting_stack.append(operator)
+        elif operator in (')', ']'):
+          # We don't bother checking for matching () or [].  If we got
+          # something like (] or [), it would have been a syntax error.
+          nesting_stack.pop()
+
+    else:
+      # Scan the next line
+      linenum += 1
+      if linenum >= len(clean_lines.elided):
+        break
+      line = clean_lines.elided[linenum]
+
+  # Exhausted all remaining lines and still no matching angle bracket.
+  # Most likely the input was incomplete, otherwise we should have
+  # seen a semicolon and returned early.
+  return True
+
+
+def FindPreviousMatchingAngleBracket(clean_lines, linenum, init_prefix):
+  """Find the corresponding < that started a template.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: Current line number.
+    init_prefix: Part of the current line before the initial >.
+
+  Returns:
+    True if a matching bracket exists.
+  """
+  line = init_prefix
+  nesting_stack = ['>']
+  while True:
+    # Find the previous operator
+    match = Search(r'^(.*)([<>(),;\[\]])[^<>(),;\[\]]*$', line)
+    if match:
+      # Found an operator, update nesting stack
+      operator = match.group(2)
+      line = match.group(1)
+
+      if nesting_stack[-1] == '>':
+        # Expecting opening angle bracket
+        if operator in ('>', ')', ']'):
+          nesting_stack.append(operator)
+        elif operator == '<':
+          nesting_stack.pop()
+          if not nesting_stack:
+            # Found matching angle bracket
+            return True
+        elif operator == ',':
+          # Got a comma before a bracket, this is most likely a
+          # template argument.  The opening angle bracket is probably
+          # there if we look for it, so just return early here.
+          return True
+        else:
+          # Got some other operator.
+          return False
+
+      else:
+        # Expecting opening parenthesis or opening bracket
+        if operator in ('>', ')', ']'):
+          nesting_stack.append(operator)
+        elif operator in ('(', '['):
+          nesting_stack.pop()
+
+    else:
+      # Scan the previous line
+      linenum -= 1
+      if linenum < 0:
+        break
+      line = clean_lines.elided[linenum]
+
+  # Exhausted all earlier lines and still no matching angle bracket.
+  return False
+
+
+def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
   """Checks for the correctness of various spacing issues in the code.
 
   Things we check for: spaces around operators, spaces after
@@ -1682,6 +2205,8 @@
     filename: The name of the current file.
     clean_lines: A CleansedLines instance containing the file.
     linenum: The number of the line to check.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
     error: The function to call with any errors found.
   """
 
@@ -1691,7 +2216,16 @@
   # Before nixing comments, check if the line is blank for no good
   # reason.  This includes the first line after a block is opened, and
   # blank lines at the end of a function (ie, right before a line like '}'
-  if IsBlankLine(line):
+  #
+  # Skip all the blank line checks if we are immediately inside a
+  # namespace body.  In other words, don't issue blank line warnings
+  # for this block:
+  #   namespace {
+  #
+  #   }
+  #
+  # A warning about missing end of namespace comments will be issued instead.
+  if IsBlankLine(line) and not nesting_state.InNamespaceBody():
     elided = clean_lines.elided
     prev_line = elided[linenum - 1]
     prevbrace = prev_line.rfind('{')
@@ -1699,8 +2233,7 @@
     #                both start with alnums and are indented the same amount.
     #                This ignores whitespace at the start of a namespace block
     #                because those are not usually indented.
-    if (prevbrace != -1 and prev_line[prevbrace:].find('}') == -1
-        and prev_line[:prevbrace].find('namespace') == -1):
+    if prevbrace != -1 and prev_line[prevbrace:].find('}') == -1:
       # OK, we have a blank line at the start of a code block.  Before we
       # complain, we check if it is an exception to the rule: The previous
       # non-empty line has the parameters of a function header that are indented
@@ -1732,12 +2265,7 @@
       if not exception:
         error(filename, linenum, 'whitespace/blank_line', 2,
               'Blank line at the start of a code block.  Is this needed?')
-    # This doesn't ignore whitespace at the end of a namespace block
-    # because that is too hard without pairing open/close braces;
-    # however, a special exception is made for namespace closing
-    # brackets which have a comment containing "namespace".
-    #
-    # Also, ignore blank lines at the end of a block in a long if-else
+    # Ignore blank lines at the end of a block in a long if-else
     # chain, like this:
     #   if (condition1) {
     #     // Something followed by a blank line
@@ -1749,7 +2277,6 @@
       next_line = raw[linenum + 1]
       if (next_line
           and Match(r'\s*}', next_line)
-          and next_line.find('namespace') == -1
           and next_line.find('} else ') == -1):
         error(filename, linenum, 'whitespace/blank_line', 3,
               'Blank line at the end of a code block.  Is this needed?')
@@ -1810,26 +2337,59 @@
   # though, so we punt on this one for now.  TODO.
 
   # You should always have whitespace around binary operators.
-  # Alas, we can't test < or > because they're legitimately used sans spaces
-  # (a->b, vector<int> a).  The only time we can tell is a < with no >, and
-  # only if it's not template params list spilling into the next line.
+  #
+  # Check <= and >= first to avoid false positives with < and >, then
+  # check non-include lines for spacing around < and >.
   match = Search(r'[^<>=!\s](==|!=|<=|>=)[^<>=!\s]', line)
-  if not match:
-    # Note that while it seems that the '<[^<]*' term in the following
-    # regexp could be simplified to '<.*', which would indeed match
-    # the same class of strings, the [^<] means that searching for the
-    # regexp takes linear rather than quadratic time.
-    if not Search(r'<[^<]*,\s*$', line):  # template params spill
-      match = Search(r'[^<>=!\s](<)[^<>=!\s]([^>]|->)*$', line)
   if match:
     error(filename, linenum, 'whitespace/operators', 3,
           'Missing spaces around %s' % match.group(1))
-  # We allow no-spaces around << and >> when used like this: 10<<20, but
+  # We allow no-spaces around << when used like this: 10<<20, but
   # not otherwise (particularly, not when used as streams)
-  match = Search(r'[^0-9\s](<<|>>)[^0-9\s]', line)
+  match = Search(r'(\S)(?:L|UL|ULL|l|ul|ull)?<<(\S)', line)
+  if match and not (match.group(1).isdigit() and match.group(2).isdigit()):
+    error(filename, linenum, 'whitespace/operators', 3,
+          'Missing spaces around <<')
+  elif not Match(r'#.*include', line):
+    # Avoid false positives on ->
+    reduced_line = line.replace('->', '')
+
+    # Look for < that is not surrounded by spaces.  This is only
+    # triggered if both sides are missing spaces, even though
+    # technically should should flag if at least one side is missing a
+    # space.  This is done to avoid some false positives with shifts.
+    match = Search(r'[^\s<]<([^\s=<].*)', reduced_line)
+    if (match and
+        not FindNextMatchingAngleBracket(clean_lines, linenum, match.group(1))):
+      error(filename, linenum, 'whitespace/operators', 3,
+            'Missing spaces around <')
+
+    # Look for > that is not surrounded by spaces.  Similar to the
+    # above, we only trigger if both sides are missing spaces to avoid
+    # false positives with shifts.
+    match = Search(r'^(.*[^\s>])>[^\s=>]', reduced_line)
+    if (match and
+        not FindPreviousMatchingAngleBracket(clean_lines, linenum,
+                                             match.group(1))):
+      error(filename, linenum, 'whitespace/operators', 3,
+            'Missing spaces around >')
+
+  # We allow no-spaces around >> for almost anything.  This is because
+  # C++11 allows ">>" to close nested templates, which accounts for
+  # most cases when ">>" is not followed by a space.
+  #
+  # We still warn on ">>" followed by alpha character, because that is
+  # likely due to ">>" being used for right shifts, e.g.:
+  #   value >> alpha
+  #
+  # When ">>" is used to close templates, the alphanumeric letter that
+  # follows would be part of an identifier, and there should still be
+  # a space separating the template type and the identifier.
+  #   type<type<type>> alpha
+  match = Search(r'>>[a-zA-Z_]', line)
   if match:
     error(filename, linenum, 'whitespace/operators', 3,
-          'Missing spaces around %s' % match.group(1))
+          'Missing spaces around >>')
 
   # There shouldn't be space around unary operators
   match = Search(r'(!\s|~\s|[\s]--[\s;]|[\s]\+\+[\s;])', line)
@@ -1903,16 +2463,23 @@
   # the semicolon there.
   if Search(r':\s*;\s*$', line):
     error(filename, linenum, 'whitespace/semicolon', 5,
-          'Semicolon defining empty statement. Use { } instead.')
+          'Semicolon defining empty statement. Use {} instead.')
   elif Search(r'^\s*;\s*$', line):
     error(filename, linenum, 'whitespace/semicolon', 5,
           'Line contains only semicolon. If this should be an empty statement, '
-          'use { } instead.')
+          'use {} instead.')
   elif (Search(r'\s+;\s*$', line) and
         not Search(r'\bfor\b', line)):
     error(filename, linenum, 'whitespace/semicolon', 5,
           'Extra space before last semicolon. If this should be an empty '
-          'statement, use { } instead.')
+          'statement, use {} instead.')
+
+  # In range-based for, we wanted spaces before and after the colon, but
+  # not around "::" tokens that might appear.
+  if (Search('for *\(.*[^:]:[^: ]', line) or
+      Search('for *\(.*[^: ]:[^:]', line)):
+    error(filename, linenum, 'whitespace/forcolon', 2,
+          'Missing space around colon in range-based for loop')
 
 
 def CheckSectionSpacing(filename, clean_lines, class_info, linenum, error):
@@ -1938,8 +2505,8 @@
   #
   # If we didn't find the end of the class, last_line would be zero,
   # and the check will be skipped by the first condition.
-  if (class_info.last_line - class_info.linenum <= 24 or
-      linenum <= class_info.linenum):
+  if (class_info.last_line - class_info.starting_linenum <= 24 or
+      linenum <= class_info.starting_linenum):
     return
 
   matched = Match(r'\s*(public|protected|private):', clean_lines.lines[linenum])
@@ -1950,15 +2517,18 @@
     #  - We are at the beginning of the class.
     #  - We are forward-declaring an inner class that is semantically
     #    private, but needed to be public for implementation reasons.
+    # Also ignores cases where the previous line ends with a backslash as can be
+    # common when defining classes in C macros.
     prev_line = clean_lines.lines[linenum - 1]
     if (not IsBlankLine(prev_line) and
-        not Search(r'\b(class|struct)\b', prev_line)):
+        not Search(r'\b(class|struct)\b', prev_line) and
+        not Search(r'\\$', prev_line)):
       # Try a bit harder to find the beginning of the class.  This is to
       # account for multi-line base-specifier lists, e.g.:
       #   class Derived
       #       : public Base {
-      end_class_head = class_info.linenum
-      for i in range(class_info.linenum, linenum):
+      end_class_head = class_info.starting_linenum
+      for i in range(class_info.starting_linenum, linenum):
         if Search(r'\{\s*$', clean_lines.lines[i]):
           end_class_head = i
           break
@@ -2008,9 +2578,11 @@
     # which is commonly used to control the lifetime of
     # stack-allocated variables.  We don't detect this perfectly: we
     # just don't complain if the last non-whitespace character on the
-    # previous non-blank line is ';', ':', '{', or '}'.
+    # previous non-blank line is ';', ':', '{', or '}', or if the previous
+    # line starts a preprocessor block.
     prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
-    if not Search(r'[;:}{]\s*$', prevline):
+    if (not Search(r'[;:}{]\s*$', prevline) and
+        not Match(r'\s*#', prevline)):
       error(filename, linenum, 'whitespace/braces', 4,
             '{ should almost always be at the end of the previous line')
 
@@ -2064,6 +2636,33 @@
           "You don't need a ; after a }")
 
 
+def CheckEmptyLoopBody(filename, clean_lines, linenum, error):
+  """Loop for empty loop body with only a single semicolon.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+
+  # Search for loop keywords at the beginning of the line.  Because only
+  # whitespaces are allowed before the keywords, this will also ignore most
+  # do-while-loops, since those lines should start with closing brace.
+  line = clean_lines.elided[linenum]
+  if Match(r'\s*(for|while)\s*\(', line):
+    # Find the end of the conditional expression
+    (end_line, end_linenum, end_pos) = CloseExpression(
+        clean_lines, linenum, line.find('('))
+
+    # Output warning if what follows the condition expression is a semicolon.
+    # No warning for all other cases, including whitespace or newline, since we
+    # have a separate check for semicolons preceded by whitespace.
+    if end_pos >= 0 and Match(r';', end_line[end_pos:]):
+      error(filename, end_linenum, 'whitespace/empty_loop_body', 5,
+            'Empty loop bodies should use {} or continue')
+
+
 def ReplaceableCheck(operator, macro, line):
   """Determine whether a basic CHECK can be replaced with a more specific one.
 
@@ -2132,6 +2731,38 @@
       break
 
 
+def CheckAltTokens(filename, clean_lines, linenum, error):
+  """Check alternative keywords being used in boolean expressions.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # Avoid preprocessor lines
+  if Match(r'^\s*#', line):
+    return
+
+  # Last ditch effort to avoid multi-line comments.  This will not help
+  # if the comment started before the current line or ended after the
+  # current line, but it catches most of the false positives.  At least,
+  # it provides a way to workaround this warning for people who use
+  # multi-line comments in preprocessor macros.
+  #
+  # TODO(unknown): remove this once cpplint has better support for
+  # multi-line comments.
+  if line.find('/*') >= 0 or line.find('*/') >= 0:
+    return
+
+  for match in _ALT_TOKEN_REPLACEMENT_PATTERN.finditer(line):
+    error(filename, linenum, 'readability/alt_tokens', 2,
+          'Use operator %s instead of %s' % (
+              _ALT_TOKEN_REPLACEMENT[match.group(1)], match.group(1)))
+
+
 def GetLineWidth(line):
   """Determines the width of the line in column positions.
 
@@ -2154,7 +2785,7 @@
     return len(line)
 
 
-def CheckStyle(filename, clean_lines, linenum, file_extension, class_state,
+def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,
                error):
   """Checks rules from the 'C++ style rules' section of cppguide.html.
 
@@ -2167,6 +2798,8 @@
     clean_lines: A CleansedLines instance containing the file.
     linenum: The number of the line to check.
     file_extension: The extension (without the dot) of the filename.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
     error: The function to call with any errors found.
   """
 
@@ -2248,16 +2881,19 @@
       not ((cleansed_line.find('case ') != -1 or
             cleansed_line.find('default:') != -1) and
            cleansed_line.find('break;') != -1)):
-    error(filename, linenum, 'whitespace/newline', 4,
+    error(filename, linenum, 'whitespace/newline', 0,
           'More than one command on the same line')
 
   # Some more style checks
   CheckBraces(filename, clean_lines, linenum, error)
-  CheckSpacing(filename, clean_lines, linenum, error)
+  CheckEmptyLoopBody(filename, clean_lines, linenum, error)
+  CheckAccess(filename, clean_lines, linenum, nesting_state, error)
+  CheckSpacing(filename, clean_lines, linenum, nesting_state, error)
   CheckCheck(filename, clean_lines, linenum, error)
-  if class_state and class_state.classinfo_stack:
-    CheckSectionSpacing(filename, clean_lines,
-                        class_state.classinfo_stack[-1], linenum, error)
+  CheckAltTokens(filename, clean_lines, linenum, error)
+  classinfo = nesting_state.InnermostClass()
+  if classinfo:
+    CheckSectionSpacing(filename, clean_lines, classinfo, linenum, error)
 
 
 _RE_PATTERN_INCLUDE_NEW_STYLE = re.compile(r'#include +"[^/]+\.h"')
@@ -2554,9 +3190,11 @@
                      fnline))):
 
     # We allow non-const references in a few standard places, like functions
-    # called "swap()" or iostream operators like "<<" or ">>".
+    # called "swap()" or iostream operators like "<<" or ">>". We also filter
+    # out for loops, which lint otherwise mistakenly thinks are functions.
     if not Search(
-        r'(swap|Swap|operator[<>][<>])\s*\(\s*(?:[\w:]|<.*>)+\s*&',
+        r'(for|swap|Swap|operator[<>][<>])\s*\(\s*'
+        r'(?:(?:typename\s*)?[\w:]|<.*>)+\s*&',
         fnline):
       error(filename, linenum, 'runtime/references', 2,
             'Is this a non-const reference? '
@@ -2578,10 +3216,19 @@
     if (match.group(1) is None and  # If new operator, then this isn't a cast
         not (Match(r'^\s*MOCK_(CONST_)?METHOD\d+(_T)?\(', line) or
              Match(r'^\s*MockCallback<.*>', line))):
-      error(filename, linenum, 'readability/casting', 4,
-            'Using deprecated casting style.  '
-            'Use static_cast<%s>(...) instead' %
-            match.group(2))
+      # Try a bit harder to catch gmock lines: the only place where
+      # something looks like an old-style cast is where we declare the
+      # return type of the mocked method, and the only time when we
+      # are missing context is if MOCK_METHOD was split across
+      # multiple lines (for example http://go/hrfhr ), so we only need
+      # to check the previous line for MOCK_METHOD.
+      if (linenum == 0 or
+          not Match(r'^\s*MOCK_(CONST_)?METHOD\d+(_T)?\(\S+,\s*$',
+                    clean_lines.elided[linenum - 1])):
+        error(filename, linenum, 'readability/casting', 4,
+              'Using deprecated casting style.  '
+              'Use static_cast<%s>(...) instead' %
+              match.group(2))
 
   CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum],
                   'static_cast',
@@ -2703,7 +3350,7 @@
   printf_args = _GetTextInside(line, r'(?i)\b(string)?printf\s*\(')
   if printf_args:
     match = Match(r'([\w.\->()]+)$', printf_args)
-    if match:
+    if match and match.group(1) != '__VA_ARGS__':
       function_name = re.search(r'\b((?:string)?printf)\s*\(',
                                 line, re.I).group(1)
       error(filename, linenum, 'runtime/printf', 4,
@@ -2824,6 +3471,11 @@
           'Using sizeof(type).  Use sizeof(varname) instead if possible')
     return True
 
+  # operator++(int) and operator--(int)
+  if (line[0:match.start(1) - 1].endswith(' operator++') or
+      line[0:match.start(1) - 1].endswith(' operator--')):
+    return False
+
   remainder = line[match.end(0):]
 
   # The close paren is for function pointers as arguments to a function.
@@ -3112,13 +3764,13 @@
   if match:
     error(filename, linenum, 'build/explicit_make_pair',
           4,  # 4 = high confidence
-          'Omit template arguments from make_pair OR use pair directly OR'
-          ' if appropriate, construct a pair directly')
+          'For C++11-compatibility, omit template arguments from make_pair'
+          ' OR use pair directly OR if appropriate, construct a pair directly')
 
 
-def ProcessLine(filename, file_extension,
-                clean_lines, line, include_state, function_state,
-                class_state, error, extra_check_functions=[]):
+def ProcessLine(filename, file_extension, clean_lines, line,
+                include_state, function_state, nesting_state, error,
+                extra_check_functions=[]):
   """Processes a single line in the file.
 
   Args:
@@ -3129,8 +3781,8 @@
     line: Number of line being processed.
     include_state: An _IncludeState instance in which the headers are inserted.
     function_state: A _FunctionState instance which counts function lines, etc.
-    class_state: A _ClassState instance which maintains information about
-                 the current stack of nested class declarations being parsed.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
     error: A callable to which errors are reported, which takes 4 arguments:
            filename, line number, error level, and message
     extra_check_functions: An array of additional check functions that will be
@@ -3139,13 +3791,16 @@
   """
   raw_lines = clean_lines.raw_lines
   ParseNolintSuppressions(filename, raw_lines[line], line, error)
+  nesting_state.Update(filename, clean_lines, line, error)
+  if nesting_state.stack and nesting_state.stack[-1].inline_asm != _NO_ASM:
+    return
   CheckForFunctionLengths(filename, clean_lines, line, function_state, error)
   CheckForMultilineCommentsAndStrings(filename, clean_lines, line, error)
-  CheckStyle(filename, clean_lines, line, file_extension, class_state, error)
+  CheckStyle(filename, clean_lines, line, file_extension, nesting_state, error)
   CheckLanguage(filename, clean_lines, line, file_extension, include_state,
                 error)
   CheckForNonStandardConstructs(filename, clean_lines, line,
-                                class_state, error)
+                                nesting_state, error)
   CheckPosixThreading(filename, clean_lines, line, error)
   CheckInvalidIncrement(filename, clean_lines, line, error)
   CheckMakePairUsesDeduction(filename, clean_lines, line, error)
@@ -3172,7 +3827,7 @@
 
   include_state = _IncludeState()
   function_state = _FunctionState()
-  class_state = _ClassState()
+  nesting_state = _NestingState()
 
   ResetNolintSuppressions()
 
@@ -3185,9 +3840,9 @@
   clean_lines = CleansedLines(lines)
   for line in xrange(clean_lines.NumLines()):
     ProcessLine(filename, file_extension, clean_lines, line,
-                include_state, function_state, class_state, error,
+                include_state, function_state, nesting_state, error,
                 extra_check_functions)
-  class_state.CheckFinished(filename, error)
+  nesting_state.CheckClassFinished(filename, error)
 
   CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error)
 
@@ -3301,7 +3956,8 @@
   try:
     (opts, filenames) = getopt.getopt(args, '', ['help', 'output=', 'verbose=',
                                                  'counting=',
-                                                 'filter='])
+                                                 'filter=',
+                                                 'root='])
   except getopt.GetoptError:
     PrintUsage('Invalid arguments.')
 
@@ -3327,6 +3983,9 @@
       if val not in ('total', 'toplevel', 'detailed'):
         PrintUsage('Valid counting options are total, toplevel, and detailed')
       counting_style = val
+    elif opt == '--root':
+      global _root
+      _root = val
 
   if not filenames:
     PrintUsage('No files were specified.')
diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index 7d1904a..cd091f3 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -91,18 +91,8 @@
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.c
 
-# TODO(johann) make this generic
-ifeq ($(HAVE_SSE2),yes)
-vp8/encoder/x86/quantize_sse2.c.o: CFLAGS += -msse2
-vp8/encoder/x86/quantize_sse2.c.d: CFLAGS += -msse2
-endif
-
 ifeq ($(CONFIG_TEMPORAL_DENOISING),yes)
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/denoising_sse2.c
-ifeq ($(HAVE_SSE2),yes)
-vp8/encoder/x86/denoising_sse2.c.o: CFLAGS += -msse2
-vp8/encoder/x86/denoising_sse2.c.d: CFLAGS += -msse2
-endif
 endif
 
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 7d323eed..97bb33e 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -83,9 +83,6 @@
   D27_PRED,           /* Directional 22 deg prediction  [anti-clockwise from 0 deg hor] */
   D63_PRED,           /* Directional 67 deg prediction  [anti-clockwise from 0 deg hor] */
   TM_PRED,            /* Truemotion prediction */
-#if !CONFIG_SB8X8
-  I8X8_PRED,          /* 8x8 based prediction, each 8x8 has its own mode */
-#endif
   I4X4_PRED,          /* 4x4 based prediction, each 4x4 has its own mode */
   NEARESTMV,
   NEARMV,
@@ -128,9 +125,6 @@
 
 #define VP9_YMODES  (I4X4_PRED + 1)
 #define VP9_UV_MODES (TM_PRED + 1)
-#if !CONFIG_SB8X8
-#define VP9_I8X8_MODES (TM_PRED + 1)
-#endif
 #define VP9_I32X32_MODES (TM_PRED + 1)
 
 #define VP9_MVREFS (1 + SPLITMV - NEARESTMV)
@@ -173,16 +167,6 @@
 #define VP9_NKF_BINTRAMODES (VP9_BINTRAMODES)  /* 10 */
 #endif
 
-#if !CONFIG_SB8X8
-typedef enum {
-  PARTITIONING_16X8 = 0,
-  PARTITIONING_8X16,
-  PARTITIONING_8X8,
-  PARTITIONING_4X4,
-  NB_PARTITIONINGS,
-} SPLITMV_PARTITIONING_TYPE;
-#endif
-
 /* For keyframes, intra block modes are predicted by the (already decoded)
    modes for the Y blocks to the left and above us; for interframes, there
    is a single probability table. */
@@ -209,11 +193,9 @@
 static INLINE int b_width_log2(BLOCK_SIZE_TYPE sb_type) {
   switch (sb_type) {
     case BLOCK_SIZE_AB4X4: return 0;
-#if CONFIG_SB8X8
     case BLOCK_SIZE_SB8X8:
     case BLOCK_SIZE_SB8X16: return 1;
     case BLOCK_SIZE_SB16X8:
-#endif
     case BLOCK_SIZE_MB16X16:
     case BLOCK_SIZE_SB16X32: return 2;
     case BLOCK_SIZE_SB32X16:
@@ -228,11 +210,9 @@
 static INLINE int b_height_log2(BLOCK_SIZE_TYPE sb_type) {
   switch (sb_type) {
     case BLOCK_SIZE_AB4X4: return 0;
-#if CONFIG_SB8X8
     case BLOCK_SIZE_SB8X8:
     case BLOCK_SIZE_SB16X8: return 1;
     case BLOCK_SIZE_SB8X16:
-#endif
     case BLOCK_SIZE_MB16X16:
     case BLOCK_SIZE_SB32X16: return 2;
     case BLOCK_SIZE_SB16X32:
@@ -245,21 +225,13 @@
 }
 
 static INLINE int mi_width_log2(BLOCK_SIZE_TYPE sb_type) {
-#if CONFIG_SB8X8
   int a = b_width_log2(sb_type) - 1;
-#else
-  int a = b_width_log2(sb_type) - 2;
-#endif
   assert(a >= 0);
   return a;
 }
 
 static INLINE int mi_height_log2(BLOCK_SIZE_TYPE sb_type) {
-#if CONFIG_SB8X8
   int a = b_height_log2(sb_type) - 1;
-#else
-  int a = b_height_log2(sb_type) - 2;
-#endif
   assert(a >= 0);
   return a;
 }
@@ -277,9 +249,6 @@
 
   int mb_mode_context[MAX_REF_FRAMES];
 
-#if !CONFIG_SB8X8
-  SPLITMV_PARTITIONING_TYPE partitioning;
-#endif
   unsigned char mb_skip_coeff;                                /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */
   unsigned char need_to_clamp_mvs;
   unsigned char need_to_clamp_secondmv;
@@ -301,7 +270,7 @@
 
 typedef struct {
   MB_MODE_INFO mbmi;
-  union b_mode_info bmi[16 >> (CONFIG_SB8X8 * 2)];
+  union b_mode_info bmi[4];
 } MODE_INFO;
 
 struct scale_factors {
@@ -443,9 +412,7 @@
 
   int sb_index;   // index of 32x32 block inside the 64x64 block
   int mb_index;   // index of 16x16 block inside the 32x32 block
-#if CONFIG_SB8X8
   int b_index;    // index of 8x8 block inside the 16x16 block
-#endif
   int q_index;
 
 } MACROBLOCKD;
@@ -462,11 +429,7 @@
   if (bsl == 0)
     return;
 
-#if CONFIG_SB8X8
   bs = 1 << (bsl - 1);
-#else
-  bs = 1 << bsl;
-#endif
 
   // update the partition context at the end notes. set partition bits
   // of block sizes larger than the current one to be one, and partition
@@ -502,21 +465,13 @@
   int above = 0, left = 0, i;
   int boffset = mi_width_log2(BLOCK_SIZE_SB64X64) - bsl;
 
-#if CONFIG_SB8X8
   bs = 1 << (bsl - 1);
-#else
-  bs = 1 << bsl;
-#endif
 
   assert(mi_width_log2(sb_type) == mi_height_log2(sb_type));
   assert(bsl >= 0);
   assert(boffset >= 0);
 
-#if CONFIG_SB8X8
   bs = 1 << (bsl - 1);
-#else
-  bs = 1 << bsl;
-#endif
 
   for (i = 0; i < bs; i++)
     above |= (xd->above_seg_context[i] & (1 << boffset));
@@ -541,10 +496,8 @@
         subsize = BLOCK_SIZE_SB64X32;
       else if (bsize == BLOCK_SIZE_SB32X32)
         subsize = BLOCK_SIZE_SB32X16;
-#if CONFIG_SB8X8
       else if (bsize == BLOCK_SIZE_MB16X16)
         subsize = BLOCK_SIZE_SB16X8;
-#endif
       else
         assert(0);
       break;
@@ -553,10 +506,8 @@
         subsize = BLOCK_SIZE_SB32X64;
       else if (bsize == BLOCK_SIZE_SB32X32)
         subsize = BLOCK_SIZE_SB16X32;
-#if CONFIG_SB8X8
       else if (bsize == BLOCK_SIZE_MB16X16)
         subsize = BLOCK_SIZE_SB8X16;
-#endif
       else
         assert(0);
       break;
@@ -565,10 +516,8 @@
         subsize = BLOCK_SIZE_SB32X32;
       else if (bsize == BLOCK_SIZE_SB32X32)
         subsize = BLOCK_SIZE_MB16X16;
-#if CONFIG_SB8X8
       else if (bsize == BLOCK_SIZE_MB16X16)
         subsize = BLOCK_SIZE_SB8X8;
-#endif
       else
         assert(0);
       break;
@@ -659,42 +608,6 @@
           xd->mode_info_context->bmi[ib].as_mode.context :
 #endif
         xd->mode_info_context->bmi[ib].as_mode.first);
-#if !CONFIG_SB8X8
-  } else if (xd->mode_info_context->mbmi.mode == I8X8_PRED &&
-             xd->q_index < ACTIVE_HT) {
-    const int ic = (ib & 10);
-#if USE_ADST_FOR_I8X8_4X4
-#if USE_ADST_PERIPHERY_ONLY
-    // Use ADST for periphery blocks only
-    const int inner = ib & 5;
-    tx_type = txfm_map(pred_mode_conv(
-        (MB_PREDICTION_MODE)xd->mode_info_context->bmi[ic].as_mode.first));
-
-#if USE_ADST_FOR_REMOTE_EDGE
-    if (inner == 5)
-      tx_type = DCT_DCT;
-#else
-    if (inner == 1) {
-      if (tx_type == ADST_ADST) tx_type = ADST_DCT;
-      else if (tx_type == DCT_ADST) tx_type = DCT_DCT;
-    } else if (inner == 4) {
-      if (tx_type == ADST_ADST) tx_type = DCT_ADST;
-      else if (tx_type == ADST_DCT) tx_type = DCT_DCT;
-    } else if (inner == 5) {
-      tx_type = DCT_DCT;
-    }
-#endif
-#else
-    // Use ADST
-    b += ic - ib;
-    tx_type = txfm_map(pred_mode_conv(
-        (MB_PREDICTION_MODE)b->bmi.as_mode.first));
-#endif
-#else
-    // Use 2D DCT
-    tx_type = DCT_DCT;
-#endif
-#endif  // !CONFIG_SB8X8
   } else if (xd->mode_info_context->mbmi.mode <= TM_PRED &&
              xd->q_index < ACTIVE_HT) {
 #if USE_ADST_FOR_I16X16_4X4
@@ -739,15 +652,6 @@
 #endif
   if (ib >= (1 << (wb + hb)))  // no chroma adst
     return tx_type;
-#if !CONFIG_SB8X8
-  if (xd->mode_info_context->mbmi.mode == I8X8_PRED &&
-      xd->q_index < ACTIVE_HT8) {
-    // TODO(rbultje): MB_PREDICTION_MODE / B_PREDICTION_MODE should be merged
-    // or the relationship otherwise modified to address this type conversion.
-    tx_type = txfm_map(pred_mode_conv(
-           (MB_PREDICTION_MODE)xd->mode_info_context->bmi[ib].as_mode.first));
-  } else
-#endif  // CONFIG_SB8X8
   if (xd->mode_info_context->mbmi.mode <= TM_PRED &&
       xd->q_index < ACTIVE_HT8) {
 #if USE_ADST_FOR_I16X16_8X8
@@ -821,9 +725,6 @@
 static TX_SIZE get_uv_tx_size(const MACROBLOCKD *xd) {
   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
   const TX_SIZE size = mbmi->txfm_size;
-#if !CONFIG_SB8X8
-  const MB_PREDICTION_MODE mode = mbmi->mode;
-#endif  // !CONFIG_SB8X8
 
   switch (mbmi->sb_type) {
     case BLOCK_SIZE_SB64X64:
@@ -835,7 +736,6 @@
         return TX_16X16;
       else
         return size;
-#if CONFIG_SB8X8
     case BLOCK_SIZE_SB32X16:
     case BLOCK_SIZE_SB16X32:
     case BLOCK_SIZE_MB16X16:
@@ -845,15 +745,6 @@
         return size;
     default:
       return TX_4X4;
-#else  // CONFIG_SB8X8
-    default:
-      if (size == TX_16X16)
-        return TX_8X8;
-      else if (size == TX_8X8 && (mode == I8X8_PRED || mode == SPLITMV))
-        return TX_4X4;
-      else
-        return size;
-#endif  // CONFIG_SB8X8
   }
 
   return size;
@@ -885,33 +776,12 @@
   return res;
 }
 
-/* TODO(jkoleszar): Probably best to remove instances that require this,
- * as the data likely becomes per-plane and stored in the per-plane structures.
- * This is a stub to work with the existing code.
- */
-static INLINE int old_block_idx_4x4(MACROBLOCKD* const xd, int block_size_b,
-                                    int plane, int i) {
-  const int luma_blocks = 1 << block_size_b;
-  assert(xd->plane[0].subsampling_x == 0);
-  assert(xd->plane[0].subsampling_y == 0);
-  assert(xd->plane[1].subsampling_x == 1);
-  assert(xd->plane[1].subsampling_y == 1);
-  assert(xd->plane[2].subsampling_x == 1);
-  assert(xd->plane[2].subsampling_y == 1);
-  return plane == 0 ? i :
-         plane == 1 ? luma_blocks + i :
-                      luma_blocks * 5 / 4 + i;
-}
-
 typedef void (*foreach_transformed_block_visitor)(int plane, int block,
                                                   BLOCK_SIZE_TYPE bsize,
                                                   int ss_txfrm_size,
                                                   void *arg);
 static INLINE void foreach_transformed_block_in_plane(
     const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize, int plane,
-#if !CONFIG_SB8X8
-    int is_split,
-#endif  // !CONFIG_SB8X8
     foreach_transformed_block_visitor visit, void *arg) {
   const int bw = b_width_log2(bsize), bh = b_height_log2(bsize);
 
@@ -931,9 +801,6 @@
   const int ss_max = MAX(xd->plane[plane].subsampling_x,
                          xd->plane[plane].subsampling_y);
   const int ss_txfrm_size = txfrm_size_b > ss_block_size
-#if !CONFIG_SB8X8
-                            || is_split
-#endif  // !CONFIG_SB8X8
                                 ? txfrm_size_b - ss_max * 2
                                 : txfrm_size_b;
   const int step = 1 << ss_txfrm_size;
@@ -950,24 +817,10 @@
 static INLINE void foreach_transformed_block(
     const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize,
     foreach_transformed_block_visitor visit, void *arg) {
-#if !CONFIG_SB8X8
-  const MB_PREDICTION_MODE mode = xd->mode_info_context->mbmi.mode;
-  const int is_split =
-      xd->mode_info_context->mbmi.txfm_size == TX_8X8 &&
-      (mode == I8X8_PRED || mode == SPLITMV);
-#endif  // !CONFIG_SB8X8
   int plane;
 
   for (plane = 0; plane < MAX_MB_PLANE; plane++) {
-#if !CONFIG_SB8X8
-    const int is_split_chroma = is_split &&
-         xd->plane[plane].plane_type == PLANE_TYPE_UV;
-#endif  // !CONFIG_SB8X8
-
     foreach_transformed_block_in_plane(xd, bsize, plane,
-#if !CONFIG_SB8X8
-                                       is_split_chroma,
-#endif  // !CONFIG_SB8X8
                                        visit, arg);
   }
 }
@@ -975,19 +828,10 @@
 static INLINE void foreach_transformed_block_uv(
     const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize,
     foreach_transformed_block_visitor visit, void *arg) {
-#if !CONFIG_SB8X8
-  const MB_PREDICTION_MODE mode = xd->mode_info_context->mbmi.mode;
-  const int is_split =
-      xd->mode_info_context->mbmi.txfm_size == TX_8X8 &&
-      (mode == I8X8_PRED || mode == SPLITMV);
-#endif  // !CONFIG_SB8X8
   int plane;
 
   for (plane = 1; plane < MAX_MB_PLANE; plane++) {
     foreach_transformed_block_in_plane(xd, bsize, plane,
-#if !CONFIG_SB8X8
-                                       is_split,
-#endif  // !CONFIG_SB8X8
                                        visit, arg);
   }
 }
@@ -1015,16 +859,8 @@
   int pred_w, pred_h;
 
   if (mode == SPLITMV) {
-#if CONFIG_SB8X8
     pred_w = 0;
     pred_h = 0;
-#else
-    // 4x4 or 8x8
-    const int is_4x4 =
-        (xd->mode_info_context->mbmi.partitioning == PARTITIONING_4X4);
-    pred_w = is_4x4 ? 0 : 1 >> xd->plane[plane].subsampling_x;
-    pred_h = is_4x4 ? 0 : 1 >> xd->plane[plane].subsampling_y;
-#endif
   } else {
     pred_w = bw;
     pred_h = bh;
@@ -1117,13 +953,6 @@
     return xd->mode_info_context->mbmi.txfm_size;
   } else {
     const int bw = b_width_log2(bsize), bh = b_height_log2(bsize);
-#if !CONFIG_SB8X8
-    const MB_PREDICTION_MODE mode = xd->mode_info_context->mbmi.mode;
-    const int is_split =
-        xd->mode_info_context->mbmi.txfm_size == TX_8X8 &&
-        (mode == I8X8_PRED || mode == SPLITMV);
-#endif
-
     // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
     // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
     const TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
@@ -1140,9 +969,6 @@
     const int ss_max = MAX(xd->plane[plane].subsampling_x,
                            xd->plane[plane].subsampling_y);
     const int ss_txfrm_size = txfrm_size_b > ss_block_size
-#if !CONFIG_SB8X8
-                            || is_split
-#endif  // !CONFIG_SB8X8
                                   ? txfrm_size_b - ss_max * 2
                                   : txfrm_size_b;
     return (TX_SIZE)(ss_txfrm_size / 2);
diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index ed5441c..31c1a52 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -16,7 +16,6 @@
 #include "vpx_mem/vpx_mem.h"
 
 static const unsigned int kf_y_mode_cts[8][VP9_YMODES] = {
-#if CONFIG_SB8X8
   /* DC V   H  D45 135 117 153 D27 D63 TM i4X4 */
   {12,  6,  5,  5,  5,  5,  5,  5,  5,  2, 200},
   {25, 13, 13,  7,  7,  7,  7,  7,  7,  6, 160},
@@ -26,27 +25,11 @@
   {68, 33, 35,  8,  8,  8,  8,  8,  8, 17,  68},
   {78, 38, 38,  8,  8,  8,  8,  8,  8, 19,  52},
   {89, 42, 42,  8,  8,  8,  8,  8,  8, 21,  34},
-#else
-  /* DC V   H  D45 135 117 153 D27 D63 TM i8x8 i4X4 */
-  {12,  6,  5,  5,  5,  5,  5,  5,  5,  2, 22, 200},
-  {25, 13, 13,  7,  7,  7,  7,  7,  7,  6, 27, 160},
-  {31, 17, 18,  8,  8,  8,  8,  8,  8,  9, 26, 139},
-  {40, 22, 23,  8,  8,  8,  8,  8,  8, 12, 27, 116},
-  {53, 26, 28,  8,  8,  8,  8,  8,  8, 13, 26,  94},
-  {68, 33, 35,  8,  8,  8,  8,  8,  8, 17, 20,  68},
-  {78, 38, 38,  8,  8,  8,  8,  8,  8, 19, 16,  52},
-  {89, 42, 42,  8,  8,  8,  8,  8,  8, 21, 12,  34},
-#endif
 };
 
 static const unsigned int y_mode_cts  [VP9_YMODES] = {
-#if CONFIG_SB8X8
   /* DC V   H  D45 135 117 153 D27 D63 TM i4X4 */
   98, 19, 15, 14, 14, 14, 14, 12, 12, 13, 70
-#else
-  /* DC V   H  D45 135 117 153 D27 D63 TM i8x8 i4X4 */
-  98, 19, 15, 14, 14, 14, 14, 12, 12, 13, 16, 70
-#endif
 };
 
 static const unsigned int uv_mode_cts [VP9_YMODES] [VP9_UV_MODES] = {
@@ -61,19 +44,9 @@
   { 150, 15, 10, 10, 10, 10, 10, 75, 10,  6}, /* D27 */
   { 150, 15, 10, 10, 10, 10, 10, 10, 75,  6}, /* D63 */
   { 160, 30, 30, 10, 10, 10, 10, 10, 10, 16}, /* TM */
-#if !CONFIG_SB8X8
-  { 132, 46, 40, 10, 10, 10, 10, 10, 10, 18}, /* i8x8 - never used */
-#endif
   { 150, 35, 41, 10, 10, 10, 10, 10, 10, 10}, /* i4X4 */
 };
 
-#if !CONFIG_SB8X8
-static const unsigned int i8x8_mode_cts  [VP9_I8X8_MODES] = {
-  /* DC V  H D45 135 117 153 D27 D63  TM */
-  73, 49, 61, 30, 30, 30, 30, 30, 30, 13
-};
-#endif
-
 static const unsigned int kf_uv_mode_cts [VP9_YMODES] [VP9_UV_MODES] = {
   // DC   V   H  D45 135 117 153 D27 D63 TM
   { 160, 24, 24, 20, 20, 20, 20, 20, 20,  8}, /* DC */
@@ -86,9 +59,6 @@
   { 102, 33, 20, 20, 20, 20, 20, 64, 20, 14}, /* D27 */
   { 102, 33, 20, 20, 20, 20, 20, 20, 64, 14}, /* D63 */
   { 132, 36, 30, 20, 20, 20, 20, 20, 20, 18}, /* TM */
-#if !CONFIG_SB8X8
-  { 122, 41, 35, 20, 20, 20, 20, 20, 20, 18}, /* i8x8 - never used */
-#endif
   { 122, 41, 35, 20, 20, 20, 20, 20, 20, 18}, /* I4X4 */
 };
 
@@ -146,45 +116,13 @@
   { 208, 1, 1  }
 };
 
-#if !CONFIG_SB8X8
-vp9_mbsplit vp9_mbsplits [VP9_NUMMBSPLITS] = {
-  {
-    0,  0,  0,  0,
-    0,  0,  0,  0,
-    1,  1,  1,  1,
-    1,  1,  1,  1,
-  }, {
-    0,  0,  1,  1,
-    0,  0,  1,  1,
-    0,  0,  1,  1,
-    0,  0,  1,  1,
-  }, {
-    0,  0,  1,  1,
-    0,  0,  1,  1,
-    2,  2,  3,  3,
-    2,  2,  3,  3,
-  }, {
-    0,  1,  2,  3,
-    4,  5,  6,  7,
-    8,  9,  10, 11,
-    12, 13, 14, 15,
-  },
-};
-
-const int vp9_mbsplit_count [VP9_NUMMBSPLITS] = { 2, 2, 4, 16};
-
-const vp9_prob vp9_mbsplit_probs [VP9_NUMMBSPLITS - 1] = { 110, 111, 150};
-#endif
-
 const vp9_prob vp9_partition_probs[NUM_PARTITION_CONTEXTS]
                                   [PARTITION_TYPES - 1] = {
-#if CONFIG_SB8X8
   // FIXME(jingning,rbultje) put real probabilities here
   {202, 162, 107},
   {16,  2,   169},
   {3,   246,  19},
   {104, 90,  134},
-#endif
   {202, 162, 107},
   {16,  2,   169},
   {3,   246,  19},
@@ -260,12 +198,7 @@
   -D27_PRED, -D63_PRED,
   16, 18,
   -V_PRED, -H_PRED,
-#if CONFIG_SB8X8
   -TM_PRED, -I4X4_PRED
-#else
-  -TM_PRED, 20,
-  -I4X4_PRED, -I8X8_PRED
-#endif
 };
 
 const vp9_tree_index vp9_kf_ymode_tree[VP9_YMODES * 2 - 2] = {
@@ -278,28 +211,9 @@
   -D27_PRED, -D63_PRED,
   16, 18,
   -V_PRED, -H_PRED,
-#if CONFIG_SB8X8
   -TM_PRED, -I4X4_PRED
-#else
-  -TM_PRED, 20,
-  -I4X4_PRED, -I8X8_PRED
-#endif
 };
 
-#if !CONFIG_SB8X8
-const vp9_tree_index vp9_i8x8_mode_tree[VP9_I8X8_MODES * 2 - 2] = {
-  2, 14,
-  -DC_PRED, 4,
-  6, 8,
-  -D45_PRED, -D135_PRED,
-  10, 12,
-  -D117_PRED, -D153_PRED,
-  -D27_PRED, -D63_PRED,
-  -V_PRED, 16,
-  -H_PRED, -TM_PRED
-};
-#endif
-
 const vp9_tree_index vp9_uv_mode_tree[VP9_UV_MODES * 2 - 2] = {
   2, 14,
   -DC_PRED, 4,
@@ -312,14 +226,6 @@
   -H_PRED, -TM_PRED
 };
 
-#if !CONFIG_SB8X8
-const vp9_tree_index vp9_mbsplit_tree[6] = {
-  -PARTITIONING_4X4,   2,
-  -PARTITIONING_8X8,   4,
-  -PARTITIONING_16X8, -PARTITIONING_8X16,
-};
-#endif
-
 const vp9_tree_index vp9_mv_ref_tree[8] = {
   -ZEROMV, 2,
   -NEARESTMV, 4,
@@ -352,10 +258,6 @@
 struct vp9_token vp9_sb_kf_ymode_encodings[VP9_I32X32_MODES];
 struct vp9_token vp9_kf_ymode_encodings[VP9_YMODES];
 struct vp9_token vp9_uv_mode_encodings[VP9_UV_MODES];
-#if !CONFIG_SB8X8
-struct vp9_token vp9_i8x8_mode_encodings[VP9_I8X8_MODES];
-struct vp9_token vp9_mbsplit_encodings[VP9_NUMMBSPLITS];
-#endif
 
 struct vp9_token vp9_mv_ref_encoding_array[VP9_MVREFS];
 struct vp9_token vp9_sb_mv_ref_encoding_array[VP9_MVREFS];
@@ -386,16 +288,8 @@
                                      bct, uv_mode_cts[i], 0);
   }
 
-#if !CONFIG_SB8X8
-  vp9_tree_probs_from_distribution(vp9_i8x8_mode_tree, x->fc.i8x8_mode_prob,
-                                   bct, i8x8_mode_cts, 0);
-#endif
-
   vpx_memcpy(x->fc.sub_mv_ref_prob, vp9_sub_mv_ref_prob2,
              sizeof(vp9_sub_mv_ref_prob2));
-#if !CONFIG_SB8X8
-  vpx_memcpy(x->fc.mbsplit_prob, vp9_mbsplit_probs, sizeof(vp9_mbsplit_probs));
-#endif
   vpx_memcpy(x->fc.switchable_interp_prob, vp9_switchable_interp_prob,
              sizeof(vp9_switchable_interp_prob));
 
@@ -499,10 +393,6 @@
   vp9_tokens_from_tree(vp9_sb_ymode_encodings, vp9_sb_ymode_tree);
   vp9_tokens_from_tree(vp9_sb_kf_ymode_encodings, vp9_sb_kf_ymode_tree);
   vp9_tokens_from_tree(vp9_uv_mode_encodings,  vp9_uv_mode_tree);
-#if !CONFIG_SB8X8
-  vp9_tokens_from_tree(vp9_i8x8_mode_encodings,  vp9_i8x8_mode_tree);
-  vp9_tokens_from_tree(vp9_mbsplit_encodings, vp9_mbsplit_tree);
-#endif
   vp9_tokens_from_tree(vp9_switchable_interp_encodings,
                        vp9_switchable_interp_tree);
   vp9_tokens_from_tree(vp9_partition_encodings, vp9_partition_tree);
@@ -681,11 +571,6 @@
   update_mode_probs(VP9_NKF_BINTRAMODES, vp9_bmode_tree,
                     fc->bmode_counts, fc->pre_bmode_prob,
                     fc->bmode_prob, 0);
-#if !CONFIG_SB8X8
-  update_mode_probs(VP9_I8X8_MODES,
-                    vp9_i8x8_mode_tree, fc->i8x8_mode_counts,
-                    fc->pre_i8x8_mode_prob, fc->i8x8_mode_prob, 0);
-#endif
 
   for (i = 0; i < SUBMVREF_COUNT; ++i)
     update_mode_probs(VP9_SUBMVREFS,
@@ -693,11 +578,6 @@
                       fc->pre_sub_mv_ref_prob[i], fc->sub_mv_ref_prob[i],
                       LEFT4X4);
 
-#if !CONFIG_SB8X8
-  update_mode_probs(VP9_NUMMBSPLITS, vp9_mbsplit_tree,
-                    fc->mbsplit_counts, fc->pre_mbsplit_prob,
-                    fc->mbsplit_prob, 0);
-#endif
 #if CONFIG_COMP_INTERINTRA_PRED
   if (cm->use_interintra) {
     int factor, interintra_prob, count;
@@ -771,8 +651,7 @@
   vp9_update_mode_info_border(cm, cm->prev_mip);
   vp9_update_mode_info_in_image(cm, cm->prev_mi);
 
-  cm->ref_frame_sign_bias[GOLDEN_FRAME] = 0;
-  cm->ref_frame_sign_bias[ALTREF_FRAME] = 0;
+  vpx_memset(cm->ref_frame_sign_bias, 0, sizeof(cm->ref_frame_sign_bias));
 
   cm->frame_context_idx = 0;
 }
diff --git a/vp9/common/vp9_entropymode.h b/vp9/common/vp9_entropymode.h
index 24f988f..f49bb3b 100644
--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h
@@ -15,9 +15,6 @@
 #include "vp9/common/vp9_treecoder.h"
 
 #define SUBMVREF_COUNT 5
-#if !CONFIG_SB8X8
-#define VP9_NUMMBSPLITS 4
-#endif
 
 #if CONFIG_COMP_INTERINTRA_PRED
 #define VP9_DEF_INTERINTRA_PROB 248
@@ -26,16 +23,6 @@
 #define SEPARATE_INTERINTRA_UV  0
 #endif
 
-#if !CONFIG_SB8X8
-typedef const int vp9_mbsplit[16];
-
-extern vp9_mbsplit vp9_mbsplits[VP9_NUMMBSPLITS];
-
-extern const int vp9_mbsplit_count[VP9_NUMMBSPLITS];    /* # of subsets */
-
-extern const vp9_prob vp9_mbsplit_probs[VP9_NUMMBSPLITS - 1];
-#endif
-
 extern int vp9_mv_cont(const int_mv *l, const int_mv *a);
 
 extern const vp9_prob vp9_sub_mv_ref_prob2[SUBMVREF_COUNT][VP9_SUBMVREFS - 1];
@@ -52,10 +39,6 @@
 extern const vp9_tree_index  vp9_uv_mode_tree[];
 #define vp9_sb_ymode_tree vp9_uv_mode_tree
 #define vp9_sb_kf_ymode_tree vp9_uv_mode_tree
-#if !CONFIG_SB8X8
-extern const vp9_tree_index  vp9_i8x8_mode_tree[];
-extern const vp9_tree_index  vp9_mbsplit_tree[];
-#endif
 extern const vp9_tree_index  vp9_mv_ref_tree[];
 extern const vp9_tree_index  vp9_sb_mv_ref_tree[];
 extern const vp9_tree_index  vp9_sub_mv_ref_tree[];
@@ -67,10 +50,6 @@
 extern struct vp9_token vp9_sb_kf_ymode_encodings[VP9_I32X32_MODES];
 extern struct vp9_token vp9_kf_ymode_encodings[VP9_YMODES];
 extern struct vp9_token vp9_uv_mode_encodings[VP9_UV_MODES];
-#if !CONFIG_SB8X8
-extern struct vp9_token vp9_i8x8_mode_encodings[VP9_I8X8_MODES];
-extern struct vp9_token vp9_mbsplit_encodings[VP9_NUMMBSPLITS];
-#endif
 
 /* Inter mode values do not start at zero */
 
diff --git a/vp9/common/vp9_enums.h b/vp9/common/vp9_enums.h
index 3f00ba4..51454c1 100644
--- a/vp9/common/vp9_enums.h
+++ b/vp9/common/vp9_enums.h
@@ -13,22 +13,16 @@
 
 #include "./vpx_config.h"
 
-#if CONFIG_SB8X8
 #define LOG2_MI_SIZE 3
-#else
-#define LOG2_MI_SIZE 4
-#endif
 
 #define MI_SIZE (1 << LOG2_MI_SIZE)
 #define MI_UV_SIZE (1 << (LOG2_MI_SIZE - 1))
 
 typedef enum BLOCK_SIZE_TYPE {
   BLOCK_SIZE_AB4X4,
-#if CONFIG_SB8X8
   BLOCK_SIZE_SB8X8,
   BLOCK_SIZE_SB8X16,
   BLOCK_SIZE_SB16X8,
-#endif
   BLOCK_SIZE_MB16X16,
   BLOCK_SIZE_SB16X32,
   BLOCK_SIZE_SB32X16,
@@ -47,6 +41,6 @@
 } PARTITION_TYPE;
 
 #define PARTITION_PLOFFSET   4  // number of probability models per block size
-#define NUM_PARTITION_CONTEXTS ((2 + CONFIG_SB8X8) * PARTITION_PLOFFSET)
+#define NUM_PARTITION_CONTEXTS (3 * PARTITION_PLOFFSET)
 
 #endif  // VP9_COMMON_VP9_ENUMS_H_
diff --git a/vp9/common/vp9_findnearmv.h b/vp9/common/vp9_findnearmv.h
index df1ab73..0a1c413 100644
--- a/vp9/common/vp9_findnearmv.h
+++ b/vp9/common/vp9_findnearmv.h
@@ -74,13 +74,9 @@
                            vp9_prob p[VP9_MVREFS - 1],
                            const int context);
 
-#if !CONFIG_SB8X8
-extern const uint8_t vp9_mbsplit_offset[4][16];
-#endif
-
 static int left_block_mv(const MACROBLOCKD *xd,
                          const MODE_INFO *cur_mb, int b) {
-  if (!(b & (3 >> CONFIG_SB8X8))) {
+  if (!(b & 1)) {
     if (!xd->left_available)
       return 0;
 
@@ -90,7 +86,7 @@
     if (cur_mb->mbmi.mode != SPLITMV)
       return cur_mb->mbmi.mv[0].as_int;
 
-    b += 4 >> CONFIG_SB8X8;
+    b += 2;
   }
 
   return (cur_mb->bmi + b - 1)->as_mv[0].as_int;
@@ -98,7 +94,7 @@
 
 static int left_block_second_mv(const MACROBLOCKD *xd,
                                 const MODE_INFO *cur_mb, int b) {
-  if (!(b & (3 >> CONFIG_SB8X8))) {
+  if (!(b & 1)) {
     if (!xd->left_available)
       return 0;
 
@@ -108,7 +104,7 @@
     if (cur_mb->mbmi.mode != SPLITMV)
       return cur_mb->mbmi.second_ref_frame > 0 ?
           cur_mb->mbmi.mv[1].as_int : cur_mb->mbmi.mv[0].as_int;
-    b += 4 >> CONFIG_SB8X8;
+    b += 2;
   }
 
   return cur_mb->mbmi.second_ref_frame > 0 ?
@@ -117,85 +113,69 @@
 }
 
 static int above_block_mv(const MODE_INFO *cur_mb, int b, int mi_stride) {
-  if (!(b >> (2 >> CONFIG_SB8X8))) {
+  if (!(b >> 1)) {
     /* On top edge, get from MB above us */
     cur_mb -= mi_stride;
 
     if (cur_mb->mbmi.mode != SPLITMV)
       return cur_mb->mbmi.mv[0].as_int;
-    b += 16 >> (2 * CONFIG_SB8X8);
+    b += 4;
   }
 
-  return (cur_mb->bmi + b - (4 >> CONFIG_SB8X8))->as_mv[0].as_int;
+  return (cur_mb->bmi + b - 2)->as_mv[0].as_int;
 }
 
 static int above_block_second_mv(const MODE_INFO *cur_mb, int b, int mi_stride) {
-  if (!(b >> (2 >> CONFIG_SB8X8))) {
+  if (!(b >> 1)) {
     /* On top edge, get from MB above us */
     cur_mb -= mi_stride;
 
     if (cur_mb->mbmi.mode != SPLITMV)
       return cur_mb->mbmi.second_ref_frame > 0 ?
           cur_mb->mbmi.mv[1].as_int : cur_mb->mbmi.mv[0].as_int;
-    b += 16 >> (2 * CONFIG_SB8X8);
+    b += 4;
   }
 
   return cur_mb->mbmi.second_ref_frame > 0 ?
-      (cur_mb->bmi + b - (4 >> CONFIG_SB8X8))->as_mv[1].as_int :
-      (cur_mb->bmi + b - (4 >> CONFIG_SB8X8))->as_mv[0].as_int;
+      (cur_mb->bmi + b - 2)->as_mv[1].as_int :
+      (cur_mb->bmi + b - 2)->as_mv[0].as_int;
 }
 
 static B_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b) {
-#if CONFIG_SB8X8
   // FIXME(rbultje, jingning): temporary hack because jenkins doesn't
   // understand this condition. This will go away soon.
   if (b == 0 || b == 2) {
-#else
-  if (!(b & (3 >> CONFIG_SB8X8))) {
-#endif
     /* On L edge, get from MB to left of us */
     --cur_mb;
 
     if (cur_mb->mbmi.mode <= TM_PRED) {
       return pred_mode_conv(cur_mb->mbmi.mode);
-#if !CONFIG_SB8X8
-    } else if (cur_mb->mbmi.mode == I8X8_PRED) {
-      return pred_mode_conv(
-          (MB_PREDICTION_MODE)(cur_mb->bmi + 3 + b)->as_mode.first);
-#endif  // !CONFIG_SB8X8
     } else if (cur_mb->mbmi.mode == I4X4_PRED) {
-      return ((cur_mb->bmi + (3 >> CONFIG_SB8X8) + b)->as_mode.first);
+      return ((cur_mb->bmi + 1 + b)->as_mode.first);
     } else {
       return B_DC_PRED;
     }
   }
-#if CONFIG_SB8X8
   assert(b == 1 || b == 3);
-#endif
   return (cur_mb->bmi + b - 1)->as_mode.first;
 }
 
 static B_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb,
                                           int b, int mi_stride) {
-  if (!(b >> (2 >> CONFIG_SB8X8))) {
+  if (!(b >> 1)) {
     /* On top edge, get from MB above us */
     cur_mb -= mi_stride;
 
     if (cur_mb->mbmi.mode <= TM_PRED) {
       return pred_mode_conv(cur_mb->mbmi.mode);
-#if !CONFIG_SB8X8
-    } else if (cur_mb->mbmi.mode == I8X8_PRED) {
-      return pred_mode_conv(
-          (MB_PREDICTION_MODE)(cur_mb->bmi + 12 + b)->as_mode.first);
-#endif
     } else if (cur_mb->mbmi.mode == I4X4_PRED) {
-      return ((cur_mb->bmi + (CONFIG_SB8X8 ? 2 : 12) + b)->as_mode.first);
+      return ((cur_mb->bmi + 2 + b)->as_mode.first);
     } else {
       return B_DC_PRED;
     }
   }
 
-  return (cur_mb->bmi + b - (4 >> CONFIG_SB8X8))->as_mode.first;
+  return (cur_mb->bmi + b - 2)->as_mode.first;
 }
 
 #endif  // VP9_COMMON_VP9_FINDNEARMV_H_
diff --git a/vp9/common/vp9_invtrans.c b/vp9/common/vp9_invtrans.c
index 458f135..01859df 100644
--- a/vp9/common/vp9_invtrans.c
+++ b/vp9/common/vp9_invtrans.c
@@ -19,153 +19,3 @@
   else
     xd->inv_txm4x4(dqcoeff, diff, pitch);
 }
-
-void vp9_inverse_transform_b_8x8(int16_t *input_dqcoeff, int16_t *output_coeff,
-                                 int pitch) {
-  vp9_short_idct8x8(input_dqcoeff, output_coeff, pitch);
-}
-
-void vp9_inverse_transform_b_16x16(int16_t *input_dqcoeff,
-                                   int16_t *output_coeff, int pitch) {
-  vp9_short_idct16x16(input_dqcoeff, output_coeff, pitch);
-}
-
-void vp9_inverse_transform_sby_32x32(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 3, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 3);
-  const int stride = 32 << bwl;
-  int n;
-
-  for (n = 0; n < bw * bh; n++) {
-    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
-    const int offset = x_idx * 32 + y_idx * 32 * stride;
-
-    vp9_short_idct32x32(BLOCK_OFFSET(xd->plane[0].dqcoeff, n, 1024),
-                        xd->plane[0].diff + offset, stride * 2);
-  }
-}
-
-void vp9_inverse_transform_sby_16x16(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 2, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 2);
-  const int stride = 16 << bwl, bstride = 4 << bwl;
-  int n;
-
-  for (n = 0; n < bw * bh; n++) {
-    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
-    const TX_TYPE tx_type = get_tx_type_16x16(xd,
-                                              (y_idx * bstride + x_idx) * 4);
-    const int offset = x_idx * 16 + y_idx * 16 * stride;
-
-    if (tx_type == DCT_DCT) {
-      vp9_inverse_transform_b_16x16(BLOCK_OFFSET(xd->plane[0].dqcoeff, n, 256),
-                                    xd->plane[0].diff + offset, stride * 2);
-    } else {
-      vp9_short_iht16x16(BLOCK_OFFSET(xd->plane[0].dqcoeff, n, 256),
-                         xd->plane[0].diff + offset, stride, tx_type);
-    }
-  }
-}
-
-void vp9_inverse_transform_sby_8x8(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 1, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 1);
-  const int stride = 8 << bwl, bstride = 2 << bwl;
-  int n;
-
-  for (n = 0; n < bw * bh; n++) {
-    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
-    const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * bstride + x_idx) * 2);
-    const int offset = x_idx * 8 + y_idx * 8 * stride;
-
-    if (tx_type == DCT_DCT) {
-      vp9_inverse_transform_b_8x8(BLOCK_OFFSET(xd->plane[0].dqcoeff, n, 64),
-                                  xd->plane[0].diff + offset, stride * 2);
-    } else {
-      vp9_short_iht8x8(BLOCK_OFFSET(xd->plane[0].dqcoeff, n, 64),
-                       xd->plane[0].diff + offset, stride, tx_type);
-    }
-  }
-}
-
-void vp9_inverse_transform_sby_4x4(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize), bw = 1 << bwl;
-  const int bh = 1 << b_height_log2(bsize);
-  const int stride = 4 << bwl, bstride = 1 << bwl;
-  int n;
-
-  for (n = 0; n < bw * bh; n++) {
-    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
-    const TX_TYPE tx_type = get_tx_type_4x4(xd, y_idx * bstride + x_idx);
-    const int offset = x_idx * 4 + y_idx * 4 * stride;
-
-    if (tx_type == DCT_DCT) {
-      vp9_inverse_transform_b_4x4(xd, xd->plane[0].eobs[n],
-                                  BLOCK_OFFSET(xd->plane[0].dqcoeff, n, 16),
-                                  xd->plane[0].diff + offset, stride * 2);
-    } else {
-      vp9_short_iht4x4(BLOCK_OFFSET(xd->plane[0].dqcoeff, n, 16),
-                       xd->plane[0].diff + offset, stride, tx_type);
-    }
-  }
-}
-
-void vp9_inverse_transform_sbuv_32x32(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
-  assert(bsize == BLOCK_SIZE_SB64X64);
-
-  vp9_short_idct32x32(xd->plane[1].dqcoeff, xd->plane[1].diff, 64);
-  vp9_short_idct32x32(xd->plane[2].dqcoeff, xd->plane[2].diff, 64);
-}
-
-void vp9_inverse_transform_sbuv_16x16(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 2, bhl = b_height_log2(bsize) - 2;
-  const int bw = 1 << (bwl - 1), bh = 1 << (bhl - 1);
-  const int stride = 16 << (bwl - 1);
-  int n;
-
-  for (n = 0; n < bw * bh; n++) {
-    const int x_idx = n & (bw - 1), y_idx = n >> (bwl - 1);
-    const int off = x_idx * 16 + y_idx * stride * 16;
-
-    vp9_inverse_transform_b_16x16(BLOCK_OFFSET(xd->plane[1].dqcoeff, n, 256),
-                                  xd->plane[1].diff + off, stride * 2);
-    vp9_inverse_transform_b_16x16(BLOCK_OFFSET(xd->plane[2].dqcoeff, n, 256),
-                                  xd->plane[2].diff + off, stride * 2);
-  }
-}
-
-void vp9_inverse_transform_sbuv_8x8(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 1, bhl = b_height_log2(bsize) - 1;
-  const int bw = 1 << (bwl - 1), bh = 1 << (bhl - 1);
-  const int stride = 8 << (bwl - 1);
-  int n;
-
-  for (n = 0; n < bw * bh; n++) {
-    const int x_idx = n & (bw - 1), y_idx = n >> (bwl - 1);
-    const int off = x_idx * 8 + y_idx * stride * 8;
-
-    vp9_inverse_transform_b_8x8(BLOCK_OFFSET(xd->plane[1].dqcoeff, n, 64),
-                                xd->plane[1].diff + off, stride * 2);
-    vp9_inverse_transform_b_8x8(BLOCK_OFFSET(xd->plane[2].dqcoeff, n, 64),
-                                xd->plane[2].diff + off, stride * 2);
-  }
-}
-
-void vp9_inverse_transform_sbuv_4x4(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
-  const int bw = 1 << (bwl - 1), bh = 1 << (bhl - 1);
-  const int stride = 4 << (bwl - 1);
-  int n;
-
-  for (n = 0; n < bw * bh; n++) {
-    const int x_idx = n & (bw - 1), y_idx = n >> (bwl - 1);
-    const int off = x_idx * 4 + y_idx * stride * 4;
-
-    vp9_inverse_transform_b_4x4(xd, xd->plane[1].eobs[n],
-                                BLOCK_OFFSET(xd->plane[1].dqcoeff, n, 16),
-                                xd->plane[1].diff + off, stride * 2);
-    vp9_inverse_transform_b_4x4(xd, xd->plane[2].eobs[n],
-                                BLOCK_OFFSET(xd->plane[2].dqcoeff, n, 16),
-                                xd->plane[2].diff + off, stride * 2);
-  }
-}
diff --git a/vp9/common/vp9_invtrans.h b/vp9/common/vp9_invtrans.h
index aeac9a0..2aeb584 100644
--- a/vp9/common/vp9_invtrans.h
+++ b/vp9/common/vp9_invtrans.h
@@ -18,20 +18,4 @@
 void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int eob,
                                  int16_t *dqcoeff, int16_t *diff,
                                  int pitch);
-
-void vp9_inverse_transform_b_8x8(int16_t *input_dqcoeff,
-                                 int16_t *output_coeff, int pitch);
-
-void vp9_inverse_transform_b_16x16(int16_t *input_dqcoeff,
-                                   int16_t *output_coeff, int pitch);
-
-void vp9_inverse_transform_sby_32x32(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize);
-void vp9_inverse_transform_sby_16x16(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize);
-void vp9_inverse_transform_sby_8x8(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize);
-void vp9_inverse_transform_sby_4x4(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize);
-void vp9_inverse_transform_sbuv_32x32(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize);
-void vp9_inverse_transform_sbuv_16x16(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize);
-void vp9_inverse_transform_sbuv_8x8(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize);
-void vp9_inverse_transform_sbuv_4x4(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize);
-
 #endif  // VP9_COMMON_VP9_INVTRANS_H_
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index edb0c54..022abb8 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -27,9 +27,6 @@
   lfi->mode_lf_lut[H_PRED] = 1;
   lfi->mode_lf_lut[TM_PRED] = 1;
   lfi->mode_lf_lut[I4X4_PRED]  = 0;
-#if !CONFIG_SB8X8
-  lfi->mode_lf_lut[I8X8_PRED] = 0;
-#endif
   lfi->mode_lf_lut[ZEROMV]  = 1;
   lfi->mode_lf_lut[NEARESTMV] = 2;
   lfi->mode_lf_lut[NEARMV] = 2;
@@ -169,12 +166,7 @@
 static int mb_lf_skip(const MB_MODE_INFO *const mbmi) {
   const int skip_coef = mbmi->mb_skip_coeff;
   const int tx_size = mbmi->txfm_size;
-#if CONFIG_SB8X8
   return mbmi->sb_type >= BLOCK_SIZE_MB16X16 &&
-#else
-  const MB_PREDICTION_MODE mode = mbmi->mode;
-  return mode != I4X4_PRED && mode != I8X8_PRED && mode != SPLITMV &&
-#endif
          (tx_size >= TX_16X16 || skip_coef);
 }
 
@@ -227,11 +219,7 @@
       if (!skip_lf) {
         if (tx_size >= TX_8X8) {
           if (tx_size == TX_8X8 &&
-#if CONFIG_SB8X8
               (mi->mbmi.sb_type < BLOCK_SIZE_MB16X16)
-#else
-              (mode == I8X8_PRED || mode == SPLITMV)
-#endif
               )
             vp9_loop_filter_bh8x8(y_ptr, u_ptr, v_ptr,
                                   y_stride, uv_stride, &lfi);
@@ -257,12 +245,7 @@
       if (!skip_lf) {
         if (tx_size >= TX_8X8) {
           if (tx_size == TX_8X8 &&
-#if CONFIG_SB8X8
-              (mi->mbmi.sb_type < BLOCK_SIZE_MB16X16)
-#else
-              (mode == I8X8_PRED || mode == SPLITMV)
-#endif
-              )
+              (mi->mbmi.sb_type < BLOCK_SIZE_MB16X16))
             vp9_loop_filter_bv8x8(y_ptr, u_ptr, v_ptr,
                                   y_stride, uv_stride, &lfi);
           else
@@ -322,7 +305,7 @@
       y_only? 0 : v_ptr,
       y_stride, uv_stride, dering);
   // process 2nd MB top-right
-  mi = mode_info_context + (1 << CONFIG_SB8X8);
+  mi = mode_info_context + 2;
   do_left_v = !(wbl >= 3 /* 32x16 or >=32x32 */ && (tx_size >= TX_32X32 ||
       sb_mb_lf_skip(mode_info_context, mi)));
   do_above_h = (mb_row > 0);
@@ -338,7 +321,7 @@
       y_stride, uv_stride, dering);
 
   // process 3rd MB bottom-left
-  mi = mode_info_context + (mis << CONFIG_SB8X8);
+  mi = mode_info_context + (mis << 1);
   do_left_v = (mb_col > 0);
   do_above_h = !(hbl >= 3 /* 16x32 or >=32x32 */ && (tx_size >= TX_32X32 ||
       sb_mb_lf_skip(mode_info_context, mi)));
@@ -354,15 +337,15 @@
       y_stride, uv_stride, dering);
 
   // process 4th MB bottom right
-  mi = mode_info_context + ((mis + 1) << CONFIG_SB8X8);
+  mi = mode_info_context + ((mis + 1) << 1);
   do_left_v = !(wbl >= 3 /* 32x16 or >=32x32 */ && (tx_size >= TX_32X32 ||
-      sb_mb_lf_skip(mi - (1 << CONFIG_SB8X8), mi)));
+      sb_mb_lf_skip(mi - 2, mi)));
   do_above_h = !(hbl >= 3 /* 16x32 or >=32x32 */ && (tx_size >= TX_32X32 ||
-      sb_mb_lf_skip(mode_info_context + (1 << CONFIG_SB8X8), mi)));
+      sb_mb_lf_skip(mode_info_context + 2, mi)));
   do_left_v_mbuv = (wbl >= 3 /* 32x16 or >=32x32 */ && (tx_size >= TX_16X16 ||
-      sb_mb_lf_skip(mi - (1 << CONFIG_SB8X8), mi)));
+      sb_mb_lf_skip(mi - 2, mi)));
   do_above_h_mbuv = !(hbl >= 3 /* 16x32 or >=32x32 */ && (tx_size >= TX_16X16 ||
-      sb_mb_lf_skip(mode_info_context + (1 << CONFIG_SB8X8), mi)));
+      sb_mb_lf_skip(mode_info_context + 2, mi)));
   lpf_mb(cm, mi, do_left_v, do_above_h,
       do_left_v_mbuv, do_above_h_mbuv,
       y_ptr + 16 * y_stride + 16,
@@ -379,17 +362,16 @@
   lpf_sb32(cm, mode_info_context, mb_row, mb_col,
       y_ptr, u_ptr, v_ptr,
       y_stride, uv_stride, y_only, dering);
-  lpf_sb32(cm, mode_info_context + (2 << CONFIG_SB8X8), mb_row, mb_col + 2,
+  lpf_sb32(cm, mode_info_context + 4, mb_row, mb_col + 2,
       y_ptr + 32, u_ptr + 16, v_ptr + 16,
       y_stride, uv_stride, y_only, dering);
-  lpf_sb32(cm, mode_info_context + cm->mode_info_stride * (2 << CONFIG_SB8X8),
+  lpf_sb32(cm, mode_info_context + cm->mode_info_stride * 4,
       mb_row + 2, mb_col,
       y_ptr + 32 * y_stride,
       u_ptr + 16 * uv_stride,
       v_ptr + 16 * uv_stride,
       y_stride, uv_stride, y_only, dering);
-  lpf_sb32(cm, mode_info_context + cm->mode_info_stride *
-      (2 << CONFIG_SB8X8) + (2 << CONFIG_SB8X8),
+  lpf_sb32(cm, mode_info_context + cm->mode_info_stride * 4 + 4,
       mb_row + 2, mb_col + 2,
       y_ptr + 32 * y_stride + 32,
       u_ptr + 16 * uv_stride + 16,
@@ -459,14 +441,14 @@
       y_ptr += 64;
       u_ptr = y_only? 0 : u_ptr + 32;
       v_ptr = y_only? 0 : v_ptr + 32;
-      mode_info_context += 4 << CONFIG_SB8X8;       // step to next SB64
+      mode_info_context += 8;       // step to next SB64
     }
     if (extra_sb32_col) {
       // process 2 SB32s in the extra SB32 col
       lpf_sb32(cm, mode_info_context, mb_row, mb_col,
                y_ptr, u_ptr, v_ptr,
                y_stride, uv_stride, y_only, dering);
-      lpf_sb32(cm, mode_info_context + mis * (2 << CONFIG_SB8X8),
+      lpf_sb32(cm, mode_info_context + mis * 4,
                mb_row + 2, mb_col,
                y_ptr + 32 * y_stride,
                u_ptr + 16 * uv_stride,
@@ -475,7 +457,7 @@
       y_ptr += 32;
       u_ptr = y_only? 0 : u_ptr + 16;
       v_ptr = y_only? 0 : v_ptr + 16;
-      mode_info_context += 2 << CONFIG_SB8X8;       // step to next SB32
+      mode_info_context += 4;       // step to next SB32
       mb_col += 2;
     }
     if (extra_mb_col) {
@@ -493,7 +475,7 @@
              y_only? 0 : v_ptr,
              y_stride, uv_stride, dering);
       // process 2nd MB
-      mi = mode_info_context + (mis << CONFIG_SB8X8);
+      mi = mode_info_context + (mis << 1);
       do_left_v = (mb_col > 0);
       do_above_h = 1;
       do_left_v_mbuv =  1;
@@ -505,7 +487,7 @@
              y_only ? 0 : (v_ptr + 8 * uv_stride),
              y_stride, uv_stride, dering);
       // process 3nd MB
-      mi = mode_info_context + (mis << CONFIG_SB8X8) * 2;
+      mi = mode_info_context + (mis << 1) * 2;
       do_left_v = (mb_col > 0);
       do_above_h = 1;
       do_left_v_mbuv =  1;
@@ -517,7 +499,7 @@
              y_only ? 0 : (v_ptr + 16 * uv_stride),
              y_stride, uv_stride, dering);
       // process 4th MB
-      mi = mode_info_context + (mis << CONFIG_SB8X8) * 3;
+      mi = mode_info_context + (mis << 1) * 3;
       do_left_v = (mb_col > 0);
       do_above_h = 1;
       do_left_v_mbuv =  1;
@@ -531,7 +513,7 @@
       y_ptr += 16;
       u_ptr = y_only? 0 : u_ptr + 8;
       v_ptr = y_only? 0 : v_ptr + 8;
-      mode_info_context += 1 << CONFIG_SB8X8;       // step to next MB
+      mode_info_context += 2;       // step to next MB
     }
     // move pointers to the begining of next sb64 row
     y_ptr += y_stride  * 64 - post->y_width;
@@ -540,7 +522,7 @@
       v_ptr += uv_stride *  32 - post->uv_width;
     }
     /* skip to next SB64 row */
-    mode_info_context += mis * (4 << CONFIG_SB8X8) - cm->mi_cols;
+    mode_info_context += mis * 8 - cm->mi_cols;
   }
   if (extra_sb32_row) {
     const int sb32_cols = sb64_cols * 2 + extra_sb32_col;
@@ -551,7 +533,7 @@
       y_ptr += 32;
       u_ptr = y_only? 0 : u_ptr + 16;
       v_ptr = y_only? 0 : v_ptr + 16;
-      mode_info_context += 2 << CONFIG_SB8X8;       // step to next SB32
+      mode_info_context += 4;       // step to next SB32
     }
     if (extra_mb_col) {
       // process 1st MB
@@ -567,7 +549,7 @@
              y_only? NULL : v_ptr,
              y_stride, uv_stride, dering);
       // process 2nd MB
-      mi = mode_info_context + (mis << CONFIG_SB8X8);
+      mi = mode_info_context + (mis << 1);
       do_left_v = (mb_col > 0);
       do_above_h = 1;
       do_left_v_mbuv =  1;
@@ -581,14 +563,14 @@
       y_ptr += 16;
       u_ptr = y_only? 0 : u_ptr + 8;
       v_ptr = y_only? 0 : v_ptr + 8;
-      mode_info_context += 1 << CONFIG_SB8X8;       /* step to next MB */
+      mode_info_context += 2;       /* step to next MB */
     }
     // move pointers to the beginning of next sb64 row
     y_ptr += y_stride * 32 - post->y_width;
     u_ptr += y_only? 0 : uv_stride *  16 - post->uv_width;
     v_ptr += y_only? 0 : uv_stride *  16 - post->uv_width;
     // skip to next MB row if exist
-    mode_info_context += mis * (2 << CONFIG_SB8X8) - cm->mi_cols;
+    mode_info_context += mis * 4 - cm->mi_cols;
     mb_row += 2;
   }
   if (extra_mb_row) {
@@ -607,7 +589,7 @@
       y_ptr += 16;
       u_ptr = y_only? 0 : u_ptr + 8;
       v_ptr = y_only? 0 : v_ptr + 8;
-      mode_info_context += 1 << CONFIG_SB8X8;     // step to next MB
+      mode_info_context += 2;     // step to next MB
     }
   }
 }
diff --git a/vp9/common/vp9_mvref_common.c b/vp9/common/vp9_mvref_common.c
index 7a7ebe6..3f18c69 100644
--- a/vp9/common/vp9_mvref_common.c
+++ b/vp9/common/vp9_mvref_common.c
@@ -12,7 +12,6 @@
 
 #define MVREF_NEIGHBOURS 8
 
-#if CONFIG_SB8X8
 static int b_mv_ref_search[MVREF_NEIGHBOURS][2] = {
   {0, -1}, {-1, 0}, {-1, -1}, {0, -2},
   {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}
@@ -32,22 +31,6 @@
     {0, -1}, {-1, 0}, {2, -1}, {-1,  2},
     {4, -1}, {-1, 4}, {6, -1}, {-1, -1}
 };
-#else
-static int mb_mv_ref_search[MVREF_NEIGHBOURS][2] = {
-  {0, -1}, {-1, 0}, {-1, -1}, {0, -2},
-  {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}
-};
-
-static int sb_mv_ref_search[MVREF_NEIGHBOURS][2] = {
-  {0, -1}, {-1, 0}, {1, -1}, {-1, 1},
-  {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}
-};
-
-static int sb64_mv_ref_search[MVREF_NEIGHBOURS][2] = {
-  {0, -1}, {-1, 0}, {1, -1}, {-1,  1},
-  {2, -1}, {-1, 2}, {3, -1}, {-1, -1}
-};
-#endif
 
 // clamp_mv_ref
 #define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units
@@ -190,15 +173,10 @@
     mv_ref_search = sb64_mv_ref_search;
   } else if (mbmi->sb_type >= BLOCK_SIZE_SB32X32) {
     mv_ref_search = sb_mv_ref_search;
-#if CONFIG_SB8X8
   } else if (mbmi->sb_type >= BLOCK_SIZE_MB16X16) {
     mv_ref_search = mb_mv_ref_search;
   } else {
     mv_ref_search = b_mv_ref_search;
-#else
-  } else {
-    mv_ref_search = mb_mv_ref_search;
-#endif
   }
 
   // We first scan for candidate vectors that match the current reference frame
@@ -208,7 +186,7 @@
 
     if ((mi_search_col >= cm->cur_tile_mi_col_start) &&
         (mi_search_col < cm->cur_tile_mi_col_end) &&
-        ((mv_ref_search[i][1] << (7 - CONFIG_SB8X8)) >= xd->mb_to_top_edge)) {
+        ((mv_ref_search[i][1] << 6) >= xd->mb_to_top_edge)) {
 
       candidate_mi = here + mv_ref_search[i][0] +
                      (mv_ref_search[i][1] * xd->mode_info_stride);
@@ -228,7 +206,7 @@
 
     if ((mi_search_col >= cm->cur_tile_mi_col_start) &&
         (mi_search_col < cm->cur_tile_mi_col_end) &&
-        ((mv_ref_search[i][1] << (7 - CONFIG_SB8X8)) >= xd->mb_to_top_edge)) {
+        ((mv_ref_search[i][1] << 6) >= xd->mb_to_top_edge)) {
       candidate_mi = here + mv_ref_search[i][0] +
                      (mv_ref_search[i][1] * xd->mode_info_stride);
 
@@ -258,7 +236,7 @@
 
     if ((mi_search_col >= cm->cur_tile_mi_col_start) &&
         (mi_search_col < cm->cur_tile_mi_col_end) &&
-        ((mv_ref_search[i][1] << (7 - CONFIG_SB8X8)) >= xd->mb_to_top_edge)) {
+        ((mv_ref_search[i][1] << 6) >= xd->mb_to_top_edge)) {
       candidate_mi = here + mv_ref_search[i][0] +
                      (mv_ref_search[i][1] * xd->mode_info_stride);
 
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index bb873c1..fe6e14a 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -32,13 +32,9 @@
 
 void vp9_initialize_common(void);
 
-#if CONFIG_MULTIPLE_ARF
+// Define the number of candidate reference buffers.
 #define NUM_REF_FRAMES 8
 #define NUM_REF_FRAMES_LG2 3
-#else
-#define NUM_REF_FRAMES 3
-#define NUM_REF_FRAMES_LG2 2
-#endif
 
 #define ALLOWED_REFS_PER_FRAME 3
 
@@ -59,13 +55,7 @@
   vp9_prob ymode_prob[VP9_YMODES - 1]; /* interframe intra mode probs */
   vp9_prob sb_ymode_prob[VP9_I32X32_MODES - 1];
   vp9_prob uv_mode_prob[VP9_YMODES][VP9_UV_MODES - 1];
-#if !CONFIG_SB8X8
-  vp9_prob i8x8_mode_prob[VP9_I8X8_MODES - 1];
-#endif
   vp9_prob sub_mv_ref_prob[SUBMVREF_COUNT][VP9_SUBMVREFS - 1];
-#if !CONFIG_SB8X8
-  vp9_prob mbsplit_prob[VP9_NUMMBSPLITS - 1];
-#endif
   vp9_prob partition_prob[NUM_PARTITION_CONTEXTS][PARTITION_TYPES - 1];
 
   vp9_coeff_probs coef_probs_4x4[BLOCK_TYPES];
@@ -85,25 +75,13 @@
   vp9_prob pre_ymode_prob[VP9_YMODES - 1]; /* interframe intra mode probs */
   vp9_prob pre_sb_ymode_prob[VP9_I32X32_MODES - 1];
   vp9_prob pre_uv_mode_prob[VP9_YMODES][VP9_UV_MODES - 1];
-#if !CONFIG_SB8X8
-  vp9_prob pre_i8x8_mode_prob[VP9_I8X8_MODES - 1];
-#endif
   vp9_prob pre_sub_mv_ref_prob[SUBMVREF_COUNT][VP9_SUBMVREFS - 1];
-#if !CONFIG_SB8X8
-  vp9_prob pre_mbsplit_prob[VP9_NUMMBSPLITS - 1];
-#endif
   vp9_prob pre_partition_prob[NUM_PARTITION_CONTEXTS][PARTITION_TYPES - 1];
   unsigned int bmode_counts[VP9_NKF_BINTRAMODES];
   unsigned int ymode_counts[VP9_YMODES];   /* interframe intra mode probs */
   unsigned int sb_ymode_counts[VP9_I32X32_MODES];
   unsigned int uv_mode_counts[VP9_YMODES][VP9_UV_MODES];
-#if !CONFIG_SB8X8
-  unsigned int i8x8_mode_counts[VP9_I8X8_MODES];   /* interframe intra probs */
-#endif
   unsigned int sub_mv_ref_counts[SUBMVREF_COUNT][VP9_SUBMVREFS];
-#if !CONFIG_SB8X8
-  unsigned int mbsplit_counts[VP9_NUMMBSPLITS];
-#endif
   unsigned int partition_counts[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];
 
   vp9_coeff_probs pre_coef_probs_4x4[BLOCK_TYPES];
@@ -168,8 +146,8 @@
 typedef struct VP9Common {
   struct vpx_internal_error_info  error;
 
-  DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][2]);
+  DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][2]);
 
   int width;
   int height;
@@ -208,8 +186,7 @@
 
   int frame_flags;
   // MBs, mb_rows/cols is in 16-pixel units; mi_rows/cols is in
-  // MODE_INFO units (depending on CONFIG_SB8X8, that is either
-  // 16-pixel or 8-pixel)
+  // MODE_INFO (8-pixel) units.
   int MBs;
   int mb_rows, mi_rows;
   int mb_cols, mi_cols;
diff --git a/vp9/common/vp9_recon.c b/vp9/common/vp9_recon.c
index 4ab4f39..69a4720 100644
--- a/vp9/common/vp9_recon.c
+++ b/vp9/common/vp9_recon.c
@@ -28,32 +28,12 @@
 }
 
 
-void vp9_recon_b_c(uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr,
-                   int stride) {
+void vp9_recon_b_c(uint8_t *pred_ptr, int16_t *diff_ptr, int diff_stride,
+                   uint8_t *dst_ptr, int stride) {
   assert(pred_ptr == dst_ptr);
-  recon(4, 4, diff_ptr, 16 >> CONFIG_SB8X8, dst_ptr, stride);
+  recon(4, 4, diff_ptr, diff_stride, dst_ptr, stride);
 }
 
-#if !CONFIG_SB8X8
-void vp9_recon_uv_b_c(uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr,
-                      int stride) {
-  assert(pred_ptr == dst_ptr);
-  recon(4, 4, diff_ptr, 8, dst_ptr, stride);
-}
-
-void vp9_recon4b_c(uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr,
-                   int stride) {
-  assert(pred_ptr == dst_ptr);
-  recon(4, 16, diff_ptr, 16, dst_ptr, stride);
-}
-
-void vp9_recon2b_c(uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr,
-                   int stride) {
-  assert(pred_ptr == dst_ptr);
-  recon(4, 8, diff_ptr, 8, dst_ptr, stride);
-}
-#endif
-
 static void recon_plane(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize, int plane) {
   const int bw = 4 << (b_width_log2(bsize) - xd->plane[plane].subsampling_x);
   const int bh = 4 << (b_height_log2(bsize) - xd->plane[plane].subsampling_y);
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index 0420063..e39e050 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -265,13 +265,8 @@
   return (value < 0 ? value - 2 : value + 2) / 4;
 }
 
-#if CONFIG_SB8X8
 #define IDX1 2
 #define IDX2 3
-#else
-#define IDX1 4
-#define IDX2 5
-#endif
 
 static int mi_mv_pred_row_q4(MACROBLOCKD *mb, int off, int idx) {
   const int temp = mb->mode_info_context->bmi[off + 0].as_mv[idx].as_mv.row +
diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c
index daeb6b5..d01cfa4 100644
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@@ -573,49 +573,19 @@
                              xd->left_available, 0 /*xd->right_available*/);
 }
 
-#if !CONFIG_SB8X8
-void vp9_intra8x8_predict(MACROBLOCKD *xd,
-                          int block4x4_idx,
-                          int mode,
-                          uint8_t *predictor, int pre_stride) {
-  const int block_idx = (block4x4_idx >> 2) | !!(block4x4_idx & 2);
-  const int have_top = (block_idx >> 1) || xd->up_available;
-  const int have_left = (block_idx & 1) || xd->left_available;
-  const int have_right = !(block_idx & 1) || xd->right_available;
-
-  vp9_build_intra_predictors(predictor, pre_stride,
-                             predictor, pre_stride,
-                             mode, 8, 8, have_top, have_left,
-                             have_right);
-}
-#endif
 #if !CONFIG_NEWBINTRAMODES
 void vp9_intra4x4_predict(MACROBLOCKD *xd,
                           int block_idx,
+                          BLOCK_SIZE_TYPE bsize,
                           int mode,
                           uint8_t *predictor, int pre_stride) {
+  const int bwl = b_width_log2(bsize);
+  const int wmask = (1 << bwl) - 1;
   const int have_top =
-      (block_idx >> (2 >> CONFIG_SB8X8)) || xd->up_available;
+      (block_idx >> bwl) || xd->up_available;
   const int have_left =
-      (block_idx & (3 >> CONFIG_SB8X8)) || xd->left_available;
-  const int have_right =
-      ((block_idx & (3 >> CONFIG_SB8X8)) != (3 >> CONFIG_SB8X8));
-
-  vp9_build_intra_predictors(predictor, pre_stride,
-                             predictor, pre_stride,
-                             mode, 4, 4, have_top, have_left,
-                             have_right);
-}
-#endif
-#if !CONFIG_SB8X8
-void vp9_intra_uv4x4_predict(MACROBLOCKD *xd,
-                             int block4x4_idx,
-                             int mode,
-                             uint8_t *predictor, int pre_stride) {
-  const int block_idx = block4x4_idx & 3;
-  const int have_top = (block_idx >> 1) || xd->up_available;
-  const int have_left = (block_idx & 1) || xd->left_available;
-  const int have_right = !(block_idx & 1);
+      (block_idx & wmask) || xd->left_available;
+  const int have_right = ((block_idx & wmask) != wmask);
 
   vp9_build_intra_predictors(predictor, pre_stride,
                              predictor, pre_stride,
diff --git a/vp9/common/vp9_reconintra4x4.c b/vp9/common/vp9_reconintra4x4.c
index 2a7c7f3..ce33aa5 100644
--- a/vp9/common/vp9_reconintra4x4.c
+++ b/vp9/common/vp9_reconintra4x4.c
@@ -160,13 +160,16 @@
 
 void vp9_intra4x4_predict(MACROBLOCKD *xd,
                           int block_idx,
+                          BLOCK_SIZE_TYPE bsize,
                           int b_mode,
                           uint8_t *predictor,
                           int ps) {
+  const int bwl = b_width_log2(bsize);
+  const int wmask = (1 << bwl) - 1;
   int i, r, c;
-  const int have_top = (block_idx >> 2) || xd->up_available;
-  const int have_left = (block_idx & 3)  || xd->left_available;
-  const int have_right = (block_idx & 3) != 3 || xd->right_available;
+  const int have_top = (block_idx >> bwl) || xd->up_available;
+  const int have_left = (block_idx & wmask)  || xd->left_available;
+  const int have_right = (block_idx & wmask) != wmask || xd->right_available;
   uint8_t left[4], above[8], top_left;
   /*
    * 127 127 127 .. 127 127 127 127 127 127
@@ -197,8 +200,8 @@
     above[1] = above_ptr[1];
     above[2] = above_ptr[2];
     above[3] = above_ptr[3];
-    if (((block_idx & 3) != 3) ||
-        (have_right && block_idx == 3 &&
+    if (((block_idx & wmask) != wmask) ||
+        (have_right && block_idx == wmask &&
          ((xd->mb_index != 3 && xd->sb_index != 3) ||
           ((xd->mb_index & 1) == 0 && xd->sb_index == 3)))) {
       above[4] = above_ptr[4];
@@ -212,7 +215,7 @@
         above_right -= 32 * ps;
       if (xd->mb_index == 3)
         above_right -= 16 * ps;
-      above_right -= (block_idx & ~3) * ps;
+      above_right -= 4 * (block_idx >> bwl) * ps;
 
       /* use a more distant above-right (from closest available top-right
        * corner), but with a "localized DC" (similar'ish to TM-pred):
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index b1acc04..75e3604 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -60,26 +60,9 @@
 prototype void vp9_copy_mem8x4 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
 specialize vp9_copy_mem8x4 mmx
 
-prototype void vp9_recon_b "uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr, int stride"
+prototype void vp9_recon_b "uint8_t *pred_ptr, int16_t *diff_ptr, int diff_stride, uint8_t *dst_ptr, int stride"
 specialize vp9_recon_b
 
-if [ "$CONFIG_SB8X8" != "yes" ]; then
-
-prototype void vp9_recon_uv_b "uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr, int stride"
-specialize vp9_recon_uv_b
-
-# TODO(jingning): The prototype functions in c are modified to enable block-size configurable
-# operations. Need to change the sse2 accrodingly.
-prototype void vp9_recon2b "uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr, int stride"
-specialize vp9_recon2b
-# specialize vp9_recon2b sse2
-
-prototype void vp9_recon4b "uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr, int stride"
-specialize vp9_recon4b
-# specialize vp9_recon4b sse2
-
-fi
-
 prototype void vp9_recon_sb "struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize"
 specialize vp9_recon_sb
 
@@ -98,19 +81,9 @@
 prototype void vp9_build_intra_predictors_sbuv_s "struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize"
 specialize vp9_build_intra_predictors_sbuv_s
 
-prototype void vp9_intra4x4_predict "struct macroblockd *xd, int block, int b_mode, uint8_t *predictor, int pre_stride"
+prototype void vp9_intra4x4_predict "struct macroblockd *xd, int block, enum BLOCK_SIZE_TYPE bsize, int b_mode, uint8_t *predictor, int pre_stride"
 specialize vp9_intra4x4_predict;
 
-if [ "$CONFIG_SB8X8" != "yes" ]; then
-
-prototype void vp9_intra8x8_predict "struct macroblockd *xd, int block, int b_mode, uint8_t *predictor, int pre_stride"
-specialize vp9_intra8x8_predict;
-
-prototype void vp9_intra_uv4x4_predict "struct macroblockd *xd, int block, int b_mode, uint8_t *predictor, int pre_stride"
-specialize vp9_intra_uv4x4_predict;
-
-fi
-
 if [ "$CONFIG_VP9_DECODER" = "yes" ]; then
 prototype void vp9_add_residual_4x4 "const int16_t *diff, uint8_t *dest, int stride"
 specialize vp9_add_residual_4x4 sse2
diff --git a/vp9/common/vp9_setupintrarecon.c b/vp9/common/vp9_setupintrarecon.c
deleted file mode 100644
index 6784103..0000000
--- a/vp9/common/vp9_setupintrarecon.c
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp9/common/vp9_setupintrarecon.h"
-#include "vpx_mem/vpx_mem.h"
-
-void vp9_setup_intra_recon(YV12_BUFFER_CONFIG *ybf) {
-  int i;
-
-  // luma
-  vpx_memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5);
-  for (i = 0; i < ybf->y_height; i++)
-    ybf->y_buffer[ybf->y_stride * i - 1] = 129;
-
-  // chroma
-  vpx_memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
-  vpx_memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
-  for (i = 0; i < ybf->uv_height; i++) {
-    ybf->u_buffer[ybf->uv_stride * i - 1] = 129;
-    ybf->v_buffer[ybf->uv_stride * i - 1] = 129;
-  }
-}
diff --git a/vp9/common/vp9_setupintrarecon.h b/vp9/common/vp9_setupintrarecon.h
deleted file mode 100644
index e389f3c..0000000
--- a/vp9/common/vp9_setupintrarecon.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_COMMON_VP9_SETUPINTRARECON_H_
-#define VP9_COMMON_VP9_SETUPINTRARECON_H_
-
-#include "vpx_scale/yv12config.h"
-
-void vp9_setup_intra_recon(YV12_BUFFER_CONFIG *ybf);
-
-#endif  // VP9_COMMON_VP9_SETUPINTRARECON_H_
diff --git a/vp9/common/vp9_tile_common.c b/vp9/common/vp9_tile_common.c
index a9d8cf0..ea26289 100644
--- a/vp9/common/vp9_tile_common.c
+++ b/vp9/common/vp9_tile_common.c
@@ -18,16 +18,12 @@
 static void vp9_get_tile_offsets(VP9_COMMON *cm, int *min_tile_off,
                                  int *max_tile_off, int tile_idx,
                                  int log2_n_tiles, int n_mis) {
-#if CONFIG_SB8X8
   const int n_sbs = (n_mis + 7) >> 3;
-#else
-  const int n_sbs = (n_mis + 3) >> 2;
-#endif
   const int sb_off1 =  (tile_idx      * n_sbs) >> log2_n_tiles;
   const int sb_off2 = ((tile_idx + 1) * n_sbs) >> log2_n_tiles;
 
-  *min_tile_off = MIN(sb_off1 << (2 + CONFIG_SB8X8), n_mis);
-  *max_tile_off = MIN(sb_off2 << (2 + CONFIG_SB8X8), n_mis);
+  *min_tile_off = MIN(sb_off1 << 3, n_mis);
+  *max_tile_off = MIN(sb_off2 << 3, n_mis);
 }
 
 void vp9_get_tile_col_offsets(VP9_COMMON *cm, int tile_col_idx) {
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index a290c55..6eaa4d5 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -65,12 +65,6 @@
   return (MB_PREDICTION_MODE)treed_read(r, vp9_kf_ymode_tree, p);
 }
 
-#if !CONFIG_SB8X8
-static int read_i8x8_mode(vp9_reader *r, const vp9_prob *p) {
-  return treed_read(r, vp9_i8x8_mode_tree, p);
-}
-#endif
-
 static MB_PREDICTION_MODE read_uv_mode(vp9_reader *r, const vp9_prob *p) {
   return (MB_PREDICTION_MODE)treed_read(r, vp9_uv_mode_tree, p);
 }
@@ -130,11 +124,7 @@
     m->mbmi.mb_skip_coeff = vp9_read(r, vp9_get_pred_prob(cm, xd, PRED_MBSKIP));
 
   // luma mode
-#if CONFIG_SB8X8
   m->mbmi.mode = m->mbmi.sb_type > BLOCK_SIZE_SB8X8 ?
-#else
-  m->mbmi.mode = m->mbmi.sb_type > BLOCK_SIZE_MB16X16 ?
-#endif
       read_kf_sb_ymode(r, cm->sb_kf_ymode_prob[cm->kf_ymode_probs_index]):
       read_kf_mb_ymode(r, cm->kf_ymode_prob[cm->kf_ymode_probs_index]);
 
@@ -142,57 +132,27 @@
 
   if (m->mbmi.mode == I4X4_PRED) {
     int i;
-    for (i = 0; i < (16 >> (2 * CONFIG_SB8X8)); ++i) {
+    for (i = 0; i < 4; ++i) {
       const B_PREDICTION_MODE a = above_block_mode(m, i, mis);
-      const B_PREDICTION_MODE l = xd->left_available || (i & 3) ?
+      const B_PREDICTION_MODE l = xd->left_available ||
+                                  (i & 1) ?
                                   left_block_mode(m, i) : B_DC_PRED;
       m->bmi[i].as_mode.first = read_kf_bmode(r, cm->kf_bmode_prob[a][l]);
     }
   }
 
-#if !CONFIG_SB8X8
-  if (m->mbmi.mode == I8X8_PRED) {
-    int i;
-    for (i = 0; i < 4; ++i) {
-      const int ib = vp9_i8x8_block[i];
-      const int mode8x8 = read_i8x8_mode(r, cm->fc.i8x8_mode_prob);
-
-      m->bmi[ib + 0].as_mode.first = mode8x8;
-      m->bmi[ib + 1].as_mode.first = mode8x8;
-      m->bmi[ib + 4].as_mode.first = mode8x8;
-      m->bmi[ib + 5].as_mode.first = mode8x8;
-    }
-  }
-
-  // chroma mode
-  if (m->mbmi.mode != I8X8_PRED)
-#endif
-  {
-    m->mbmi.uv_mode = read_uv_mode(r, cm->kf_uv_mode_prob[m->mbmi.mode]);
-  }
+  m->mbmi.uv_mode = read_uv_mode(r, cm->kf_uv_mode_prob[m->mbmi.mode]);
 
   if (cm->txfm_mode == TX_MODE_SELECT &&
-      !m->mbmi.mb_skip_coeff &&
-#if CONFIG_SB8X8
-      m->mbmi.mode != I4X4_PRED
-#else
-      m->mbmi.mode <= I8X8_PRED
-#endif
-      ) {
-#if CONFIG_SB8X8
+      !m->mbmi.mb_skip_coeff && m->mbmi.mode != I4X4_PRED) {
     const int allow_16x16 = m->mbmi.sb_type >= BLOCK_SIZE_MB16X16;
-#else
-    const int allow_16x16 = m->mbmi.mode != I8X8_PRED;
-#endif
     const int allow_32x32 = m->mbmi.sb_type >= BLOCK_SIZE_SB32X32;
     m->mbmi.txfm_size = select_txfm_size(cm, r, allow_16x16, allow_32x32);
   } else if (cm->txfm_mode >= ALLOW_32X32 &&
              m->mbmi.sb_type >= BLOCK_SIZE_SB32X32) {
     m->mbmi.txfm_size = TX_32X32;
   } else if (cm->txfm_mode >= ALLOW_16X16 &&
-#if CONFIG_SB8X8
              m->mbmi.sb_type >= BLOCK_SIZE_MB16X16 &&
-#endif
              m->mbmi.mode <= TM_PRED) {
     m->mbmi.txfm_size = TX_16X16;
   } else if (cm->txfm_mode >= ALLOW_8X8 && m->mbmi.mode != I4X4_PRED) {
@@ -676,7 +636,7 @@
       if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_SKIP)) {
         mbmi->mode = ZEROMV;
       } else {
-        mbmi->mode = mbmi->sb_type > BLOCK_SIZE_MB16X16 ?
+        mbmi->mode = mbmi->sb_type > BLOCK_SIZE_SB8X8 ?
                                      read_sb_mv_ref(r, mv_ref_p)
                                    : read_mv_ref(r, mv_ref_p);
         vp9_accum_mv_refs(cm, mbmi->mode, mbmi->mb_mode_context[ref_frame]);
@@ -698,11 +658,9 @@
 #endif
     }
 
-    if (is_inter_mode(mbmi->mode)) {
-      mbmi->interp_filter = cm->mcomp_filter_type == SWITCHABLE
-                                ? read_switchable_filter_type(pbi, r)
-                                : cm->mcomp_filter_type;
-    }
+    mbmi->interp_filter = cm->mcomp_filter_type == SWITCHABLE
+                              ? read_switchable_filter_type(pbi, r)
+                              : cm->mcomp_filter_type;
 
     if (cm->comp_pred_mode == COMP_PREDICTION_ONLY ||
         (cm->comp_pred_mode == HYBRID_PREDICTION &&
@@ -772,29 +730,16 @@
     mbmi->uv_mode = DC_PRED;
     switch (mbmi->mode) {
       case SPLITMV: {
-#if CONFIG_SB8X8
         const int num_p = 4;
-#else
-        const int s = treed_read(r, vp9_mbsplit_tree, cm->fc.mbsplit_prob);
-        const int num_p = vp9_mbsplit_count[s];
-#endif
         int j = 0;
 
-#if !CONFIG_SB8X8
-        cm->fc.mbsplit_counts[s]++;
-        mbmi->partitioning = s;
-#endif
         mbmi->need_to_clamp_mvs = 0;
         do {  // for each subset j
           int_mv leftmv, abovemv, second_leftmv, second_abovemv;
           int_mv blockmv, secondmv;
           int mv_contz;
           int blockmode;
-#if CONFIG_SB8X8
           int k = j;
-#else
-          int k = vp9_mbsplit_offset[s][j];  // first block in subset j
-#endif
 
           leftmv.as_int = left_block_mv(xd, mi, k);
           abovemv.as_int = above_block_mv(mi, k, mis);
@@ -848,46 +793,14 @@
             default:
               break;
           }
-
-          /*  Commenting this section out, not sure why this was needed, and
-           *  there are mismatches with this section in rare cases since it is
-           *  not done in the encoder at all.
-          mbmi->need_to_clamp_mvs |= check_mv_bounds(&blockmv,
-                                                     mb_to_left_edge,
-                                                     mb_to_right_edge,
-                                                     mb_to_top_edge,
-                                                     mb_to_bottom_edge);
-          if (mbmi->second_ref_frame > 0) {
-            mbmi->need_to_clamp_mvs |= check_mv_bounds(&secondmv,
-                                                       mb_to_left_edge,
-                                                       mb_to_right_edge,
-                                                       mb_to_top_edge,
-                                                       mb_to_bottom_edge);
-          }
-          */
-
-#if !CONFIG_SB8X8
-          {
-            /* Fill (uniform) modes, mvs of jth subset.
-             Must do it here because ensuing subsets can
-             refer back to us via "left" or "above". */
-            unsigned int fill_count = mbsplit_fill_count[s];
-            const uint8_t *fill_offset =
-                &mbsplit_fill_offset[s][j * fill_count];
-
-            do {
-              mi->bmi[*fill_offset].as_mv[0].as_int = blockmv.as_int;
-              if (mbmi->second_ref_frame > 0)
-                mi->bmi[*fill_offset].as_mv[1].as_int = secondmv.as_int;
-              fill_offset++;
-            } while (--fill_count);
-          }
-#endif
+          mi->bmi[j].as_mv[0].as_int = blockmv.as_int;
+          if (mbmi->second_ref_frame > 0)
+            mi->bmi[j].as_mv[1].as_int = secondmv.as_int;
         } while (++j < num_p);
       }
 
-      mv0->as_int = mi->bmi[15 >> (2 * CONFIG_SB8X8)].as_mv[0].as_int;
-      mv1->as_int = mi->bmi[15 >> (2 * CONFIG_SB8X8)].as_mv[1].as_int;
+      mv0->as_int = mi->bmi[3].as_mv[0].as_int;
+      mv1->as_int = mi->bmi[3].as_mv[1].as_int;
 
       break;  /* done with SPLITMV */
 
@@ -952,7 +865,7 @@
     // required for left and above block mv
     mv0->as_int = 0;
 
-    if (mbmi->sb_type > BLOCK_SIZE_MB16X16) {
+    if (mbmi->sb_type > BLOCK_SIZE_SB8X8) {
       mbmi->mode = read_sb_ymode(r, cm->fc.sb_ymode_prob);
       cm->fc.sb_ymode_counts[mbmi->mode]++;
     } else {
@@ -970,28 +883,11 @@
         if (m == B_CONTEXT_PRED) m -= CONTEXT_PRED_REPLACEMENTS;
 #endif
         cm->fc.bmode_counts[m]++;
-      } while (++j < 16);
+      } while (++j < 4);
     }
 
-#if !CONFIG_SB8X8
-    if (mbmi->mode == I8X8_PRED) {
-      int i;
-      for (i = 0; i < 4; i++) {
-        const int ib = vp9_i8x8_block[i];
-        const int mode8x8 = read_i8x8_mode(r, cm->fc.i8x8_mode_prob);
-
-        mi->bmi[ib + 0].as_mode.first = mode8x8;
-        mi->bmi[ib + 1].as_mode.first = mode8x8;
-        mi->bmi[ib + 4].as_mode.first = mode8x8;
-        mi->bmi[ib + 5].as_mode.first = mode8x8;
-        cm->fc.i8x8_mode_counts[mode8x8]++;
-      }
-    } else
-#endif
-    {
-      mbmi->uv_mode = read_uv_mode(r, cm->fc.uv_mode_prob[mbmi->mode]);
-      cm->fc.uv_mode_counts[mbmi->mode][mbmi->uv_mode]++;
-    }
+    mbmi->uv_mode = read_uv_mode(r, cm->fc.uv_mode_prob[mbmi->mode]);
+    cm->fc.uv_mode_counts[mbmi->mode][mbmi->uv_mode]++;
   }
   /*
   if (cm->current_video_frame == 1)
@@ -999,44 +895,22 @@
     */
 
   if (cm->txfm_mode == TX_MODE_SELECT && mbmi->mb_skip_coeff == 0 &&
-      ((mbmi->ref_frame == INTRA_FRAME &&
-#if CONFIG_SB8X8
-        mbmi->mode != I4X4_PRED
-#else
-        mbmi->mode <= I8X8_PRED
-#endif
-        ) ||
-       (mbmi->ref_frame != INTRA_FRAME &&
-#if CONFIG_SB8X8
-        mbmi->mode != SPLITMV
-#else
-        !(mbmi->mode == SPLITMV && mbmi->partitioning == PARTITIONING_4X4)
-#endif
-        ))) {
-#if CONFIG_SB8X8
+      ((mbmi->ref_frame == INTRA_FRAME && mbmi->mode != I4X4_PRED) ||
+       (mbmi->ref_frame != INTRA_FRAME && mbmi->mode != SPLITMV))) {
     const int allow_16x16 = mbmi->sb_type >= BLOCK_SIZE_MB16X16;
-#else
-    const int allow_16x16 = mbmi->mode != I8X8_PRED && mbmi->mode != SPLITMV;
-#endif
     const int allow_32x32 = mbmi->sb_type >= BLOCK_SIZE_SB32X32;
     mbmi->txfm_size = select_txfm_size(cm, r, allow_16x16, allow_32x32);
   } else if (mbmi->sb_type >= BLOCK_SIZE_SB32X32 &&
              cm->txfm_mode >= ALLOW_32X32) {
     mbmi->txfm_size = TX_32X32;
   } else if (cm->txfm_mode >= ALLOW_16X16 &&
-#if CONFIG_SB8X8
              mbmi->sb_type >= BLOCK_SIZE_MB16X16 &&
-#endif
       ((mbmi->ref_frame == INTRA_FRAME && mbmi->mode <= TM_PRED) ||
        (mbmi->ref_frame != INTRA_FRAME && mbmi->mode != SPLITMV))) {
     mbmi->txfm_size = TX_16X16;
   } else if (cm->txfm_mode >= ALLOW_8X8 &&
       (!(mbmi->ref_frame == INTRA_FRAME && mbmi->mode == I4X4_PRED) &&
-       !(mbmi->ref_frame != INTRA_FRAME && mbmi->mode == SPLITMV
-#if !CONFIG_SB8X8
-         && mbmi->partitioning == PARTITIONING_4X4
-#endif
-         ))) {
+       !(mbmi->ref_frame != INTRA_FRAME && mbmi->mode == SPLITMV))) {
     mbmi->txfm_size = TX_8X8;
   } else {
     mbmi->txfm_size = TX_4X4;
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index ac421f3..15ff16c 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -24,7 +24,6 @@
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_quant_common.h"
 #include "vpx_scale/vpx_scale.h"
-#include "vp9/common/vp9_setupintrarecon.h"
 
 #include "vp9/decoder/vp9_decodemv.h"
 #include "vp9/common/vp9_extend.h"
@@ -164,7 +163,7 @@
 }
 
 void vp9_init_dequantizer(VP9_COMMON *pc) {
-  int q, i;
+  int q;
 
   for (q = 0; q < QINDEX_RANGE; q++) {
     // DC value
@@ -172,12 +171,8 @@
     pc->uv_dequant[q][0] = vp9_dc_quant(q, pc->uv_dc_delta_q);
 
     // AC values
-    for (i = 1; i < 16; i++) {
-      const int rc = vp9_default_zig_zag1d_4x4[i];
-
-      pc->y_dequant[q][rc] = vp9_ac_quant(q, 0);
-      pc->uv_dequant[q][rc] = vp9_ac_quant(q, pc->uv_ac_delta_q);
-    }
+    pc->y_dequant[q][1] = vp9_ac_quant(q, 0);
+    pc->uv_dequant[q][1] = vp9_ac_quant(q, pc->uv_ac_delta_q);
   }
 }
 
@@ -191,61 +186,10 @@
     xd->plane[i].dequant = pc->uv_dequant[xd->q_index];
 }
 
-#if !CONFIG_SB8X8
-static void decode_8x8(MACROBLOCKD *xd) {
-  const MB_PREDICTION_MODE mode = xd->mode_info_context->mbmi.mode;
-  // luma
-  // if the first one is DCT_DCT assume all the rest are as well
-  TX_TYPE tx_type = get_tx_type_8x8(xd, 0);
-  int i;
-  assert(mode == I8X8_PRED);
-  for (i = 0; i < 4; i++) {
-    int ib = vp9_i8x8_block[i];
-    int idx = (ib & 0x02) ? (ib + 2) : ib;
-    int16_t *q  = BLOCK_OFFSET(xd->plane[0].qcoeff, idx, 16);
-    uint8_t* const dst =
-          raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, ib,
-                                    xd->plane[0].dst.buf,
-                                    xd->plane[0].dst.stride);
-    int stride = xd->plane[0].dst.stride;
-    if (mode == I8X8_PRED) {
-      int i8x8mode = xd->mode_info_context->bmi[ib].as_mode.first;
-      vp9_intra8x8_predict(xd, ib, i8x8mode, dst, stride);
-    }
-    tx_type = get_tx_type_8x8(xd, ib);
-    vp9_iht_add_8x8_c(tx_type, q, dst, stride, xd->plane[0].eobs[idx]);
-  }
-
-  // chroma
-  for (i = 0; i < 4; i++) {
-    int ib = vp9_i8x8_block[i];
-    int i8x8mode = xd->mode_info_context->bmi[ib].as_mode.first;
-    uint8_t* dst;
-
-    dst = raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 1, i,
-                                    xd->plane[1].dst.buf,
-                                    xd->plane[1].dst.stride);
-    vp9_intra_uv4x4_predict(xd, 16 + i, i8x8mode,
-                            dst, xd->plane[1].dst.stride);
-    xd->itxm_add(BLOCK_OFFSET(xd->plane[1].qcoeff, i, 16),
-                 dst, xd->plane[1].dst.stride,
-                 xd->plane[1].eobs[i]);
-
-    dst = raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 2, i,
-                                    xd->plane[2].dst.buf,
-                                    xd->plane[1].dst.stride);
-    vp9_intra_uv4x4_predict(xd, 20 + i, i8x8mode,
-                            dst, xd->plane[1].dst.stride);
-    xd->itxm_add(BLOCK_OFFSET(xd->plane[2].qcoeff, i, 16),
-                 dst, xd->plane[1].dst.stride,
-                 xd->plane[2].eobs[i]);
-  }
-}
-#endif
-
-static INLINE void dequant_add_y(MACROBLOCKD *xd, TX_TYPE tx_type, int idx) {
+static INLINE void dequant_add_y(MACROBLOCKD *xd, TX_TYPE tx_type, int idx,
+                                 BLOCK_SIZE_TYPE bsize) {
   struct macroblockd_plane *const y = &xd->plane[0];
-  uint8_t* const dst = raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, idx,
+  uint8_t* const dst = raster_block_offset_uint8(xd, bsize, 0, idx,
                                                  xd->plane[0].dst.buf,
                                                  xd->plane[0].dst.stride);
   if (tx_type != DCT_DCT) {
@@ -257,47 +201,6 @@
   }
 }
 
-#if !CONFIG_SB8X8
-static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd, vp9_reader *r) {
-  TX_TYPE tx_type;
-  int i = 0;
-  const MB_PREDICTION_MODE mode = xd->mode_info_context->mbmi.mode;
-  assert(mode == I8X8_PRED);
-  for (i = 0; i < 4; i++) {
-    int ib = vp9_i8x8_block[i];
-    const int iblock[4] = {0, 1, 4, 5};
-    int j;
-    uint8_t* dst;
-    int i8x8mode = xd->mode_info_context->bmi[ib].as_mode.first;
-
-    dst = raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, ib,
-                                    xd->plane[0].dst.buf,
-                                    xd->plane[0].dst.stride);
-    vp9_intra8x8_predict(xd, ib, i8x8mode, dst, xd->plane[0].dst.stride);
-    for (j = 0; j < 4; j++) {
-      tx_type = get_tx_type_4x4(xd, ib + iblock[j]);
-      dequant_add_y(xd, tx_type, ib + iblock[j]);
-    }
-    dst = raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 1, i,
-                                    xd->plane[1].dst.buf,
-                                    xd->plane[1].dst.stride);
-    vp9_intra_uv4x4_predict(xd, 16 + i, i8x8mode,
-                            dst, xd->plane[1].dst.stride);
-    xd->itxm_add(BLOCK_OFFSET(xd->plane[1].qcoeff, i, 16),
-                 dst, xd->plane[1].dst.stride,
-                 xd->plane[1].eobs[i]);
-    dst = raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 2, i,
-                                    xd->plane[2].dst.buf,
-                                    xd->plane[2].dst.stride);
-    vp9_intra_uv4x4_predict(xd, 20 + i, i8x8mode,
-                            dst, xd->plane[1].dst.stride);
-    xd->itxm_add(BLOCK_OFFSET(xd->plane[2].qcoeff, i, 16),
-                 dst, xd->plane[1].dst.stride,
-                 xd->plane[2].eobs[i]);
-  }
-}
-#endif
-
 static void decode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
                          int ss_txfrm_size, void *arg) {
   MACROBLOCKD* const xd = arg;
@@ -357,10 +260,10 @@
     if (!xd->mode_info_context->mbmi.mb_skip_coeff)
       vp9_decode_coefs_4x4(pbi, xd, r, PLANE_TYPE_Y_WITH_DC, i);
 #endif
-    vp9_intra4x4_predict(xd, i, b_mode, dst, xd->plane[0].dst.stride);
+    vp9_intra4x4_predict(xd, i, bsize, b_mode, dst, xd->plane[0].dst.stride);
     // TODO(jingning): refactor to use foreach_transformed_block_in_plane_
     tx_type = get_tx_type_4x4(xd, i);
-    dequant_add_y(xd, tx_type, i);
+    dequant_add_y(xd, tx_type, i, bsize);
   }
 #if CONFIG_NEWBINTRAMODES
   if (!xd->mode_info_context->mbmi.mb_skip_coeff)
@@ -449,35 +352,6 @@
   }
 }
 
-#if !CONFIG_SB8X8
-// TODO(jingning): This only performs I8X8_PRED decoding process, which will be
-// automatically covered by decode_sb, when SB8X8 is on.
-static void decode_mb(VP9D_COMP *pbi, MACROBLOCKD *xd,
-                     int mi_row, int mi_col,
-                     vp9_reader *r) {
-  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
-  const int tx_size = mbmi->txfm_size;
-
-  assert(mbmi->sb_type == BLOCK_SIZE_MB16X16);
-
-  if (mbmi->mb_skip_coeff) {
-    vp9_reset_sb_tokens_context(xd, BLOCK_SIZE_MB16X16);
-  } else {
-    // re-initialize macroblock dequantizer before detokenization
-    if (xd->segmentation_enabled)
-      mb_init_dequantizer(&pbi->common, xd);
-
-    if (!vp9_reader_has_error(r))
-      vp9_decode_tokens(pbi, xd, r, BLOCK_SIZE_MB16X16);
-  }
-
-  if (tx_size == TX_8X8)
-    decode_8x8(xd);
-  else
-    decode_4x4(pbi, xd, r);
-}
-#endif
-
 static int get_delta_q(vp9_reader *r, int *dq) {
   const int old_value = *dq;
 
@@ -496,14 +370,8 @@
   const int bw = 1 << mi_width_log2(bsize);
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
-  int i;
-
   const int mi_idx = mi_row * cm->mode_info_stride + mi_col;
-  const YV12_BUFFER_CONFIG *dst_fb = &cm->yv12_fb[cm->new_fb_idx];
-  const int recon_yoffset =
-      (MI_SIZE * mi_row) * dst_fb->y_stride + (MI_SIZE * mi_col);
-  const int recon_uvoffset =
-      (MI_UV_SIZE * mi_row) * dst_fb->uv_stride + (MI_UV_SIZE * mi_col);
+  int i;
 
   xd->mode_info_context = cm->mi + mi_idx;
   xd->mode_info_context->mbmi.sb_type = bsize;
@@ -511,20 +379,18 @@
 
   for (i = 0; i < MAX_MB_PLANE; i++) {
     xd->plane[i].above_context = cm->above_context[i] +
-        (mi_col * 4 >> (xd->plane[i].subsampling_x + CONFIG_SB8X8));
+        (mi_col * 2 >> xd->plane[i].subsampling_x);
     xd->plane[i].left_context = cm->left_context[i] +
-        (((mi_row * 4 >> CONFIG_SB8X8) & 15) >> xd->plane[i].subsampling_y);
+        (((mi_row * 2) & 15) >> xd->plane[i].subsampling_y);
   }
-  xd->above_seg_context = cm->above_seg_context + (mi_col >> CONFIG_SB8X8);
-  xd->left_seg_context  = cm->left_seg_context + ((mi_row >> CONFIG_SB8X8) & 3);
+  xd->above_seg_context = cm->above_seg_context + (mi_col >> 1);
+  xd->left_seg_context  = cm->left_seg_context + ((mi_row >> 1) & 3);
 
   // Distance of Mb to the various image edges. These are specified to 8th pel
   // as they are always compared to values that are in 1/8th pel units
   set_mi_row_col(cm, xd, mi_row, bh, mi_col, bw);
 
-  xd->plane[0].dst.buf = dst_fb->y_buffer + recon_yoffset;
-  xd->plane[1].dst.buf = dst_fb->u_buffer + recon_uvoffset;
-  xd->plane[2].dst.buf = dst_fb->v_buffer + recon_uvoffset;
+  setup_dst_planes(xd, &cm->yv12_fb[cm->new_fb_idx], mi_row, mi_col);
 }
 
 static void set_refs(VP9D_COMP *pbi, int mi_row, int mi_col) {
@@ -563,33 +429,12 @@
   vp9_decode_mb_mode_mv(pbi, xd, mi_row, mi_col, r);
   set_refs(pbi, mi_row, mi_col);
 
-#if CONFIG_SB8X8
   if (bsize == BLOCK_SIZE_SB8X8 &&
       (xd->mode_info_context->mbmi.mode == SPLITMV ||
        xd->mode_info_context->mbmi.mode == I4X4_PRED))
     decode_atom(pbi, xd, mi_row, mi_col, r, bsize);
   else
     decode_sb(pbi, xd, mi_row, mi_col, r, bsize);
-#else
-  // TODO(jingning): merge decode_sb_ and decode_mb_
-  if (bsize > BLOCK_SIZE_MB16X16) {
-    decode_sb(pbi, xd, mi_row, mi_col, r, bsize);
-  } else {
-    // TODO(jingning): In transition of separating functionalities of decode_mb
-    // into decode_sb and decode_atom. Will remove decode_mb and clean this up
-    // when SB8X8 is on.
-    if (xd->mode_info_context->mbmi.mode == I4X4_PRED ||
-        (xd->mode_info_context->mbmi.mode == SPLITMV &&
-         xd->mode_info_context->mbmi.partitioning == PARTITIONING_4X4))
-      decode_atom(pbi, xd, mi_row, mi_col, r, bsize);
-    else if (xd->mode_info_context->mbmi.mode != I8X8_PRED)
-      decode_sb(pbi, xd, mi_row, mi_col, r, bsize);
-    else
-      // TODO(jingning): decode_mb still carries deocding process of I8X8_PRED.
-      // This will be covered by decode_sb when SB8X8 is on.
-      decode_mb(pbi, xd, mi_row, mi_col, r);
-  }
-#endif
 
   xd->corrupted |= vp9_reader_has_error(r);
 }
@@ -606,16 +451,12 @@
   if (mi_row >= pc->mi_rows || mi_col >= pc->mi_cols)
     return;
 
-#if CONFIG_SB8X8
   if (bsize > BLOCK_SIZE_SB8X8) {
-#else
-  if (bsize > BLOCK_SIZE_MB16X16) {
-#endif
     int pl;
     // read the partition information
     xd->left_seg_context =
-        pc->left_seg_context + ((mi_row >> CONFIG_SB8X8) & 3);
-    xd->above_seg_context = pc->above_seg_context + (mi_col >> CONFIG_SB8X8);
+        pc->left_seg_context + ((mi_row >> 1) & 3);
+    xd->above_seg_context = pc->above_seg_context + (mi_col >> 1);
     pl = partition_plane_context(xd, bsize);
     partition = treed_read(r, vp9_partition_tree,
                            pc->fc.partition_prob[pl]);
@@ -642,15 +483,10 @@
         int j = n >> 1, i = n & 0x01;
         if (subsize == BLOCK_SIZE_SB32X32)
           xd->sb_index = n;
-#if CONFIG_SB8X8
         else if (subsize == BLOCK_SIZE_MB16X16)
           xd->mb_index = n;
         else
           xd->b_index = n;
-#else
-        else
-          xd->mb_index = n;
-#endif
         decode_modes_sb(pbi, mi_row + j * bs, mi_col + i * bs, r, subsize);
       }
       break;
@@ -658,15 +494,11 @@
       assert(0);
   }
   // update partition context
-#if CONFIG_SB8X8
   if ((partition == PARTITION_SPLIT) && (bsize > BLOCK_SIZE_MB16X16))
-#else
-  if ((partition == PARTITION_SPLIT) && (bsize > BLOCK_SIZE_SB32X32))
-#endif
     return;
 
-  xd->left_seg_context = pc->left_seg_context + ((mi_row >> CONFIG_SB8X8) & 3);
-  xd->above_seg_context = pc->above_seg_context + (mi_col >> CONFIG_SB8X8);
+  xd->left_seg_context = pc->left_seg_context + ((mi_row >> 1) & 3);
+  xd->above_seg_context = pc->above_seg_context + (mi_col >> 1);
   update_partition_context(xd, subsize, bsize);
 }
 
@@ -1017,13 +849,7 @@
   vp9_copy(fc->pre_sb_ymode_prob, fc->sb_ymode_prob);
   vp9_copy(fc->pre_uv_mode_prob, fc->uv_mode_prob);
   vp9_copy(fc->pre_bmode_prob, fc->bmode_prob);
-#if !CONFIG_SB8X8
-  vp9_copy(fc->pre_i8x8_mode_prob, fc->i8x8_mode_prob);
-#endif
   vp9_copy(fc->pre_sub_mv_ref_prob, fc->sub_mv_ref_prob);
-#if !CONFIG_SB8X8
-  vp9_copy(fc->pre_mbsplit_prob, fc->mbsplit_prob);
-#endif
   vp9_copy(fc->pre_partition_prob, fc->partition_prob);
   fc->pre_nmvc = fc->nmvc;
 
@@ -1036,13 +862,7 @@
   vp9_zero(fc->sb_ymode_counts);
   vp9_zero(fc->uv_mode_counts);
   vp9_zero(fc->bmode_counts);
-#if !CONFIG_SB8X8
-  vp9_zero(fc->i8x8_mode_counts);
-#endif
   vp9_zero(fc->sub_mv_ref_counts);
-#if !CONFIG_SB8X8
-  vp9_zero(fc->mbsplit_counts);
-#endif
   vp9_zero(fc->NMVcount);
   vp9_zero(fc->mv_ref_ct);
   vp9_zero(fc->partition_counts);
@@ -1070,12 +890,12 @@
   int mi_row, mi_col;
 
   for (mi_row = pc->cur_tile_mi_row_start;
-       mi_row < pc->cur_tile_mi_row_end; mi_row += (4 << CONFIG_SB8X8)) {
+       mi_row < pc->cur_tile_mi_row_end; mi_row += 8) {
     // For a SB there are 2 left contexts, each pertaining to a MB row within
     vpx_memset(&pc->left_context, 0, sizeof(pc->left_context));
     vpx_memset(pc->left_seg_context, 0, sizeof(pc->left_seg_context));
     for (mi_col = pc->cur_tile_mi_col_start;
-         mi_col < pc->cur_tile_mi_col_end; mi_col += (4 << CONFIG_SB8X8)) {
+         mi_col < pc->cur_tile_mi_col_end; mi_col += 8) {
       decode_modes_sb(pbi, mi_row, mi_col, r, BLOCK_SIZE_SB64X64);
     }
   }
@@ -1249,8 +1069,6 @@
 
   setup_loopfilter(pc, xd, &header_bc);
 
-  vp9_read_literal(&header_bc, 2);  // unused
-
   setup_quantization(pbi, &header_bc);
 
   // Determine if the golden frame or ARF buffer should be updated and how.
@@ -1277,8 +1095,11 @@
         vp9_setup_scale_factors_for_frame(sf, fb, pc->width, pc->height);
     }
 
-    pc->ref_frame_sign_bias[GOLDEN_FRAME] = vp9_read_bit(&header_bc);
-    pc->ref_frame_sign_bias[ALTREF_FRAME] = vp9_read_bit(&header_bc);
+    // Read the sign bias for each reference frame buffer.
+    for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {
+      pc->ref_frame_sign_bias[i + 1] = vp9_read_bit(&header_bc);
+    }
+
     xd->allow_high_precision_mv = vp9_read_bit(&header_bc);
     pc->mcomp_filter_type = read_mcomp_filter_type(&header_bc);
 
@@ -1337,17 +1158,11 @@
     CHECK_MEM_ERROR(pc->last_frame_seg_map,
                     vpx_calloc((pc->mi_rows * pc->mi_cols), 1));
 
-  // set up frame new frame for intra coded blocks
-  vp9_setup_intra_recon(new_fb);
-
   vp9_setup_block_dptrs(xd);
 
   // clear out the coeff buffer
-  vpx_memset(xd->plane[0].qcoeff, 0, sizeof(xd->plane[0].qcoeff));
-  vpx_memset(xd->plane[1].qcoeff, 0, sizeof(xd->plane[1].qcoeff));
-  vpx_memset(xd->plane[2].qcoeff, 0, sizeof(xd->plane[2].qcoeff));
-
-  vp9_read_bit(&header_bc);  // unused
+  for (i = 0; i < MAX_MB_PLANE; ++i)
+    vp9_zero(xd->plane[i].qcoeff);
 
   vp9_decode_mode_mvs_init(pbi, &header_bc);
 
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index 650defd..0ef25ba 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -356,9 +356,7 @@
                          int ss_txfrm_size,
                          void *argv) {
   const struct decode_block_args* const arg = argv;
-  const int bw = b_width_log2(bsize), bh = b_height_log2(bsize);
-  const int old_block_idx = old_block_idx_4x4(arg->xd, bw + bh,
-                                              plane, block);
+  const int bw = b_width_log2(bsize);
 
   // find the maximum eob for this transform size, adjusted by segment
   const int segment_id = arg->xd->mode_info_context->mbmi.segment_id;
@@ -370,7 +368,7 @@
   const int aoff = (off & ((1 << mod) - 1)) << ss_tx_size;
   const int loff = (off >> mod) << ss_tx_size;
 
-  const int eob = decode_coefs(arg->pbi, arg->xd, arg->r, old_block_idx,
+  const int eob = decode_coefs(arg->pbi, arg->xd, arg->r, block,
                                arg->xd->plane[plane].plane_type, seg_eob,
                                BLOCK_OFFSET(qcoeff_base, block, 16),
                                ss_tx_size, arg->xd->plane[plane].dequant,
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 73af5ca..6624f07 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -281,12 +281,6 @@
   write_token(bc, vp9_uv_mode_tree, p, vp9_sb_kf_ymode_encodings + m);
 }
 
-#if !CONFIG_SB8X8
-static void write_i8x8_mode(vp9_writer *bc, int m, const vp9_prob *p) {
-  write_token(bc, vp9_i8x8_mode_tree, p, vp9_i8x8_mode_encodings + m);
-}
-#endif
-
 static void write_uv_mode(vp9_writer *bc, int m, const vp9_prob *p) {
   write_token(bc, vp9_uv_mode_tree, p, vp9_uv_mode_encodings + m);
 }
@@ -304,12 +298,6 @@
   write_token(bc, vp9_kf_bmode_tree, p, vp9_kf_bmode_encodings + m);
 }
 
-#if !CONFIG_SB8X8
-static void write_split(vp9_writer *bc, int x, const vp9_prob *p) {
-  write_token(bc, vp9_mbsplit_tree, p, vp9_mbsplit_encodings + x);
-}
-#endif
-
 static int prob_update_savings(const unsigned int *ct,
                                const vp9_prob oldp, const vp9_prob newp,
                                const vp9_prob upd) {
@@ -671,7 +659,7 @@
     active_section = 6;
 #endif
 
-    if (m->mbmi.sb_type > BLOCK_SIZE_MB16X16)
+    if (m->mbmi.sb_type > BLOCK_SIZE_SB8X8)
       write_sb_ymode(bc, mode, pc->fc.sb_ymode_prob);
     else
       write_ymode(bc, mode, pc->fc.ymode_prob);
@@ -681,24 +669,10 @@
       do {
         write_bmode(bc, m->bmi[j].as_mode.first,
                     pc->fc.bmode_prob);
-      } while (++j < (16 >> (CONFIG_SB8X8 * 2)));
+      } while (++j < 4);
     }
-#if !CONFIG_SB8X8
-    if (mode == I8X8_PRED) {
-      write_i8x8_mode(bc, m->bmi[0].as_mode.first,
-                      pc->fc.i8x8_mode_prob);
-      write_i8x8_mode(bc, m->bmi[2].as_mode.first,
-                      pc->fc.i8x8_mode_prob);
-      write_i8x8_mode(bc, m->bmi[8].as_mode.first,
-                      pc->fc.i8x8_mode_prob);
-      write_i8x8_mode(bc, m->bmi[10].as_mode.first,
-                      pc->fc.i8x8_mode_prob);
-    } else
-#endif
-    {
-      write_uv_mode(bc, mi->uv_mode,
-                    pc->fc.uv_mode_prob[mode]);
-    }
+    write_uv_mode(bc, mi->uv_mode,
+                  pc->fc.uv_mode_prob[mode]);
   } else {
     vp9_prob mv_ref_p[VP9_MVREFS - 1];
 
@@ -710,7 +684,7 @@
 
     // If segment skip is not enabled code the mode.
     if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {
-      if (mi->sb_type > BLOCK_SIZE_MB16X16) {
+      if (mi->sb_type > BLOCK_SIZE_SB8X8) {
         write_sb_mv_ref(bc, mode, mv_ref_p);
       } else {
         write_mv_ref(bc, mode, mv_ref_p);
@@ -780,34 +754,16 @@
         ++count_mb_seg[mi->partitioning];
 #endif
 
-#if !CONFIG_SB8X8
-        write_split(bc, mi->partitioning, cpi->common.fc.mbsplit_prob);
-        cpi->mbsplit_count[mi->partitioning]++;
-#endif
-
         do {
           B_PREDICTION_MODE blockmode;
           int_mv blockmv;
-#if !CONFIG_SB8X8
-          const int *const  L = vp9_mbsplits[mi->partitioning];
-#endif
           int k = -1;  /* first block in subset j */
           int mv_contz;
           int_mv leftmv, abovemv;
 
           blockmode = cpi->mb.partition_info->bmi[j].mode;
           blockmv = cpi->mb.partition_info->bmi[j].mv;
-#if CONFIG_SB8X8
           k = j;
-#else
-#if CONFIG_DEBUG
-          while (j != L[++k])
-            if (k >= 16)
-              assert(0);
-#else
-          while (j != L[++k]);
-#endif
-#endif
           leftmv.as_int = left_block_mv(xd, m, k);
           abovemv.as_int = above_block_mv(m, k, mis);
           mv_contz = vp9_mv_cont(&leftmv, &abovemv);
@@ -839,7 +795,6 @@
     }
   }
 
-#if CONFIG_SB8X8
   if (((rf == INTRA_FRAME && mode != I4X4_PRED) ||
        (rf != INTRA_FRAME && mode != SPLITMV)) &&
       pc->txfm_mode == TX_MODE_SELECT &&
@@ -854,23 +809,6 @@
         vp9_write(bc, sz != TX_16X16, pc->prob_tx[2]);
     }
   }
-#else
-  if (((rf == INTRA_FRAME && mode <= I8X8_PRED) ||
-       (rf != INTRA_FRAME && !(mode == SPLITMV &&
-                               mi->partitioning == PARTITIONING_4X4))) &&
-      pc->txfm_mode == TX_MODE_SELECT &&
-          !(skip_coeff || vp9_segfeature_active(xd, segment_id,
-                                                SEG_LVL_SKIP))) {
-    TX_SIZE sz = mi->txfm_size;
-    // FIXME(rbultje) code ternary symbol once all experiments are merged
-    vp9_write(bc, sz != TX_4X4, pc->prob_tx[0]);
-    if (sz != TX_4X4 && mode != I8X8_PRED && mode != SPLITMV) {
-      vp9_write(bc, sz != TX_8X8, pc->prob_tx[1]);
-      if (mi->sb_type >= BLOCK_SIZE_SB32X32 && sz != TX_8X8)
-        vp9_write(bc, sz != TX_16X16, pc->prob_tx[2]);
-    }
-  }
-#endif
 }
 
 static void write_mb_modes_kf(const VP9_COMP *cpi,
@@ -893,11 +831,7 @@
     vp9_write(bc, skip_coeff, vp9_get_pred_prob(c, xd, PRED_MBSKIP));
   }
 
-#if CONFIG_SB8X8
   if (m->mbmi.sb_type > BLOCK_SIZE_SB8X8)
-#else
-  if (m->mbmi.sb_type > BLOCK_SIZE_MB16X16)
-#endif
     sb_kfwrite_ymode(bc, ym, c->sb_kf_ymode_prob[c->kf_ymode_probs_index]);
   else
     kfwrite_ymode(bc, ym, c->kf_ymode_prob[c->kf_ymode_probs_index]);
@@ -906,7 +840,8 @@
     int i = 0;
     do {
       const B_PREDICTION_MODE a = above_block_mode(m, i, mis);
-      const B_PREDICTION_MODE l = (xd->left_available || (i & 3)) ?
+      const B_PREDICTION_MODE l = (xd->left_available ||
+                                  (i & 1)) ?
                                   left_block_mode(m, i) : B_DC_PRED;
       const int bm = m->bmi[i].as_mode.first;
 
@@ -914,23 +849,11 @@
       ++intra_mode_stats [A] [L] [bm];
 #endif
       write_kf_bmode(bc, bm, c->kf_bmode_prob[a][l]);
-    } while (++i < (16 >> (CONFIG_SB8X8 * 2)));
+    } while (++i < 4);
   }
-#if !CONFIG_SB8X8
-  if (ym == I8X8_PRED) {
-    write_i8x8_mode(bc, m->bmi[0].as_mode.first, c->fc.i8x8_mode_prob);
-    // printf("    mode: %d\n", m->bmi[0].as_mode.first); fflush(stdout);
-    write_i8x8_mode(bc, m->bmi[2].as_mode.first, c->fc.i8x8_mode_prob);
-    // printf("    mode: %d\n", m->bmi[2].as_mode.first); fflush(stdout);
-    write_i8x8_mode(bc, m->bmi[8].as_mode.first, c->fc.i8x8_mode_prob);
-    // printf("    mode: %d\n", m->bmi[8].as_mode.first); fflush(stdout);
-    write_i8x8_mode(bc, m->bmi[10].as_mode.first, c->fc.i8x8_mode_prob);
-    // printf("    mode: %d\n", m->bmi[10].as_mode.first); fflush(stdout);
-  } else
-#endif
-    write_uv_mode(bc, m->mbmi.uv_mode, c->kf_uv_mode_prob[ym]);
 
-#if CONFIG_SB8X8
+  write_uv_mode(bc, m->mbmi.uv_mode, c->kf_uv_mode_prob[ym]);
+
   if (ym != I4X4_PRED && c->txfm_mode == TX_MODE_SELECT &&
       !(skip_coeff || vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP))) {
     TX_SIZE sz = m->mbmi.txfm_size;
@@ -942,19 +865,6 @@
         vp9_write(bc, sz != TX_16X16, c->prob_tx[2]);
     }
   }
-#else
-  if (ym <= I8X8_PRED && c->txfm_mode == TX_MODE_SELECT &&
-      !(skip_coeff || vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP))) {
-    TX_SIZE sz = m->mbmi.txfm_size;
-    // FIXME(rbultje) code ternary symbol once all experiments are merged
-    vp9_write(bc, sz != TX_4X4, c->prob_tx[0]);
-    if (sz != TX_4X4 && ym <= TM_PRED) {
-      vp9_write(bc, sz != TX_8X8, c->prob_tx[1]);
-      if (m->mbmi.sb_type >= BLOCK_SIZE_SB32X32 && sz != TX_8X8)
-        vp9_write(bc, sz != TX_16X16, c->prob_tx[2]);
-    }
-  }
-#endif
 }
 
 
@@ -1162,15 +1072,11 @@
   else
     assert(0);
 
-#if CONFIG_SB8X8
   if (bsize > BLOCK_SIZE_SB8X8) {
-#else
-  if (bsize > BLOCK_SIZE_MB16X16) {
-#endif
     int pl;
     xd->left_seg_context =
-        cm->left_seg_context + ((mi_row >> CONFIG_SB8X8) & 3);
-    xd->above_seg_context = cm->above_seg_context + (mi_col >> CONFIG_SB8X8);
+        cm->left_seg_context + ((mi_row >> 1) & 3);
+    xd->above_seg_context = cm->above_seg_context + (mi_col >> 1);
     pl = partition_plane_context(xd, bsize);
     // encode the partition information
     write_token(bc, vp9_partition_tree, cm->fc.partition_prob[pl],
@@ -1205,15 +1111,11 @@
   }
 
   // update partition context
-#if CONFIG_SB8X8
   if ((partition == PARTITION_SPLIT) && (bsize > BLOCK_SIZE_MB16X16))
-#else
-  if ((partition == PARTITION_SPLIT) && (bsize > BLOCK_SIZE_SB32X32))
-#endif
     return;
 
-  xd->left_seg_context = cm->left_seg_context + ((mi_row >> CONFIG_SB8X8) & 3);
-  xd->above_seg_context = cm->above_seg_context + (mi_col >> CONFIG_SB8X8);
+  xd->left_seg_context = cm->left_seg_context + ((mi_row >> 1) & 3);
+  xd->above_seg_context = cm->above_seg_context + (mi_col >> 1);
   update_partition_context(xd, subsize, bsize);
 }
 
@@ -1230,12 +1132,12 @@
 
   for (mi_row = c->cur_tile_mi_row_start;
        mi_row < c->cur_tile_mi_row_end;
-       mi_row += (4 << CONFIG_SB8X8), m_ptr += (4 << CONFIG_SB8X8) * mis) {
+       mi_row += 8, m_ptr += 8 * mis) {
     m = m_ptr;
     vpx_memset(c->left_seg_context, 0, sizeof(c->left_seg_context));
     for (mi_col = c->cur_tile_mi_col_start;
          mi_col < c->cur_tile_mi_col_end;
-         mi_col += (4 << CONFIG_SB8X8), m += (4 << CONFIG_SB8X8))
+         mi_col += 8, m += 8)
       write_modes_sb(cpi, m, bc, tok, tok_end, mi_row, mi_col,
                      BLOCK_SIZE_SB64X64);
   }
@@ -1928,9 +1830,6 @@
 
   encode_loopfilter(xd, &header_bc);
 
-  // TODO(jkoleszar): remove these unused bits
-  vp9_write_literal(&header_bc, 0, 2);
-
   // Frame Q baseline quantizer index
   vp9_write_literal(&header_bc, pc->base_qindex, QINDEX_BITS);
 
@@ -1982,9 +1881,10 @@
     vp9_write_literal(&header_bc, cpi->gld_fb_idx, NUM_REF_FRAMES_LG2);
     vp9_write_literal(&header_bc, cpi->alt_fb_idx, NUM_REF_FRAMES_LG2);
 
-    // Indicate reference frame sign bias for Golden and ARF frames (always 0 for last frame buffer)
-    vp9_write_bit(&header_bc, pc->ref_frame_sign_bias[GOLDEN_FRAME]);
-    vp9_write_bit(&header_bc, pc->ref_frame_sign_bias[ALTREF_FRAME]);
+    // Indicate the sign bias for each reference frame buffer.
+    for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {
+      vp9_write_bit(&header_bc, pc->ref_frame_sign_bias[LAST_FRAME + i]);
+    }
 
     // Signal whether to allow high MV precision
     vp9_write_bit(&header_bc, (xd->allow_high_precision_mv) ? 1 : 0);
@@ -2151,19 +2051,12 @@
   vp9_copy(cpi->common.fc.pre_uv_mode_prob, cpi->common.fc.uv_mode_prob);
   vp9_copy(cpi->common.fc.pre_bmode_prob, cpi->common.fc.bmode_prob);
   vp9_copy(cpi->common.fc.pre_sub_mv_ref_prob, cpi->common.fc.sub_mv_ref_prob);
-#if !CONFIG_SB8X8
-  vp9_copy(cpi->common.fc.pre_mbsplit_prob, cpi->common.fc.mbsplit_prob);
-  vp9_copy(cpi->common.fc.pre_i8x8_mode_prob, cpi->common.fc.i8x8_mode_prob);
-#endif
   vp9_copy(cpi->common.fc.pre_partition_prob, cpi->common.fc.partition_prob);
   cpi->common.fc.pre_nmvc = cpi->common.fc.nmvc;
 #if CONFIG_COMP_INTERINTRA_PRED
   cpi->common.fc.pre_interintra_prob = cpi->common.fc.interintra_prob;
 #endif
   vp9_zero(cpi->sub_mv_ref_count);
-#if !CONFIG_SB8X8
-  vp9_zero(cpi->mbsplit_count);
-#endif
   vp9_zero(cpi->common.fc.mv_ref_ct);
 
   update_coef_probs(cpi, &header_bc);
@@ -2175,9 +2068,6 @@
   active_section = 2;
 #endif
 
-  // TODO(jkoleszar): remove this unused bit
-  vp9_write_bit(&header_bc, 1);
-
   vp9_update_skip_probs(cpi);
   for (i = 0; i < MBSKIP_CONTEXTS; ++i) {
     vp9_write_prob(&header_bc, pc->mbskip_pred_probs[i]);
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 83c1102..6bc42c7 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -29,7 +29,7 @@
     B_PREDICTION_MODE mode;
     int_mv mv;
     int_mv second_mv;
-  } bmi[16 >> (2 * CONFIG_SB8X8)];
+  } bmi[4];
 } PARTITION_INFO;
 
 // Structure to hold snapshot of coding context during the mode picking process
@@ -117,9 +117,6 @@
   int mbmode_cost[2][MB_MODE_COUNT];
   int intra_uv_mode_cost[2][MB_MODE_COUNT];
   int bmode_costs[VP9_KF_BINTRAMODES][VP9_KF_BINTRAMODES][VP9_KF_BINTRAMODES];
-#if !CONFIG_SB8X8
-  int i8x8_mode_costs[MB_MODE_COUNT];
-#endif
   int inter_bmode_costs[B_MODE_COUNT];
   int switchable_interp_costs[VP9_SWITCHABLE_FILTERS + 1]
                              [VP9_SWITCHABLE_FILTERS];
@@ -143,11 +140,9 @@
 
   // Structure to hold context for each of the 4 MBs within a SB:
   // when encoded as 4 independent MBs:
-#if CONFIG_SB8X8
   PICK_MODE_CONTEXT sb8_context[4][4][4];
   PICK_MODE_CONTEXT sb8x16_context[4][4][2];
   PICK_MODE_CONTEXT sb16x8_context[4][4][2];
-#endif
   PICK_MODE_CONTEXT mb_context[4][4];
   PICK_MODE_CONTEXT sb32x16_context[4][2];
   PICK_MODE_CONTEXT sb16x32_context[4][2];
@@ -164,12 +159,6 @@
   void (*fwd_txm16x16)(int16_t *input, int16_t *output, int pitch);
   void (*quantize_b_4x4)(MACROBLOCK *x, int b_idx, TX_TYPE tx_type,
                          int y_blocks);
-  void (*quantize_b_4x4_pair)(MACROBLOCK *x, int b_idx1, int b_idx2,
-                              int y_blocks);
-  void (*quantize_b_16x16)(MACROBLOCK *x, int b_idx, TX_TYPE tx_type,
-                           int y_blocks);
-  void (*quantize_b_8x8)(MACROBLOCK *x, int b_idx, TX_TYPE tx_type,
-                         int y_blocks);
 };
 
 #endif  // VP9_ENCODER_VP9_BLOCK_H_
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 2586d44..6366d38 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -20,7 +20,6 @@
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/encoder/vp9_segmentation.h"
-#include "vp9/common/vp9_setupintrarecon.h"
 #include "vp9/encoder/vp9_encodeintra.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_invtrans.h"
@@ -47,11 +46,6 @@
 
 void vp9_select_interp_filter_type(VP9_COMP *cpi);
 
-#if !CONFIG_SB8X8
-static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t,
-                              int output_enabled, int mi_row, int mi_col);
-#endif
-
 static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
                               int output_enabled, int mi_row, int mi_col,
                               BLOCK_SIZE_TYPE bsize);
@@ -392,9 +386,9 @@
                sizeof(PARTITION_INFO));
 
     mbmi->mv[0].as_int =
-        x->partition_info->bmi[15 >> (CONFIG_SB8X8 * 2)].mv.as_int;
+        x->partition_info->bmi[3].mv.as_int;
     mbmi->mv[1].as_int =
-        x->partition_info->bmi[15 >> (CONFIG_SB8X8 * 2)].second_mv.as_int;
+        x->partition_info->bmi[3].second_mv.as_int;
   }
 
   x->skip = ctx->skip;
@@ -448,9 +442,6 @@
       THR_D27_PRED /*D27_PRED*/,
       THR_D63_PRED /*D63_PRED*/,
       THR_TM /*TM_PRED*/,
-#if !CONFIG_SB8X8
-      THR_I8X8_PRED /*I8X8_PRED*/,
-#endif
       THR_B_PRED /*I4X4_PRED*/,
     };
     cpi->mode_chosen_counts[kf_mode_index[mb_mode]]++;
@@ -488,11 +479,13 @@
       mbmi->best_mv.as_int = best_mv.as_int;
       mbmi->best_second_mv.as_int = best_second_mv.as_int;
       vp9_update_nmv_count(cpi, x, &best_mv, &best_second_mv);
-#if CONFIG_SB8X8
-      xd->mode_info_context[1].mbmi =
-      xd->mode_info_context[mis].mbmi =
-      xd->mode_info_context[1 + mis].mbmi = *mbmi;
-#endif
+    }
+
+    if (bsize > BLOCK_SIZE_SB8X8 && mbmi->mode == NEWMV) {
+      int i, j;
+      for (j = 0; j < bh; ++j)
+        for (i = 0; i < bw; ++i)
+          xd->mode_info_context[mis * j + i].mbmi = *mbmi;
     }
 #if CONFIG_COMP_INTERINTRA_PRED
     if (mbmi->mode >= NEARESTMV && mbmi->mode < SPLITMV &&
@@ -564,8 +557,8 @@
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
 
-  xd->above_seg_context = cm->above_seg_context + (mi_col >> CONFIG_SB8X8);
-  xd->left_seg_context  = cm->left_seg_context + ((mi_row >> CONFIG_SB8X8) & 3);
+  xd->above_seg_context = cm->above_seg_context + (mi_col >> 1);
+  xd->left_seg_context  = cm->left_seg_context + ((mi_row >> 1) & 3);
 }
 
 static void set_offsets(VP9_COMP *cpi,
@@ -577,17 +570,17 @@
   const int dst_fb_idx = cm->new_fb_idx;
   const int idx_str = xd->mode_info_stride * mi_row + mi_col;
   const int bw = 1 << mi_width_log2(bsize), bh = 1 << mi_height_log2(bsize);
-  const int mb_row = mi_row >> CONFIG_SB8X8;
-  const int mb_col = mi_col >> CONFIG_SB8X8;
+  const int mb_row = mi_row >> 1;
+  const int mb_col = mi_col >> 1;
   const int idx_map = mb_row * cm->mb_cols + mb_col;
   int i;
 
   // entropy context structures
   for (i = 0; i < MAX_MB_PLANE; i++) {
     xd->plane[i].above_context = cm->above_context[i] +
-        (mi_col * 4 >> (CONFIG_SB8X8 + xd->plane[i].subsampling_x));
+        (mi_col * 2 >>  xd->plane[i].subsampling_x);
     xd->plane[i].left_context = cm->left_context[i] +
-        (((mi_row * 4 >> CONFIG_SB8X8) & 15) >> xd->plane[i].subsampling_y);
+        (((mi_row * 2) & 15) >> xd->plane[i].subsampling_y);
   }
 
   // partition contexts
@@ -650,9 +643,9 @@
       const int p16 = ((mb_row & 1) << 1) +  (mb_col & 1);
       const int p32 = ((mb_row & 2) << 2) + ((mb_col & 2) << 1);
       const int tile_progress =
-          cm->cur_tile_mi_col_start * cm->mb_rows >> CONFIG_SB8X8;
+          cm->cur_tile_mi_col_start * cm->mb_rows >> 1;
       const int mb_cols =
-          (cm->cur_tile_mi_col_end - cm->cur_tile_mi_col_start) >> CONFIG_SB8X8;
+          (cm->cur_tile_mi_col_end - cm->cur_tile_mi_col_start) >> 1;
 
       cpi->seg0_progress =
           ((y * mb_cols + x * 4 + p32 + p16 + tile_progress) << 16) / cm->MBs;
@@ -662,49 +655,6 @@
   }
 }
 
-#if !CONFIG_SB8X8
-static int pick_mb_mode(VP9_COMP *cpi,
-                        int mi_row,
-                        int mi_col,
-                        TOKENEXTRA **tp,
-                        int *totalrate,
-                        int *totaldist) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int splitmodes_used = 0;
-  MB_MODE_INFO *mbmi;
-
-  set_offsets(cpi, mi_row, mi_col, BLOCK_SIZE_MB16X16);
-
-  if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
-    vp9_activity_masking(cpi, x);
-
-  mbmi = &xd->mode_info_context->mbmi;
-  mbmi->sb_type = BLOCK_SIZE_MB16X16;
-
-  // Find best coding mode & reconstruct the MB so it is available
-  // as a predictor for MBs that follow in the SB
-  if (cm->frame_type == KEY_FRAME) {
-    vp9_rd_pick_intra_mode(cpi, x, totalrate, totaldist);
-
-    // Save the coding context
-    vpx_memcpy(&x->mb_context[xd->sb_index][xd->mb_index].mic,
-               xd->mode_info_context, sizeof(MODE_INFO));
-  } else {
-    vp9_pick_mode_inter_macroblock(cpi, x, mi_row, mi_col,
-                                   totalrate, totaldist);
-    splitmodes_used += (mbmi->mode == SPLITMV);
-
-    if (cpi->mb.e_mbd.segmentation_enabled && mbmi->segment_id == 0) {
-      cpi->seg0_idx++;
-    }
-  }
-
-  return splitmodes_used;
-}
-#endif
-
 static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col,
                           TOKENEXTRA **tp, int *totalrate, int *totaldist,
                           BLOCK_SIZE_TYPE bsize, PICK_MODE_CONTEXT *ctx) {
@@ -787,15 +737,10 @@
                             BLOCK_SIZE_TYPE bsize) {
   if (bsize >= BLOCK_SIZE_SB32X32) {
     xd->sb_index = idx;
-#if CONFIG_SB8X8
   } else if (bsize >= BLOCK_SIZE_MB16X16) {
     xd->mb_index = idx;
   } else {
     xd->b_index = idx;
-#else
-  } else {
-    xd->mb_index = idx;
-#endif
   }
 }
 
@@ -818,14 +763,12 @@
       return &x->sb16x32_context[xd->sb_index][xd->mb_index];
     case BLOCK_SIZE_MB16X16:
       return &x->mb_context[xd->sb_index][xd->mb_index];
-#if CONFIG_SB8X8
     case BLOCK_SIZE_SB16X8:
       return &x->sb16x8_context[xd->sb_index][xd->mb_index][xd->b_index];
     case BLOCK_SIZE_SB8X16:
       return &x->sb8x16_context[xd->sb_index][xd->mb_index][xd->b_index];
     case BLOCK_SIZE_SB8X8:
       return &x->sb8_context[xd->sb_index][xd->mb_index][xd->b_index];
-#endif
     default:
       assert(0);
       return NULL;
@@ -846,17 +789,7 @@
     set_block_index(xd, sub_index, bsize);
   set_offsets(cpi, mi_row, mi_col, bsize);
   update_state(cpi, get_block_context(x, bsize), bsize, output_enabled);
-#if !CONFIG_SB8X8
-  if (bsize == BLOCK_SIZE_MB16X16) {
-    if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
-      vp9_activity_masking(cpi, x);
-
-    encode_macroblock(cpi, tp, output_enabled, mi_row, mi_col);
-  } else
-#endif
-  {
-    encode_superblock(cpi, tp, output_enabled, mi_row, mi_col, bsize);
-  }
+  encode_superblock(cpi, tp, output_enabled, mi_row, mi_col, bsize);
 
   if (output_enabled) {
     update_stats(cpi, mi_row, mi_col);
@@ -869,10 +802,8 @@
 static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp,
                       int mi_row, int mi_col, int output_enabled,
                       BLOCK_SIZE_TYPE level,
-                      BLOCK_SIZE_TYPE c1, BLOCK_SIZE_TYPE c2[4]
-#if CONFIG_SB8X8
-                      , BLOCK_SIZE_TYPE c3[4][4]
-#endif
+                      BLOCK_SIZE_TYPE c1, BLOCK_SIZE_TYPE c2[4],
+                      BLOCK_SIZE_TYPE c3[4][4]
                       ) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
@@ -884,23 +815,13 @@
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-#if CONFIG_SB8X8
   if (level > BLOCK_SIZE_SB8X8) {
-#endif
     set_partition_seg_context(cpi, mi_row, mi_col);
     pl = partition_plane_context(xd, level);
-#if CONFIG_SB8X8
   }
-#endif
 
   if (bsl == bwl && bsl == bhl) {
-    if (output_enabled &&
-#if CONFIG_SB8X8
-        level > BLOCK_SIZE_SB8X8
-#else
-        level > BLOCK_SIZE_MB16X16
-#endif
-        )
+    if (output_enabled && level > BLOCK_SIZE_SB8X8)
       cpi->partition_count[pl][PARTITION_NONE]++;
     encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, -1);
   } else if (bsl == bhl && bsl > bwl) {
@@ -920,17 +841,11 @@
     assert(bwl < bsl && bhl < bsl);
     if (level == BLOCK_SIZE_SB64X64) {
       subsize = BLOCK_SIZE_SB32X32;
-#if CONFIG_SB8X8
     } else if (level == BLOCK_SIZE_SB32X32) {
       subsize = BLOCK_SIZE_MB16X16;
     } else {
       assert(level == BLOCK_SIZE_MB16X16);
       subsize = BLOCK_SIZE_SB8X8;
-#else
-    } else {
-      assert(level == BLOCK_SIZE_SB32X32);
-      subsize = BLOCK_SIZE_MB16X16;
-#endif
     }
 
     if (output_enabled)
@@ -942,22 +857,12 @@
       set_block_index(xd, i, subsize);
       encode_sb(cpi, tp, mi_row + y_idx * bs, mi_col + x_idx * bs,
                 output_enabled, subsize,
-#if CONFIG_SB8X8
                 c2 ? c2[i] : c1, c3 ? c3[i] : NULL, NULL);
-#else
-                c2 ? c2[i] : c1, NULL);
-#endif
     }
   }
 
-#if CONFIG_SB8X8
   if (level > BLOCK_SIZE_SB8X8 &&
-      (level == BLOCK_SIZE_MB16X16 || bsl == bwl || bsl == bhl))
-#else
-  if (level > BLOCK_SIZE_MB16X16 &&
-      (level == BLOCK_SIZE_SB32X32 || bsl == bwl || bsl == bhl))
-#endif
-  {
+      (level == BLOCK_SIZE_MB16X16 || bsl == bwl || bsl == bhl)) {
     set_partition_seg_context(cpi, mi_row, mi_col);
     update_partition_context(xd, c1, level);
   }
@@ -978,11 +883,9 @@
 
   // Code each SB in the row
   for (mi_col = cm->cur_tile_mi_col_start;
-       mi_col < cm->cur_tile_mi_col_end; mi_col += (4 << CONFIG_SB8X8)) {
+       mi_col < cm->cur_tile_mi_col_end; mi_col += 8) {
     int i, p;
-#if CONFIG_SB8X8
     BLOCK_SIZE_TYPE mb_partitioning[4][4];
-#endif
     BLOCK_SIZE_TYPE sb_partitioning[4];
     BLOCK_SIZE_TYPE sb64_partitioning = BLOCK_SIZE_SB32X32;
     int sb64_rate = 0, sb64_dist = 0;
@@ -993,26 +896,27 @@
 
     for (p = 0; p < MAX_MB_PLANE; p++) {
       memcpy(a + 16 * p, cm->above_context[p] +
-                 (mi_col * 4 >> (CONFIG_SB8X8 + xd->plane[p].subsampling_x)),
+                 (mi_col * 2 >> xd->plane[p].subsampling_x),
              sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_x);
       memcpy(l + 16 * p, cm->left_context[p],
              sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_y);
     }
-    memcpy(&seg_a, cm->above_seg_context + (mi_col >> CONFIG_SB8X8),
+    memcpy(&seg_a, cm->above_seg_context + (mi_col >> 1),
            sizeof(seg_a));
     memcpy(&seg_l, cm->left_seg_context, sizeof(seg_l));
 
     // FIXME(rbultje): this function should probably be rewritten to be
     // recursive at some point in the future.
     for (i = 0; i < 4; i++) {
-      const int x_idx = (i & 1) << (1 + CONFIG_SB8X8);
-      const int y_idx = (i & 2) << CONFIG_SB8X8;
+      const int x_idx = (i & 1) << 2;
+      const int y_idx = (i & 2) << 1;
       int sb32_rate = 0, sb32_dist = 0;
       int splitmodes_used = 0;
       int sb32_skip = 0;
       int j;
       ENTROPY_CONTEXT l2[8 * MAX_MB_PLANE], a2[8 * MAX_MB_PLANE];
 
+      sb_partitioning[i] = BLOCK_SIZE_MB16X16;
       if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
         continue;
 
@@ -1022,26 +926,23 @@
       for (p = 0; p < MAX_MB_PLANE; p++) {
         vpx_memcpy(l2 + 8 * p,
                    cm->left_context[p] +
-                       (y_idx * 4 >> (CONFIG_SB8X8 +
-                                      xd->plane[p].subsampling_y)),
+                       (y_idx * 2 >> xd->plane[p].subsampling_y),
                    sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_y);
         vpx_memcpy(a2 + 8 * p,
                    cm->above_context[p] +
-                       ((mi_col + x_idx) * 4 >> (CONFIG_SB8X8 +
-                                                 xd->plane[p].subsampling_x)),
+                       ((mi_col + x_idx) * 2 >> xd->plane[p].subsampling_x),
                    sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_x);
       }
 
       /* Encode MBs in raster order within the SB */
-      sb_partitioning[i] = BLOCK_SIZE_MB16X16;
       for (j = 0; j < 4; j++) {
-        const int x_idx_m = x_idx + ((j & 1) << CONFIG_SB8X8);
-        const int y_idx_m = y_idx + ((j >> 1) << CONFIG_SB8X8);
+        const int x_idx_m = x_idx + ((j & 1) << 1);
+        const int y_idx_m = y_idx + ((j >> 1) << 1);
         int r, d;
-#if CONFIG_SB8X8
         int r2, d2, mb16_rate = 0, mb16_dist = 0, k;
         ENTROPY_CONTEXT l3[4 * MAX_MB_PLANE], a3[4 * MAX_MB_PLANE];
-#endif
+
+        mb_partitioning[i][j] = BLOCK_SIZE_SB8X8;
 
         if (mi_row + y_idx_m >= cm->mi_rows ||
             mi_col + x_idx_m >= cm->mi_cols) {
@@ -1052,21 +953,17 @@
         // Index of the MB in the SB 0..3
         xd->mb_index = j;
 
-#if CONFIG_SB8X8
         for (p = 0; p < MAX_MB_PLANE; p++) {
           vpx_memcpy(l3 + 4 * p,
                      cm->left_context[p] +
-                         (y_idx_m * 4 >> (CONFIG_SB8X8 +
-                                          xd->plane[p].subsampling_y)),
+                         (y_idx_m * 2 >> xd->plane[p].subsampling_y),
                      sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y);
           vpx_memcpy(a3 + 4 * p,
                      cm->above_context[p] +
-                         ((mi_col + x_idx_m) * 4 >> (CONFIG_SB8X8 +
-                                                   xd->plane[p].subsampling_x)),
+                         ((mi_col + x_idx_m) * 2 >> xd->plane[p].subsampling_x),
                      sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x);
         }
 
-        mb_partitioning[i][j] = BLOCK_SIZE_SB8X8;
         for (k = 0; k < 4; k++) {
           xd->b_index = k;
 
@@ -1091,13 +988,11 @@
         mb16_rate += x->partition_cost[pl][PARTITION_SPLIT];
         for (p = 0; p < MAX_MB_PLANE; p++) {
           vpx_memcpy(cm->left_context[p] +
-                         (y_idx_m * 4 >> (CONFIG_SB8X8 +
-                                          xd->plane[p].subsampling_y)),
+                         (y_idx_m * 2 >> xd->plane[p].subsampling_y),
                      l3 + 4 * p,
                      sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y);
           vpx_memcpy(cm->above_context[p] +
-                         ((mi_col + x_idx_m) * 4 >> (CONFIG_SB8X8 +
-                                                   xd->plane[p].subsampling_x)),
+                         ((mi_col + x_idx_m) * 2 >> xd->plane[p].subsampling_x),
                      a3 + 4 * p,
                      sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x);
         }
@@ -1130,19 +1025,17 @@
         r2 += x->partition_cost[pl][PARTITION_VERT];
         if (RDCOST(x->rdmult, x->rddiv, r2, d2) <
                 RDCOST(x->rdmult, x->rddiv, mb16_rate, mb16_dist)) {
-          mb16_rate = r;
-          mb16_dist = d;
+          mb16_rate = r2;
+          mb16_dist = d2;
           mb_partitioning[i][j] = BLOCK_SIZE_SB8X16;
         }
         for (p = 0; p < MAX_MB_PLANE; p++) {
           vpx_memcpy(cm->left_context[p] +
-                         (y_idx_m * 4 >> (CONFIG_SB8X8 +
-                                          xd->plane[p].subsampling_y)),
+                         (y_idx_m * 2 >> xd->plane[p].subsampling_y),
                      l3 + 4 * p,
                      sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y);
           vpx_memcpy(cm->above_context[p] +
-                         ((mi_col + x_idx_m) * 4 >> (CONFIG_SB8X8 +
-                                                   xd->plane[p].subsampling_x)),
+                         ((mi_col + x_idx_m) * 2 >> xd->plane[p].subsampling_x),
                      a3 + 4 * p,
                      sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x);
         }
@@ -1175,19 +1068,17 @@
         r2 += x->partition_cost[pl][PARTITION_HORZ];
         if (RDCOST(x->rdmult, x->rddiv, r2, d2) <
                 RDCOST(x->rdmult, x->rddiv, mb16_rate, mb16_dist)) {
-          mb16_rate = r;
-          mb16_dist = d;
+          mb16_rate = r2;
+          mb16_dist = d2;
           mb_partitioning[i][j] = BLOCK_SIZE_SB16X8;
         }
         for (p = 0; p < MAX_MB_PLANE; p++) {
           vpx_memcpy(cm->left_context[p] +
-                         (y_idx_m * 4 >> (CONFIG_SB8X8 +
-                                          xd->plane[p].subsampling_y)),
+                         (y_idx_m * 2 >> xd->plane[p].subsampling_y),
                      l3 + 4 * p,
                      sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y);
           vpx_memcpy(cm->above_context[p] +
-                         ((mi_col + x_idx_m) * 4 >> (CONFIG_SB8X8 +
-                                                   xd->plane[p].subsampling_x)),
+                         ((mi_col + x_idx_m) * 2 >> xd->plane[p].subsampling_x),
                      a3 + 4 * p,
                      sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x);
         }
@@ -1207,33 +1098,20 @@
         }
         sb32_rate += mb16_rate;
         sb32_dist += mb16_dist;
-#else
-        splitmodes_used += pick_mb_mode(cpi, mi_row + y_idx_m,
-                                        mi_col + x_idx_m, tp, &r, &d);
-        sb32_rate += r;
-        sb32_dist += d;
-#endif
 
         // Dummy encode, do not do the tokenization
-#if CONFIG_SB8X8
-        encode_sb(cpi, tp, mi_row + y_idx, mi_col + x_idx, 0,
+        encode_sb(cpi, tp, mi_row + y_idx_m, mi_col + x_idx_m, 0,
                   BLOCK_SIZE_MB16X16, mb_partitioning[i][j], NULL, NULL);
-#else
-        encode_macroblock(cpi, tp, 0, mi_row + y_idx_m,
-                          mi_col + x_idx_m);
-#endif
       }
 
       /* Restore L & A coding context to those in place on entry */
       for (p = 0; p < MAX_MB_PLANE; p++) {
         vpx_memcpy(cm->left_context[p] +
-                       (y_idx * 4 >> (CONFIG_SB8X8 +
-                                      xd->plane[p].subsampling_y)),
+                       (y_idx * 2 >> xd->plane[p].subsampling_y),
                    l2 + 8 * p,
                    sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_y);
         vpx_memcpy(cm->above_context[p] +
-                       ((mi_col + x_idx) * 4 >> (CONFIG_SB8X8 +
-                                                 xd->plane[p].subsampling_x)),
+                       ((mi_col + x_idx) * 2 >> xd->plane[p].subsampling_x),
                    a2 + 8 * p,
                    sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_x);
       }
@@ -1248,14 +1126,14 @@
       }
 
       // check 32x16
-      if (mi_col + x_idx + (2 << CONFIG_SB8X8) <= cm->mi_cols) {
+      if (mi_col + x_idx + 4 <= cm->mi_cols) {
         int r, d;
 
         xd->mb_index = 0;
         pick_sb_modes(cpi, mi_row + y_idx, mi_col + x_idx,
                       tp, &r, &d, BLOCK_SIZE_SB32X16,
                       &x->sb32x16_context[xd->sb_index][xd->mb_index]);
-        if (mi_row + y_idx + (1 << CONFIG_SB8X8) < cm->mi_rows) {
+        if (mi_row + y_idx + 2 < cm->mi_rows) {
           int r2, d2;
 
           update_state(cpi, &x->sb32x16_context[xd->sb_index][xd->mb_index],
@@ -1264,7 +1142,7 @@
                             0, mi_row + y_idx, mi_col + x_idx,
                             BLOCK_SIZE_SB32X16);
           xd->mb_index = 1;
-          pick_sb_modes(cpi, mi_row + y_idx + (1 << CONFIG_SB8X8),
+          pick_sb_modes(cpi, mi_row + y_idx + 2,
                         mi_col + x_idx, tp, &r2, &d2, BLOCK_SIZE_SB32X16,
                         &x->sb32x16_context[xd->sb_index][xd->mb_index]);
           r += r2;
@@ -1285,27 +1163,25 @@
 
         for (p = 0; p < MAX_MB_PLANE; p++) {
           vpx_memcpy(cm->left_context[p] +
-                         (y_idx * 4 >> (CONFIG_SB8X8 +
-                                        xd->plane[p].subsampling_y)),
+                         (y_idx * 2 >> xd->plane[p].subsampling_y),
                      l2 + 8 * p,
                      sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_y);
           vpx_memcpy(cm->above_context[p] +
-                         ((mi_col + x_idx) * 4 >> (CONFIG_SB8X8 +
-                                                   xd->plane[p].subsampling_x)),
+                         ((mi_col + x_idx) * 2 >> xd->plane[p].subsampling_x),
                      a2 + 8 * p,
                      sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_x);
         }
       }
 
       // check 16x32
-      if (mi_row + y_idx + (2 << CONFIG_SB8X8) <= cm->mi_rows) {
+      if (mi_row + y_idx + 4 <= cm->mi_rows) {
         int r, d;
 
         xd->mb_index = 0;
         pick_sb_modes(cpi, mi_row + y_idx, mi_col + x_idx,
                       tp, &r, &d, BLOCK_SIZE_SB16X32,
                       &x->sb16x32_context[xd->sb_index][xd->mb_index]);
-        if (mi_col + x_idx + (1 << CONFIG_SB8X8) < cm->mi_cols) {
+        if (mi_col + x_idx + 2 < cm->mi_cols) {
           int r2, d2;
 
           update_state(cpi, &x->sb16x32_context[xd->sb_index][xd->mb_index],
@@ -1315,7 +1191,7 @@
                             BLOCK_SIZE_SB16X32);
           xd->mb_index = 1;
           pick_sb_modes(cpi, mi_row + y_idx,
-                        mi_col + x_idx + (1 << CONFIG_SB8X8),
+                        mi_col + x_idx + 2,
                         tp, &r2, &d2, BLOCK_SIZE_SB16X32,
                         &x->sb16x32_context[xd->sb_index][xd->mb_index]);
           r += r2;
@@ -1336,21 +1212,19 @@
 
         for (p = 0; p < MAX_MB_PLANE; p++) {
           vpx_memcpy(cm->left_context[p] +
-                         (y_idx * 4 >> (CONFIG_SB8X8 +
-                                        xd->plane[p].subsampling_y)),
+                         (y_idx * 2 >> xd->plane[p].subsampling_y),
                      l2 + 8 * p,
                      sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_y);
           vpx_memcpy(cm->above_context[p] +
-                         ((mi_col + x_idx) * 4 >> (CONFIG_SB8X8 +
-                                                   xd->plane[p].subsampling_x)),
+                         ((mi_col + x_idx) * 2 >> xd->plane[p].subsampling_x),
                      a2 + 8 * p,
                      sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_x);
         }
       }
 
       if (!sb32_skip &&
-          mi_col + x_idx + (2 << CONFIG_SB8X8) <= cm->mi_cols &&
-          mi_row + y_idx + (2 << CONFIG_SB8X8) <= cm->mi_rows) {
+          mi_col + x_idx + 4 <= cm->mi_cols &&
+          mi_row + y_idx + 4 <= cm->mi_rows) {
         int r, d;
 
         /* Pick a mode assuming that it applies to all 4 of the MBs in the SB */
@@ -1385,23 +1259,19 @@
       // instead of small->big) means we can use as threshold for small, which
       // may enable breakouts if RD is not good enough (i.e. faster)
       encode_sb(cpi, tp, mi_row + y_idx, mi_col + x_idx, 0,
-#if CONFIG_SB8X8
                 BLOCK_SIZE_SB32X32, sb_partitioning[i], mb_partitioning[i],
                 NULL);
-#else
-                BLOCK_SIZE_SB32X32, sb_partitioning[i], NULL);
-#endif
     }
 
     for (p = 0; p < MAX_MB_PLANE; p++) {
       memcpy(cm->above_context[p] +
-                 (mi_col * 4 >> (CONFIG_SB8X8 + xd->plane[p].subsampling_x)),
+                 (mi_col * 2 >> xd->plane[p].subsampling_x),
              a + 16 * p,
              sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_x);
       memcpy(cm->left_context[p], l + 16 * p,
              sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_y);
     }
-    memcpy(cm->above_seg_context + (mi_col >> CONFIG_SB8X8), &seg_a,
+    memcpy(cm->above_seg_context + (mi_col >> 1), &seg_a,
            sizeof(seg_a));
     memcpy(cm->left_seg_context, &seg_l, sizeof(seg_l));
 
@@ -1410,14 +1280,14 @@
     sb64_rate += x->partition_cost[pl][PARTITION_SPLIT];
 
     // check 64x32
-    if (mi_col + (4 << CONFIG_SB8X8) <= cm->mi_cols && !(cm->mb_rows & 1)) {
+    if (mi_col + 8 <= cm->mi_cols && !(cm->mb_rows & 1)) {
       int r, d;
 
       xd->sb_index = 0;
       pick_sb_modes(cpi, mi_row, mi_col,
                     tp, &r, &d, BLOCK_SIZE_SB64X32,
                     &x->sb64x32_context[xd->sb_index]);
-      if (mi_row + (2 << CONFIG_SB8X8) != cm->mi_rows) {
+      if (mi_row + 4 != cm->mi_rows) {
         int r2, d2;
 
         update_state(cpi, &x->sb64x32_context[xd->sb_index],
@@ -1425,7 +1295,7 @@
         encode_superblock(cpi, tp,
                           0, mi_row, mi_col, BLOCK_SIZE_SB64X32);
         xd->sb_index = 1;
-        pick_sb_modes(cpi, mi_row + (2 << CONFIG_SB8X8), mi_col,
+        pick_sb_modes(cpi, mi_row + 4, mi_col,
                       tp, &r2, &d2, BLOCK_SIZE_SB64X32,
                       &x->sb64x32_context[xd->sb_index]);
         r += r2;
@@ -1446,7 +1316,7 @@
 
       for (p = 0; p < MAX_MB_PLANE; p++) {
         memcpy(cm->above_context[p] +
-                   (mi_col * 4 >> (CONFIG_SB8X8 + xd->plane[p].subsampling_x)),
+                   (mi_col * 2 >> xd->plane[p].subsampling_x),
                a + 16 * p,
                sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_x);
         memcpy(cm->left_context[p], l + 16 * p,
@@ -1455,14 +1325,14 @@
     }
 
     // check 32x64
-    if (mi_row + (4 << CONFIG_SB8X8) <= cm->mi_rows && !(cm->mb_cols & 1)) {
+    if (mi_row + 8 <= cm->mi_rows && !(cm->mb_cols & 1)) {
       int r, d;
 
       xd->sb_index = 0;
       pick_sb_modes(cpi, mi_row, mi_col,
                     tp, &r, &d, BLOCK_SIZE_SB32X64,
                     &x->sb32x64_context[xd->sb_index]);
-      if (mi_col + (2 << CONFIG_SB8X8) != cm->mi_cols) {
+      if (mi_col + 4 != cm->mi_cols) {
         int r2, d2;
 
         update_state(cpi, &x->sb32x64_context[xd->sb_index],
@@ -1470,7 +1340,7 @@
         encode_superblock(cpi, tp,
                           0, mi_row, mi_col, BLOCK_SIZE_SB32X64);
         xd->sb_index = 1;
-        pick_sb_modes(cpi, mi_row, mi_col + (2 << CONFIG_SB8X8),
+        pick_sb_modes(cpi, mi_row, mi_col + 4,
                       tp, &r2, &d2, BLOCK_SIZE_SB32X64,
                       &x->sb32x64_context[xd->sb_index]);
         r += r2;
@@ -1491,7 +1361,7 @@
 
       for (p = 0; p < MAX_MB_PLANE; p++) {
         memcpy(cm->above_context[p] +
-                   (mi_col * 4 >> (CONFIG_SB8X8 + xd->plane[p].subsampling_x)),
+                   (mi_col * 2 >> xd->plane[p].subsampling_x),
                a + 16 * p,
                sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_x);
         memcpy(cm->left_context[p], l + 16 * p,
@@ -1500,8 +1370,8 @@
     }
 
     if (!sb64_skip &&
-        mi_col + (4 << CONFIG_SB8X8) <= cm->mi_cols &&
-        mi_row + (4 << CONFIG_SB8X8) <= cm->mi_rows) {
+        mi_col + 8 <= cm->mi_cols &&
+        mi_row + 8 <= cm->mi_rows) {
       int r, d;
 
       pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d,
@@ -1521,11 +1391,7 @@
 
     assert(tp_orig == *tp);
     encode_sb(cpi, tp, mi_row, mi_col, 1, BLOCK_SIZE_SB64X64,
-#if CONFIG_SB8X8
               sb64_partitioning, sb_partitioning, mb_partitioning);
-#else
-              sb64_partitioning, sb_partitioning);
-#endif
     assert(tp_orig < *tp);
   }
 }
@@ -1557,9 +1423,6 @@
                    0, 0, NULL, NULL);
   setup_dst_planes(xd, &cm->yv12_fb[cm->new_fb_idx], 0, 0);
 
-  // set up frame for intra coded blocks
-  vp9_setup_intra_recon(&cm->yv12_fb[cm->new_fb_idx]);
-
   vp9_build_block_offsets(x);
 
   vp9_setup_block_dptrs(&x->e_mbd);
@@ -1570,14 +1433,8 @@
   vp9_zero(cpi->count_mb_ref_frame_usage)
   vp9_zero(cpi->bmode_count)
   vp9_zero(cpi->ymode_count)
-#if !CONFIG_SB8X8
-  vp9_zero(cpi->i8x8_mode_count)
-#endif
   vp9_zero(cpi->y_uv_mode_count)
   vp9_zero(cpi->sub_mv_ref_count)
-#if !CONFIG_SB8X8
-  vp9_zero(cpi->mbsplit_count)
-#endif
   vp9_zero(cpi->common.fc.mv_ref_ct)
   vp9_zero(cpi->sb_ymode_count)
   vp9_zero(cpi->partition_count);
@@ -1715,7 +1572,7 @@
           vp9_get_tile_col_offsets(cm, tile_col);
           for (mi_row = cm->cur_tile_mi_row_start;
                mi_row < cm->cur_tile_mi_row_end;
-               mi_row += (4 << CONFIG_SB8X8)) {
+               mi_row += 8) {
             encode_sb_row(cpi, mi_row, &tp, &totalrate);
           }
           cpi->tok_count[tile_col] = (unsigned int)(tp - tp_old);
@@ -1844,17 +1701,11 @@
     assert(bwl < bsl && bhl < bsl);
     if (bsize == BLOCK_SIZE_SB64X64) {
       subsize = BLOCK_SIZE_SB32X32;
-#if CONFIG_SB8X8
     } else if (bsize == BLOCK_SIZE_SB32X32) {
       subsize = BLOCK_SIZE_MB16X16;
     } else {
       assert(bsize == BLOCK_SIZE_MB16X16);
       subsize = BLOCK_SIZE_SB8X8;
-#else
-    } else {
-      assert(bsize == BLOCK_SIZE_SB32X32);
-      subsize = BLOCK_SIZE_MB16X16;
-#endif
     }
 
     for (n = 0; n < 4; n++) {
@@ -1874,10 +1725,10 @@
   MODE_INFO *mi, *mi_ptr = cm->mi;
 
   for (mi_row = 0; mi_row < cm->mi_rows;
-       mi_row += (4 << CONFIG_SB8X8), mi_ptr += (4 << CONFIG_SB8X8) * mis) {
+       mi_row += 8, mi_ptr += 8 * mis) {
     mi = mi_ptr;
     for (mi_col = 0; mi_col < cm->mi_cols;
-         mi_col += (4 << CONFIG_SB8X8), mi += (4 << CONFIG_SB8X8)) {
+         mi_col += 8, mi += 8) {
       reset_skip_txfm_size_sb(cpi, mi, txfm_max,
                               mi_row, mi_col, BLOCK_SIZE_SB64X64);
     }
@@ -2059,36 +1910,16 @@
 
     do {
       ++ bct[xd->block[b].bmi.as_mode.first];
-    } while (++b < (16 >> (CONFIG_SB8X8 * 2)));
-  }
-
-#if !CONFIG_SB8X8
-  if (m == I8X8_PRED) {
-    i8x8_modes[xd->block[0].bmi.as_mode.first]++;
-    i8x8_modes[xd->block[2].bmi.as_mode.first]++;
-    i8x8_modes[xd->block[8].bmi.as_mode.first]++;
-    i8x8_modes[xd->block[10].bmi.as_mode.first]++;
+    } while (++b < 4);
   }
 #endif
-#endif
 
-  if (xd->mode_info_context->mbmi.sb_type > BLOCK_SIZE_MB16X16) {
+  if (xd->mode_info_context->mbmi.sb_type > BLOCK_SIZE_SB8X8) {
     ++cpi->sb_ymode_count[m];
   } else {
     ++cpi->ymode_count[m];
   }
-#if !CONFIG_SB8X8
-  if (m != I8X8_PRED)
-#endif
     ++cpi->y_uv_mode_count[m][uvm];
-#if !CONFIG_SB8X8
-  else {
-    cpi->i8x8_mode_count[xd->mode_info_context->bmi[0].as_mode.first]++;
-    cpi->i8x8_mode_count[xd->mode_info_context->bmi[2].as_mode.first]++;
-    cpi->i8x8_mode_count[xd->mode_info_context->bmi[8].as_mode.first]++;
-    cpi->i8x8_mode_count[xd->mode_info_context->bmi[10].as_mode.first]++;
-  }
-#endif
   if (m == I4X4_PRED) {
     int b = 0;
     do {
@@ -2097,7 +1928,7 @@
       if (m == B_CONTEXT_PRED) m -= CONTEXT_PRED_REPLACEMENTS;
 #endif
       ++cpi->bmode_count[m];
-    } while (++b < (16 >> (CONFIG_SB8X8 * 2)));
+    } while (++b < 4);
   }
 }
 
@@ -2122,256 +1953,6 @@
 #endif
 }
 
-#if !CONFIG_SB8X8
-static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t,
-                              int output_enabled,
-                              int mi_row, int mi_col) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO *mi = xd->mode_info_context;
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
-  const int mis = cm->mode_info_stride;
-#if CONFIG_SB8X8
-  int n;
-#endif
-
-  assert(xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_MB16X16);
-
-#ifdef ENC_DEBUG
-  enc_debug = (cpi->common.current_video_frame == 11 && cm->show_frame &&
-               mb_row == 8 && mb_col == 0 && output_enabled);
-  if (enc_debug)
-    printf("Encode MB %d %d output %d\n", mb_row, mb_col, output_enabled);
-#endif
-  if (cm->frame_type == KEY_FRAME) {
-    if (cpi->oxcf.tuning == VP8_TUNE_SSIM && output_enabled) {
-      // Adjust the zbin based on this MB rate.
-      adjust_act_zbin(cpi, x);
-      vp9_update_zbin_extra(cpi, x);
-    }
-  } else {
-    vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
-
-    if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
-      // Adjust the zbin based on this MB rate.
-      adjust_act_zbin(cpi, x);
-    }
-
-    // Experimental code. Special case for gf and arf zeromv modes.
-    // Increase zbin size to suppress noise
-    cpi->zbin_mode_boost = 0;
-    if (cpi->zbin_mode_boost_enabled) {
-      if (mbmi->ref_frame != INTRA_FRAME) {
-        if (mbmi->mode == ZEROMV) {
-          if (mbmi->ref_frame != LAST_FRAME)
-            cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
-          else
-            cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
-        } else if (mbmi->mode == SPLITMV)
-          cpi->zbin_mode_boost = SPLIT_MV_ZBIN_BOOST;
-        else
-          cpi->zbin_mode_boost = MV_ZBIN_BOOST;
-      } else {
-        cpi->zbin_mode_boost = INTRA_ZBIN_BOOST;
-      }
-    }
-
-    vp9_update_zbin_extra(cpi, x);
-  }
-
-  if (mbmi->ref_frame == INTRA_FRAME) {
-#if 0  // def ENC_DEBUG
-    if (enc_debug) {
-      printf("Mode %d skip %d tx_size %d\n", mbmi->mode, x->skip,
-             mbmi->txfm_size);
-    }
-#endif
-    if (mbmi->mode == I4X4_PRED) {
-      vp9_encode_intra16x16mbuv(cm, x);
-      vp9_encode_intra4x4mby(x, BLOCK_SIZE_MB16X16);
-    } else if (mbmi->mode == I8X8_PRED) {
-      vp9_encode_intra8x8mby(x);
-      vp9_encode_intra8x8mbuv(x);
-    } else {
-      vp9_encode_intra16x16mbuv(cm, x);
-      vp9_encode_intra16x16mby(cm, x);
-    }
-
-    if (output_enabled)
-      sum_intra_stats(cpi, x);
-  } else {
-    int ref_fb_idx, second_ref_fb_idx;
-#ifdef ENC_DEBUG
-    if (enc_debug)
-      printf("Mode %d skip %d tx_size %d ref %d ref2 %d mv %d %d interp %d\n",
-             mbmi->mode, x->skip, mbmi->txfm_size,
-             mbmi->ref_frame, mbmi->second_ref_frame,
-             mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col,
-             mbmi->interp_filter);
-#endif
-
-    assert(cm->frame_type != KEY_FRAME);
-
-    if (mbmi->ref_frame == LAST_FRAME)
-      ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];
-    else if (mbmi->ref_frame == GOLDEN_FRAME)
-      ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];
-    else
-      ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];
-
-    if (mbmi->second_ref_frame > 0) {
-      if (mbmi->second_ref_frame == LAST_FRAME)
-        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];
-      else if (mbmi->second_ref_frame == GOLDEN_FRAME)
-        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];
-      else
-        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];
-    }
-
-    setup_pre_planes(xd,
-        &cpi->common.yv12_fb[ref_fb_idx],
-        mbmi->second_ref_frame > 0 ? &cpi->common.yv12_fb[second_ref_fb_idx]
-                                   : NULL,
-        mi_row, mi_col, xd->scale_factor, xd->scale_factor_uv);
-
-    if (!x->skip) {
-      vp9_encode_inter16x16(cm, x, mi_row, mi_col);
-    } else {
-      vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_SIZE_MB16X16);
-#if CONFIG_COMP_INTERINTRA_PRED
-      if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) {
-        vp9_build_interintra_predictors(xd,
-                                        xd->plane[0].dst.buf,
-                                        xd->plane[1].dst.buf,
-                                        xd->plane[2].dst.buf,
-                                        xd->plane[0].dst.stride,
-                                        xd->plane[1].dst.stride,
-                                        BLOCK_SIZE_MB16X16);
-      }
-#endif
-    }
-  }
-
-  if (!x->skip) {
-#ifdef ENC_DEBUG
-    if (enc_debug) {
-      int i, j;
-      printf("\n");
-      printf("qcoeff\n");
-      for (i = 0; i < 384; i++) {
-        printf("%3d ", xd->qcoeff[i]);
-        if (i % 16 == 15) printf("\n");
-      }
-      printf("\n");
-      printf("predictor\n");
-      for (i = 0; i < 384; i++) {
-        printf("%3d ", xd->predictor[i]);
-        if (i % 16 == 15) printf("\n");
-      }
-      printf("\n");
-      printf("src_diff\n");
-      for (i = 0; i < 384; i++) {
-        printf("%3d ", x->src_diff[i]);
-        if (i % 16 == 15) printf("\n");
-      }
-      printf("\n");
-      printf("diff\n");
-      for (i = 0; i < 384; i++) {
-        printf("%3d ", xd->block[0].diff[i]);
-        if (i % 16 == 15) printf("\n");
-      }
-      printf("\n");
-      printf("final y\n");
-      for (i = 0; i < 16; i++) {
-        for (j = 0; j < 16; j++)
-          printf("%3d ", xd->plane[0].dst.buf[i * xd->plane[0].dst.stride + j]);
-        printf("\n");
-      }
-      printf("\n");
-      printf("final u\n");
-      for (i = 0; i < 8; i++) {
-        for (j = 0; j < 8; j++)
-          printf("%3d ", xd->plane[1].dst.buf[i * xd->plane[1].dst.stride + j]);
-        printf("\n");
-      }
-      printf("\n");
-      printf("final v\n");
-      for (i = 0; i < 8; i++) {
-        for (j = 0; j < 8; j++)
-          printf("%3d ", xd->plane[2].dst.buf[i * xd->plane[1].dst.stride + j]);
-        printf("\n");
-      }
-      fflush(stdout);
-    }
-#endif
-
-    vp9_tokenize_sb(cpi, xd, t, !output_enabled, BLOCK_SIZE_MB16X16);
-  } else {
-    // FIXME(rbultje): not tile-aware (mi - 1)
-    int mb_skip_context =
-      (mi - 1)->mbmi.mb_skip_coeff + (mi - mis)->mbmi.mb_skip_coeff;
-
-    mbmi->mb_skip_coeff = 1;
-    if (output_enabled)
-      cpi->skip_true_count[mb_skip_context]++;
-    vp9_reset_sb_tokens_context(xd, BLOCK_SIZE_MB16X16);
-  }
-
-#if CONFIG_SB8X8
-  // copy skip flag on all mb_mode_info contexts in this SB
-  // if this was a skip at this txfm size
-  for (n = 1; n < 4; n++) {
-    const int x_idx = n & 1, y_idx = n >> 1;
-    if (mi_col + x_idx < cm->mi_cols && mi_row + y_idx < cm->mi_rows)
-      mi[x_idx + y_idx * mis].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff;
-  }
-#endif
-
-  if (output_enabled) {
-    int segment_id = mbmi->segment_id;
-    if (cpi->common.txfm_mode == TX_MODE_SELECT &&
-        !(mbmi->mb_skip_coeff ||
-          vp9_segfeature_active(&x->e_mbd, segment_id, SEG_LVL_SKIP))) {
-      assert(mbmi->txfm_size <= TX_16X16);
-      if (mbmi->mode != I4X4_PRED && mbmi->mode != I8X8_PRED &&
-          mbmi->mode != SPLITMV) {
-        cpi->txfm_count_16x16p[mbmi->txfm_size]++;
-      } else if (mbmi->mode == I8X8_PRED ||
-                 (mbmi->mode == SPLITMV &&
-                  mbmi->partitioning != PARTITIONING_4X4)) {
-        cpi->txfm_count_8x8p[mbmi->txfm_size]++;
-      }
-    } else {
-#if CONFIG_SB8X8
-      int y, x;
-#endif
-      if (mbmi->mode != I4X4_PRED && mbmi->mode != I8X8_PRED &&
-          mbmi->mode != SPLITMV && cpi->common.txfm_mode >= ALLOW_16X16) {
-        mbmi->txfm_size = TX_16X16;
-      } else if (mbmi->mode != I4X4_PRED &&
-                 !(mbmi->mode == SPLITMV &&
-                   mbmi->partitioning == PARTITIONING_4X4) &&
-                 cpi->common.txfm_mode >= ALLOW_8X8) {
-        mbmi->txfm_size = TX_8X8;
-      } else {
-        mbmi->txfm_size = TX_4X4;
-      }
-
-#if CONFIG_SB8X8
-      for (y = 0; y < 2; y++) {
-        for (x = !y; x < 2; x++) {
-          if (mi_col + x < cm->mi_cols && mi_row + y < cm->mi_rows) {
-            mi[mis * y + x].mbmi.txfm_size = mbmi->txfm_size;
-          }
-        }
-      }
-#endif
-    }
-  }
-}
-#endif
-
 static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
                               int output_enabled, int mi_row, int mi_col,
                               BLOCK_SIZE_TYPE bsize) {
@@ -2421,26 +2002,17 @@
     vp9_update_zbin_extra(cpi, x);
   }
 
-#if CONFIG_SB8X8
   if (xd->mode_info_context->mbmi.mode == I4X4_PRED) {
     assert(bsize == BLOCK_SIZE_SB8X8 &&
            xd->mode_info_context->mbmi.txfm_size == TX_4X4);
 
     vp9_encode_intra4x4mby(x, bsize);
     vp9_build_intra_predictors_sbuv_s(&x->e_mbd, bsize);
-    vp9_subtract_sbuv(x, bsize);
-    vp9_transform_sbuv_4x4(x, bsize);
-    vp9_quantize_sbuv_4x4(x, bsize);
-    if (x->optimize)
-      vp9_optimize_sbuv(cm, x, bsize);
-    vp9_inverse_transform_sbuv_4x4(xd, bsize);
-    vp9_recon_sbuv(xd, bsize);
+    vp9_encode_sbuv(cm, x, bsize);
 
     if (output_enabled)
       sum_intra_stats(cpi, x);
-  } else
-#endif
-  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
+  } else if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
     vp9_build_intra_predictors_sby_s(&x->e_mbd, bsize);
     vp9_build_intra_predictors_sbuv_s(&x->e_mbd, bsize);
     if (output_enabled)
@@ -2475,97 +2047,11 @@
     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
   }
 
-#if CONFIG_SB8X8
   if (xd->mode_info_context->mbmi.mode == I4X4_PRED) {
     assert(bsize == BLOCK_SIZE_SB8X8);
     vp9_tokenize_sb(cpi, &x->e_mbd, t, !output_enabled, bsize);
-  } else
-#endif
-  if (!x->skip) {
-    vp9_subtract_sb(x, bsize);
-
-    switch (xd->mode_info_context->mbmi.txfm_size) {
-      case TX_32X32:
-        vp9_transform_sby_32x32(x, bsize);
-        vp9_quantize_sby_32x32(x, bsize);
-        if (bsize == BLOCK_SIZE_SB64X64) {
-          vp9_transform_sbuv_32x32(x, bsize);
-          vp9_quantize_sbuv_32x32(x, bsize);
-        } else {
-          vp9_transform_sbuv_16x16(x, bsize);
-          vp9_quantize_sbuv_16x16(x, bsize);
-        }
-        if (x->optimize) {
-          vp9_optimize_sby(cm, x, bsize);
-          if (bsize == BLOCK_SIZE_SB64X64)
-            vp9_optimize_sbuv(cm, x, bsize);
-          else
-            vp9_optimize_sbuv(cm, x, bsize);
-        }
-        vp9_inverse_transform_sby_32x32(xd, bsize);
-        if (bsize == BLOCK_SIZE_SB64X64)
-          vp9_inverse_transform_sbuv_32x32(xd, bsize);
-        else
-          vp9_inverse_transform_sbuv_16x16(xd, bsize);
-        break;
-      case TX_16X16:
-        vp9_transform_sby_16x16(x, bsize);
-        vp9_quantize_sby_16x16(x, bsize);
-        if (bsize >= BLOCK_SIZE_SB32X32) {
-          vp9_transform_sbuv_16x16(x, bsize);
-          vp9_quantize_sbuv_16x16(x, bsize);
-        } else {
-          vp9_transform_sbuv_8x8(x, bsize);
-          vp9_quantize_sbuv_8x8(x, bsize);
-        }
-        if (x->optimize) {
-          vp9_optimize_sby(cm, x, bsize);
-          if (bsize >= BLOCK_SIZE_SB32X32)
-            vp9_optimize_sbuv(cm, x, bsize);
-          else
-            vp9_optimize_sbuv(cm, x, bsize);
-        }
-        vp9_inverse_transform_sby_16x16(xd, bsize);
-        if (bsize >= BLOCK_SIZE_SB32X32)
-          vp9_inverse_transform_sbuv_16x16(xd, bsize);
-        else
-          vp9_inverse_transform_sbuv_8x8(xd, bsize);
-        break;
-      case TX_8X8:
-        vp9_transform_sby_8x8(x, bsize);
-        vp9_quantize_sby_8x8(x, bsize);
-        if (x->optimize)
-          vp9_optimize_sby(cm, x, bsize);
-        vp9_inverse_transform_sby_8x8(xd, bsize);
-        if (bsize >= BLOCK_SIZE_MB16X16) {
-          vp9_transform_sbuv_8x8(x, bsize);
-          vp9_quantize_sbuv_8x8(x, bsize);
-          if (x->optimize)
-            vp9_optimize_sbuv(cm, x, bsize);
-          vp9_inverse_transform_sbuv_8x8(xd, bsize);
-        } else {
-          vp9_transform_sbuv_4x4(x, bsize);
-          vp9_quantize_sbuv_4x4(x, bsize);
-          if (x->optimize)
-            vp9_optimize_sbuv(cm, x, bsize);
-          vp9_inverse_transform_sbuv_4x4(xd, bsize);
-        }
-        break;
-      case TX_4X4:
-        vp9_transform_sby_4x4(x, bsize);
-        vp9_transform_sbuv_4x4(x, bsize);
-        vp9_quantize_sby_4x4(x, bsize);
-        vp9_quantize_sbuv_4x4(x, bsize);
-        if (x->optimize) {
-          vp9_optimize_sby(cm, x, bsize);
-          vp9_optimize_sbuv(cm, x, bsize);
-        }
-        vp9_inverse_transform_sby_4x4(xd, bsize);
-        vp9_inverse_transform_sbuv_4x4(xd, bsize);
-        break;
-      default: assert(0);
-    }
-    vp9_recon_sb_c(xd, bsize);
+  } else if (!x->skip) {
+    vp9_encode_sb(cm, x, bsize);
     vp9_tokenize_sb(cpi, &x->e_mbd, t, !output_enabled, bsize);
   } else {
     // FIXME(rbultje): not tile-aware (mi - 1)
@@ -2605,6 +2091,9 @@
         sz = TX_16X16;
       if (sz == TX_16X16 && bsize < BLOCK_SIZE_MB16X16)
         sz = TX_8X8;
+      if (sz == TX_8X8 && (xd->mode_info_context->mbmi.mode == SPLITMV ||
+                           xd->mode_info_context->mbmi.mode == I4X4_PRED))
+        sz = TX_4X4;
 
       for (y = 0; y < bh; y++) {
         for (x = 0; x < bw; x++) {
diff --git a/vp9/encoder/vp9_encodeintra.c b/vp9/encoder/vp9_encodeintra.c
index c5f29fe..72a6603 100644
--- a/vp9/encoder/vp9_encodeintra.c
+++ b/vp9/encoder/vp9_encodeintra.c
@@ -22,15 +22,12 @@
   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
   (void) cpi;
 
-#if !CONFIG_SB8X8
   if (use_16x16_pred) {
-#endif
     mbmi->mode = DC_PRED;
     mbmi->uv_mode = DC_PRED;
     mbmi->ref_frame = INTRA_FRAME;
 
     vp9_encode_intra16x16mby(&cpi->common, x);
-#if !CONFIG_SB8X8
   } else {
     int i;
 
@@ -39,7 +36,6 @@
       encode_intra4x4block(x, i, BLOCK_SIZE_MB16X16);
     }
   }
-#endif
 
   return vp9_get_mb_ss(x->plane[0].src_diff);
 }
@@ -61,36 +57,37 @@
       raster_block_offset_int16(xd, bsize, 0, ib,
                                 xd->plane[0].diff);
   int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, ib, 16);
+  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
 
-  assert(ib < (16 >> (2 * CONFIG_SB8X8)));
+  assert(ib < (1 << (bwl + bhl)));
 
 #if CONFIG_NEWBINTRAMODES
   xd->mode_info_context->bmi[ib].as_mode.context =
     vp9_find_bpred_context(&x->e_mbd, ib, dst, xd->plane[0].dst.stride);
 #endif
 
-  vp9_intra4x4_predict(&x->e_mbd, ib,
+  vp9_intra4x4_predict(&x->e_mbd, ib, bsize,
                        xd->mode_info_context->bmi[ib].as_mode.first,
                        dst, xd->plane[0].dst.stride);
-  vp9_subtract_block(4, 4, src_diff, 16 >> CONFIG_SB8X8,
+  vp9_subtract_block(4, 4, src_diff, 4 << bwl,
                      src, x->plane[0].src.stride,
                      dst, xd->plane[0].dst.stride);
 
   tx_type = get_tx_type_4x4(&x->e_mbd, ib);
   if (tx_type != DCT_DCT) {
-    vp9_short_fht4x4(src_diff, coeff, 16 >> CONFIG_SB8X8, tx_type);
+    vp9_short_fht4x4(src_diff, coeff, 4 << bwl, tx_type);
     x->quantize_b_4x4(x, ib, tx_type, 16);
     vp9_short_iht4x4(BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16),
-                     diff, 16 >> CONFIG_SB8X8, tx_type);
+                     diff, 4 << bwl, tx_type);
   } else {
-    x->fwd_txm4x4(src_diff, coeff, 32 >> CONFIG_SB8X8);
+    x->fwd_txm4x4(src_diff, coeff, 8 << bwl);
     x->quantize_b_4x4(x, ib, tx_type, 16);
     vp9_inverse_transform_b_4x4(&x->e_mbd, xd->plane[0].eobs[ib],
                                 BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16),
-                                diff, 32 >> CONFIG_SB8X8);
+                                diff, 8 << bwl);
   }
 
-  vp9_recon_b(dst, diff, dst, xd->plane[0].dst.stride);
+  vp9_recon_b(dst, diff, 4 << bwl, dst, xd->plane[0].dst.stride);
 }
 
 void vp9_encode_intra4x4mby(MACROBLOCK *mb, BLOCK_SIZE_TYPE bsize) {
@@ -104,209 +101,14 @@
 
 void vp9_encode_intra16x16mby(VP9_COMMON *const cm, MACROBLOCK *x) {
   MACROBLOCKD *xd = &x->e_mbd;
-  TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
 
   vp9_build_intra_predictors_sby_s(xd, BLOCK_SIZE_MB16X16);
-  vp9_subtract_sby(x, BLOCK_SIZE_MB16X16);
-
-  switch (tx_size) {
-    case TX_16X16:
-      vp9_transform_sby_16x16(x, BLOCK_SIZE_MB16X16);
-      vp9_quantize_sby_16x16(x, BLOCK_SIZE_MB16X16);
-      if (x->optimize)
-        vp9_optimize_sby(cm, x, BLOCK_SIZE_MB16X16);
-      vp9_inverse_transform_sby_16x16(xd, BLOCK_SIZE_MB16X16);
-      break;
-    case TX_8X8:
-      vp9_transform_sby_8x8(x, BLOCK_SIZE_MB16X16);
-      vp9_quantize_sby_8x8(x, BLOCK_SIZE_MB16X16);
-      if (x->optimize)
-        vp9_optimize_sby(cm, x, BLOCK_SIZE_MB16X16);
-      vp9_inverse_transform_sby_8x8(xd, BLOCK_SIZE_MB16X16);
-      break;
-    default:
-      vp9_transform_sby_4x4(x, BLOCK_SIZE_MB16X16);
-      vp9_quantize_sby_4x4(x, BLOCK_SIZE_MB16X16);
-      if (x->optimize)
-        vp9_optimize_sby(cm, x, BLOCK_SIZE_MB16X16);
-      vp9_inverse_transform_sby_4x4(xd, BLOCK_SIZE_MB16X16);
-      break;
-  }
-
-  vp9_recon_sby(xd, BLOCK_SIZE_MB16X16);
+  vp9_encode_sby(cm, x, BLOCK_SIZE_MB16X16);
 }
 
 void vp9_encode_intra16x16mbuv(VP9_COMMON *const cm, MACROBLOCK *x) {
   MACROBLOCKD *xd = &x->e_mbd;
-  TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
 
   vp9_build_intra_predictors_sbuv_s(xd, BLOCK_SIZE_MB16X16);
-  vp9_subtract_sbuv(x, BLOCK_SIZE_MB16X16);
-
-  switch (tx_size) {
-    case TX_4X4:
-      vp9_transform_sbuv_4x4(x, BLOCK_SIZE_MB16X16);
-      vp9_quantize_sbuv_4x4(x, BLOCK_SIZE_MB16X16);
-      if (x->optimize)
-        vp9_optimize_sbuv(cm, x, BLOCK_SIZE_MB16X16);
-      vp9_inverse_transform_sbuv_4x4(xd, BLOCK_SIZE_MB16X16);
-      break;
-    default:  // 16x16 or 8x8
-      vp9_transform_sbuv_8x8(x, BLOCK_SIZE_MB16X16);
-      vp9_quantize_sbuv_8x8(x, BLOCK_SIZE_MB16X16);
-      if (x->optimize)
-        vp9_optimize_sbuv(cm, x, BLOCK_SIZE_MB16X16);
-      vp9_inverse_transform_sbuv_8x8(xd, BLOCK_SIZE_MB16X16);
-      break;
-    }
-
-  vp9_recon_sbuv(xd, BLOCK_SIZE_MB16X16);
+  vp9_encode_sbuv(cm, x, BLOCK_SIZE_MB16X16);
 }
-
-#if !CONFIG_SB8X8
-void vp9_encode_intra8x8(MACROBLOCK *x, int ib) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  uint8_t* const src =
-      raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, ib,
-                                x->plane[0].src.buf, x->plane[0].src.stride);
-  int16_t* const src_diff =
-      raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, ib,
-                                x->plane[0].src_diff);
-  int16_t* const diff =
-      raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, ib,
-                                xd->plane[0].diff);
-  uint8_t* const dst =
-      raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, ib,
-                                xd->plane[0].dst.buf, xd->plane[0].dst.stride);
-  const int iblock[4] = {0, 1, 4, 5};
-  int i;
-  TX_TYPE tx_type;
-
-  vp9_intra8x8_predict(xd, ib, xd->mode_info_context->bmi[ib].as_mode.first,
-                       dst, xd->plane[0].dst.stride);
-  // generate residual blocks
-  vp9_subtract_block(8, 8, src_diff, 16,
-                     src, x->plane[0].src.stride,
-                     dst, xd->plane[0].dst.stride);
-
-  if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {
-    int idx = (ib & 0x02) ? (ib + 2) : ib;
-    int16_t* const dqcoeff = BLOCK_OFFSET(xd->plane[0].dqcoeff, idx, 16);
-    int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, idx, 16);
-
-    assert(idx < 16);
-    tx_type = get_tx_type_8x8(xd, ib);
-    if (tx_type != DCT_DCT) {
-      vp9_short_fht8x8(src_diff, coeff, 16, tx_type);
-      x->quantize_b_8x8(x, idx, tx_type, 16);
-      vp9_short_iht8x8(dqcoeff, diff, 16, tx_type);
-    } else {
-      x->fwd_txm8x8(src_diff, coeff, 32);
-      x->quantize_b_8x8(x, idx, DCT_DCT, 16);
-      vp9_short_idct8x8(dqcoeff, diff, 32);
-    }
-  } else {
-    for (i = 0; i < 4; i++) {
-      int idx = ib + iblock[i];
-      int16_t* const dqcoeff = BLOCK_OFFSET(xd->plane[0].dqcoeff, idx, 16);
-      int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, idx, 16);
-      int16_t* const src_diff =
-          raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, idx,
-                                    x->plane[0].src_diff);
-      int16_t* const diff =
-          raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, idx,
-                                    xd->plane[0].diff);
-
-      assert(idx < 16);
-      tx_type = get_tx_type_4x4(xd, ib + iblock[i]);
-      if (tx_type != DCT_DCT) {
-        vp9_short_fht4x4(src_diff, coeff, 16, tx_type);
-        x->quantize_b_4x4(x, ib + iblock[i], tx_type, 16);
-        vp9_short_iht4x4(dqcoeff, diff, 16, tx_type);
-      } else if (!(i & 1) &&
-                 get_tx_type_4x4(xd, ib + iblock[i] + 1) == DCT_DCT) {
-        x->fwd_txm8x4(src_diff, coeff, 32);
-        x->quantize_b_4x4_pair(x, ib + iblock[i], ib + iblock[i] + 1, 16);
-        vp9_inverse_transform_b_4x4(xd, xd->plane[0].eobs[ib + iblock[i]],
-                                    dqcoeff, diff, 32);
-        vp9_inverse_transform_b_4x4(xd, xd->plane[0].eobs[ib + iblock[i] + 1],
-                                    dqcoeff + 16, diff + 4, 32);
-        i++;
-      } else {
-        x->fwd_txm4x4(src_diff, coeff, 32);
-        x->quantize_b_4x4(x, ib + iblock[i], tx_type, 16);
-        vp9_inverse_transform_b_4x4(xd, xd->plane[0].eobs[ib + iblock[i]],
-                                    dqcoeff, diff, 32);
-      }
-    }
-  }
-
-  // reconstruct submacroblock
-  for (i = 0; i < 4; i++) {
-    int16_t* const diff =
-        raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, ib + iblock[i],
-                                  xd->plane[0].diff);
-    uint8_t* const dst =
-        raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, ib + iblock[i],
-                                  xd->plane[0].dst.buf,
-                                  xd->plane[0].dst.stride);
-    vp9_recon_b_c(dst, diff, dst, xd->plane[0].dst.stride);
-  }
-}
-
-void vp9_encode_intra8x8mby(MACROBLOCK *x) {
-  int i;
-
-  for (i = 0; i < 4; i++)
-    vp9_encode_intra8x8(x, vp9_i8x8_block[i]);
-}
-
-static void encode_intra_uv4x4(MACROBLOCK *x, int ib, int mode) {
-  MACROBLOCKD * const xd = &x->e_mbd;
-  int16_t * const dqcoeff = MB_SUBBLOCK_FIELD(xd, dqcoeff, ib);
-  int16_t* const coeff = MB_SUBBLOCK_FIELD(x, coeff, ib);
-  const int plane = ib < 20 ? 1 : 2;
-  const int block = ib < 20 ? ib - 16 : ib - 20;
-  uint8_t* const src =
-      raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, plane, block,
-                                x->plane[plane].src.buf,
-                                x->plane[plane].src.stride);
-  int16_t* const src_diff =
-      raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, plane, block,
-                                x->plane[plane].src_diff);
-  int16_t* const diff =
-      raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, plane, block,
-                                xd->plane[plane].diff);
-  uint8_t* const dst =
-      raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, plane, block,
-                                xd->plane[plane].dst.buf,
-                                xd->plane[plane].dst.stride);
-
-  assert(ib >= 16 && ib < 24);
-  vp9_intra_uv4x4_predict(&x->e_mbd, ib, mode,
-                          dst, xd->plane[plane].dst.stride);
-
-  assert(xd->plane[1].subsampling_x == 1);
-  vp9_subtract_block(4, 4, src_diff, 8,
-                     src, x->plane[plane].src.stride,
-                     dst, xd->plane[plane].dst.stride);
-
-  x->fwd_txm4x4(src_diff, coeff, 16);
-  x->quantize_b_4x4(x, ib, DCT_DCT, 16);
-  vp9_inverse_transform_b_4x4(&x->e_mbd, xd->plane[plane].eobs[block],
-                              dqcoeff, diff, 16);
-
-  vp9_recon_uv_b_c(dst, diff, dst, xd->plane[plane].dst.stride);
-}
-
-void vp9_encode_intra8x8mbuv(MACROBLOCK *x) {
-  int i;
-
-  for (i = 0; i < 4; i++) {
-    int mode = x->e_mbd.mode_info_context->bmi[vp9_i8x8_block[i]].as_mode.first;
-
-    encode_intra_uv4x4(x, i + 16, mode);  // u
-    encode_intra_uv4x4(x, i + 20, mode);  // v
-  }
-}
-#endif
diff --git a/vp9/encoder/vp9_encodeintra.h b/vp9/encoder/vp9_encodeintra.h
index a4f4c18..c262004 100644
--- a/vp9/encoder/vp9_encodeintra.h
+++ b/vp9/encoder/vp9_encodeintra.h
@@ -17,10 +17,4 @@
 void vp9_encode_intra16x16mby(VP9_COMMON *const cm, MACROBLOCK *x);
 void vp9_encode_intra16x16mbuv(VP9_COMMON *const cm, MACROBLOCK *x);
 void vp9_encode_intra4x4mby(MACROBLOCK *mb, BLOCK_SIZE_TYPE bs);
-#if !CONFIG_SB8X8
-void vp9_encode_intra8x8mby(MACROBLOCK *x);
-void vp9_encode_intra8x8mbuv(MACROBLOCK *x);
-void vp9_encode_intra8x8(MACROBLOCK *x, int ib);
-#endif
-
 #endif  // VP9_ENCODER_VP9_ENCODEINTRA_H_
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 15fd4f1..4210527 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -67,143 +67,6 @@
 }
 
 
-void vp9_transform_sby_32x32(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 3, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 3);
-  const int stride = 32 << bwl;
-  int n;
-
-  for (n = 0; n < bw * bh; n++) {
-    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
-
-    vp9_short_fdct32x32(x->plane[0].src_diff + y_idx * stride * 32 + x_idx * 32,
-                        x->plane[0].coeff + n * 1024, stride * 2);
-  }
-}
-
-void vp9_transform_sby_16x16(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 2, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 2);
-  const int stride = 16 << bwl, bstride = 4 << bwl;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int n;
-
-  for (n = 0; n < bw * bh; n++) {
-    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
-    const TX_TYPE tx_type = get_tx_type_16x16(xd,
-                                              (y_idx * bstride + x_idx) * 4);
-
-    if (tx_type != DCT_DCT) {
-      vp9_short_fht16x16(x->plane[0].src_diff +
-                             y_idx * stride * 16 + x_idx * 16,
-                         x->plane[0].coeff + n * 256, stride, tx_type);
-    } else {
-      x->fwd_txm16x16(x->plane[0].src_diff + y_idx * stride * 16 + x_idx * 16,
-                      x->plane[0].coeff + n * 256, stride * 2);
-    }
-  }
-}
-
-void vp9_transform_sby_8x8(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 1, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 1);
-  const int stride = 8 << bwl, bstride = 2 << bwl;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int n;
-
-  for (n = 0; n < bw * bh; n++) {
-    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
-    const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * bstride + x_idx) * 2);
-
-    if (tx_type != DCT_DCT) {
-      vp9_short_fht8x8(x->plane[0].src_diff + y_idx * stride * 8 + x_idx * 8,
-                       x->plane[0].coeff + n * 64, stride, tx_type);
-    } else {
-      x->fwd_txm8x8(x->plane[0].src_diff + y_idx * stride * 8 + x_idx * 8,
-                    x->plane[0].coeff + n * 64, stride * 2);
-    }
-  }
-}
-
-void vp9_transform_sby_4x4(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize), bw = 1 << bwl;
-  const int bh = 1 << b_height_log2(bsize);
-  const int stride = 4 << bwl;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int n;
-
-  for (n = 0; n < bw * bh; n++) {
-    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
-    const TX_TYPE tx_type = get_tx_type_4x4(xd, n);
-
-    if (tx_type != DCT_DCT) {
-      vp9_short_fht4x4(x->plane[0].src_diff + y_idx * stride * 4 + x_idx * 4,
-                       x->plane[0].coeff + n * 16, stride, tx_type);
-    } else {
-      x->fwd_txm4x4(x->plane[0].src_diff + y_idx * stride * 4 + x_idx * 4,
-                    x->plane[0].coeff + n * 16, stride * 2);
-    }
-  }
-}
-
-void vp9_transform_sbuv_32x32(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  assert(bsize == BLOCK_SIZE_SB64X64);
-  vp9_clear_system_state();
-  vp9_short_fdct32x32(x->plane[1].src_diff, x->plane[1].coeff, 64);
-  vp9_short_fdct32x32(x->plane[2].src_diff, x->plane[2].coeff, 64);
-}
-
-void vp9_transform_sbuv_16x16(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 2, bhl = b_height_log2(bsize) - 2;
-  const int bw = 1 << (bwl - 1), bh = 1 << (bhl - 1);
-  const int stride = 16 << (bwl - 1);
-  int n;
-
-  vp9_clear_system_state();
-  for (n = 0; n < bw * bh; n++) {
-    const int x_idx = n & (bw - 1), y_idx = n >> (bwl - 1);
-
-    x->fwd_txm16x16(x->plane[1].src_diff + y_idx * stride * 16 + x_idx * 16,
-                    x->plane[1].coeff + n * 256, stride * 2);
-    x->fwd_txm16x16(x->plane[2].src_diff + y_idx * stride * 16 + x_idx * 16,
-                    x->plane[2].coeff + n * 256, stride * 2);
-  }
-}
-
-void vp9_transform_sbuv_8x8(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 1, bhl = b_height_log2(bsize) - 1;
-  const int bw = 1 << (bwl - 1), bh = 1 << (bhl - 1);
-  const int stride = 8 << (bwl - 1);
-  int n;
-
-  vp9_clear_system_state();
-  for (n = 0; n < bw * bh; n++) {
-    const int x_idx = n & (bw - 1), y_idx = n >> (bwl - 1);
-
-    x->fwd_txm8x8(x->plane[1].src_diff + y_idx * stride * 8 + x_idx * 8,
-                  x->plane[1].coeff + n * 64, stride * 2);
-    x->fwd_txm8x8(x->plane[2].src_diff + y_idx * stride * 8 + x_idx * 8,
-                  x->plane[2].coeff + n * 64, stride * 2);
-  }
-}
-
-void vp9_transform_sbuv_4x4(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
-  const int bw = 1 << (bwl - 1), bh = 1 << (bhl - 1);
-  const int stride = 4 << (bwl - 1);
-  int n;
-
-  vp9_clear_system_state();
-  for (n = 0; n < bw * bh; n++) {
-    const int x_idx = n & (bw - 1), y_idx = n >> (bwl - 1);
-
-    x->fwd_txm4x4(x->plane[1].src_diff + y_idx * stride * 4 + x_idx * 4,
-                  x->plane[1].coeff + n * 16, stride * 2);
-    x->fwd_txm4x4(x->plane[2].src_diff + y_idx * stride * 4 + x_idx * 4,
-                  x->plane[2].coeff + n * 16, stride * 2);
-  }
-}
-
 #define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
 #define RDTRUNC_8x8(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
 typedef struct vp9_token_state vp9_token_state;
@@ -480,31 +343,33 @@
   *a = *l = (final_eob > 0);
 }
 
-struct optimize_ctx {
-  ENTROPY_CONTEXT ta[MAX_MB_PLANE][16];
-  ENTROPY_CONTEXT tl[MAX_MB_PLANE][16];
-};
-
 struct optimize_block_args {
   VP9_COMMON *cm;
   MACROBLOCK *x;
   struct optimize_ctx *ctx;
 };
 
-static void optimize_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
-                           int ss_txfrm_size, void *arg) {
-  const struct optimize_block_args* const args = arg;
-  MACROBLOCKD* const xd = &args->x->e_mbd;
+void vp9_optimize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
+                    int ss_txfrm_size, VP9_COMMON *cm, MACROBLOCK *mb,
+                    struct optimize_ctx *ctx) {
+  MACROBLOCKD* const xd = &mb->e_mbd;
   int x, y;
 
   // find current entropy context
   txfrm_block_to_raster_xy(xd, bsize, plane, block, ss_txfrm_size, &x, &y);
 
-  optimize_b(args->cm, args->x, plane, block, bsize,
-             &args->ctx->ta[plane][x], &args->ctx->tl[plane][y],
+  optimize_b(cm, mb, plane, block, bsize,
+             &ctx->ta[plane][x], &ctx->tl[plane][y],
              ss_txfrm_size / 2);
 }
 
+static void optimize_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
+                           int ss_txfrm_size, void *arg) {
+  const struct optimize_block_args* const args = arg;
+  vp9_optimize_b(plane, block, bsize, ss_txfrm_size, args->cm, args->x,
+                 args->ctx);
+}
+
 void vp9_optimize_init(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,
                        struct optimize_ctx *ctx) {
   int p;
@@ -539,9 +404,6 @@
   struct optimize_block_args arg = {cm, x, &ctx};
   vp9_optimize_init(&x->e_mbd, bsize, &ctx);
   foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0,
-#if !CONFIG_SB8X8
-  0,
-#endif
                                      optimize_block, &arg);
 }
 
@@ -553,77 +415,192 @@
   foreach_transformed_block_uv(&x->e_mbd, bsize, optimize_block, &arg);
 }
 
-#if !CONFIG_SB8X8
-void vp9_fidct_mb(VP9_COMMON *const cm, MACROBLOCK *x) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
+struct encode_b_args {
+  VP9_COMMON *cm;
+  MACROBLOCK *x;
+  struct optimize_ctx *ctx;
+};
 
-  if (tx_size == TX_16X16) {
-    vp9_transform_sby_16x16(x, BLOCK_SIZE_MB16X16);
-    vp9_transform_sbuv_8x8(x, BLOCK_SIZE_MB16X16);
-    vp9_quantize_sby_16x16(x, BLOCK_SIZE_MB16X16);
-    vp9_quantize_sbuv_8x8(x, BLOCK_SIZE_MB16X16);
-    if (x->optimize) {
-      vp9_optimize_sby(cm, x, BLOCK_SIZE_MB16X16);
-      vp9_optimize_sbuv(cm, x, BLOCK_SIZE_MB16X16);
-    }
-    vp9_inverse_transform_sby_16x16(xd, BLOCK_SIZE_MB16X16);
-    vp9_inverse_transform_sbuv_8x8(xd, BLOCK_SIZE_MB16X16);
-  } else if (tx_size == TX_8X8) {
-    vp9_transform_sby_8x8(x, BLOCK_SIZE_MB16X16);
-    vp9_quantize_sby_8x8(x, BLOCK_SIZE_MB16X16);
-    if (x->optimize)
-      vp9_optimize_sby(cm, x, BLOCK_SIZE_MB16X16);
-    vp9_inverse_transform_sby_8x8(xd, BLOCK_SIZE_MB16X16);
-    if (xd->mode_info_context->mbmi.mode == SPLITMV) {
-      assert(xd->mode_info_context->mbmi.partitioning != PARTITIONING_4X4);
-      vp9_transform_sbuv_4x4(x, BLOCK_SIZE_MB16X16);
-      vp9_quantize_sbuv_4x4(x, BLOCK_SIZE_MB16X16);
-      if (x->optimize)
-        vp9_optimize_sbuv(cm, x, BLOCK_SIZE_MB16X16);
-      vp9_inverse_transform_sbuv_4x4(xd, BLOCK_SIZE_MB16X16);
-    } else {
-      vp9_transform_sbuv_8x8(x, BLOCK_SIZE_MB16X16);
-      vp9_quantize_sbuv_8x8(x, BLOCK_SIZE_MB16X16);
-      if (x->optimize)
-        vp9_optimize_sbuv(cm, x, BLOCK_SIZE_MB16X16);
-      vp9_inverse_transform_sbuv_8x8(xd, BLOCK_SIZE_MB16X16);
-    }
-  } else {
-    vp9_transform_sby_4x4(x, BLOCK_SIZE_MB16X16);
-    vp9_transform_sbuv_4x4(x, BLOCK_SIZE_MB16X16);
-    vp9_quantize_sby_4x4(x, BLOCK_SIZE_MB16X16);
-    vp9_quantize_sbuv_4x4(x, BLOCK_SIZE_MB16X16);
-    if (x->optimize) {
-      vp9_optimize_sby(cm, x, BLOCK_SIZE_MB16X16);
-      vp9_optimize_sbuv(cm, x, BLOCK_SIZE_MB16X16);
-    }
-    vp9_inverse_transform_sby_4x4(xd, BLOCK_SIZE_MB16X16);
-    vp9_inverse_transform_sbuv_4x4(xd, BLOCK_SIZE_MB16X16);
+static void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize,
+                         int ss_txfrm_size, void *arg) {
+  struct encode_b_args* const args = arg;
+  MACROBLOCK* const x = args->x;
+  MACROBLOCKD* const xd = &x->e_mbd;
+  const int bw = 4 << (b_width_log2(bsize) - xd->plane[plane].subsampling_x);
+  const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,
+                                                       block, ss_txfrm_size);
+  int16_t* const src_diff = raster_block_offset_int16(xd, bsize, plane,
+                                                      raster_block,
+                                                      x->plane[plane].src_diff);
+  TX_TYPE tx_type = DCT_DCT;
+
+  switch (ss_txfrm_size / 2) {
+    case TX_32X32:
+      vp9_short_fdct32x32(src_diff,
+                          BLOCK_OFFSET(x->plane[plane].coeff, block, 16),
+                          bw * 2);
+      break;
+    case TX_16X16:
+      tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;
+      if (tx_type != DCT_DCT) {
+        vp9_short_fht16x16(src_diff,
+                           BLOCK_OFFSET(x->plane[plane].coeff, block, 16),
+                           bw, tx_type);
+      } else {
+        x->fwd_txm16x16(src_diff,
+                        BLOCK_OFFSET(x->plane[plane].coeff, block, 16),
+                        bw * 2);
+      }
+      break;
+    case TX_8X8:
+      tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;
+      if (tx_type != DCT_DCT) {
+        vp9_short_fht8x8(src_diff,
+                           BLOCK_OFFSET(x->plane[plane].coeff, block, 16),
+                           bw, tx_type);
+      } else {
+        x->fwd_txm8x8(src_diff,
+                      BLOCK_OFFSET(x->plane[plane].coeff, block, 16),
+                      bw * 2);
+      }
+      break;
+    case TX_4X4:
+      tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT;
+      if (tx_type != DCT_DCT) {
+        vp9_short_fht4x4(src_diff,
+                           BLOCK_OFFSET(x->plane[plane].coeff, block, 16),
+                           bw, tx_type);
+      } else {
+        x->fwd_txm4x4(src_diff,
+                      BLOCK_OFFSET(x->plane[plane].coeff, block, 16),
+                      bw * 2);
+      }
+      break;
+    default:
+      assert(0);
+  }
+
+  vp9_quantize(x, plane, block, 16 << ss_txfrm_size, tx_type);
+}
+
+static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
+                         int ss_txfrm_size, void *arg) {
+  struct encode_b_args* const args = arg;
+  MACROBLOCK* const x = args->x;
+  MACROBLOCKD* const xd = &x->e_mbd;
+  const int bw = 4 << (b_width_log2(bsize) - xd->plane[plane].subsampling_x);
+  const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,
+                                                       block, ss_txfrm_size);
+  int16_t* const diff = raster_block_offset_int16(xd, bsize, plane,
+                                                  raster_block,
+                                                  xd->plane[plane].diff);
+  TX_TYPE tx_type = DCT_DCT;
+
+  xform_quant(plane, block, bsize, ss_txfrm_size, arg);
+
+  if (x->optimize)
+    vp9_optimize_b(plane, block, bsize, ss_txfrm_size, args->cm, x, args->ctx);
+
+  switch (ss_txfrm_size / 2) {
+    case TX_32X32:
+      vp9_short_idct32x32(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
+                          diff, bw * 2);
+      break;
+    case TX_16X16:
+      tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;
+      if (tx_type == DCT_DCT) {
+        vp9_short_idct16x16(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
+                            diff, bw * 2);
+      } else {
+        vp9_short_iht16x16(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
+                           diff, bw, tx_type);
+      }
+      break;
+    case TX_8X8:
+      tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;
+      if (tx_type == DCT_DCT) {
+        vp9_short_idct8x8(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
+                          diff, bw * 2);
+      } else {
+        vp9_short_iht8x8(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
+                         diff, bw, tx_type);
+      }
+      break;
+    case TX_4X4:
+      tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT;
+      if (tx_type == DCT_DCT) {
+        // this is like vp9_short_idct4x4 but has a special case around eob<=1
+        // which is significant (not just an optimization) for the lossless
+        // case.
+        vp9_inverse_transform_b_4x4(xd, xd->plane[plane].eobs[block],
+            BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16), diff, bw * 2);
+      } else {
+        vp9_short_iht4x4(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
+                         diff, bw, tx_type);
+      }
+      break;
   }
 }
 
-void vp9_encode_inter16x16(VP9_COMMON *const cm, MACROBLOCK *x,
-                           int mi_row, int mi_col) {
-  MACROBLOCKD *const xd = &x->e_mbd;
+void vp9_xform_quant_sby(VP9_COMMON *const cm, MACROBLOCK *x,
+                         BLOCK_SIZE_TYPE bsize) {
+  MACROBLOCKD* const xd = &x->e_mbd;
+  struct encode_b_args arg = {cm, x, NULL};
 
-  vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_SIZE_MB16X16);
-  vp9_subtract_sb(x, BLOCK_SIZE_MB16X16);
-  vp9_fidct_mb(cm, x);
-  vp9_recon_sb(xd, BLOCK_SIZE_MB16X16);
+  foreach_transformed_block_in_plane(xd, bsize, 0,
+                                     xform_quant, &arg);
 }
-#endif
 
-/* this function is used by first pass only */
-void vp9_encode_inter16x16y(MACROBLOCK *x, int mi_row, int mi_col) {
-  MACROBLOCKD *xd = &x->e_mbd;
+void vp9_xform_quant_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
+                         BLOCK_SIZE_TYPE bsize) {
+  MACROBLOCKD* const xd = &x->e_mbd;
+  struct encode_b_args arg = {cm, x, NULL};
 
-  vp9_build_inter_predictors_sby(xd, mi_row, mi_col, BLOCK_SIZE_MB16X16);
-  vp9_subtract_sby(x, BLOCK_SIZE_MB16X16);
+  foreach_transformed_block_uv(xd, bsize, xform_quant, &arg);
+}
 
-  vp9_transform_sby_4x4(x, BLOCK_SIZE_MB16X16);
-  vp9_quantize_sby_4x4(x, BLOCK_SIZE_MB16X16);
-  vp9_inverse_transform_sby_4x4(xd, BLOCK_SIZE_MB16X16);
+void vp9_encode_sby(VP9_COMMON *const cm, MACROBLOCK *x,
+                    BLOCK_SIZE_TYPE bsize) {
+  MACROBLOCKD* const xd = &x->e_mbd;
+  struct optimize_ctx ctx;
+  struct encode_b_args arg = {cm, x, &ctx};
 
-  vp9_recon_sby(xd, BLOCK_SIZE_MB16X16);
+  vp9_subtract_sby(x, bsize);
+  if (x->optimize)
+    vp9_optimize_init(xd, bsize, &ctx);
+
+  foreach_transformed_block_in_plane(xd, bsize, 0,
+                                     encode_block, &arg);
+
+  vp9_recon_sby(xd, bsize);
+}
+
+void vp9_encode_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
+                     BLOCK_SIZE_TYPE bsize) {
+  MACROBLOCKD* const xd = &x->e_mbd;
+  struct optimize_ctx ctx;
+  struct encode_b_args arg = {cm, x, &ctx};
+
+  vp9_subtract_sbuv(x, bsize);
+  if (x->optimize)
+    vp9_optimize_init(xd, bsize, &ctx);
+
+  foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
+
+  vp9_recon_sbuv(xd, bsize);
+}
+
+void vp9_encode_sb(VP9_COMMON *const cm, MACROBLOCK *x,
+                   BLOCK_SIZE_TYPE bsize) {
+  MACROBLOCKD* const xd = &x->e_mbd;
+  struct optimize_ctx ctx;
+  struct encode_b_args arg = {cm, x, &ctx};
+
+  vp9_subtract_sb(x, bsize);
+  if (x->optimize)
+    vp9_optimize_init(xd, bsize, &ctx);
+
+  foreach_transformed_block(xd, bsize, encode_block, &arg);
+
+  vp9_recon_sb(xd, bsize);
 }
diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h
index b1d8771..afbe446 100644
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -22,32 +22,29 @@
   MV_REFERENCE_FRAME second_ref_frame;
 } MODE_DEFINITION;
 
-
-struct VP9_ENCODER_RTCD;
-#if !CONFIG_SB8X8
-void vp9_encode_inter16x16(VP9_COMMON *const cm, MACROBLOCK *x,
-                           int mb_row, int mb_col);
-#endif
-
-void vp9_encode_inter16x16y(MACROBLOCK *x, int mb_row, int mb_col);
-
-void vp9_transform_sby_32x32(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_transform_sby_16x16(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_transform_sby_8x8(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_transform_sby_4x4(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_transform_sbuv_32x32(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_transform_sbuv_16x16(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_transform_sbuv_8x8(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_transform_sbuv_4x4(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-
+struct optimize_ctx {
+  ENTROPY_CONTEXT ta[MAX_MB_PLANE][16];
+  ENTROPY_CONTEXT tl[MAX_MB_PLANE][16];
+};
+void vp9_optimize_init(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,
+                       struct optimize_ctx *ctx);
+void vp9_optimize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
+                    int ss_txfrm_size, VP9_COMMON *cm, MACROBLOCK *x,
+                    struct optimize_ctx *ctx);
 void vp9_optimize_sby(VP9_COMMON *const cm, MACROBLOCK *x,
                       BLOCK_SIZE_TYPE bsize);
 void vp9_optimize_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
                        BLOCK_SIZE_TYPE bsize);
 
-#if !CONFIG_SB8X8
-void vp9_fidct_mb(VP9_COMMON *const cm, MACROBLOCK *x);
-#endif
+void vp9_encode_sb(VP9_COMMON *const cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
+void vp9_encode_sby(VP9_COMMON *const cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
+void vp9_encode_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
+                     BLOCK_SIZE_TYPE bsize);
+
+void vp9_xform_quant_sby(VP9_COMMON *const cm, MACROBLOCK *x,
+                        BLOCK_SIZE_TYPE bsize);
+void vp9_xform_quant_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
+                          BLOCK_SIZE_TYPE bsize);
 
 void vp9_subtract_block(int rows, int cols,
                         int16_t *diff_ptr, int diff_stride,
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index e4d6863..a1898af 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -14,7 +14,6 @@
 #include "vp9/encoder/vp9_onyx_int.h"
 #include "vp9/encoder/vp9_variance.h"
 #include "vp9/encoder/vp9_encodeintra.h"
-#include "vp9/common/vp9_setupintrarecon.h"
 #include "vp9/encoder/vp9_mcomp.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vpx_scale/vpx_scale.h"
@@ -247,8 +246,8 @@
 
 // Calculate a modified Error used in distributing bits between easier and harder frames
 static double calculate_modified_err(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
-  double av_err = (cpi->twopass.total_stats->ssim_weighted_pred_err /
-                   cpi->twopass.total_stats->count);
+  double av_err = (cpi->twopass.total_stats.ssim_weighted_pred_err /
+                   cpi->twopass.total_stats.count);
   double this_err = this_frame->ssim_weighted_pred_err;
   double modified_err;
 
@@ -328,7 +327,7 @@
   // For VBR base this on the bits and frames left plus the
   // two_pass_vbrmax_section rate passed in by the user.
   max_bits = (int) (((double) cpi->twopass.bits_left
-      / (cpi->twopass.total_stats->count - (double) cpi->common
+      / (cpi->twopass.total_stats.count - (double) cpi->common
              .current_video_frame))
                     * ((double) cpi->oxcf.two_pass_vbrmax_section / 100.0));
 
@@ -340,11 +339,11 @@
 }
 
 void vp9_init_first_pass(VP9_COMP *cpi) {
-  zero_stats(cpi->twopass.total_stats);
+  zero_stats(&cpi->twopass.total_stats);
 }
 
 void vp9_end_first_pass(VP9_COMP *cpi) {
-  output_stats(cpi, cpi->output_pkt_list, cpi->twopass.total_stats);
+  output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.total_stats);
 }
 
 static void zz_motion_search(VP9_COMP *cpi, MACROBLOCK *x, YV12_BUFFER_CONFIG *recon_buffer, int *best_motion_err, int recon_yoffset) {
@@ -486,8 +485,6 @@
 
   vp9_setup_block_dptrs(&x->e_mbd);
 
-  // set up frame new frame for intra coded blocks
-  vp9_setup_intra_recon(new_yv12);
   vp9_frame_init_quantizer(cpi);
 
   // Initialise the MV cost table to the defaults
@@ -521,9 +518,9 @@
       int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
 
       set_mi_row_col(cm, xd,
-                     mb_row << CONFIG_SB8X8,
+                     mb_row << 1,
                      1 << mi_height_log2(BLOCK_SIZE_MB16X16),
-                     mb_col << CONFIG_SB8X8,
+                     mb_col << 1,
                      1 << mi_height_log2(BLOCK_SIZE_MB16X16));
 
       xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset;
@@ -626,7 +623,10 @@
           this_error = motion_error;
           vp9_set_mbmode_and_mvs(x, NEWMV, &mv);
           xd->mode_info_context->mbmi.txfm_size = TX_4X4;
-          vp9_encode_inter16x16y(x, mb_row, mb_col);
+          vp9_build_inter_predictors_sby(xd, mb_row << 1,
+                                         mb_col << 1,
+                                         BLOCK_SIZE_MB16X16);
+          vp9_encode_sb(cm, x, BLOCK_SIZE_MB16X16);
           sum_mvr += mv.as_mv.row;
           sum_mvr_abs += abs(mv.as_mv.row);
           sum_mvc += mv.as_mv.col;
@@ -751,20 +751,20 @@
                             - cpi->source->ts_start);
 
     // don't want to do output stats with a stack variable!
-    memcpy(cpi->twopass.this_frame_stats,
+    memcpy(&cpi->twopass.this_frame_stats,
            &fps,
            sizeof(FIRSTPASS_STATS));
-    output_stats(cpi, cpi->output_pkt_list, cpi->twopass.this_frame_stats);
-    accumulate_stats(cpi->twopass.total_stats, &fps);
+    output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.this_frame_stats);
+    accumulate_stats(&cpi->twopass.total_stats, &fps);
   }
 
   // Copy the previous Last Frame back into gf and and arf buffers if
   // the prediction is good enough... but also dont allow it to lag too far
   if ((cpi->twopass.sr_update_lag > 3) ||
       ((cm->current_video_frame > 0) &&
-       (cpi->twopass.this_frame_stats->pcnt_inter > 0.20) &&
-       ((cpi->twopass.this_frame_stats->intra_error /
-         DOUBLE_DIVIDE_CHECK(cpi->twopass.this_frame_stats->coded_error)) >
+       (cpi->twopass.this_frame_stats.pcnt_inter > 0.20) &&
+       ((cpi->twopass.this_frame_stats.intra_error /
+         DOUBLE_DIVIDE_CHECK(cpi->twopass.this_frame_stats.coded_error)) >
         2.0))) {
     vp8_yv12_copy_frame(lst_yv12, gld_yv12);
     cpi->twopass.sr_update_lag = 1;
@@ -995,7 +995,7 @@
   // Give average a chance to settle though.
   // PGW TODO.. This code is broken for the extended Q range
   if ((cpi->ni_frames >
-       ((int)cpi->twopass.total_stats->count >> 8)) &&
+       ((int)cpi->twopass.total_stats.count >> 8)) &&
       (cpi->ni_frames > 25)) {
     adjust_maxq_qrange(cpi);
   }
@@ -1052,8 +1052,8 @@
   }
 
   // II ratio correction factor for clip as a whole
-  clip_iiratio = cpi->twopass.total_stats->intra_error /
-                 DOUBLE_DIVIDE_CHECK(cpi->twopass.total_stats->coded_error);
+  clip_iiratio = cpi->twopass.total_stats.intra_error /
+                 DOUBLE_DIVIDE_CHECK(cpi->twopass.total_stats.coded_error);
   clip_iifactor = 1.0 - ((clip_iiratio - 10.0) * 0.025);
   if (clip_iifactor < 0.80)
     clip_iifactor = 0.80;
@@ -1098,14 +1098,14 @@
   if (two_pass_min_rate < lower_bounds_min_rate)
     two_pass_min_rate = lower_bounds_min_rate;
 
-  zero_stats(cpi->twopass.total_stats);
-  zero_stats(cpi->twopass.total_left_stats);
+  zero_stats(&cpi->twopass.total_stats);
+  zero_stats(&cpi->twopass.total_left_stats);
 
   if (!cpi->twopass.stats_in_end)
     return;
 
-  *cpi->twopass.total_stats = *cpi->twopass.stats_in_end;
-  *cpi->twopass.total_left_stats = *cpi->twopass.total_stats;
+  cpi->twopass.total_stats = *cpi->twopass.stats_in_end;
+  cpi->twopass.total_left_stats = cpi->twopass.total_stats;
 
   // each frame can have a different duration, as the frame rate in the source
   // isn't guaranteed to be constant.   The frame rate prior to the first frame
@@ -1113,13 +1113,13 @@
   // Its calculated based on the actual durations of all frames from the first
   // pass.
   vp9_new_frame_rate(cpi,
-                     10000000.0 * cpi->twopass.total_stats->count /
-                     cpi->twopass.total_stats->duration);
+                     10000000.0 * cpi->twopass.total_stats.count /
+                     cpi->twopass.total_stats.duration);
 
   cpi->output_frame_rate = cpi->oxcf.frame_rate;
-  cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats->duration *
+  cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration *
                                      cpi->oxcf.target_bandwidth / 10000000.0);
-  cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats->duration *
+  cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats.duration *
                                       two_pass_min_rate / 10000000.0);
 
   // Calculate a minimum intra value to be used in determining the IIratio
@@ -1145,7 +1145,8 @@
       sum_iiratio += IIRatio;
     }
 
-    cpi->twopass.avg_iiratio = sum_iiratio / DOUBLE_DIVIDE_CHECK((double)cpi->twopass.total_stats->count);
+    cpi->twopass.avg_iiratio = sum_iiratio /
+        DOUBLE_DIVIDE_CHECK((double)cpi->twopass.total_stats.count);
 
     // Reset file position
     reset_fpf_position(cpi, start_pos);
@@ -1828,7 +1829,7 @@
   // where cpi->twopass.kf_group_bits is tied to cpi->twopass.bits_left.
   // This is also important for short clips where there may only be one
   // key frame.
-  if (cpi->twopass.frames_to_key >= (int)(cpi->twopass.total_stats->count -
+  if (cpi->twopass.frames_to_key >= (int)(cpi->twopass.total_stats.count -
                                           cpi->common.current_video_frame)) {
     cpi->twopass.kf_group_bits =
       (cpi->twopass.bits_left > 0) ? cpi->twopass.bits_left : 0;
@@ -2096,7 +2097,7 @@
 
 void vp9_second_pass(VP9_COMP *cpi) {
   int tmp_q;
-  int frames_left = (int)(cpi->twopass.total_stats->count -
+  int frames_left = (int)(cpi->twopass.total_stats.count -
                           cpi->common.current_video_frame);
 
   FIRSTPASS_STATS this_frame;
@@ -2121,7 +2122,7 @@
 
       est_cq =
         estimate_cq(cpi,
-                    cpi->twopass.total_left_stats,
+                    &cpi->twopass.total_left_stats,
                     (int)(cpi->twopass.bits_left / frames_left));
 
       cpi->cq_target_quality = cpi->oxcf.cq_level;
@@ -2135,7 +2136,7 @@
 
     tmp_q = estimate_max_q(
               cpi,
-              cpi->twopass.total_left_stats,
+              &cpi->twopass.total_left_stats,
               (int)(cpi->twopass.bits_left / frames_left));
 
     cpi->active_worst_quality         = tmp_q;
@@ -2158,15 +2159,15 @@
   // radical adjustments to the allowed quantizer range just to use up a
   // few surplus bits or get beneath the target rate.
   else if ((cpi->common.current_video_frame <
-            (((unsigned int)cpi->twopass.total_stats->count * 255) >> 8)) &&
+            (((unsigned int)cpi->twopass.total_stats.count * 255) >> 8)) &&
            ((cpi->common.current_video_frame + cpi->baseline_gf_interval) <
-            (unsigned int)cpi->twopass.total_stats->count)) {
+            (unsigned int)cpi->twopass.total_stats.count)) {
     if (frames_left < 1)
       frames_left = 1;
 
     tmp_q = estimate_max_q(
               cpi,
-              cpi->twopass.total_left_stats,
+              &cpi->twopass.total_left_stats,
               (int)(cpi->twopass.bits_left / frames_left));
 
     // Make a damped adjustment to active max Q
@@ -2245,7 +2246,7 @@
   cpi->twopass.frames_to_key--;
 
   // Update the total stats remaining structure
-  subtract_stats(cpi->twopass.total_left_stats, &this_frame);
+  subtract_stats(&cpi->twopass.total_left_stats, &this_frame);
 }
 
 static int test_candidate_kf(VP9_COMP *cpi,
diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index af62ec3..018c86c 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -11,7 +11,6 @@
 #include <limits.h>
 #include <vp9/encoder/vp9_encodeintra.h>
 #include <vp9/encoder/vp9_rdopt.h>
-#include <vp9/common/vp9_setupintrarecon.h>
 #include <vp9/common/vp9_blockd.h>
 #include <vp9/common/vp9_reconinter.h>
 #include <vp9/common/vp9_systemdependent.h>
@@ -386,7 +385,6 @@
       // goes in segment 0
       if (arf_not_zz[offset + mb_col]) {
         ncnt[0]++;
-#if CONFIG_SB8X8
         cpi->segmentation_map[offset * 4 + 2 * mb_col] = 0;
         cpi->segmentation_map[offset * 4 + 2 * mb_col + 1] = 0;
         cpi->segmentation_map[offset * 4 + 2 * mb_col + cm->mi_cols] = 0;
@@ -396,11 +394,6 @@
         cpi->segmentation_map[offset * 4 + 2 * mb_col + 1] = 1;
         cpi->segmentation_map[offset * 4 + 2 * mb_col + cm->mi_cols] = 1;
         cpi->segmentation_map[offset * 4 + 2 * mb_col + cm->mi_cols + 1] = 1;
-#else
-        cpi->segmentation_map[offset + mb_col] = 0;
-      } else {
-        cpi->segmentation_map[offset + mb_col] = 1;
-#endif
         ncnt[1]++;
       }
     }
diff --git a/vp9/encoder/vp9_modecosts.c b/vp9/encoder/vp9_modecosts.c
index 88cd1f4..e26daf0 100644
--- a/vp9/encoder/vp9_modecosts.c
+++ b/vp9/encoder/vp9_modecosts.c
@@ -41,10 +41,6 @@
                   x->fc.uv_mode_prob[VP9_YMODES - 1], vp9_uv_mode_tree);
   vp9_cost_tokens(c->mb.intra_uv_mode_cost[0],
                   x->kf_uv_mode_prob[VP9_YMODES - 1], vp9_uv_mode_tree);
-#if !CONFIG_SB8X8
-  vp9_cost_tokens(c->mb.i8x8_mode_costs,
-                  x->fc.i8x8_mode_prob, vp9_i8x8_mode_tree);
-#endif
 
   for (i = 0; i <= VP9_SWITCHABLE_FILTERS; ++i)
     vp9_cost_tokens((int *)c->mb.switchable_interp_costs[i],
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 738d6e6..ffee34e 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -332,15 +332,6 @@
 
   vpx_free(cpi->mb.pip);
   cpi->mb.pip = 0;
-
-  vpx_free(cpi->twopass.total_stats);
-  cpi->twopass.total_stats = 0;
-
-  vpx_free(cpi->twopass.total_left_stats);
-  cpi->twopass.total_left_stats = 0;
-
-  vpx_free(cpi->twopass.this_frame_stats);
-  cpi->twopass.this_frame_stats = 0;
 }
 
 // Computes a q delta (in "q index" terms) to get from a starting q value
@@ -626,9 +617,6 @@
   sf->thresh_mult[THR_D63_PRED ] += speed_multiplier * 1500;
 
   sf->thresh_mult[THR_B_PRED   ] += speed_multiplier * 2500;
-#if !CONFIG_SB8X8
-  sf->thresh_mult[THR_I8X8_PRED] += speed_multiplier * 2500;
-#endif
 
   sf->thresh_mult[THR_NEWMV    ] += speed_multiplier * 1000;
   sf->thresh_mult[THR_NEWG     ] += speed_multiplier * 1000;
@@ -867,9 +855,6 @@
   }
 
   cpi->mb.quantize_b_4x4      = vp9_regular_quantize_b_4x4;
-  cpi->mb.quantize_b_4x4_pair = vp9_regular_quantize_b_4x4_pair;
-  cpi->mb.quantize_b_8x8      = vp9_regular_quantize_b_8x8;
-  cpi->mb.quantize_b_16x16    = vp9_regular_quantize_b_16x16;
 
   vp9_init_quantizer(cpi);
 
@@ -959,23 +944,6 @@
   CHECK_MEM_ERROR(cpi->mb_norm_activity_map,
                   vpx_calloc(sizeof(unsigned int),
                              cm->mb_rows * cm->mb_cols));
-
-  vpx_free(cpi->twopass.total_stats);
-
-  cpi->twopass.total_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));
-
-  vpx_free(cpi->twopass.total_left_stats);
-  cpi->twopass.total_left_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));
-
-  vpx_free(cpi->twopass.this_frame_stats);
-
-  cpi->twopass.this_frame_stats = vpx_calloc(1, sizeof(FIRSTPASS_STATS));
-
-  if (!cpi->twopass.total_stats ||
-      !cpi->twopass.total_left_stats ||
-      !cpi->twopass.this_frame_stats)
-    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
-                       "Failed to allocate firstpass stats");
 }
 
 
@@ -1647,12 +1615,12 @@
   BFP(BLOCK_8X8, vp9_sad8x8, vp9_variance8x8, vp9_sub_pixel_variance8x8,
       NULL, NULL, NULL, vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)
 
-#if CONFIG_SB8X8
   BFP(BLOCK_4X8, NULL, vp9_variance4x8, NULL,
       NULL, NULL, NULL, NULL, NULL, NULL)
+
   BFP(BLOCK_8X4, NULL, vp9_variance8x4, NULL,
       NULL, NULL, NULL, NULL, NULL, NULL)
-#endif
+
   BFP(BLOCK_4X4, vp9_sad4x4, vp9_variance4x4, vp9_sub_pixel_variance4x4,
       NULL, NULL, NULL, vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
 
@@ -3332,13 +3300,7 @@
     vp9_copy(cpi->common.fc.ymode_counts, cpi->ymode_count);
     vp9_copy(cpi->common.fc.uv_mode_counts, cpi->y_uv_mode_count);
     vp9_copy(cpi->common.fc.bmode_counts, cpi->bmode_count);
-#if !CONFIG_SB8X8
-    vp9_copy(cpi->common.fc.i8x8_mode_counts, cpi->i8x8_mode_count);
-#endif
     vp9_copy(cpi->common.fc.sub_mv_ref_counts, cpi->sub_mv_ref_count);
-#if !CONFIG_SB8X8
-    vp9_copy(cpi->common.fc.mbsplit_counts, cpi->mbsplit_count);
-#endif
     vp9_copy(cpi->common.fc.partition_counts, cpi->partition_count);
 #if CONFIG_COMP_INTERINTRA_PRED
     vp9_copy(cpi->common.fc.interintra_counts, cpi->interintra_count);
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 9d1e984..cc91ba5 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -48,9 +48,9 @@
 #define KEY_FRAME_CONTEXT 5
 
 #if CONFIG_COMP_INTERINTRA_PRED
-#define MAX_MODES 54 - CONFIG_SB8X8
+#define MAX_MODES 53
 #else
-#define MAX_MODES 42 - CONFIG_SB8X8
+#define MAX_MODES 41
 #endif
 
 #define MIN_THRESHMULT  32
@@ -72,9 +72,6 @@
   // Stats
   int y_modes[VP9_YMODES];
   int uv_modes[VP9_UV_MODES];
-#if !CONFIG_SB8X8
-  int i8x8_modes[VP9_I8X8_MODES];
-#endif
   int b_modes[B_MODE_COUNT];
   int inter_y_modes[MB_MODE_COUNT];
   int inter_uv_modes[VP9_UV_MODES];
@@ -102,13 +99,7 @@
   vp9_prob ymode_prob[VP9_YMODES - 1]; /* interframe intra mode probs */
   vp9_prob uv_mode_prob[VP9_YMODES][VP9_UV_MODES - 1];
   vp9_prob bmode_prob[VP9_NKF_BINTRAMODES - 1];
-#if !CONFIG_SB8X8
-  vp9_prob i8x8_mode_prob[VP9_I8X8_MODES - 1];
-#endif
   vp9_prob sub_mv_ref_prob[SUBMVREF_COUNT][VP9_SUBMVREFS - 1];
-#if !CONFIG_SB8X8
-  vp9_prob mbsplit_prob[VP9_NUMMBSPLITS - 1];
-#endif
   vp9_prob partition_prob[NUM_PARTITION_CONTEXTS][PARTITION_TYPES - 1];
 
   vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
@@ -213,9 +204,6 @@
   THR_SPLITA,
 
   THR_B_PRED,
-#if !CONFIG_SB8X8
-  THR_I8X8_PRED,
-#endif
 
   THR_COMP_ZEROLG,
   THR_COMP_NEARESTLG,
@@ -281,19 +269,12 @@
 } SPEED_FEATURES;
 
 enum BlockSize {
-#if CONFIG_SB8X8
   BLOCK_4X4,
   BLOCK_4X8,
   BLOCK_8X4,
   BLOCK_8X8,
   BLOCK_8X16,
   BLOCK_16X8,
-#else
-  BLOCK_16X8 = PARTITIONING_16X8,
-  BLOCK_8X16 = PARTITIONING_8X16,
-  BLOCK_8X8 = PARTITIONING_8X8,
-  BLOCK_4X4 = PARTITIONING_4X4,
-#endif
   BLOCK_16X16,
   BLOCK_MAX_SEGMENTS,
   BLOCK_32X32 = BLOCK_MAX_SEGMENTS,
@@ -468,13 +449,7 @@
   int sb_ymode_count [VP9_I32X32_MODES];
   int ymode_count[VP9_YMODES];        /* intra MB type cts this frame */
   int bmode_count[VP9_NKF_BINTRAMODES];
-#if !CONFIG_SB8X8
-  int i8x8_mode_count[VP9_I8X8_MODES];
-#endif
   int sub_mv_ref_count[SUBMVREF_COUNT][VP9_SUBMVREFS];
-#if !CONFIG_SB8X8
-  int mbsplit_count[VP9_NUMMBSPLITS];
-#endif
   int y_uv_mode_count[VP9_YMODES][VP9_UV_MODES];
   unsigned int partition_count[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];
 #if CONFIG_COMP_INTERINTRA_PRED
@@ -578,10 +553,10 @@
     unsigned int section_intra_rating;
     unsigned int next_iiratio;
     unsigned int this_iiratio;
-    FIRSTPASS_STATS *total_stats;
-    FIRSTPASS_STATS *this_frame_stats;
+    FIRSTPASS_STATS total_stats;
+    FIRSTPASS_STATS this_frame_stats;
     FIRSTPASS_STATS *stats_in, *stats_in_end, *stats_in_start;
-    FIRSTPASS_STATS *total_left_stats;
+    FIRSTPASS_STATS total_left_stats;
     int first_pass_done;
     int64_t bits_left;
     int64_t clip_bits_total;
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index fe8ba4b..4ed8f63 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -75,6 +75,43 @@
   *eob_ptr = eob + 1;
 }
 
+void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coeffs,
+                  TX_TYPE tx_type) {
+  MACROBLOCKD *const xd = &mb->e_mbd;
+  const int mul = n_coeffs == 1024 ? 2 : 1;
+  const int *scan;
+
+  // These contexts may be available in the caller
+  switch (n_coeffs) {
+    case 4 * 4:
+      scan = get_scan_4x4(tx_type);
+      break;
+    case 8 * 8:
+      scan = get_scan_8x8(tx_type);
+      break;
+    case 16 * 16:
+      scan = get_scan_16x16(tx_type);
+      break;
+    default:
+      scan = vp9_default_zig_zag1d_32x32;
+      break;
+  }
+
+  quantize(mb->plane[plane].zrun_zbin_boost,
+           BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
+           n_coeffs, mb->skip_block,
+           mb->plane[plane].zbin,
+           mb->plane[plane].round,
+           mb->plane[plane].quant,
+           mb->plane[plane].quant_shift,
+           BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16),
+           BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
+           xd->plane[plane].dequant,
+           mb->plane[plane].zbin_extra,
+           &xd->plane[plane].eobs[block],
+           scan, mul);
+}
+
 void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
                                 int y_blocks) {
   MACROBLOCKD *const xd = &mb->e_mbd;
@@ -96,162 +133,6 @@
            pt_scan, 1);
 }
 
-void vp9_regular_quantize_b_8x8(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
-                                int y_blocks) {
-  MACROBLOCKD *const xd = &mb->e_mbd;
-  const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, b_idx);
-  const int *pt_scan = get_scan_8x8(tx_type);
-
-  quantize(mb->plane[pb_idx.plane].zrun_zbin_boost,
-           BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block, 16),
-           64, mb->skip_block,
-           mb->plane[pb_idx.plane].zbin,
-           mb->plane[pb_idx.plane].round,
-           mb->plane[pb_idx.plane].quant,
-           mb->plane[pb_idx.plane].quant_shift,
-           BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff, pb_idx.block, 16),
-           BLOCK_OFFSET(xd->plane[pb_idx.plane].dqcoeff, pb_idx.block, 16),
-           xd->plane[pb_idx.plane].dequant,
-           mb->plane[pb_idx.plane].zbin_extra,
-           &xd->plane[pb_idx.plane].eobs[pb_idx.block],
-           pt_scan, 1);
-}
-
-void vp9_regular_quantize_b_16x16(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
-                                  int y_blocks) {
-  MACROBLOCKD *const xd = &mb->e_mbd;
-  const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, b_idx);
-  const int *pt_scan = get_scan_16x16(tx_type);
-
-  quantize(mb->plane[pb_idx.plane].zrun_zbin_boost,
-           BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block, 16),
-           256, mb->skip_block,
-           mb->plane[pb_idx.plane].zbin,
-           mb->plane[pb_idx.plane].round,
-           mb->plane[pb_idx.plane].quant,
-           mb->plane[pb_idx.plane].quant_shift,
-           BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff, pb_idx.block, 16),
-           BLOCK_OFFSET(xd->plane[pb_idx.plane].dqcoeff, pb_idx.block, 16),
-           xd->plane[pb_idx.plane].dequant,
-           mb->plane[pb_idx.plane].zbin_extra,
-           &xd->plane[pb_idx.plane].eobs[pb_idx.block],
-           pt_scan, 1);
-}
-
-void vp9_regular_quantize_b_32x32(MACROBLOCK *mb, int b_idx, int y_blocks) {
-  MACROBLOCKD *const xd = &mb->e_mbd;
-  const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, b_idx);
-
-  quantize(mb->plane[pb_idx.plane].zrun_zbin_boost,
-           BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block, 16),
-           1024, mb->skip_block,
-           mb->plane[pb_idx.plane].zbin,
-           mb->plane[pb_idx.plane].round,
-           mb->plane[pb_idx.plane].quant,
-           mb->plane[pb_idx.plane].quant_shift,
-           BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff, pb_idx.block, 16),
-           BLOCK_OFFSET(xd->plane[pb_idx.plane].dqcoeff, pb_idx.block, 16),
-           xd->plane[pb_idx.plane].dequant,
-           mb->plane[pb_idx.plane].zbin_extra,
-           &xd->plane[pb_idx.plane].eobs[pb_idx.block],
-           vp9_default_zig_zag1d_32x32, 2);
-}
-
-void vp9_quantize_sby_32x32(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  const int bw = 1 << (b_width_log2(bsize) - 3);
-  const int bh = 1 << (b_height_log2(bsize) - 3);
-  int n;
-
-  for (n = 0; n < bw * bh; n++)
-    vp9_regular_quantize_b_32x32(x, n * 64, bw * bh * 64);
-}
-
-void vp9_quantize_sby_16x16(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 2, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 2);
-  const int bstride = 16 << bwl;
-  int n;
-
-  for (n = 0; n < bw * bh; n++) {
-    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
-    TX_TYPE tx_type = get_tx_type_16x16(&x->e_mbd,
-                                        4 * x_idx + y_idx * bstride);
-    x->quantize_b_16x16(x, n * 16, tx_type, 16 * bw * bh);
-  }
-}
-
-void vp9_quantize_sby_8x8(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 1, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 1);
-  const int bstride = 4 << bwl;
-  int n;
-
-  for (n = 0; n < bw * bh; n++) {
-    const int x_idx = n & (bw - 1), y_idx = n >> bwl;
-    TX_TYPE tx_type = get_tx_type_8x8(&x->e_mbd,
-                                      2 * x_idx + y_idx * bstride);
-    x->quantize_b_8x8(x, n * 4, tx_type, 4 * bw * bh);
-  }
-}
-
-void vp9_quantize_sby_4x4(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize), bw = 1 << bwl;
-  const int bh = 1 << b_height_log2(bsize);
-  MACROBLOCKD *const xd = &x->e_mbd;
-  int n;
-
-  for (n = 0; n < bw * bh; n++) {
-    const TX_TYPE tx_type = get_tx_type_4x4(xd, n);
-    x->quantize_b_4x4(x, n, tx_type, bw * bh);
-  }
-}
-
-void vp9_quantize_sbuv_32x32(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  assert(bsize == BLOCK_SIZE_SB64X64);
-  vp9_regular_quantize_b_32x32(x, 256, 256);
-  vp9_regular_quantize_b_32x32(x, 320, 256);
-}
-
-void vp9_quantize_sbuv_16x16(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 2;
-  const int bhl = b_height_log2(bsize) - 2;
-  const int uoff = 16 << (bhl + bwl);
-  int i;
-
-  for (i = uoff; i < ((uoff * 3) >> 1); i += 16)
-    x->quantize_b_16x16(x, i, DCT_DCT, uoff);
-}
-
-void vp9_quantize_sbuv_8x8(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 1;
-  const int bhl = b_height_log2(bsize) - 1;
-  const int uoff = 4 << (bhl + bwl);
-  int i;
-
-  for (i = uoff; i < ((uoff * 3) >> 1); i += 4)
-    x->quantize_b_8x8(x, i, DCT_DCT, uoff);
-}
-
-void vp9_quantize_sbuv_4x4(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize);
-  const int bhl = b_height_log2(bsize);
-  const int uoff = 1 << (bhl + bwl);
-  int i;
-
-  for (i = uoff; i < ((uoff * 3) >> 1); i++)
-    x->quantize_b_4x4(x, i, DCT_DCT, uoff);
-}
-
-/* quantize_b_pair function pointer in MACROBLOCK structure is set to one of
- * these two C functions if corresponding optimized routine is not available.
- * NEON optimized version implements currently the fast quantization for pair
- * of blocks. */
-void vp9_regular_quantize_b_4x4_pair(MACROBLOCK *x, int b_idx1, int b_idx2,
-                                     int y_blocks) {
-  vp9_regular_quantize_b_4x4(x, b_idx1, DCT_DCT, y_blocks);
-  vp9_regular_quantize_b_4x4(x, b_idx2, DCT_DCT, y_blocks);
-}
-
 static void invert_quant(int16_t *quant, uint8_t *shift, int d) {
   unsigned t;
   int l;
@@ -266,6 +147,7 @@
 void vp9_init_quantizer(VP9_COMP *cpi) {
   int i;
   int quant_val;
+  int quant_uv_val;
   int q;
 
   static const int zbin_boost[16] = { 0,  0,  0,  8,  8,  8, 10, 12,
@@ -293,25 +175,26 @@
     cpi->common.uv_dequant[q][0] = quant_val;
     cpi->zrun_zbin_boost_uv[q][0] = (quant_val * zbin_boost[0]) >> 7;
 
+    quant_val = vp9_ac_quant(q, 0);
+    cpi->common.y_dequant[q][1] = quant_val;
+    quant_uv_val = vp9_ac_quant(q, cpi->common.uv_ac_delta_q);
+    cpi->common.uv_dequant[q][1] = quant_uv_val;
     // all the 4x4 ac values =;
     for (i = 1; i < 16; i++) {
       int rc = vp9_default_zig_zag1d_4x4[i];
 
-      quant_val = vp9_ac_quant(q, 0);
       invert_quant(cpi->Y1quant[q] + rc, cpi->Y1quant_shift[q] + rc, quant_val);
       cpi->Y1zbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
       cpi->Y1round[q][rc] = (qrounding_factor * quant_val) >> 7;
-      cpi->common.y_dequant[q][rc] = quant_val;
       cpi->zrun_zbin_boost_y1[q][i] =
           ROUND_POWER_OF_TWO(quant_val * zbin_boost[i], 7);
 
-      quant_val = vp9_ac_quant(q, cpi->common.uv_ac_delta_q);
-      invert_quant(cpi->UVquant[q] + rc, cpi->UVquant_shift[q] + rc, quant_val);
-      cpi->UVzbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
-      cpi->UVround[q][rc] = (qrounding_factor * quant_val) >> 7;
-      cpi->common.uv_dequant[q][rc] = quant_val;
+      invert_quant(cpi->UVquant[q] + rc, cpi->UVquant_shift[q] + rc,
+        quant_uv_val);
+      cpi->UVzbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_uv_val, 7);
+      cpi->UVround[q][rc] = (qrounding_factor * quant_uv_val) >> 7;
       cpi->zrun_zbin_boost_uv[q][i] =
-          ROUND_POWER_OF_TWO(quant_val * zbin_boost[i], 7);
+          ROUND_POWER_OF_TWO(quant_uv_val * zbin_boost[i], 7);
     }
   }
 }
diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h
index fb74cbd..2b1eeab 100644
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h
@@ -22,26 +22,15 @@
 #define prototype_quantize_mb(sym) \
   void (sym)(MACROBLOCK *x)
 
+void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coefs,
+                  TX_TYPE tx_type);
+
 void vp9_regular_quantize_b_4x4_pair(MACROBLOCK *mb, int b_idx1, int b_idx2,
                                      int y_blocks);
 void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
                                 int y_blocks);
 void vp9_regular_quantize_b_8x8(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
                                 int y_blocks);
-void vp9_regular_quantize_b_16x16(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
-                                  int y_blocks);
-void vp9_regular_quantize_b_32x32(MACROBLOCK *mb, int b_idx,
-                                  int y_blocks);
-
-void vp9_quantize_sby_32x32(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_quantize_sby_16x16(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_quantize_sby_8x8(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_quantize_sby_4x4(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_quantize_sbuv_32x32(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_quantize_sbuv_16x16(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_quantize_sbuv_8x8(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_quantize_sbuv_4x4(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-
 struct VP9_COMP;
 
 extern void vp9_set_quantizer(struct VP9_COMP *cpi, int Q);
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 42d339d..0f84b1a 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -138,13 +138,7 @@
   vp9_copy(cc->sb_ymode_prob, cm->fc.sb_ymode_prob);
   vp9_copy(cc->bmode_prob, cm->fc.bmode_prob);
   vp9_copy(cc->uv_mode_prob, cm->fc.uv_mode_prob);
-#if !CONFIG_SB8X8
-  vp9_copy(cc->i8x8_mode_prob, cm->fc.i8x8_mode_prob);
-#endif
   vp9_copy(cc->sub_mv_ref_prob, cm->fc.sub_mv_ref_prob);
-#if !CONFIG_SB8X8
-  vp9_copy(cc->mbsplit_prob, cm->fc.mbsplit_prob);
-#endif
   vp9_copy(cc->partition_prob, cm->fc.partition_prob);
 
   // Stats
@@ -202,14 +196,8 @@
   vp9_copy(cm->fc.ymode_prob, cc->ymode_prob);
   vp9_copy(cm->fc.sb_ymode_prob, cc->sb_ymode_prob);
   vp9_copy(cm->fc.bmode_prob, cc->bmode_prob);
-#if !CONFIG_SB8X8
-  vp9_copy(cm->fc.i8x8_mode_prob, cc->i8x8_mode_prob);
-#endif
   vp9_copy(cm->fc.uv_mode_prob, cc->uv_mode_prob);
   vp9_copy(cm->fc.sub_mv_ref_prob, cc->sub_mv_ref_prob);
-#if !CONFIG_SB8X8
-  vp9_copy(cm->fc.mbsplit_prob, cc->mbsplit_prob);
-#endif
   vp9_copy(cm->fc.partition_prob, cc->partition_prob);
 
   // Stats
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index cf4b1e8..90d56b2 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -13,8 +13,8 @@
 #include <math.h>
 #include <limits.h>
 #include <assert.h>
-#include "vp9/common/vp9_pragmas.h"
 
+#include "vp9/common/vp9_pragmas.h"
 #include "vp9/encoder/vp9_tokenize.h"
 #include "vp9/encoder/vp9_treewriter.h"
 #include "vp9/encoder/vp9_onyx_int.h"
@@ -34,7 +34,6 @@
 #include "vpx_mem/vpx_mem.h"
 #include "vp9/common/vp9_systemdependent.h"
 #include "vp9/encoder/vp9_encodemv.h"
-
 #include "vp9/common/vp9_seg_common.h"
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_entropy.h"
@@ -42,8 +41,6 @@
 #include "vp9/common/vp9_mvref_common.h"
 #include "vp9/common/vp9_common.h"
 
-#define MAXF(a,b)            (((a) > (b)) ? (a) : (b))
-
 #define INVALID_MV 0x80008000
 
 /* Factor to weigh the rate for switchable interp filters */
@@ -105,9 +102,6 @@
   {SPLITMV,   ALTREF_FRAME, NONE},
 
   {I4X4_PRED,    INTRA_FRAME,  NONE},
-#if !CONFIG_SB8X8
-  {I8X8_PRED, INTRA_FRAME,  NONE},
-#endif
 
   /* compound prediction modes */
   {ZEROMV,    LAST_FRAME,   GOLDEN_FRAME},
@@ -157,11 +151,9 @@
   for (i = 0; i < BLOCK_TYPES; i++)
     for (j = 0; j < REF_TYPES; j++)
       for (k = 0; k < COEF_BANDS; k++)
-        for (l = 0; l < PREV_COEF_CONTEXTS; l++) {
-          vp9_cost_tokens_skip((int *)(c[i][j][k][l]),
-                               p[i][j][k][l],
+        for (l = 0; l < PREV_COEF_CONTEXTS; l++)
+          vp9_cost_tokens_skip((int *)c[i][j][k][l], p[i][j][k][l],
                                vp9_coef_tree);
-        }
 }
 
 static int rd_iifactor[32] =  { 4, 4, 3, 2, 1, 0, 0, 0,
@@ -184,7 +176,7 @@
   for (i = 0; i < QINDEX_RANGE; i++) {
     sad_per_bit16lut[i] =
       (int)((0.0418 * vp9_convert_qindex_to_q(i)) + 2.4107);
-    sad_per_bit4lut[i] = (int)((0.063 * vp9_convert_qindex_to_q(i)) + 2.742);
+    sad_per_bit4lut[i] = (int)(0.063 * vp9_convert_qindex_to_q(i) + 2.742);
   }
 }
 
@@ -208,7 +200,7 @@
   // for key frames, golden frames and arf frames.
   // if (cpi->common.refresh_golden_frame ||
   //     cpi->common.refresh_alt_ref_frame)
-  qindex = (qindex < 0) ? 0 : ((qindex > MAXQ) ? MAXQ : qindex);
+  qindex = clamp(qindex, 0, MAXQ);
 
   cpi->RDMULT = compute_rd_mult(qindex);
   if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
@@ -293,7 +285,7 @@
 }
 
 static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
-                              int ib, PLANE_TYPE type,
+                              int plane, int block, PLANE_TYPE type,
                               ENTROPY_CONTEXT *A,
                               ENTROPY_CONTEXT *L,
                               TX_SIZE tx_size,
@@ -304,10 +296,9 @@
   int c = 0;
   int cost = 0, pad;
   const int *scan, *nb;
-  const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, ib);
-  const int eob = xd->plane[pb_idx.plane].eobs[pb_idx.block];
-  const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff,
-                                           pb_idx.block, 16);
+  const int eob = xd->plane[plane].eobs[block];
+  const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff,
+                                           block, 16);
   const int ref = mbmi->ref_frame != INTRA_FRAME;
   unsigned int (*token_costs)[PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] =
       mb->token_costs[tx_size][type][ref];
@@ -334,7 +325,7 @@
 #endif
 
   // Check for consistency of tx_size with mode info
-  assert((!type && !pb_idx.plane) || (type && pb_idx.plane));
+  assert((!type && !plane) || (type && plane));
   if (type == PLANE_TYPE_Y_WITH_DC) {
     assert(xd->mode_info_context->mbmi.txfm_size == tx_size);
   } else {
@@ -345,7 +336,7 @@
   switch (tx_size) {
     case TX_4X4: {
       tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
-          get_tx_type_4x4(xd, ib) : DCT_DCT;
+          get_tx_type_4x4(xd, block) : DCT_DCT;
       above_ec = A[0] != 0;
       left_ec = L[0] != 0;
       coef_probs = cm->fc.coef_probs_4x4;
@@ -359,7 +350,7 @@
     case TX_8X8: {
       const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
       const int sz = 1 + b_width_log2(sb_type);
-      const int x = ib & ((1 << sz) - 1), y = ib - x;
+      const int x = block & ((1 << sz) - 1), y = block - x;
       TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
           get_tx_type_8x8(xd, y + (x >> 1)) : DCT_DCT;
       above_ec = (A[0] + A[1]) != 0;
@@ -375,7 +366,7 @@
     case TX_16X16: {
       const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;
       const int sz = 2 + b_width_log2(sb_type);
-      const int x = ib & ((1 << sz) - 1), y = ib - x;
+      const int x = block & ((1 << sz) - 1), y = block - x;
       TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
           get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT;
       scan = get_scan_16x16(tx_type);
@@ -608,16 +599,17 @@
 
   for (i = 0; i < block_size; i++) {
     int this_diff = coeff[i] - dqcoeff[i];
-    error += this_diff * this_diff;
+    error += (unsigned)this_diff * this_diff;
   }
   error >>= shift;
 
   return error > INT_MAX ? INT_MAX : (int)error;
 }
 
-static int block_error_sby(MACROBLOCK *x, int block_size, int shift) {
+static int block_error_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int shift) {
+  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
   return block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff,
-                     block_size, shift);
+                     16 << (bwl + bhl), shift);
 }
 
 static int block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int shift) {
@@ -635,155 +627,54 @@
   return sum > INT_MAX ? INT_MAX : (int)sum;
 }
 
-static int rdcost_sby_4x4(VP9_COMMON *const cm, MACROBLOCK *x,
-                          BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize), bw = 1 << bwl;
-  const int bh = 1 << b_height_log2(bsize);
-  int cost = 0, b;
+static int rdcost_plane(VP9_COMMON *const cm, MACROBLOCK *x,
+                        int plane, BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
   MACROBLOCKD *const xd = &x->e_mbd;
+  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
+  const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
+  const int bw = 1 << bwl, bh = 1 << bhl;
   ENTROPY_CONTEXT t_above[16], t_left[16];
+  int block, cost;
 
-  vpx_memcpy(&t_above, xd->plane[0].above_context,
+  vpx_memcpy(&t_above, xd->plane[plane].above_context,
              sizeof(ENTROPY_CONTEXT) * bw);
-  vpx_memcpy(&t_left,  xd->plane[0].left_context,
+  vpx_memcpy(&t_left,  xd->plane[plane].left_context,
              sizeof(ENTROPY_CONTEXT) * bh);
 
-  for (b = 0; b < bw * bh; b++) {
-    const int x_idx = b & (bw - 1), y_idx = b >> bwl;
-    cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC,
+  cost = 0;
+  for (block = 0; block < bw * bh; block += 1 << (tx_size * 2)) {
+    int x_idx, y_idx;
+
+    txfrm_block_to_raster_xy(xd, bsize, plane, block, tx_size * 2,
+                             &x_idx, &y_idx);
+
+    cost += cost_coeffs(cm, x, plane, block, xd->plane[plane].plane_type,
                         t_above + x_idx, t_left + y_idx,
-                        TX_4X4, bw * bh);
+                        tx_size, bw * bh);
   }
 
   return cost;
 }
 
-static void super_block_yrd_4x4(VP9_COMMON *const cm, MACROBLOCK *x,
-                                int *rate, int *distortion, int *skippable,
-                                BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
-  MACROBLOCKD *const xd = &x->e_mbd;
+static int rdcost_uv(VP9_COMMON *const cm, MACROBLOCK *x,
+                     BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
+  int cost = 0, plane;
 
-  xd->mode_info_context->mbmi.txfm_size = TX_4X4;
-  vp9_transform_sby_4x4(x, bsize);
-  vp9_quantize_sby_4x4(x, bsize);
-
-  *distortion = block_error_sby(x, 16 << (bwl + bhl), 2);
-  *rate       = rdcost_sby_4x4(cm, x, bsize);
-  *skippable  = vp9_sby_is_skippable(xd, bsize);
-}
-
-static int rdcost_sby_8x8(VP9_COMMON *const cm, MACROBLOCK *x,
-                          BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 1, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 1);
-  int cost = 0, b;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  ENTROPY_CONTEXT t_above[16], t_left[16];
-
-  vpx_memcpy(&t_above, xd->plane[0].above_context,
-             sizeof(ENTROPY_CONTEXT) * 2 * bw);
-  vpx_memcpy(&t_left,  xd->plane[0].left_context,
-             sizeof(ENTROPY_CONTEXT) * 2 * bh);
-
-  for (b = 0; b < bw * bh; b++) {
-    const int x_idx = b & (bw - 1), y_idx = b >> bwl;
-    cost += cost_coeffs(cm, x, b * 4, PLANE_TYPE_Y_WITH_DC,
-                        t_above + x_idx * 2, t_left + y_idx * 2,
-                        TX_8X8, 4 * bw * bh);
+  for (plane = 1; plane < MAX_MB_PLANE; plane++) {
+    cost += rdcost_plane(cm, x, plane, bsize, tx_size);
   }
-
   return cost;
 }
 
-static void super_block_yrd_8x8(VP9_COMMON *const cm, MACROBLOCK *x,
-                                int *rate, int *distortion, int *skippable,
-                                BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
+static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
+                                     int *rate, int *distortion, int *skippable,
+                                     BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
   MACROBLOCKD *const xd = &x->e_mbd;
+  xd->mode_info_context->mbmi.txfm_size = tx_size;
+  vp9_xform_quant_sby(cm, x, bsize);
 
-  xd->mode_info_context->mbmi.txfm_size = TX_8X8;
-  vp9_transform_sby_8x8(x, bsize);
-  vp9_quantize_sby_8x8(x, bsize);
-
-  *distortion = block_error_sby(x, 16 << (bhl + bwl), 2);
-  *rate       = rdcost_sby_8x8(cm, x, bsize);
-  *skippable  = vp9_sby_is_skippable(xd, bsize);
-}
-
-static int rdcost_sby_16x16(VP9_COMMON *const cm, MACROBLOCK *x,
-                            BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 2, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 2);
-  int cost = 0, b;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  ENTROPY_CONTEXT t_above[16], t_left[16];
-
-  vpx_memcpy(&t_above, xd->plane[0].above_context,
-             sizeof(ENTROPY_CONTEXT) * 4 * bw);
-  vpx_memcpy(&t_left,  xd->plane[0].left_context,
-             sizeof(ENTROPY_CONTEXT) * 4 * bh);
-
-  for (b = 0; b < bw * bh; b++) {
-    const int x_idx = b & (bw - 1), y_idx = b >> bwl;
-    cost += cost_coeffs(cm, x, b * 16, PLANE_TYPE_Y_WITH_DC,
-                        t_above + x_idx * 4, t_left + y_idx * 4,
-                        TX_16X16, bw * bh * 16);
-  }
-
-  return cost;
-}
-
-static void super_block_yrd_16x16(VP9_COMMON *const cm, MACROBLOCK *x,
-                                  int *rate, int *distortion, int *skippable,
-                                  BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  xd->mode_info_context->mbmi.txfm_size = TX_16X16;
-  vp9_transform_sby_16x16(x, bsize);
-  vp9_quantize_sby_16x16(x, bsize);
-
-  *distortion = block_error_sby(x, 16 << (bwl + bhl), 2);
-  *rate       = rdcost_sby_16x16(cm, x, bsize);
-  *skippable  = vp9_sby_is_skippable(xd, bsize);
-}
-
-static int rdcost_sby_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
-                            BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 3, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 3);
-  int cost = 0, b;
-  MACROBLOCKD * const xd = &x->e_mbd;
-  ENTROPY_CONTEXT t_above[16], t_left[16];
-
-  vpx_memcpy(&t_above, xd->plane[0].above_context,
-             sizeof(ENTROPY_CONTEXT) * 8 * bw);
-  vpx_memcpy(&t_left,  xd->plane[0].left_context,
-             sizeof(ENTROPY_CONTEXT) * 8 * bh);
-
-  for (b = 0; b < bw * bh; b++) {
-    const int x_idx = b & (bw - 1), y_idx = b >> bwl;
-    cost += cost_coeffs(cm, x, b * 64, PLANE_TYPE_Y_WITH_DC,
-                        t_above + x_idx * 8, t_left + y_idx * 8,
-                        TX_32X32, bw * bh * 64);
-  }
-
-  return cost;
-}
-
-static void super_block_yrd_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
-                                  int *rate, int *distortion, int *skippable,
-                                  BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  xd->mode_info_context->mbmi.txfm_size = TX_32X32;
-  vp9_transform_sby_32x32(x, bsize);
-  vp9_quantize_sby_32x32(x, bsize);
-
-  *distortion = block_error_sby(x, 16 << (bwl + bhl), 0);
-  *rate       = rdcost_sby_32x32(cm, x, bsize);
+  *distortion = block_error_sby(x, bsize, tx_size == TX_32X32 ? 0 : 2);
+  *rate       = rdcost_plane(cm, x, 0, bsize, tx_size);
   *skippable  = vp9_sby_is_skippable(xd, bsize);
 }
 
@@ -797,20 +688,19 @@
   vp9_subtract_sby(x, bs);
 
   if (bs >= BLOCK_SIZE_SB32X32)
-    super_block_yrd_32x32(cm, x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32],
-                          bs);
+    super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32],
+                             bs, TX_32X32);
   if (bs >= BLOCK_SIZE_MB16X16)
-    super_block_yrd_16x16(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16],
-                          bs);
-  super_block_yrd_8x8(cm, x,   &r[TX_8X8][0],   &d[TX_8X8],   &s[TX_8X8],   bs);
-  super_block_yrd_4x4(cm, x,   &r[TX_4X4][0],   &d[TX_4X4],   &s[TX_4X4],   bs);
+    super_block_yrd_for_txfm(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16],
+                             bs, TX_16X16);
+  super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8], bs,
+                           TX_8X8);
+  super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4], bs,
+                           TX_4X4);
 
   choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache,
                            TX_32X32 - (bs < BLOCK_SIZE_SB32X32)
-#if CONFIG_SB8X8
-                           - (bs < BLOCK_SIZE_MB16X16)
-#endif
-                           );
+                           - (bs < BLOCK_SIZE_MB16X16));
 }
 
 static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
@@ -828,39 +718,23 @@
   const int src_stride = x->plane[0].src.stride;
   uint8_t* const src =
       raster_block_offset_uint8(xd,
-#if CONFIG_SB8X8
                                 BLOCK_SIZE_SB8X8,
-#else
-                                BLOCK_SIZE_MB16X16,
-#endif
                                 0, ib,
                                 x->plane[0].src.buf, src_stride);
   int16_t* const src_diff =
       raster_block_offset_int16(xd,
-#if CONFIG_SB8X8
                                 BLOCK_SIZE_SB8X8,
-#else
-                                BLOCK_SIZE_MB16X16,
-#endif
                                 0, ib,
                                 x->plane[0].src_diff);
   int16_t* const diff =
       raster_block_offset_int16(xd,
-#if CONFIG_SB8X8
                                 BLOCK_SIZE_SB8X8,
-#else
-                                BLOCK_SIZE_MB16X16,
-#endif
                                 0, ib,
                                 xd->plane[0].diff);
   int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, ib, 16);
   uint8_t* const dst =
       raster_block_offset_uint8(xd,
-#if CONFIG_SB8X8
                                 BLOCK_SIZE_SB8X8,
-#else
-                                BLOCK_SIZE_MB16X16,
-#endif
                                 0, ib,
                                 xd->plane[0].dst.buf, xd->plane[0].dst.stride);
   ENTROPY_CONTEXT ta = *a, tempa = *a;
@@ -874,7 +748,7 @@
    * */
   DECLARE_ALIGNED_ARRAY(16, int16_t, best_dqcoeff, 16);
 
-  assert(ib < (16 >> (2 * CONFIG_SB8X8)));
+  assert(ib < 4);
 #if CONFIG_NEWBINTRAMODES
   xd->mode_info_context->bmi[ib].as_mode.context =
     vp9_find_bpred_context(xd, ib, dst, xd->plane[0].dst.stride);
@@ -902,25 +776,27 @@
     rate = bmode_costs[mode];
 #endif
 
-    vp9_intra4x4_predict(xd, ib, mode, dst, xd->plane[0].dst.stride);
-    vp9_subtract_block(4, 4, src_diff, 16 >> CONFIG_SB8X8,
+    vp9_intra4x4_predict(xd, ib,
+                         BLOCK_SIZE_SB8X8,
+                         mode, dst, xd->plane[0].dst.stride);
+    vp9_subtract_block(4, 4, src_diff, 8,
                        src, src_stride,
                        dst, xd->plane[0].dst.stride);
 
     xd->mode_info_context->bmi[ib].as_mode.first = mode;
     tx_type = get_tx_type_4x4(xd, ib);
     if (tx_type != DCT_DCT) {
-      vp9_short_fht4x4(src_diff, coeff, 16 >> CONFIG_SB8X8, tx_type);
+      vp9_short_fht4x4(src_diff, coeff, 8, tx_type);
       x->quantize_b_4x4(x, ib, tx_type, 16);
     } else {
-      x->fwd_txm4x4(src_diff, coeff, 32 >> CONFIG_SB8X8);
+      x->fwd_txm4x4(src_diff, coeff, 16);
       x->quantize_b_4x4(x, ib, tx_type, 16);
     }
 
     tempa = ta;
     templ = tl;
 
-    ratey = cost_coeffs(cm, x, ib,
+    ratey = cost_coeffs(cm, x, 0, ib,
                         PLANE_TYPE_Y_WITH_DC, &tempa, &templ, TX_4X4, 16);
     rate += ratey;
     distortion = vp9_block_error(coeff,
@@ -946,13 +822,15 @@
 
   // inverse transform
   if (best_tx_type != DCT_DCT)
-    vp9_short_iht4x4(best_dqcoeff, diff, 16 >> CONFIG_SB8X8, best_tx_type);
+    vp9_short_iht4x4(best_dqcoeff, diff, 8, best_tx_type);
   else
-    xd->inv_txm4x4(best_dqcoeff, diff, 32 >> CONFIG_SB8X8);
+    xd->inv_txm4x4(best_dqcoeff, diff, 16);
 
-  vp9_intra4x4_predict(xd, ib, *best_mode,
+  vp9_intra4x4_predict(xd, ib,
+                       BLOCK_SIZE_SB8X8,
+                       *best_mode,
                        dst, xd->plane[0].dst.stride);
-  vp9_recon_b(dst, diff,
+  vp9_recon_b(dst, diff, 8,
               dst, xd->plane[0].dst.stride);
 
   return best_rd;
@@ -967,7 +845,7 @@
   int distortion = 0;
   int tot_rate_y = 0;
   int64_t total_rd = 0;
-  ENTROPY_CONTEXT t_above[4 >> CONFIG_SB8X8], t_left[4 >> CONFIG_SB8X8];
+  ENTROPY_CONTEXT t_above[2], t_left[2];
   int *bmode_costs;
 
   vpx_memcpy(t_above, xd->plane[0].above_context, sizeof(t_above));
@@ -976,8 +854,8 @@
   xd->mode_info_context->mbmi.mode = I4X4_PRED;
   bmode_costs = mb->inter_bmode_costs;
 
-  for (i = 0; i < (16 >> (2 * CONFIG_SB8X8)); i++) {
-    const int x_idx = i & (3 >> CONFIG_SB8X8), y_idx = i >> (2 >> CONFIG_SB8X8);
+  for (i = 0; i < 4; i++) {
+    const int x_idx = i & 1, y_idx = i >> 1;
     MODE_INFO *const mic = xd->mode_info_context;
     const int mis = xd->mode_info_stride;
     B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
@@ -985,11 +863,7 @@
 #if CONFIG_NEWBINTRAMODES
     uint8_t* const dst =
         raster_block_offset_uint8(xd,
-#if CONFIG_SB8X8
                                   BLOCK_SIZE_SB8X8,
-#else
-                                  BLOCK_SIZE_MB16X16,
-#endif
                                   0, i,
                                   xd->plane[0].dst.buf,
                                   xd->plane[0].dst.stride);
@@ -1087,405 +961,16 @@
   return best_rd;
 }
 
-#if !CONFIG_SB8X8
-static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
-                                     B_PREDICTION_MODE *best_mode,
-                                     int *mode_costs,
-                                     ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
-                                     int *bestrate, int *bestratey,
-                                     int *bestdistortion) {
-  VP9_COMMON *const cm = &cpi->common;
-  MB_PREDICTION_MODE mode;
-  MACROBLOCKD *xd = &x->e_mbd;
-  int64_t best_rd = INT64_MAX;
-  int distortion = 0, rate = 0;
-  ENTROPY_CONTEXT ta[2], tl[2], ta_temp[2], tl_temp[2];
-  // perform transformation of dimension 8x8
-  // note the input and output index mapping
-  int idx = (ib & 0x02) ? (ib + 2) : ib;
-  const int src_stride = x->plane[0].src.stride;
-  uint8_t* const src =
-      raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, ib,
-                                x->plane[0].src.buf, src_stride);
-  int16_t* const src_diff =
-      raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, ib,
-                                x->plane[0].src_diff);
-  int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, idx, 16);
-  uint8_t* const dst =
-      raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, ib,
-                                xd->plane[0].dst.buf, xd->plane[0].dst.stride);
-
-  assert(ib < 16);
-  vpx_memcpy(ta, a, sizeof(ta));
-  vpx_memcpy(tl, l, sizeof(tl));
-  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
-    int64_t this_rd;
-    int rate_t = 0;
-
-    // FIXME rate for compound mode and second intrapred mode
-    rate = mode_costs[mode];
-    xd->mode_info_context->bmi[ib].as_mode.first = mode;
-
-    vp9_intra8x8_predict(xd, ib, mode, dst, xd->plane[0].dst.stride);
-
-    vp9_subtract_block(8, 8, src_diff, 16,
-                       src, src_stride,
-                       dst, xd->plane[0].dst.stride);
-
-    vpx_memcpy(ta_temp, ta, sizeof(ta));
-    vpx_memcpy(tl_temp, tl, sizeof(tl));
-
-    if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {
-      TX_TYPE tx_type = get_tx_type_8x8(xd, ib);
-      if (tx_type != DCT_DCT)
-        vp9_short_fht8x8(src_diff, coeff, 16, tx_type);
-      else
-        x->fwd_txm8x8(src_diff, coeff, 32);
-      x->quantize_b_8x8(x, idx, tx_type, 16);
-
-      // compute quantization mse of 8x8 block
-      distortion = vp9_block_error_c(coeff,
-          BLOCK_OFFSET(xd->plane[0].dqcoeff, idx, 16), 64);
-
-      rate_t = cost_coeffs(cm, x, idx, PLANE_TYPE_Y_WITH_DC,
-                           ta_temp, tl_temp, TX_8X8, 16);
-
-      rate += rate_t;
-    } else {
-      static const int iblock[4] = {0, 1, 4, 5};
-      TX_TYPE tx_type;
-      int i;
-
-      distortion = 0;
-      rate_t = 0;
-      for (i = 0; i < 4; ++i) {
-        int16_t* const src_diff =
-            raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16,
-                                      0, ib + iblock[i],
-                                      x->plane[0].src_diff);
-        int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff,
-                                            ib + iblock[i], 16);
-        int do_two = 0;
-        tx_type = get_tx_type_4x4(xd, ib + iblock[i]);
-        if (tx_type != DCT_DCT) {
-          vp9_short_fht4x4(src_diff, coeff, 16, tx_type);
-          x->quantize_b_4x4(x, ib + iblock[i], tx_type, 16);
-        } else if (!(i & 1) &&
-                   get_tx_type_4x4(xd, ib + iblock[i] + 1) == DCT_DCT) {
-          x->fwd_txm8x4(src_diff, coeff, 32);
-          x->quantize_b_4x4_pair(x, ib + iblock[i], ib + iblock[i] + 1, 16);
-          do_two = 1;
-        } else {
-          x->fwd_txm4x4(src_diff, coeff, 32);
-          x->quantize_b_4x4(x, ib + iblock[i], tx_type, 16);
-        }
-        distortion += vp9_block_error_c(coeff,
-            BLOCK_OFFSET(xd->plane[0].dqcoeff, ib + iblock[i], 16),
-            16 << do_two);
-        rate_t += cost_coeffs(cm, x, ib + iblock[i], PLANE_TYPE_Y_WITH_DC,
-                              &ta_temp[i & 1], &tl_temp[i >> 1],
-                              TX_4X4, 16);
-        if (do_two) {
-          i++;
-          rate_t += cost_coeffs(cm, x, ib + iblock[i], PLANE_TYPE_Y_WITH_DC,
-                                &ta_temp[i & 1], &tl_temp[i >> 1],
-                                TX_4X4, 16);
-        }
-      }
-      rate += rate_t;
-    }
-
-    distortion >>= 2;
-    this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
-    if (this_rd < best_rd) {
-      *bestrate = rate;
-      *bestratey = rate_t;
-      *bestdistortion = distortion;
-      vpx_memcpy(a, ta_temp, sizeof(ta_temp));
-      vpx_memcpy(l, tl_temp, sizeof(tl_temp));
-      best_rd = this_rd;
-      *best_mode = mode;
-    }
-  }
-  xd->mode_info_context->bmi[ib].as_mode.first = (*best_mode);
-  vp9_encode_intra8x8(x, ib);
-
-  return best_rd;
-}
-
-static int64_t rd_pick_intra8x8mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
-                                         int *Rate, int *rate_y,
-                                         int *Distortion, int64_t best_rd) {
-  MACROBLOCKD *const xd = &mb->e_mbd;
-  int i, ib;
-  int cost = mb->mbmode_cost [xd->frame_type] [I8X8_PRED];
-  int distortion = 0;
-  int tot_rate_y = 0;
-  int64_t total_rd = 0;
-  ENTROPY_CONTEXT t_above[4], t_left[4];
-  int *i8x8mode_costs;
-
-  vpx_memcpy(t_above, xd->plane[0].above_context, sizeof(t_above));
-  vpx_memcpy(t_left, xd->plane[0].left_context, sizeof(t_left));
-
-  xd->mode_info_context->mbmi.mode = I8X8_PRED;
-  i8x8mode_costs  = mb->i8x8_mode_costs;
-
-  for (i = 0; i < 4; i++) {
-    const int x_idx = i & 1, y_idx = i >> 1;
-    MODE_INFO *const mic = xd->mode_info_context;
-    B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
-    int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry), UNINITIALIZED_IS_SAFE(d);
-
-    ib = vp9_i8x8_block[i];
-    total_rd += rd_pick_intra8x8block(cpi, mb, ib, &best_mode, i8x8mode_costs,
-                                      t_above + x_idx * 2, t_left + y_idx * 2,
-                                      &r, &ry, &d);
-    cost += r;
-    distortion += d;
-    tot_rate_y += ry;
-    mic->bmi[ib].as_mode.first = best_mode;
-  }
-
-  *Rate = cost;
-  *rate_y = tot_rate_y;
-  *Distortion = distortion;
-  return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);
-}
-
-static int64_t rd_pick_intra8x8mby_modes_and_txsz(VP9_COMP *cpi, MACROBLOCK *x,
-                                                  int *rate, int *rate_y,
-                                                  int *distortion,
-                                                  int *mode8x8,
-                                                  int64_t best_yrd,
-                                                  int64_t *txfm_cache) {
-  VP9_COMMON *const cm = &cpi->common;
+static void super_block_uvrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
+                                      int *rate, int *distortion,
+                                      int *skippable, BLOCK_SIZE_TYPE bsize,
+                                      TX_SIZE uv_tx_size) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
-  int cost0 = vp9_cost_bit(cm->prob_tx[0], 0);
-  int cost1 = vp9_cost_bit(cm->prob_tx[0], 1);
-  int64_t tmp_rd_4x4s, tmp_rd_8x8s;
-  int64_t tmp_rd_4x4, tmp_rd_8x8, tmp_rd;
-  int r4x4, tok4x4, d4x4, r8x8, tok8x8, d8x8;
+  vp9_xform_quant_sbuv(cm, x, bsize);
 
-  mbmi->txfm_size = TX_4X4;
-  tmp_rd_4x4 = rd_pick_intra8x8mby_modes(cpi, x, &r4x4, &tok4x4,
-                                         &d4x4, best_yrd);
-  mode8x8[0] = xd->mode_info_context->bmi[0].as_mode.first;
-  mode8x8[1] = xd->mode_info_context->bmi[2].as_mode.first;
-  mode8x8[2] = xd->mode_info_context->bmi[8].as_mode.first;
-  mode8x8[3] = xd->mode_info_context->bmi[10].as_mode.first;
-  mbmi->txfm_size = TX_8X8;
-  tmp_rd_8x8 = rd_pick_intra8x8mby_modes(cpi, x, &r8x8, &tok8x8,
-                                         &d8x8, best_yrd);
-  txfm_cache[ONLY_4X4]  = tmp_rd_4x4;
-  txfm_cache[ALLOW_8X8] = tmp_rd_8x8;
-  txfm_cache[ALLOW_16X16] = tmp_rd_8x8;
-  tmp_rd_4x4s = tmp_rd_4x4 + RDCOST(x->rdmult, x->rddiv, cost0, 0);
-  tmp_rd_8x8s = tmp_rd_8x8 + RDCOST(x->rdmult, x->rddiv, cost1, 0);
-  txfm_cache[TX_MODE_SELECT] = tmp_rd_4x4s < tmp_rd_8x8s ?
-                               tmp_rd_4x4s : tmp_rd_8x8s;
-  if (cm->txfm_mode == TX_MODE_SELECT) {
-    if (tmp_rd_4x4s < tmp_rd_8x8s) {
-      *rate = r4x4 + cost0;
-      *rate_y = tok4x4 + cost0;
-      *distortion = d4x4;
-      mbmi->txfm_size = TX_4X4;
-      tmp_rd = tmp_rd_4x4s;
-    } else {
-      *rate = r8x8 + cost1;
-      *rate_y = tok8x8 + cost1;
-      *distortion = d8x8;
-      mbmi->txfm_size = TX_8X8;
-      tmp_rd = tmp_rd_8x8s;
-
-      mode8x8[0] = xd->mode_info_context->bmi[0].as_mode.first;
-      mode8x8[1] = xd->mode_info_context->bmi[2].as_mode.first;
-      mode8x8[2] = xd->mode_info_context->bmi[8].as_mode.first;
-      mode8x8[3] = xd->mode_info_context->bmi[10].as_mode.first;
-    }
-  } else if (cm->txfm_mode == ONLY_4X4) {
-    *rate = r4x4;
-    *rate_y = tok4x4;
-    *distortion = d4x4;
-    mbmi->txfm_size = TX_4X4;
-    tmp_rd = tmp_rd_4x4;
-  } else {
-    *rate = r8x8;
-    *rate_y = tok8x8;
-    *distortion = d8x8;
-    mbmi->txfm_size = TX_8X8;
-    tmp_rd = tmp_rd_8x8;
-
-    mode8x8[0] = xd->mode_info_context->bmi[0].as_mode.first;
-    mode8x8[1] = xd->mode_info_context->bmi[2].as_mode.first;
-    mode8x8[2] = xd->mode_info_context->bmi[8].as_mode.first;
-    mode8x8[3] = xd->mode_info_context->bmi[10].as_mode.first;
-  }
-
-  return tmp_rd;
-}
-#endif  // !CONFIG_SB8X8
-
-static int rd_cost_sbuv_4x4(VP9_COMMON *const cm, MACROBLOCK *x,
-                            BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 1, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 1);
-  int yoff = 4 * bw * bh;
-  int p, b, cost = 0;
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  for (p = 1; p < MAX_MB_PLANE; p++) {
-    ENTROPY_CONTEXT t_above[8], t_left[8];
-
-    vpx_memcpy(t_above, xd->plane[p].above_context,
-               sizeof(ENTROPY_CONTEXT) * 2 * bw >> xd->plane[p].subsampling_x);
-    vpx_memcpy(t_left, xd->plane[p].left_context,
-               sizeof(ENTROPY_CONTEXT) * 2 * bh >> xd->plane[p].subsampling_y);
-    for (b = 0; b < bw * bh; b++) {
-      const int x_idx = b & (bw - 1), y_idx = b >> bwl;
-      cost += cost_coeffs(cm, x, yoff + b, PLANE_TYPE_UV,
-                          t_above + x_idx, t_left + y_idx,
-                          TX_4X4, bw * bh * 4);
-    }
-    yoff = (yoff * 5) >> 2;  // u -> v
-  }
-
-  return cost;
-}
-
-static void super_block_uvrd_4x4(VP9_COMMON *const cm, MACROBLOCK *x,
-                                 int *rate, int *distortion, int *skip,
-                                 BLOCK_SIZE_TYPE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  vp9_transform_sbuv_4x4(x, bsize);
-  vp9_quantize_sbuv_4x4(x, bsize);
-
-  *rate       = rd_cost_sbuv_4x4(cm, x, bsize);
-  *distortion = block_error_sbuv(x, bsize, 2);
-  *skip       = vp9_sbuv_is_skippable(xd, bsize);
-}
-
-static int rd_cost_sbuv_8x8(VP9_COMMON *const cm, MACROBLOCK *x,
-                            BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 2, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 2);
-  int yoff = 16 * bw * bh;
-  int p, b, cost = 0;
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  for (p = 1; p < MAX_MB_PLANE; p++) {
-    ENTROPY_CONTEXT t_above[8], t_left[8];
-
-    vpx_memcpy(t_above, xd->plane[p].above_context,
-               sizeof(ENTROPY_CONTEXT) * 4 * bw >> xd->plane[p].subsampling_x);
-    vpx_memcpy(t_left, xd->plane[p].left_context,
-               sizeof(ENTROPY_CONTEXT) * 4 * bh >> xd->plane[p].subsampling_y);
-    for (b = 0; b < bw * bh; b++) {
-      const int x_idx = b & (bw - 1), y_idx = b >> bwl;
-      cost += cost_coeffs(cm, x, yoff + b * 4, PLANE_TYPE_UV,
-                          t_above + x_idx * 2, t_left + y_idx * 2,
-                          TX_8X8, bw * bh * 16);
-    }
-    yoff = (yoff * 5) >> 2;  // u -> v
-  }
-
-  return cost;
-}
-
-static void super_block_uvrd_8x8(VP9_COMMON *const cm, MACROBLOCK *x,
-                                 int *rate, int *distortion, int *skip,
-                                 BLOCK_SIZE_TYPE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  vp9_transform_sbuv_8x8(x, bsize);
-  vp9_quantize_sbuv_8x8(x, bsize);
-
-  *rate       = rd_cost_sbuv_8x8(cm, x, bsize);
-  *distortion = block_error_sbuv(x, bsize, 2);
-  *skip       = vp9_sbuv_is_skippable(xd, bsize);
-}
-
-static int rd_cost_sbuv_16x16(VP9_COMMON *const cm, MACROBLOCK *x,
-                              BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 3, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 3);
-  int yoff = 64 * bw * bh;
-  int p, b, cost = 0;
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  for (p = 1; p < MAX_MB_PLANE; p++) {
-    ENTROPY_CONTEXT t_above[8], t_left[8];
-
-    vpx_memcpy(t_above, xd->plane[p].above_context,
-               sizeof(ENTROPY_CONTEXT) * 8 * bw >> xd->plane[p].subsampling_x);
-    vpx_memcpy(t_left, xd->plane[p].left_context,
-               sizeof(ENTROPY_CONTEXT) * 8 * bh >> xd->plane[p].subsampling_y);
-    for (b = 0; b < bw * bh; b++) {
-      const int x_idx = b & (bw - 1), y_idx = b >> bwl;
-      cost += cost_coeffs(cm, x, yoff + b * 16, PLANE_TYPE_UV,
-                          t_above + x_idx * 4, t_left + y_idx * 4,
-                          TX_16X16, bw * bh * 64);
-    }
-    yoff = (yoff * 5) >> 2;  // u -> v
-  }
-
-  return cost;
-}
-
-static void super_block_uvrd_16x16(VP9_COMMON *const cm, MACROBLOCK *x,
-                                   int *rate, int *distortion, int *skip,
-                                   BLOCK_SIZE_TYPE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  vp9_transform_sbuv_16x16(x, bsize);
-  vp9_quantize_sbuv_16x16(x, bsize);
-
-  *rate       = rd_cost_sbuv_16x16(cm, x, bsize);
-  *distortion = block_error_sbuv(x, bsize, 2);
-  *skip       = vp9_sbuv_is_skippable(xd, bsize);
-}
-
-static int rd_cost_sbuv_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
-                              BLOCK_SIZE_TYPE bsize) {
-  const int bwl = b_width_log2(bsize) - 4, bw = 1 << bwl;
-  const int bh = 1 << (b_height_log2(bsize) - 4);
-  int yoff = 256 * bh * bw;
-  int p, b, cost = 0;
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  for (p = 1; p < MAX_MB_PLANE; p++) {
-    ENTROPY_CONTEXT t_above[8], t_left[8];
-
-    vpx_memcpy(t_above, xd->plane[p].above_context,
-               sizeof(ENTROPY_CONTEXT) * 16 * bw >> xd->plane[p].subsampling_x);
-    vpx_memcpy(t_left, xd->plane[p].left_context,
-               sizeof(ENTROPY_CONTEXT) * 16 * bh >> xd->plane[p].subsampling_y);
-    for (b = 0; b < bw * bh; b++) {
-      const int x_idx = b * (bw - 1), y_idx = b >> bwl;
-      cost += cost_coeffs(cm, x, yoff + b * 64, PLANE_TYPE_UV,
-                          t_above + x_idx * 8, t_left + y_idx * 8,
-                          TX_32X32, 256 * bh * bw);
-    }
-    yoff = (yoff * 5) >> 2;  // u -> v
-  }
-
-  return cost;
-}
-#undef UVCTX
-
-static void super_block_uvrd_32x32(VP9_COMMON *const cm, MACROBLOCK *x,
-                                   int *rate, int *distortion, int *skip,
-                                   BLOCK_SIZE_TYPE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-
-  vp9_transform_sbuv_32x32(x, bsize);
-  vp9_quantize_sbuv_32x32(x, bsize);
-
-  *rate       = rd_cost_sbuv_32x32(cm, x, bsize);
-  *distortion = block_error_sbuv(x, bsize, 0);
-  *skip       = vp9_sbuv_is_skippable(xd, bsize);
+  *distortion = block_error_sbuv(x, bsize, uv_tx_size == TX_32X32 ? 0 : 2);
+  *rate       = rdcost_uv(cm, x, bsize, uv_tx_size);
+  *skippable  = vp9_sbuv_is_skippable(xd, bsize);
 }
 
 static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,
@@ -1497,13 +982,17 @@
   vp9_subtract_sbuv(x, bsize);
 
   if (mbmi->txfm_size >= TX_32X32 && bsize >= BLOCK_SIZE_SB64X64) {
-    super_block_uvrd_32x32(cm, x, rate, distortion, skippable, bsize);
+    super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize,
+                              TX_32X32);
   } else if (mbmi->txfm_size >= TX_16X16 && bsize >= BLOCK_SIZE_SB32X32) {
-    super_block_uvrd_16x16(cm, x, rate, distortion, skippable, bsize);
+    super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize,
+                              TX_16X16);
   } else if (mbmi->txfm_size >= TX_8X8 && bsize >= BLOCK_SIZE_MB16X16) {
-    super_block_uvrd_8x8(cm, x, rate, distortion, skippable, bsize);
+    super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize,
+                              TX_8X8);
   } else {
-    super_block_uvrd_4x4(cm, x, rate, distortion, skippable, bsize);
+    super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize,
+                              TX_4X4);
   }
 }
 
@@ -1566,7 +1055,6 @@
   x->e_mbd.mode_info_context->mbmi.mv[0].as_int = mv->as_int;
 }
 
-#if CONFIG_SB8X8
 static int labels2mode(MACROBLOCK *x,
                        int const *labelings, int which_label,
                        B_PREDICTION_MODE this_mode,
@@ -1740,7 +1228,7 @@
                                        BLOCK_OFFSET(xd->plane[0].dqcoeff,
                                                     i, 16), 16);
       *distortion += thisdistortion;
-      *labelyrate += cost_coeffs(cm, x, i, PLANE_TYPE_Y_WITH_DC,
+      *labelyrate += cost_coeffs(cm, x, 0, i, PLANE_TYPE_Y_WITH_DC,
                                  ta + (i & 1),
                                  tl + (i >> 1), TX_4X4, 16);
     }
@@ -1764,7 +1252,6 @@
   int mvthresh;
   int *mdcounts;
 } BEST_SEG_INFO;
-#endif  // CONFIG_SB8X8
 
 static INLINE int mv_check_bounds(MACROBLOCK *x, int_mv *mv) {
   int r = 0;
@@ -1775,7 +1262,6 @@
   return r;
 }
 
-#if CONFIG_SB8X8
 static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
                                     BEST_SEG_INFO *bsi,
                                     int_mv seg_mvs[4][MAX_REF_FRAMES - 1]) {
@@ -2072,905 +1558,6 @@
   return (int)(bsi.segment_rd);
 }
 
-#else  // !CONFIG_SB8X8
-
-static int labels2mode(
-  MACROBLOCK *x,
-  int const *labelings, int which_label,
-  B_PREDICTION_MODE this_mode,
-  int_mv *this_mv, int_mv *this_second_mv,
-  int_mv seg_mvs[MAX_REF_FRAMES - 1],
-  int_mv *best_ref_mv,
-  int_mv *second_best_ref_mv,
-  int *mvjcost, int *mvcost[2], VP9_COMP *cpi) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO *const mic = xd->mode_info_context;
-  MB_MODE_INFO * mbmi = &mic->mbmi;
-  const int mis = xd->mode_info_stride;
-
-  int i, cost = 0, thismvcost = 0;
-
-  /* We have to be careful retrieving previously-encoded motion vectors.
-     Ones from this macroblock have to be pulled from the BLOCKD array
-     as they have not yet made it to the bmi array in our MB_MODE_INFO. */
-  for (i = 0; i < 16; ++i) {
-    const int row = i >> 2,  col = i & 3;
-
-    B_PREDICTION_MODE m;
-
-    if (labelings[i] != which_label)
-      continue;
-
-    if (col  &&  labelings[i] == labelings[i - 1])
-      m = LEFT4X4;
-    else if (row  &&  labelings[i] == labelings[i - 4])
-      m = ABOVE4X4;
-    else {
-      // the only time we should do costing for new motion vector or mode
-      // is when we are on a new label  (jbb May 08, 2007)
-      switch (m = this_mode) {
-        case NEW4X4 :
-          if (mbmi->second_ref_frame > 0) {
-            this_mv->as_int = seg_mvs[mbmi->ref_frame - 1].as_int;
-            this_second_mv->as_int =
-              seg_mvs[mbmi->second_ref_frame - 1].as_int;
-          }
-
-          thismvcost  = vp9_mv_bit_cost(this_mv, best_ref_mv, mvjcost, mvcost,
-                                        102, xd->allow_high_precision_mv);
-          if (mbmi->second_ref_frame > 0) {
-            thismvcost += vp9_mv_bit_cost(this_second_mv, second_best_ref_mv,
-                                          mvjcost, mvcost, 102,
-                                          xd->allow_high_precision_mv);
-          }
-          break;
-        case LEFT4X4:
-          this_mv->as_int = col ? mic->bmi[i - 1].as_mv[0].as_int :
-                                  left_block_mv(xd, mic, i);
-          if (mbmi->second_ref_frame > 0)
-            this_second_mv->as_int = col ? mic->bmi[i - 1].as_mv[1].as_int :
-                                           left_block_second_mv(xd, mic, i);
-          break;
-        case ABOVE4X4:
-          this_mv->as_int = row ? mic->bmi[i - 4].as_mv[0].as_int :
-                                  above_block_mv(mic, i, mis);
-          if (mbmi->second_ref_frame > 0)
-            this_second_mv->as_int = row ? mic->bmi[i - 4].as_mv[1].as_int :
-                                           above_block_second_mv(mic, i, mis);
-          break;
-        case ZERO4X4:
-          this_mv->as_int = 0;
-          if (mbmi->second_ref_frame > 0)
-            this_second_mv->as_int = 0;
-          break;
-        default:
-          break;
-      }
-
-      if (m == ABOVE4X4) { // replace above with left if same
-        int_mv left_mv, left_second_mv;
-
-        left_second_mv.as_int = 0;
-        left_mv.as_int = col ? mic->bmi[i - 1].as_mv[0].as_int :
-                         left_block_mv(xd, mic, i);
-        if (mbmi->second_ref_frame > 0)
-          left_second_mv.as_int = col ? mic->bmi[i - 1].as_mv[1].as_int :
-                                  left_block_second_mv(xd, mic, i);
-
-        if (left_mv.as_int == this_mv->as_int &&
-            (mbmi->second_ref_frame <= 0 ||
-             left_second_mv.as_int == this_second_mv->as_int))
-          m = LEFT4X4;
-      }
-
-#if CONFIG_NEWBINTRAMODES
-      cost = x->inter_bmode_costs[
-          m == B_CONTEXT_PRED ? m - CONTEXT_PRED_REPLACEMENTS : m];
-#else
-      cost = x->inter_bmode_costs[m];
-#endif
-    }
-
-    mic->bmi[i].as_mv[0].as_int = this_mv->as_int;
-    if (mbmi->second_ref_frame > 0)
-      mic->bmi[i].as_mv[1].as_int = this_second_mv->as_int;
-
-    x->partition_info->bmi[i].mode = m;
-    x->partition_info->bmi[i].mv.as_int = this_mv->as_int;
-    if (mbmi->second_ref_frame > 0)
-      x->partition_info->bmi[i].second_mv.as_int = this_second_mv->as_int;
-  }
-
-  cost += thismvcost;
-  return cost;
-}
-
-static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
-                                       MACROBLOCK *x,
-                                       int const *labels,
-                                       int which_label,
-                                       int *labelyrate,
-                                       int *distortion,
-                                       ENTROPY_CONTEXT *ta,
-                                       ENTROPY_CONTEXT *tl) {
-  int i;
-  MACROBLOCKD *xd = &x->e_mbd;
-
-  *labelyrate = 0;
-  *distortion = 0;
-  for (i = 0; i < 16; i++) {
-    if (labels[i] == which_label) {
-      const int src_stride = x->plane[0].src.stride;
-      uint8_t* const src =
-          raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, i,
-                                    x->plane[0].src.buf, src_stride);
-      int16_t* const src_diff =
-          raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, i,
-                                    x->plane[0].src_diff);
-      int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, 16, i);
-      uint8_t* const pre =
-          raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, i,
-                                    xd->plane[0].pre[0].buf,
-                                    xd->plane[0].pre[0].stride);
-      uint8_t* const dst =
-          raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, i,
-                                    xd->plane[0].dst.buf,
-                                    xd->plane[0].dst.stride);
-      int thisdistortion;
-
-      vp9_build_inter_predictor(pre,
-                                xd->plane[0].pre[0].stride,
-                                dst,
-                                xd->plane[0].dst.stride,
-                                &xd->mode_info_context->bmi[i].as_mv[0],
-                                &xd->scale_factor[0],
-                                4, 4, 0 /* no avg */, &xd->subpix);
-
-      // TODO(debargha): Make this work properly with the
-      // implicit-compoundinter-weight experiment when implicit
-      // weighting for splitmv modes is turned on.
-      if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
-        uint8_t* const second_pre =
-          raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, i,
-                                    xd->plane[0].pre[1].buf,
-                                    xd->plane[0].pre[1].stride);
-        vp9_build_inter_predictor(
-            second_pre, xd->plane[0].pre[1].stride,
-            dst, xd->plane[0].dst.stride,
-            &xd->mode_info_context->bmi[i].as_mv[1],
-            &xd->scale_factor[1], 4, 4, 1,
-            &xd->subpix);
-      }
-
-      vp9_subtract_block(4, 4, src_diff, 16,
-                         src, src_stride,
-                         dst, xd->plane[0].dst.stride);
-      x->fwd_txm4x4(src_diff, coeff, 32);
-      x->quantize_b_4x4(x, i, DCT_DCT, 16);
-      thisdistortion = vp9_block_error(coeff,
-          BLOCK_OFFSET(xd->plane[0].dqcoeff, i, 16), 16);
-      *distortion += thisdistortion;
-      *labelyrate += cost_coeffs(cm, x, i, PLANE_TYPE_Y_WITH_DC,
-                                 ta + (i & 3),
-                                 tl + (i >> 2), TX_4X4, 16);
-    }
-  }
-  *distortion >>= 2;
-  return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
-}
-
-static int64_t encode_inter_mb_segment_8x8(VP9_COMMON *const cm,
-                                           MACROBLOCK *x,
-                                           int const *labels,
-                                           int which_label,
-                                           int *labelyrate,
-                                           int *distortion,
-                                           int64_t *otherrd,
-                                           ENTROPY_CONTEXT *ta,
-                                           ENTROPY_CONTEXT *tl) {
-  int i, j;
-  MACROBLOCKD *xd = &x->e_mbd;
-  const int iblock[4] = { 0, 1, 4, 5 };
-  int othercost = 0, otherdist = 0;
-  ENTROPY_CONTEXT tac[4], tlc[4];
-
-  if (otherrd) {
-    memcpy(&tac, ta, sizeof(tac));
-    memcpy(&tlc, tl, sizeof(tlc));
-  }
-
-  *distortion = 0;
-  *labelyrate = 0;
-  for (i = 0; i < 4; i++) {
-    int ib = vp9_i8x8_block[i];
-
-    if (labels[ib] == which_label) {
-      const int use_second_ref =
-          xd->mode_info_context->mbmi.second_ref_frame > 0;
-      int which_mv;
-      const int idx = (ib & 8) + ((ib & 2) << 1);
-      const int src_stride = x->plane[0].src.stride;
-      uint8_t* const src =
-          raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, ib,
-                                    x->plane[0].src.buf, src_stride);
-      int16_t* const src_diff =
-          raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, ib,
-                                    x->plane[0].src_diff);
-      int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, idx, 16);
-      int thisdistortion;
-      uint8_t* const dst =
-          raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, ib,
-                                    xd->plane[0].dst.buf,
-                                    xd->plane[0].dst.stride);
-
-      assert(idx < 16);
-      for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
-        uint8_t* const pre =
-            raster_block_offset_uint8(xd, BLOCK_SIZE_MB16X16, 0, ib,
-                                      xd->plane[0].pre[which_mv].buf,
-                                      xd->plane[0].pre[which_mv].stride);
-
-        // TODO(debargha): Make this work properly with the
-        // implicit-compoundinter-weight experiment when implicit
-        // weighting for splitmv modes is turned on.
-        vp9_build_inter_predictor(
-            pre, xd->plane[0].pre[which_mv].stride,
-            dst, xd->plane[0].dst.stride,
-            &xd->mode_info_context->bmi[ib].as_mv[which_mv],
-            &xd->scale_factor[which_mv], 8, 8,
-            which_mv, &xd->subpix);
-      }
-
-      vp9_subtract_block(8, 8, src_diff, 16,
-                         src, src_stride,
-                         dst, xd->plane[0].dst.stride);
-
-      if (xd->mode_info_context->mbmi.txfm_size == TX_4X4) {
-        if (otherrd) {
-          x->fwd_txm8x8(src_diff, coeff, 32);
-          x->quantize_b_8x8(x, idx, DCT_DCT, 16);
-          thisdistortion = vp9_block_error_c(coeff,
-              BLOCK_OFFSET(xd->plane[0].dqcoeff, idx, 16), 64);
-          otherdist += thisdistortion;
-          xd->mode_info_context->mbmi.txfm_size = TX_8X8;
-          othercost += cost_coeffs(cm, x, idx, PLANE_TYPE_Y_WITH_DC,
-                                   tac + (i & 1) * 2,
-                                   tlc + (i & 2),
-                                   TX_8X8, 16);
-          xd->mode_info_context->mbmi.txfm_size = TX_4X4;
-        }
-        for (j = 0; j < 4; j += 2) {
-          int16_t* const src_diff =
-              raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16,
-                                        0, ib + iblock[j],
-                                        x->plane[0].src_diff);
-          int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff,
-                                              ib + iblock[j], 16);
-          x->fwd_txm8x4(src_diff, coeff, 32);
-          x->quantize_b_4x4_pair(x, ib + iblock[j], ib + iblock[j] + 1, 16);
-          thisdistortion = vp9_block_error_c(coeff,
-              BLOCK_OFFSET(xd->plane[0].dqcoeff, ib + iblock[j], 16), 32);
-          *distortion += thisdistortion;
-          *labelyrate +=
-              cost_coeffs(cm, x, ib + iblock[j], PLANE_TYPE_Y_WITH_DC,
-                          ta + (i & 1) * 2,
-                          tl + (i & 2) + ((j & 2) >> 1),
-                          TX_4X4, 16);
-          *labelyrate +=
-              cost_coeffs(cm, x, ib + iblock[j] + 1,
-                          PLANE_TYPE_Y_WITH_DC,
-                          ta + (i & 1) * 2 + 1,
-                          tl + (i & 2) + ((j & 2) >> 1),
-                          TX_4X4, 16);
-        }
-      } else /* 8x8 */ {
-        if (otherrd) {
-          for (j = 0; j < 4; j += 2) {
-            int16_t* const src_diff =
-                raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16,
-                                          0, ib + iblock[j],
-                                          x->plane[0].src_diff);
-            int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff,
-                                                ib + iblock[j], 16);
-            x->fwd_txm8x4(src_diff, coeff, 32);
-            x->quantize_b_4x4_pair(x, ib + iblock[j], ib + iblock[j] + 1, 16);
-            thisdistortion = vp9_block_error_c(coeff,
-                BLOCK_OFFSET(xd->plane[0].dqcoeff, ib + iblock[j], 16), 32);
-            otherdist += thisdistortion;
-            xd->mode_info_context->mbmi.txfm_size = TX_4X4;
-            othercost +=
-                cost_coeffs(cm, x, ib + iblock[j], PLANE_TYPE_Y_WITH_DC,
-                            tac + (i & 1) * 2,
-                            tlc + (i & 2) + ((j & 2) >> 1),
-                            TX_4X4, 16);
-            othercost +=
-                cost_coeffs(cm, x, ib + iblock[j] + 1,
-                            PLANE_TYPE_Y_WITH_DC,
-                            tac + (i & 1) * 2 + 1,
-                            tlc + (i & 2) + ((j & 2) >> 1),
-                            TX_4X4, 16);
-            xd->mode_info_context->mbmi.txfm_size = TX_8X8;
-          }
-        }
-        x->fwd_txm8x8(src_diff, coeff, 32);
-        x->quantize_b_8x8(x, idx, DCT_DCT, 16);
-        thisdistortion = vp9_block_error_c(coeff,
-            BLOCK_OFFSET(xd->plane[0].dqcoeff, idx, 16), 64);
-        *distortion += thisdistortion;
-        *labelyrate += cost_coeffs(cm, x, idx, PLANE_TYPE_Y_WITH_DC,
-                                   ta + (i & 1) * 2,
-                                   tl + (i & 2),
-                                   TX_8X8, 16);
-      }
-    }
-  }
-  *distortion >>= 2;
-  if (otherrd) {
-    otherdist >>= 2;
-    *otherrd = RDCOST(x->rdmult, x->rddiv, othercost, otherdist);
-  }
-  return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
-}
-
-static const unsigned int segmentation_to_sseshift[4] = {3, 3, 2, 0};
-
-
-typedef struct {
-  int_mv *ref_mv, *second_ref_mv;
-  int_mv mvp;
-
-  int64_t segment_rd;
-  SPLITMV_PARTITIONING_TYPE segment_num;
-  TX_SIZE txfm_size;
-  int r;
-  int d;
-  int segment_yrate;
-  B_PREDICTION_MODE modes[16];
-  int_mv mvs[16], second_mvs[16];
-  int eobs[16];
-
-  int mvthresh;
-  int *mdcounts;
-
-  int_mv sv_mvp[4];     // save 4 mvp from 8x8
-  int sv_istep[2];  // save 2 initial step_param for 16x8/8x16
-
-} BEST_SEG_INFO;
-
-static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
-                                    BEST_SEG_INFO *bsi,
-                                    SPLITMV_PARTITIONING_TYPE segmentation,
-                                    TX_SIZE tx_size, int64_t *otherrds,
-                                    int64_t *rds, int *completed,
-                                    /* 16 = n_blocks */
-                                    int_mv seg_mvs[16 /* n_blocks */]
-                                                  [MAX_REF_FRAMES - 1]) {
-  int i, j;
-  int const *labels;
-  int br = 0, bd = 0;
-  B_PREDICTION_MODE this_mode;
-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
-
-  int label_count;
-  int64_t this_segment_rd = 0, other_segment_rd;
-  int label_mv_thresh;
-  int rate = 0;
-  int sbr = 0, sbd = 0;
-  int segmentyrate = 0;
-  int best_eobs[16] = { 0 };
-
-  vp9_variance_fn_ptr_t *v_fn_ptr;
-
-  ENTROPY_CONTEXT t_above[4], t_left[4];
-  ENTROPY_CONTEXT t_above_b[4], t_left_b[4];
-
-  vpx_memcpy(t_above, x->e_mbd.plane[0].above_context, sizeof(t_above));
-  vpx_memcpy(t_left, x->e_mbd.plane[0].left_context, sizeof(t_left));
-
-  v_fn_ptr = &cpi->fn_ptr[segmentation];
-  labels = vp9_mbsplits[segmentation];
-  label_count = vp9_mbsplit_count[segmentation];
-
-  // 64 makes this threshold really big effectively
-  // making it so that we very rarely check mvs on
-  // segments.   setting this to 1 would make mv thresh
-  // roughly equal to what it is for macroblocks
-  label_mv_thresh = 1 * bsi->mvthresh / label_count;
-
-  // Segmentation method overheads
-  rate = cost_token(vp9_mbsplit_tree, vp9_mbsplit_probs,
-                    vp9_mbsplit_encodings + segmentation);
-  rate += vp9_cost_mv_ref(cpi, SPLITMV,
-                          mbmi->mb_mode_context[mbmi->ref_frame]);
-  this_segment_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);
-  br += rate;
-  other_segment_rd = this_segment_rd;
-
-  mbmi->txfm_size = tx_size;
-  for (i = 0; i < label_count && this_segment_rd < bsi->segment_rd; i++) {
-    int_mv mode_mv[B_MODE_COUNT], second_mode_mv[B_MODE_COUNT];
-    int64_t best_label_rd = INT64_MAX, best_other_rd = INT64_MAX;
-    B_PREDICTION_MODE mode_selected = ZERO4X4;
-    int bestlabelyrate = 0;
-
-    // search for the best motion vector on this segment
-    for (this_mode = LEFT4X4; this_mode <= NEW4X4; this_mode ++) {
-      int64_t this_rd, other_rd;
-      int distortion;
-      int labelyrate;
-      ENTROPY_CONTEXT t_above_s[4], t_left_s[4];
-
-      vpx_memcpy(t_above_s, t_above, sizeof(t_above_s));
-      vpx_memcpy(t_left_s, t_left, sizeof(t_left_s));
-
-      // motion search for newmv (single predictor case only)
-      if (mbmi->second_ref_frame <= 0 && this_mode == NEW4X4) {
-        int sseshift, n;
-        int step_param = 0;
-        int further_steps;
-        int thissme, bestsme = INT_MAX;
-        const struct buf_2d orig_src = x->plane[0].src;
-        const struct buf_2d orig_pre = x->e_mbd.plane[0].pre[0];
-
-        /* Is the best so far sufficiently good that we cant justify doing
-         * and new motion search. */
-        if (best_label_rd < label_mv_thresh)
-          break;
-
-        if (cpi->compressor_speed) {
-          if (segmentation == PARTITIONING_8X16 ||
-              segmentation == PARTITIONING_16X8) {
-            bsi->mvp.as_int = bsi->sv_mvp[i].as_int;
-            if (i == 1 && segmentation == PARTITIONING_16X8)
-              bsi->mvp.as_int = bsi->sv_mvp[2].as_int;
-
-            step_param = bsi->sv_istep[i];
-          }
-
-          // use previous block's result as next block's MV predictor.
-          if (segmentation == PARTITIONING_4X4 && i > 0) {
-            bsi->mvp.as_int =
-              x->e_mbd.mode_info_context->bmi[i - 1].as_mv[0].as_int;
-            if (i == 4 || i == 8 || i == 12)
-              bsi->mvp.as_int =
-                x->e_mbd.mode_info_context->bmi[i - 4].as_mv[0].as_int;
-            step_param = 2;
-          }
-        }
-
-        further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
-
-        {
-          int sadpb = x->sadperbit4;
-          int_mv mvp_full;
-
-          mvp_full.as_mv.row = bsi->mvp.as_mv.row >> 3;
-          mvp_full.as_mv.col = bsi->mvp.as_mv.col >> 3;
-
-          // find first label
-          n = vp9_mbsplit_offset[segmentation][i];
-
-          // adjust src pointer for this segment
-          x->plane[0].src.buf =
-              raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_MB16X16, 0, n,
-                                        x->plane[0].src.buf,
-                                        x->plane[0].src.stride);
-          assert(((intptr_t)x->e_mbd.plane[0].pre[0].buf & 0xf) == 0);
-          x->e_mbd.plane[0].pre[0].buf =
-              raster_block_offset_uint8(&x->e_mbd, BLOCK_SIZE_MB16X16, 0, n,
-                                        x->e_mbd.plane[0].pre[0].buf,
-                                        x->e_mbd.plane[0].pre[0].stride);
-
-          bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
-                                           sadpb, further_steps, 0, v_fn_ptr,
-                                           bsi->ref_mv, &mode_mv[NEW4X4]);
-
-          sseshift = segmentation_to_sseshift[segmentation];
-
-          // Should we do a full search (best quality only)
-          if ((cpi->compressor_speed == 0) && (bestsme >> sseshift) > 4000) {
-            /* Check if mvp_full is within the range. */
-            clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,
-                     x->mv_row_min, x->mv_row_max);
-
-            thissme = cpi->full_search_sad(x, &mvp_full,
-                                           sadpb, 16, v_fn_ptr,
-                                           x->nmvjointcost, x->mvcost,
-                                           bsi->ref_mv,
-                                           n);
-
-            if (thissme < bestsme) {
-              bestsme = thissme;
-              mode_mv[NEW4X4].as_int =
-                x->e_mbd.mode_info_context->bmi[n].as_mv[0].as_int;
-            } else {
-              /* The full search result is actually worse so re-instate the
-               * previous best vector */
-              x->e_mbd.mode_info_context->bmi[n].as_mv[0].as_int =
-                mode_mv[NEW4X4].as_int;
-            }
-          }
-        }
-
-        if (bestsme < INT_MAX) {
-          int distortion;
-          unsigned int sse;
-          cpi->find_fractional_mv_step(x, &mode_mv[NEW4X4],
-                                       bsi->ref_mv, x->errorperbit, v_fn_ptr,
-                                       x->nmvjointcost, x->mvcost,
-                                       &distortion, &sse);
-
-          // safe motion search result for use in compound prediction
-          seg_mvs[i][mbmi->ref_frame - 1].as_int = mode_mv[NEW4X4].as_int;
-        }
-
-        // restore src pointers
-        x->plane[0].src = orig_src;
-        x->e_mbd.plane[0].pre[0] = orig_pre;
-      } else if (mbmi->second_ref_frame > 0 && this_mode == NEW4X4) {
-        /* NEW4X4 */
-        /* motion search not completed? Then skip newmv for this block with
-         * comppred */
-        if (seg_mvs[i][mbmi->second_ref_frame - 1].as_int == INVALID_MV ||
-            seg_mvs[i][mbmi->ref_frame        - 1].as_int == INVALID_MV) {
-          continue;
-        }
-      }
-
-      rate = labels2mode(x, labels, i, this_mode, &mode_mv[this_mode],
-                         &second_mode_mv[this_mode], seg_mvs[i],
-                         bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,
-                         x->mvcost, cpi);
-
-      // Trap vectors that reach beyond the UMV borders
-      if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) ||
-          ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) ||
-          ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) ||
-          ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max)) {
-        continue;
-      }
-      if (mbmi->second_ref_frame > 0 &&
-          mv_check_bounds(x, &second_mode_mv[this_mode]))
-        continue;
-
-      if (segmentation == PARTITIONING_4X4) {
-        this_rd = encode_inter_mb_segment(&cpi->common,
-                                          x, labels, i, &labelyrate,
-                                          &distortion, t_above_s, t_left_s);
-        other_rd = this_rd;
-      } else {
-        this_rd = encode_inter_mb_segment_8x8(&cpi->common,
-                                              x, labels, i, &labelyrate,
-                                              &distortion, &other_rd,
-                                              t_above_s, t_left_s);
-      }
-      this_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);
-      rate += labelyrate;
-
-      if (this_rd < best_label_rd) {
-        sbr = rate;
-        sbd = distortion;
-        bestlabelyrate = labelyrate;
-        mode_selected = this_mode;
-        best_label_rd = this_rd;
-        if (x->e_mbd.mode_info_context->mbmi.txfm_size == TX_4X4) {
-          for (j = 0; j < 16; j++)
-            if (labels[j] == i)
-              best_eobs[j] = x->e_mbd.plane[0].eobs[j];
-        } else {
-          for (j = 0; j < 4; j++) {
-            int ib = vp9_i8x8_block[j], idx = j * 4;
-
-            if (labels[ib] == i)
-              best_eobs[idx] = x->e_mbd.plane[0].eobs[idx];
-          }
-        }
-        if (other_rd < best_other_rd)
-          best_other_rd = other_rd;
-
-        vpx_memcpy(t_above_b, t_above_s, sizeof(t_above_s));
-        vpx_memcpy(t_left_b, t_left_s, sizeof(t_left_s));
-
-      }
-    } /*for each 4x4 mode*/
-
-    vpx_memcpy(t_above, t_above_b, sizeof(t_above));
-    vpx_memcpy(t_left, t_left_b, sizeof(t_left));
-
-    labels2mode(x, labels, i, mode_selected, &mode_mv[mode_selected],
-                &second_mode_mv[mode_selected], seg_mvs[i],
-                bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,
-                x->mvcost, cpi);
-
-    br += sbr;
-    bd += sbd;
-    segmentyrate += bestlabelyrate;
-    this_segment_rd += best_label_rd;
-    other_segment_rd += best_other_rd;
-    if (rds)
-      rds[i] = this_segment_rd;
-    if (otherrds)
-      otherrds[i] = other_segment_rd;
-  } /* for each label */
-
-  if (this_segment_rd < bsi->segment_rd) {
-    bsi->r = br;
-    bsi->d = bd;
-    bsi->segment_yrate = segmentyrate;
-    bsi->segment_rd = this_segment_rd;
-    bsi->segment_num = segmentation;
-    bsi->txfm_size = mbmi->txfm_size;
-
-    // store everything needed to come back to this!!
-    for (i = 0; i < 16; i++) {
-      bsi->mvs[i].as_mv = x->partition_info->bmi[i].mv.as_mv;
-      if (mbmi->second_ref_frame > 0)
-        bsi->second_mvs[i].as_mv = x->partition_info->bmi[i].second_mv.as_mv;
-      bsi->modes[i] = x->partition_info->bmi[i].mode;
-      bsi->eobs[i] = best_eobs[i];
-    }
-  }
-
-  if (completed) {
-    *completed = i;
-  }
-}
-
-static void rd_check_segment(VP9_COMP *cpi, MACROBLOCK *x,
-                             BEST_SEG_INFO *bsi,
-                             unsigned int segmentation,
-                             /* 16 = n_blocks */
-                             int_mv seg_mvs[16][MAX_REF_FRAMES - 1],
-                             int64_t txfm_cache[NB_TXFM_MODES]) {
-  int i, n, c = vp9_mbsplit_count[segmentation];
-
-  if (segmentation == PARTITIONING_4X4) {
-    int64_t rd[16];
-
-    rd_check_segment_txsize(cpi, x, bsi, segmentation, TX_4X4, NULL,
-                            rd, &n, seg_mvs);
-    if (n == c) {
-      for (i = 0; i < NB_TXFM_MODES; i++) {
-        if (rd[c - 1] < txfm_cache[i])
-          txfm_cache[i] = rd[c - 1];
-      }
-    }
-  } else {
-    int64_t diff, base_rd;
-    int cost4x4 = vp9_cost_bit(cpi->common.prob_tx[0], 0);
-    int cost8x8 = vp9_cost_bit(cpi->common.prob_tx[0], 1);
-
-    if (cpi->common.txfm_mode == TX_MODE_SELECT) {
-      int64_t rd4x4[4], rd8x8[4];
-      int n4x4, n8x8, nmin;
-      BEST_SEG_INFO bsi4x4, bsi8x8;
-
-      /* factor in cost of cost4x4/8x8 in decision */
-      vpx_memcpy(&bsi4x4, bsi, sizeof(*bsi));
-      vpx_memcpy(&bsi8x8, bsi, sizeof(*bsi));
-      rd_check_segment_txsize(cpi, x, &bsi4x4, segmentation,
-                              TX_4X4, NULL, rd4x4, &n4x4, seg_mvs);
-      rd_check_segment_txsize(cpi, x, &bsi8x8, segmentation,
-                              TX_8X8, NULL, rd8x8, &n8x8, seg_mvs);
-      if (bsi4x4.segment_num == segmentation) {
-        bsi4x4.segment_rd += RDCOST(x->rdmult, x->rddiv, cost4x4, 0);
-        if (bsi4x4.segment_rd < bsi->segment_rd)
-          vpx_memcpy(bsi, &bsi4x4, sizeof(*bsi));
-      }
-      if (bsi8x8.segment_num == segmentation) {
-        bsi8x8.segment_rd += RDCOST(x->rdmult, x->rddiv, cost8x8, 0);
-        if (bsi8x8.segment_rd < bsi->segment_rd)
-          vpx_memcpy(bsi, &bsi8x8, sizeof(*bsi));
-      }
-      n = n4x4 > n8x8 ? n4x4 : n8x8;
-      if (n == c) {
-        nmin = n4x4 < n8x8 ? n4x4 : n8x8;
-        diff = rd8x8[nmin - 1] - rd4x4[nmin - 1];
-        if (n == n4x4) {
-          base_rd = rd4x4[c - 1];
-        } else {
-          base_rd = rd8x8[c - 1] - diff;
-        }
-      }
-    } else {
-      int64_t rd[4], otherrd[4];
-
-      if (cpi->common.txfm_mode == ONLY_4X4) {
-        rd_check_segment_txsize(cpi, x, bsi, segmentation, TX_4X4, otherrd,
-                                rd, &n, seg_mvs);
-        if (n == c) {
-          base_rd = rd[c - 1];
-          diff = otherrd[c - 1] - rd[c - 1];
-        }
-      } else /* use 8x8 transform */ {
-        rd_check_segment_txsize(cpi, x, bsi, segmentation, TX_8X8, otherrd,
-                                rd, &n, seg_mvs);
-        if (n == c) {
-          diff = rd[c - 1] - otherrd[c - 1];
-          base_rd = otherrd[c - 1];
-        }
-      }
-    }
-
-    if (n == c) {
-      if (base_rd < txfm_cache[ONLY_4X4]) {
-        txfm_cache[ONLY_4X4] = base_rd;
-      }
-      if (base_rd + diff < txfm_cache[ALLOW_8X8]) {
-        txfm_cache[ALLOW_8X8] = txfm_cache[ALLOW_16X16] =
-            txfm_cache[ALLOW_32X32] = base_rd + diff;
-      }
-      if (diff < 0) {
-        base_rd += diff + RDCOST(x->rdmult, x->rddiv, cost8x8, 0);
-      } else {
-        base_rd += RDCOST(x->rdmult, x->rddiv, cost4x4, 0);
-      }
-      if (base_rd < txfm_cache[TX_MODE_SELECT]) {
-        txfm_cache[TX_MODE_SELECT] = base_rd;
-      }
-    }
-  }
-}
-
-static INLINE void cal_step_param(int sr, int *sp) {
-  int step = 0;
-
-  if (sr > MAX_FIRST_STEP) sr = MAX_FIRST_STEP;
-  else if (sr < 1) sr = 1;
-
-  while (sr >>= 1)
-    step++;
-
-  *sp = MAX_MVSEARCH_STEPS - 1 - step;
-}
-
-static int rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
-                                       int_mv *best_ref_mv,
-                                       int_mv *second_best_ref_mv,
-                                       int64_t best_rd,
-                                       int *mdcounts,
-                                       int *returntotrate,
-                                       int *returnyrate,
-                                       int *returndistortion,
-                                       int *skippable, int mvthresh,
-                                       int_mv seg_mvs[NB_PARTITIONINGS]
-                                                     [16 /* n_blocks */]
-                                                     [MAX_REF_FRAMES - 1],
-                                       int64_t txfm_cache[NB_TXFM_MODES]) {
-  int i;
-  BEST_SEG_INFO bsi;
-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
-
-  vpx_memset(&bsi, 0, sizeof(bsi));
-  for (i = 0; i < NB_TXFM_MODES; i++)
-    txfm_cache[i] = INT64_MAX;
-
-  bsi.segment_rd = best_rd;
-  bsi.ref_mv = best_ref_mv;
-  bsi.second_ref_mv = second_best_ref_mv;
-  bsi.mvp.as_int = best_ref_mv->as_int;
-  bsi.mvthresh = mvthresh;
-  bsi.mdcounts = mdcounts;
-  bsi.txfm_size = TX_4X4;
-
-  for (i = 0; i < 16; i++)
-    bsi.modes[i] = ZERO4X4;
-
-  if (cpi->compressor_speed == 0) {
-    /* for now, we will keep the original segmentation order
-       when in best quality mode */
-    rd_check_segment(cpi, x, &bsi, PARTITIONING_16X8,
-                     seg_mvs[PARTITIONING_16X8], txfm_cache);
-    rd_check_segment(cpi, x, &bsi, PARTITIONING_8X16,
-                     seg_mvs[PARTITIONING_8X16], txfm_cache);
-    rd_check_segment(cpi, x, &bsi, PARTITIONING_8X8,
-                     seg_mvs[PARTITIONING_8X8], txfm_cache);
-    rd_check_segment(cpi, x, &bsi, PARTITIONING_4X4,
-                     seg_mvs[PARTITIONING_4X4], txfm_cache);
-  } else {
-    int sr;
-
-    rd_check_segment(cpi, x, &bsi, PARTITIONING_8X8,
-                     seg_mvs[PARTITIONING_8X8], txfm_cache);
-
-    if (bsi.segment_rd < best_rd) {
-      int tmp_col_min = x->mv_col_min;
-      int tmp_col_max = x->mv_col_max;
-      int tmp_row_min = x->mv_row_min;
-      int tmp_row_max = x->mv_row_max;
-
-      vp9_clamp_mv_min_max(x, best_ref_mv);
-
-      /* Get 8x8 result */
-      bsi.sv_mvp[0].as_int = bsi.mvs[0].as_int;
-      bsi.sv_mvp[1].as_int = bsi.mvs[2].as_int;
-      bsi.sv_mvp[2].as_int = bsi.mvs[8].as_int;
-      bsi.sv_mvp[3].as_int = bsi.mvs[10].as_int;
-
-      /* Use 8x8 result as 16x8/8x16's predictor MV. Adjust search range
-       * according to the closeness of 2 MV. */
-      /* block 8X16 */
-      sr = MAXF((abs(bsi.sv_mvp[0].as_mv.row - bsi.sv_mvp[2].as_mv.row)) >> 3,
-                (abs(bsi.sv_mvp[0].as_mv.col - bsi.sv_mvp[2].as_mv.col)) >> 3);
-      cal_step_param(sr, &bsi.sv_istep[0]);
-
-      sr = MAXF((abs(bsi.sv_mvp[1].as_mv.row - bsi.sv_mvp[3].as_mv.row)) >> 3,
-                (abs(bsi.sv_mvp[1].as_mv.col - bsi.sv_mvp[3].as_mv.col)) >> 3);
-      cal_step_param(sr, &bsi.sv_istep[1]);
-
-      rd_check_segment(cpi, x, &bsi, PARTITIONING_8X16,
-                       seg_mvs[PARTITIONING_8X16], txfm_cache);
-
-      /* block 16X8 */
-      sr = MAXF((abs(bsi.sv_mvp[0].as_mv.row - bsi.sv_mvp[1].as_mv.row)) >> 3,
-                (abs(bsi.sv_mvp[0].as_mv.col - bsi.sv_mvp[1].as_mv.col)) >> 3);
-      cal_step_param(sr, &bsi.sv_istep[0]);
-
-      sr = MAXF((abs(bsi.sv_mvp[2].as_mv.row - bsi.sv_mvp[3].as_mv.row)) >> 3,
-                (abs(bsi.sv_mvp[2].as_mv.col - bsi.sv_mvp[3].as_mv.col)) >> 3);
-      cal_step_param(sr, &bsi.sv_istep[1]);
-
-      rd_check_segment(cpi, x, &bsi, PARTITIONING_16X8,
-                       seg_mvs[PARTITIONING_16X8], txfm_cache);
-
-      /* If 8x8 is better than 16x8/8x16, then do 4x4 search */
-      /* Not skip 4x4 if speed=0 (good quality) */
-      if (cpi->sf.no_skip_block4x4_search ||
-          bsi.segment_num == PARTITIONING_8X8) {
-        /* || (sv_segment_rd8x8-bsi.segment_rd) < sv_segment_rd8x8>>5) */
-        bsi.mvp.as_int = bsi.sv_mvp[0].as_int;
-        rd_check_segment(cpi, x, &bsi, PARTITIONING_4X4,
-                         seg_mvs[PARTITIONING_4X4], txfm_cache);
-      }
-
-      /* restore UMV window */
-      x->mv_col_min = tmp_col_min;
-      x->mv_col_max = tmp_col_max;
-      x->mv_row_min = tmp_row_min;
-      x->mv_row_max = tmp_row_max;
-    }
-  }
-
-  /* set it to the best */
-  for (i = 0; i < 16; i++) {
-    x->e_mbd.mode_info_context->bmi[i].as_mv[0].as_int = bsi.mvs[i].as_int;
-    if (mbmi->second_ref_frame > 0)
-      x->e_mbd.mode_info_context->bmi[i].as_mv[1].as_int =
-        bsi.second_mvs[i].as_int;
-    x->e_mbd.plane[0].eobs[i] = bsi.eobs[i];
-  }
-
-  /* save partitions */
-  mbmi->txfm_size = bsi.txfm_size;
-  mbmi->partitioning = bsi.segment_num;
-  x->partition_info->count = vp9_mbsplit_count[bsi.segment_num];
-
-  for (i = 0; i < x->partition_info->count; i++) {
-    int j;
-
-    j = vp9_mbsplit_offset[bsi.segment_num][i];
-
-    x->partition_info->bmi[i].mode = bsi.modes[j];
-    x->partition_info->bmi[i].mv.as_mv = bsi.mvs[j].as_mv;
-    if (mbmi->second_ref_frame > 0)
-      x->partition_info->bmi[i].second_mv.as_mv = bsi.second_mvs[j].as_mv;
-  }
-  /*
-   * used to set mbmi->mv.as_int
-   */
-  x->partition_info->bmi[15].mv.as_int = bsi.mvs[15].as_int;
-  if (mbmi->second_ref_frame > 0)
-    x->partition_info->bmi[15].second_mv.as_int = bsi.second_mvs[15].as_int;
-
-  *returntotrate = bsi.r;
-  *returndistortion = bsi.d;
-  *returnyrate = bsi.segment_yrate;
-  *skippable = vp9_sby_is_skippable(&x->e_mbd, BLOCK_SIZE_MB16X16);
-
-  return (int)(bsi.segment_rd);
-}
-#endif  // !CONFIG_SB8X8
-
 static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
                     uint8_t *ref_y_buffer, int ref_y_stride,
                     int ref_frame, enum BlockSize block_size ) {
@@ -3016,24 +1603,10 @@
   x->mv_best_ref_index[ref_frame] = best_index;
 }
 
-#if !CONFIG_SB8X8
-static void set_i8x8_block_modes(MACROBLOCK *x, int modes[4]) {
-  int i;
-  MACROBLOCKD *xd = &x->e_mbd;
-  for (i = 0; i < 4; i++) {
-    int ib = vp9_i8x8_block[i];
-    xd->mode_info_context->bmi[ib + 0].as_mode.first = modes[i];
-    xd->mode_info_context->bmi[ib + 1].as_mode.first = modes[i];
-    xd->mode_info_context->bmi[ib + 4].as_mode.first = modes[i];
-    xd->mode_info_context->bmi[ib + 5].as_mode.first = modes[i];
-    // printf("%d,%d,%d,%d\n",
-    //       modes[0], modes[1], modes[2], modes[3]);
-  }
-}
-#endif
-
 extern void vp9_calc_ref_probs(int *count, vp9_prob *probs);
-static void estimate_curframe_refprobs(VP9_COMP *cpi, vp9_prob mod_refprobs[3], int pred_ref) {
+static void estimate_curframe_refprobs(VP9_COMP *cpi,
+                                       vp9_prob mod_refprobs[3],
+                                       int pred_ref) {
   int norm_cnt[MAX_REF_FRAMES];
   const int *const rfct = cpi->count_mb_ref_frame_usage;
   int intra_count = rfct[INTRA_FRAME];
@@ -3083,7 +1656,8 @@
   return (0x8000 + weight * cost1 + (0x10000 - weight) * cost0) >> 16;
 }
 
-static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id, unsigned int *ref_costs) {
+static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id,
+                                     unsigned int *ref_costs) {
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &cpi->mb.e_mbd;
   vp9_prob *mod_refprobs;
@@ -3132,10 +1706,10 @@
       // Get the prediction for the current mb
       cost = weighted_cost(&pred_prob, &new_pred_prob, 0,
                            pred_flag, cpi->seg0_progress);
-      if (cost > 1024) cost = 768; // i.e. account for 4 bits max.
+      if (cost > 1024) cost = 768;  // i.e. account for 4 bits max.
 
       // for incorrectly predicted cases
-      if (! pred_flag) {
+      if (!pred_flag) {
         vp9_prob curframe_mod_refprobs[3];
 
         if (cpi->seg0_progress) {
@@ -3243,6 +1817,51 @@
             frame_type, block_size);
 }
 
+
+static enum BlockSize get_block_size(int bw, int bh) {
+  if (bw == 4 && bh == 4)
+    return BLOCK_4X4;
+
+  if (bw == 4 && bh == 8)
+    return BLOCK_4X8;
+
+  if (bw == 8 && bh == 4)
+    return BLOCK_8X4;
+
+  if (bw == 8 && bh == 8)
+    return BLOCK_8X8;
+
+  if (bw == 8 && bh == 16)
+    return BLOCK_8X16;
+
+  if (bw == 16 && bh == 8)
+    return BLOCK_16X8;
+
+  if (bw == 16 && bh == 16)
+    return BLOCK_16X16;
+
+  if (bw == 32 && bh == 32)
+    return BLOCK_32X32;
+
+  if (bw == 32 && bh == 16)
+    return BLOCK_32X16;
+
+  if (bw == 16 && bh == 32)
+    return BLOCK_16X32;
+
+  if (bw == 64 && bh == 32)
+    return BLOCK_64X32;
+
+  if (bw == 32 && bh == 64)
+    return BLOCK_32X64;
+
+  if (bw == 64 && bh == 64)
+    return BLOCK_64X64;
+
+  assert(0);
+  return -1;
+}
+
 static void model_rd_from_var_lapndz(int var, int n, int qstep,
                                      int *rate, int *dist) {
   // This function models the rate and distortion for a Laplacian
@@ -3286,6 +1905,36 @@
   vp9_clear_system_state();
 }
 
+static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
+                            MACROBLOCK *x, MACROBLOCKD *xd,
+                            int *out_rate_sum, int *out_dist_sum) {
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  unsigned int sse, var;
+  int i, rate_sum = 0, dist_sum = 0;
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    struct macroblock_plane *const p = &x->plane[i];
+    struct macroblockd_plane *const pd = &xd->plane[i];
+
+    const int bwl = b_width_log2(bsize) - pd->subsampling_x;
+    const int bhl = b_height_log2(bsize) - pd->subsampling_y;
+    const enum BlockSize bs = get_block_size(4 << bwl, 4 << bhl);
+    int rate, dist;
+    var = cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
+                             pd->dst.buf, pd->dst.stride, &sse);
+    model_rd_from_var_lapndz(var, 16 << (bwl + bhl),
+                             pd->dequant[1] >> 3, &rate, &dist);
+
+    rate_sum += rate;
+    dist_sum += dist;
+  }
+
+  *out_rate_sum = rate_sum;
+  *out_dist_sum = dist_sum;
+}
+
 static enum BlockSize y_to_uv_block_size(enum BlockSize bs) {
   switch (bs) {
     case BLOCK_64X64: return BLOCK_32X32;
@@ -3295,11 +1944,9 @@
     case BLOCK_32X16: return BLOCK_16X8;
     case BLOCK_16X32: return BLOCK_8X16;
     case BLOCK_16X16: return BLOCK_8X8;
-#if CONFIG_SB8X8
     case BLOCK_16X8:  return BLOCK_8X4;
     case BLOCK_8X16:  return BLOCK_4X8;
     case BLOCK_8X8:   return BLOCK_4X4;
-#endif
     default:
       assert(0);
       return -1;
@@ -3315,11 +1962,9 @@
     case BLOCK_SIZE_SB32X16: return BLOCK_32X16;
     case BLOCK_SIZE_SB16X32: return BLOCK_16X32;
     case BLOCK_SIZE_MB16X16: return BLOCK_16X16;
-#if CONFIG_SB8X8
     case BLOCK_SIZE_SB16X8:  return BLOCK_16X8;
     case BLOCK_SIZE_SB8X16:  return BLOCK_8X16;
     case BLOCK_SIZE_SB8X8:   return BLOCK_8X8;
-#endif
     default:
       assert(0);
       return -1;
@@ -3520,76 +2165,41 @@
   // Search for best switchable filter by checking the variance of
   // pred error irrespective of whether the filter will be used
   if (1) {
-    int switchable_filter_index, newbest;
-    int tmp_rate_y_i = 0, tmp_rate_u_i = 0, tmp_rate_v_i = 0;
-    int tmp_dist_y_i = 0, tmp_dist_u_i = 0, tmp_dist_v_i = 0;
-    for (switchable_filter_index = 0;
-         switchable_filter_index < VP9_SWITCHABLE_FILTERS;
-         ++switchable_filter_index) {
+    int i, newbest;
+    int tmp_rate_sum = 0, tmp_dist_sum = 0;
+    for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
       int rs = 0;
-      mbmi->interp_filter = vp9_switchable_interp[switchable_filter_index];
-      vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
+      const INTERPOLATIONFILTERTYPE filter = vp9_switchable_interp[i];
+      const int is_intpel_interp = intpel_mv &&
+                                   vp9_is_interpolating_filter[filter];
+      mbmi->interp_filter = filter;
+      vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
 
-      if (cpi->common.mcomp_filter_type == SWITCHABLE) {
+      if (cm->mcomp_filter_type == SWITCHABLE) {
         const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP);
         const int m = vp9_switchable_interp_map[mbmi->interp_filter];
         rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m];
       }
-      if (interpolating_intpel_seen && intpel_mv &&
-          vp9_is_interpolating_filter[mbmi->interp_filter]) {
-        rd = RDCOST(x->rdmult, x->rddiv,
-                    rs + tmp_rate_y_i + tmp_rate_u_i + tmp_rate_v_i,
-                    tmp_dist_y_i + tmp_dist_u_i + tmp_dist_v_i);
+
+      if (interpolating_intpel_seen && is_intpel_interp) {
+        rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate_sum, tmp_dist_sum);
       } else {
-        unsigned int sse, var;
-        int tmp_rate_y, tmp_rate_u, tmp_rate_v;
-        int tmp_dist_y, tmp_dist_u, tmp_dist_v;
+        int rate_sum = 0, dist_sum = 0;
         vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
-        var = cpi->fn_ptr[block_size].vf(x->plane[0].src.buf,
-                                         x->plane[0].src.stride,
-                                         xd->plane[0].dst.buf,
-                                         xd->plane[0].dst.stride,
-                                         &sse);
-        // Note our transform coeffs are 8 times an orthogonal transform.
-        // Hence quantizer step is also 8 times. To get effective quantizer
-        // we need to divide by 8 before sending to modeling function.
-        model_rd_from_var_lapndz(var, MI_SIZE * bw * MI_SIZE * bh,
-                                 xd->plane[0].dequant[1] >> 3,
-                                 &tmp_rate_y, &tmp_dist_y);
-        var = cpi->fn_ptr[uv_block_size].vf(x->plane[1].src.buf,
-                                            x->plane[1].src.stride,
-                                            xd->plane[1].dst.buf,
-                                            xd->plane[1].dst.stride,
-                                            &sse);
-        model_rd_from_var_lapndz(var, MI_UV_SIZE * bw * MI_UV_SIZE * bh,
-                                 xd->plane[1].dequant[1] >> 3,
-                                 &tmp_rate_u, &tmp_dist_u);
-        var = cpi->fn_ptr[uv_block_size].vf(x->plane[2].src.buf,
-                                            x->plane[1].src.stride,
-                                            xd->plane[2].dst.buf,
-                                            xd->plane[1].dst.stride,
-                                            &sse);
-        model_rd_from_var_lapndz(var, MI_UV_SIZE * bw * MI_UV_SIZE * bh,
-                                 xd->plane[2].dequant[1] >> 3,
-                                 &tmp_rate_v, &tmp_dist_v);
-        rd = RDCOST(x->rdmult, x->rddiv,
-                    rs + tmp_rate_y + tmp_rate_u + tmp_rate_v,
-                    tmp_dist_y + tmp_dist_u + tmp_dist_v);
-        if (!interpolating_intpel_seen && intpel_mv &&
-            vp9_is_interpolating_filter[mbmi->interp_filter]) {
-          tmp_rate_y_i = tmp_rate_y;
-          tmp_rate_u_i = tmp_rate_u;
-          tmp_rate_v_i = tmp_rate_v;
-          tmp_dist_y_i = tmp_dist_y;
-          tmp_dist_u_i = tmp_dist_u;
-          tmp_dist_v_i = tmp_dist_v;
+        model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum);
+        rd = RDCOST(x->rdmult, x->rddiv, rs + rate_sum, dist_sum);
+        if (!interpolating_intpel_seen && is_intpel_interp) {
+          tmp_rate_sum = rate_sum;
+          tmp_dist_sum = dist_sum;
         }
       }
-      newbest = (switchable_filter_index == 0 || rd < best_rd);
+      newbest = i == 0 || rd < best_rd;
+
       if (newbest) {
         best_rd = rd;
         *best_filter = mbmi->interp_filter;
       }
+
       if ((cm->mcomp_filter_type == SWITCHABLE && newbest) ||
           (cm->mcomp_filter_type != SWITCHABLE &&
            cm->mcomp_filter_type == mbmi->interp_filter)) {
@@ -3604,21 +2214,19 @@
                      sizeof(unsigned char) * MI_UV_SIZE * bw);
         for (i = 0; i < MI_UV_SIZE * bh; ++i)
           vpx_memcpy(tmp_vbuf + i * MI_UV_SIZE * bw,
-                     xd->plane[2].dst.buf + i * xd->plane[1].dst.stride,
+                     xd->plane[2].dst.buf + i * xd->plane[2].dst.stride,
                      sizeof(unsigned char) * MI_UV_SIZE * bw);
         pred_exists = 1;
       }
-      interpolating_intpel_seen |=
-        intpel_mv && vp9_is_interpolating_filter[mbmi->interp_filter];
+      interpolating_intpel_seen |= is_intpel_interp;
     }
   }
 
   // Set the appripriate filter
-  if (cm->mcomp_filter_type != SWITCHABLE)
-    mbmi->interp_filter = cm->mcomp_filter_type;
-  else
-    mbmi->interp_filter = *best_filter;
-  vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
+  mbmi->interp_filter = cm->mcomp_filter_type != SWITCHABLE ?
+                             cm->mcomp_filter_type : *best_filter;
+  vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
+
 
   if (pred_exists) {
     // FIXME(rbultje): mb code still predicts into xd->predictor
@@ -3631,7 +2239,7 @@
                  tmp_ubuf + i * bw * MI_UV_SIZE,
                  sizeof(unsigned char) * bw * MI_UV_SIZE);
     for (i = 0; i < bh * MI_UV_SIZE; ++i)
-      vpx_memcpy(xd->plane[2].dst.buf + i * xd->plane[1].dst.stride,
+      vpx_memcpy(xd->plane[2].dst.buf + i * xd->plane[2].dst.stride,
                  tmp_vbuf + i * bw * MI_UV_SIZE,
                  sizeof(unsigned char) * bw * MI_UV_SIZE);
   } else {
@@ -3747,869 +2355,6 @@
   return this_rd;  // if 0, this will be re-calculated by caller
 }
 
-#if !CONFIG_SB8X8
-static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
-                               int mi_row, int mi_col,
-                               int *returnrate, int *returndistortion,
-                               int64_t *returnintra) {
-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-    VP9_ALT_FLAG };
-  VP9_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
-  union b_mode_info best_bmodes[16];
-  MB_MODE_INFO best_mbmode;
-  PARTITION_INFO best_partition;
-  int_mv best_ref_mv, second_best_ref_mv;
-  MB_PREDICTION_MODE this_mode;
-  MB_PREDICTION_MODE best_mode = DC_PRED;
-  MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
-  int i, best_mode_index = 0;
-  int mode8x8[4];
-  unsigned char segment_id = mbmi->segment_id;
-
-  int mode_index;
-  int mdcounts[4];
-  int rate, distortion;
-  int rate2, distortion2;
-  int64_t best_txfm_rd[NB_TXFM_MODES];
-  int64_t best_txfm_diff[NB_TXFM_MODES];
-  int64_t best_pred_diff[NB_PREDICTION_TYPES];
-  int64_t best_pred_rd[NB_PREDICTION_TYPES];
-  int64_t best_rd = INT64_MAX, best_intra_rd = INT64_MAX;
-#if CONFIG_COMP_INTERINTRA_PRED
-  int is_best_interintra = 0;
-  int64_t best_intra16_rd = INT64_MAX;
-  int best_intra16_mode = DC_PRED;
-#if SEPARATE_INTERINTRA_UV
-  int best_intra16_uv_mode = DC_PRED;
-#endif
-#endif
-  int64_t best_overall_rd = INT64_MAX;
-  INTERPOLATIONFILTERTYPE best_filter = SWITCHABLE;
-  INTERPOLATIONFILTERTYPE tmp_best_filter = SWITCHABLE;
-  int uv_intra_rate[2], uv_intra_distortion[2], uv_intra_rate_tokenonly[2];
-  int uv_intra_skippable[2];
-  MB_PREDICTION_MODE uv_intra_mode[2];
-  int rate_y, UNINITIALIZED_IS_SAFE(rate_uv);
-  int distortion_uv = INT_MAX;
-  int64_t best_yrd = INT64_MAX;
-
-  int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
-  int frame_mdcounts[4][4];
-  YV12_BUFFER_CONFIG yv12_mb[4];
-
-  unsigned int ref_costs[MAX_REF_FRAMES];
-  int_mv seg_mvs[NB_PARTITIONINGS][16 /* n_blocks */][MAX_REF_FRAMES - 1];
-
-  int intra_cost_penalty = 20 * vp9_dc_quant(cpi->common.base_qindex,
-                                             cpi->common.y_dc_delta_q);
-  int64_t mode_distortions[MB_MODE_COUNT] = {-1};
-  int64_t frame_distortions[MAX_REF_FRAMES] = {-1};
-  int ref_frame;
-
-  struct scale_factors scale_factor[4];
-
-  vpx_memset(mode8x8, 0, sizeof(mode8x8));
-  vpx_memset(&frame_mv, 0, sizeof(frame_mv));
-  vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
-  vpx_memset(&best_bmodes, 0, sizeof(best_bmodes));
-  vpx_memset(&x->mb_context[xd->sb_index][xd->mb_index], 0,
-             sizeof(PICK_MODE_CONTEXT));
-
-  x->mb_context[xd->sb_index][xd->mb_index].frames_with_high_error = 0;
-  x->mb_context[xd->sb_index][xd->mb_index].modes_with_high_error = 0;
-
-  for (i = 0; i < MAX_REF_FRAMES; i++)
-    frame_mv[NEWMV][i].as_int = INVALID_MV;
-  for (i = 0; i < NB_PREDICTION_TYPES; ++i)
-    best_pred_rd[i] = INT64_MAX;
-  for (i = 0; i < NB_TXFM_MODES; i++)
-    best_txfm_rd[i] = INT64_MAX;
-
-  for (i = 0; i < NB_PARTITIONINGS; i++) {
-    int j, k;
-
-    for (j = 0; j < 16; j++)
-      for (k = 0; k < MAX_REF_FRAMES - 1; k++)
-        seg_mvs[i][j][k].as_int = INVALID_MV;
-  }
-
-  if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
-    setup_buffer_inter(cpi, x, cpi->lst_fb_idx,
-                       LAST_FRAME, BLOCK_16X16, mi_row, mi_col,
-                       frame_mv[NEARESTMV], frame_mv[NEARMV],
-                       frame_mdcounts, yv12_mb, scale_factor);
-  }
-
-  if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
-    setup_buffer_inter(cpi, x, cpi->gld_fb_idx,
-                       GOLDEN_FRAME, BLOCK_16X16, mi_row, mi_col,
-                       frame_mv[NEARESTMV], frame_mv[NEARMV],
-                       frame_mdcounts, yv12_mb, scale_factor);
-  }
-
-  if (cpi->ref_frame_flags & VP9_ALT_FLAG) {
-    setup_buffer_inter(cpi, x, cpi->alt_fb_idx,
-                       ALTREF_FRAME, BLOCK_16X16, mi_row, mi_col,
-                       frame_mv[NEARESTMV], frame_mv[NEARMV],
-                       frame_mdcounts, yv12_mb, scale_factor);
-  }
-
-  *returnintra = INT64_MAX;
-
-  mbmi->ref_frame = INTRA_FRAME;
-
-  /* Initialize zbin mode boost for uv costing */
-  cpi->zbin_mode_boost = 0;
-  vp9_update_zbin_extra(cpi, x);
-
-  xd->mode_info_context->mbmi.mode = DC_PRED;
-
-  for (i = 0; i <= TX_8X8; i++) {
-    mbmi->txfm_size = i;
-    rd_pick_intra_sbuv_mode(cpi, x, &uv_intra_rate[i],
-                            &uv_intra_rate_tokenonly[i],
-                            &uv_intra_distortion[i],
-                            &uv_intra_skippable[i],
-                            BLOCK_SIZE_MB16X16);
-    uv_intra_mode[i] = mbmi->uv_mode;
-  }
-
-  // Get estimates of reference frame costs for each reference frame
-  // that depend on the current prediction etc.
-  estimate_ref_frame_costs(cpi, segment_id, ref_costs);
-
-  for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
-    int64_t this_rd = INT64_MAX;
-    int disable_skip = 0, skippable = 0;
-    int other_cost = 0;
-    int compmode_cost = 0;
-#if CONFIG_COMP_INTERINTRA_PRED
-    int compmode_interintra_cost = 0;
-#endif
-    int mode_excluded = 0;
-    int64_t txfm_cache[NB_TXFM_MODES] = { 0 };
-    YV12_BUFFER_CONFIG *scaled_ref_frame;
-
-    // These variables hold are rolling total cost and distortion for this mode
-    rate2 = 0;
-    distortion2 = 0;
-    rate_y = 0;
-    rate_uv = 0;
-
-    x->skip = 0;
-
-    this_mode = vp9_mode_order[mode_index].mode;
-    mbmi->mode = this_mode;
-    mbmi->uv_mode = DC_PRED;
-    mbmi->ref_frame = vp9_mode_order[mode_index].ref_frame;
-    mbmi->second_ref_frame = vp9_mode_order[mode_index].second_ref_frame;
-
-    mbmi->interp_filter = cm->mcomp_filter_type;
-
-    set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,
-                      scale_factor);
-
-    vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
-
-    // Test best rd so far against threshold for trying this mode.
-    if (best_rd <= cpi->rd_threshes[mode_index])
-      continue;
-
-    // Ensure that the references used by this mode are available.
-    if (mbmi->ref_frame &&
-        !(cpi->ref_frame_flags & flag_list[mbmi->ref_frame]))
-      continue;
-
-    if (mbmi->second_ref_frame > 0 &&
-        !(cpi->ref_frame_flags & flag_list[mbmi->second_ref_frame]))
-      continue;
-
-    // only scale on zeromv.
-    if (mbmi->ref_frame > 0 &&
-          (yv12_mb[mbmi->ref_frame].y_width != cm->mb_cols * 16 ||
-           yv12_mb[mbmi->ref_frame].y_height != cm->mb_rows * 16) &&
-        this_mode != ZEROMV)
-      continue;
-
-    if (mbmi->second_ref_frame > 0 &&
-          (yv12_mb[mbmi->second_ref_frame].y_width != cm->mb_cols * 16 ||
-           yv12_mb[mbmi->second_ref_frame].y_height != cm->mb_rows * 16) &&
-        this_mode != ZEROMV)
-      continue;
-
-    // current coding mode under rate-distortion optimization test loop
-#if CONFIG_COMP_INTERINTRA_PRED
-    mbmi->interintra_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
-    mbmi->interintra_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
-#endif
-
-    // If the segment reference frame feature is enabled....
-    // then do nothing if the current ref frame is not allowed..
-    if (vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
-        !vp9_check_segref(xd, segment_id, mbmi->ref_frame)) {
-      continue;
-    // If the segment skip feature is enabled....
-    // then do nothing if the current mode is not allowed..
-    } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP) &&
-               (this_mode != ZEROMV)) {
-      continue;
-    // Disable this drop out case if  the ref frame segment
-    // level feature is enabled for this segment. This is to
-    // prevent the possibility that the we end up unable to pick any mode.
-    } else if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME)) {
-      // Only consider ZEROMV/ALTREF_FRAME for alt ref frame overlay,
-      // unless ARNR filtering is enabled in which case we want
-      // an unfiltered alternative
-      if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
-        if (this_mode != ZEROMV ||
-            mbmi->ref_frame != ALTREF_FRAME) {
-          continue;
-        }
-      }
-    }
-
-    /* everything but intra */
-    scaled_ref_frame = NULL;
-    if (mbmi->ref_frame) {
-      int ref = mbmi->ref_frame;
-      int fb;
-
-      best_ref_mv = mbmi->ref_mvs[ref][0];
-      vpx_memcpy(mdcounts, frame_mdcounts[ref], sizeof(mdcounts));
-
-      if (mbmi->ref_frame == LAST_FRAME) {
-        fb = cpi->lst_fb_idx;
-      } else if (mbmi->ref_frame == GOLDEN_FRAME) {
-        fb = cpi->gld_fb_idx;
-      } else {
-        fb = cpi->alt_fb_idx;
-      }
-
-      if (cpi->scaled_ref_idx[fb] != cm->ref_frame_map[fb])
-        scaled_ref_frame = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];
-    }
-
-    if (mbmi->second_ref_frame > 0) {
-      int ref = mbmi->second_ref_frame;
-
-      second_best_ref_mv = mbmi->ref_mvs[ref][0];
-    }
-
-    // TODO(jkoleszar) scaling/translation handled during creation of yv12_mb
-    // currently.
-    setup_pre_planes(xd, &yv12_mb[mbmi->ref_frame],
-        mbmi->second_ref_frame > 0 ? &yv12_mb[mbmi->second_ref_frame] : NULL,
-        0, 0, NULL, NULL);
-
-    // Experimental code. Special case for gf and arf zeromv modes.
-    // Increase zbin size to suppress noise
-    if (cpi->zbin_mode_boost_enabled) {
-      if (vp9_mode_order[mode_index].ref_frame == INTRA_FRAME)
-        cpi->zbin_mode_boost = 0;
-      else {
-        if (vp9_mode_order[mode_index].mode == ZEROMV) {
-          if (vp9_mode_order[mode_index].ref_frame != LAST_FRAME)
-            cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
-          else
-            cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
-        } else if (vp9_mode_order[mode_index].mode == SPLITMV)
-          cpi->zbin_mode_boost = 0;
-        else
-          cpi->zbin_mode_boost = MV_ZBIN_BOOST;
-      }
-
-      vp9_update_zbin_extra(cpi, x);
-    }
-
-    // Intra
-    if (!mbmi->ref_frame) {
-      switch (this_mode) {
-        default:
-        case V_PRED:
-        case H_PRED:
-        case D45_PRED:
-        case D135_PRED:
-        case D117_PRED:
-        case D153_PRED:
-        case D27_PRED:
-        case D63_PRED:
-          rate2 += intra_cost_penalty;
-        case DC_PRED:
-        case TM_PRED:
-          mbmi->ref_frame = INTRA_FRAME;
-          // FIXME compound intra prediction
-          vp9_build_intra_predictors_sby_s(&x->e_mbd, BLOCK_SIZE_MB16X16);
-          // vp9_build_intra_predictors_mby(&x->e_mbd);
-          super_block_yrd(cpi, x, &rate_y, &distortion, &skippable,
-                          BLOCK_SIZE_MB16X16, txfm_cache);
-          rate2 += rate_y;
-          distortion2 += distortion;
-          rate2 += x->mbmode_cost[xd->frame_type][mbmi->mode];
-
-          rate2 += uv_intra_rate[mbmi->txfm_size != TX_4X4];
-          rate_uv = uv_intra_rate_tokenonly[mbmi->txfm_size != TX_4X4];
-          distortion2 += uv_intra_distortion[mbmi->txfm_size != TX_4X4];
-          distortion_uv = uv_intra_distortion[mbmi->txfm_size != TX_4X4];
-          skippable = skippable &&
-                      uv_intra_skippable[mbmi->txfm_size != TX_4X4];
-          break;
-        case I4X4_PRED: {
-          int64_t tmp_rd;
-
-          // Note the rate value returned here includes the cost of coding
-          // the I4X4_PRED mode : x->mbmode_cost[xd->frame_type][I4X4_PRED];
-          mbmi->txfm_size = TX_4X4;
-          tmp_rd = rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y,
-                                             &distortion, best_yrd);
-          rate2 += rate;
-          rate2 += intra_cost_penalty;
-          distortion2 += distortion;
-
-          if (tmp_rd < best_yrd) {
-            rate2 += uv_intra_rate[TX_4X4];
-            rate_uv = uv_intra_rate_tokenonly[TX_4X4];
-            distortion2 += uv_intra_distortion[TX_4X4];
-            distortion_uv = uv_intra_distortion[TX_4X4];
-          } else {
-            this_rd = INT64_MAX;
-            disable_skip = 1;
-          }
-        }
-        break;
-        case I8X8_PRED: {
-          int64_t tmp_rd;
-
-          tmp_rd = rd_pick_intra8x8mby_modes_and_txsz(cpi, x, &rate, &rate_y,
-                                                      &distortion, mode8x8,
-                                                      best_yrd, txfm_cache);
-          rate2 += rate;
-          rate2 += intra_cost_penalty;
-          distortion2 += distortion;
-
-          /* TODO: uv rate maybe over-estimated here since there is UV intra
-                   mode coded in I8X8_PRED prediction */
-          if (tmp_rd < best_yrd) {
-            rate2 += uv_intra_rate[TX_4X4];
-            rate_uv = uv_intra_rate_tokenonly[TX_4X4];
-            distortion2 += uv_intra_distortion[TX_4X4];
-            distortion_uv = uv_intra_distortion[TX_4X4];
-          } else {
-            this_rd = INT64_MAX;
-            disable_skip = 1;
-          }
-        }
-        break;
-      }
-    }
-    // Split MV. The code is very different from the other inter modes so
-    // special case it.
-    else if (this_mode == SPLITMV) {
-      const int is_comp_pred = mbmi->second_ref_frame > 0;
-      int64_t this_rd_thresh;
-      int64_t tmp_rd, tmp_best_rd = INT64_MAX, tmp_best_rdu = INT64_MAX;
-      int tmp_best_rate = INT_MAX, tmp_best_ratey = INT_MAX;
-      int tmp_best_distortion = INT_MAX, tmp_best_skippable = 0;
-      int switchable_filter_index;
-      int_mv *second_ref = is_comp_pred ? &second_best_ref_mv : NULL;
-      union b_mode_info tmp_best_bmodes[16];
-      MB_MODE_INFO tmp_best_mbmode;
-      PARTITION_INFO tmp_best_partition;
-      int pred_exists = 0;
-
-      this_rd_thresh =
-          (mbmi->ref_frame == LAST_FRAME) ?
-          cpi->rd_threshes[THR_NEWMV] : cpi->rd_threshes[THR_NEWA];
-      this_rd_thresh =
-          (mbmi->ref_frame == GOLDEN_FRAME) ?
-          cpi->rd_threshes[THR_NEWG] : this_rd_thresh;
-      xd->mode_info_context->mbmi.txfm_size = TX_4X4;
-
-      for (switchable_filter_index = 0;
-           switchable_filter_index < VP9_SWITCHABLE_FILTERS;
-           ++switchable_filter_index) {
-        int newbest;
-        mbmi->interp_filter =
-            vp9_switchable_interp[switchable_filter_index];
-        vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
-
-        tmp_rd = rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv,
-                                             second_ref, best_yrd, mdcounts,
-                                             &rate, &rate_y, &distortion,
-                                             &skippable,
-                                             (int)this_rd_thresh, seg_mvs,
-                                             txfm_cache);
-        if (cpi->common.mcomp_filter_type == SWITCHABLE) {
-          int rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs
-                   [vp9_get_pred_context(&cpi->common, xd,
-                                         PRED_SWITCHABLE_INTERP)]
-                   [vp9_switchable_interp_map[mbmi->interp_filter]];
-          tmp_rd += RDCOST(x->rdmult, x->rddiv, rs, 0);
-        }
-        newbest = (tmp_rd < tmp_best_rd);
-        if (newbest) {
-          tmp_best_filter = mbmi->interp_filter;
-          tmp_best_rd = tmp_rd;
-        }
-        if ((newbest && cm->mcomp_filter_type == SWITCHABLE) ||
-            (mbmi->interp_filter == cm->mcomp_filter_type &&
-             cm->mcomp_filter_type != SWITCHABLE)) {
-          tmp_best_rdu = tmp_rd;
-          tmp_best_rate = rate;
-          tmp_best_ratey = rate_y;
-          tmp_best_distortion = distortion;
-          tmp_best_skippable = skippable;
-          vpx_memcpy(&tmp_best_mbmode, mbmi, sizeof(MB_MODE_INFO));
-          vpx_memcpy(&tmp_best_partition, x->partition_info,
-                     sizeof(PARTITION_INFO));
-          for (i = 0; i < 16; i++) {
-            tmp_best_bmodes[i] = xd->mode_info_context->bmi[i];
-          }
-          pred_exists = 1;
-        }
-      }  // switchable_filter_index loop
-
-      mbmi->interp_filter = (cm->mcomp_filter_type == SWITCHABLE ?
-                             tmp_best_filter : cm->mcomp_filter_type);
-      vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
-      if (!pred_exists) {
-        // Handles the special case when a filter that is not in the
-        // switchable list (bilinear, 6-tap) is indicated at the frame level
-        tmp_rd = rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv,
-                                             second_ref, best_yrd, mdcounts,
-                                             &rate, &rate_y, &distortion,
-                                             &skippable,
-                                             (int)this_rd_thresh, seg_mvs,
-                                             txfm_cache);
-      } else {
-        if (cpi->common.mcomp_filter_type == SWITCHABLE) {
-          int rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs
-                   [vp9_get_pred_context(&cpi->common, xd,
-                                         PRED_SWITCHABLE_INTERP)]
-                   [vp9_switchable_interp_map[mbmi->interp_filter]];
-          tmp_best_rdu -= RDCOST(x->rdmult, x->rddiv, rs, 0);
-        }
-        tmp_rd = tmp_best_rdu;
-        rate = tmp_best_rate;
-        rate_y = tmp_best_ratey;
-        distortion = tmp_best_distortion;
-        skippable = tmp_best_skippable;
-        vpx_memcpy(mbmi, &tmp_best_mbmode, sizeof(MB_MODE_INFO));
-        vpx_memcpy(x->partition_info, &tmp_best_partition,
-                   sizeof(PARTITION_INFO));
-        for (i = 0; i < 16; i++) {
-          xd->mode_info_context->bmi[i] = tmp_best_bmodes[i];
-        }
-      }
-
-      rate2 += rate;
-      distortion2 += distortion;
-
-      if (cpi->common.mcomp_filter_type == SWITCHABLE)
-        rate2 += SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs
-            [vp9_get_pred_context(&cpi->common, xd, PRED_SWITCHABLE_INTERP)]
-            [vp9_switchable_interp_map[mbmi->interp_filter]];
-
-      // If even the 'Y' rd value of split is higher than best so far
-      // then dont bother looking at UV
-      if (tmp_rd < best_yrd) {
-        int uv_skippable;
-
-        vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
-                                        BLOCK_SIZE_MB16X16);
-
-        vp9_subtract_sbuv(x, BLOCK_SIZE_MB16X16);
-
-        super_block_uvrd_4x4(cm, x, &rate_uv, &distortion_uv,
-                             &uv_skippable, BLOCK_SIZE_MB16X16);
-        rate2 += rate_uv;
-        distortion2 += distortion_uv;
-        skippable = skippable && uv_skippable;
-      } else {
-        this_rd = INT64_MAX;
-        disable_skip = 1;
-      }
-
-      if (!mode_excluded) {
-        if (is_comp_pred)
-          mode_excluded = cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY;
-        else
-          mode_excluded = cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY;
-      }
-
-      compmode_cost =
-        vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_COMP), is_comp_pred);
-      mbmi->mode = this_mode;
-    }
-    else {
-#if CONFIG_COMP_INTERINTRA_PRED
-      if (mbmi->second_ref_frame == INTRA_FRAME) {
-        if (best_intra16_mode == DC_PRED - 1) continue;
-        mbmi->interintra_mode = best_intra16_mode;
-#if SEPARATE_INTERINTRA_UV
-        mbmi->interintra_uv_mode = best_intra16_uv_mode;
-#else
-        mbmi->interintra_uv_mode = best_intra16_mode;
-#endif
-      }
-#endif
-      this_rd = handle_inter_mode(cpi, x, BLOCK_SIZE_MB16X16,
-                                  mdcounts, txfm_cache,
-                                  &rate2, &distortion2, &skippable,
-                                  &compmode_cost,
-#if CONFIG_COMP_INTERINTRA_PRED
-                                  &compmode_interintra_cost,
-#endif
-                                  &rate_y, &distortion,
-                                  &rate_uv, &distortion_uv,
-                                  &mode_excluded, &disable_skip,
-                                  mode_index, &tmp_best_filter, frame_mv,
-                                  scaled_ref_frame, mi_row, mi_col);
-      if (this_rd == INT64_MAX)
-        continue;
-    }
-
-#if CONFIG_COMP_INTERINTRA_PRED
-    if (cpi->common.use_interintra)
-      rate2 += compmode_interintra_cost;
-#endif
-
-    if (cpi->common.comp_pred_mode == HYBRID_PREDICTION)
-      rate2 += compmode_cost;
-
-    // Estimate the reference frame signaling cost and add it
-    // to the rolling cost variable.
-    rate2 += ref_costs[mbmi->ref_frame];
-
-    if (!disable_skip) {
-      // Test for the condition where skip block will be activated
-      // because there are no non zero coefficients and make any
-      // necessary adjustment for rate. Ignore if skip is coded at
-      // segment level as the cost wont have been added in.
-      int mb_skip_allowed;
-
-      // Is Mb level skip allowed (i.e. not coded at segment level).
-      mb_skip_allowed = !vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);
-
-      if (skippable) {
-        mbmi->mb_skip_coeff = 1;
-
-        // Back out the coefficient coding costs
-        rate2 -= (rate_y + rate_uv);
-        // for best_yrd calculation
-        rate_uv = 0;
-
-        if (mb_skip_allowed) {
-          int prob_skip_cost;
-
-          // Cost the skip mb case
-          vp9_prob skip_prob =
-            vp9_get_pred_prob(cm, &x->e_mbd, PRED_MBSKIP);
-
-          if (skip_prob) {
-            prob_skip_cost = vp9_cost_bit(skip_prob, 1);
-            rate2 += prob_skip_cost;
-            other_cost += prob_skip_cost;
-          }
-        }
-      } else {
-        // Add in the cost of the no skip flag.
-        mbmi->mb_skip_coeff = 0;
-        if (mb_skip_allowed) {
-          int prob_skip_cost = vp9_cost_bit(
-                 vp9_get_pred_prob(cm, &x->e_mbd, PRED_MBSKIP), 0);
-          rate2 += prob_skip_cost;
-          other_cost += prob_skip_cost;
-        }
-      }
-
-      // Calculate the final RD estimate for this mode.
-      this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
-    }
-
-    // Keep record of best intra distortion
-    if ((mbmi->ref_frame == INTRA_FRAME) &&
-        (this_rd < best_intra_rd)) {
-      best_intra_rd = this_rd;
-      *returnintra = distortion2;
-    }
-#if CONFIG_COMP_INTERINTRA_PRED
-    if ((mbmi->ref_frame == INTRA_FRAME) &&
-        (this_mode <= TM_PRED) &&
-        (this_rd < best_intra16_rd)) {
-      best_intra16_rd = this_rd;
-      best_intra16_mode = this_mode;
-#if SEPARATE_INTERINTRA_UV
-      best_intra16_uv_mode = uv_intra_mode[mbmi->txfm_size != TX_4X4];
-#endif
-    }
-#endif
-
-    if (!disable_skip && mbmi->ref_frame == INTRA_FRAME)
-      for (i = 0; i < NB_PREDICTION_TYPES; ++i)
-        best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
-
-    if (this_rd < best_overall_rd) {
-      best_overall_rd = this_rd;
-      best_filter = tmp_best_filter;
-      best_mode = this_mode;
-#if CONFIG_COMP_INTERINTRA_PRED
-      is_best_interintra = (mbmi->second_ref_frame == INTRA_FRAME);
-#endif
-    }
-
-    // Store the respective mode distortions for later use.
-    // Store the respective mode distortions for later use.
-    if (mode_distortions[this_mode] == -1
-        || distortion2 < mode_distortions[this_mode]) {
-      mode_distortions[this_mode] = distortion2;
-    }
-    if (frame_distortions[mbmi->ref_frame] == -1 ||
-        distortion2 < frame_distortions[mbmi->ref_frame]) {
-       frame_distortions[mbmi->ref_frame] = distortion2;
-    }
-
-    // Did this mode help.. i.e. is it the new best mode
-    if (this_rd < best_rd || x->skip) {
-      if (!mode_excluded) {
-        /*
-        if (mbmi->second_ref_frame == INTRA_FRAME) {
-          printf("rd %d best %d bestintra16 %d\n", this_rd, best_rd, best_intra16_rd);
-        }
-        */
-        // Note index of best mode so far
-        best_mode_index = mode_index;
-
-        if (this_mode <= I4X4_PRED) {
-          if (mbmi->txfm_size != TX_4X4
-              && this_mode != I4X4_PRED
-              && this_mode != I8X8_PRED)
-            mbmi->uv_mode = uv_intra_mode[TX_8X8];
-          else
-            mbmi->uv_mode = uv_intra_mode[TX_4X4];
-          /* required for left and above block mv */
-          mbmi->mv[0].as_int = 0;
-        }
-
-        other_cost += ref_costs[mbmi->ref_frame];
-
-        /* Calculate the final y RD estimate for this mode */
-        best_yrd = RDCOST(x->rdmult, x->rddiv, (rate2 - rate_uv - other_cost),
-                          (distortion2 - distortion_uv));
-
-        *returnrate = rate2;
-        *returndistortion = distortion2;
-        best_rd = this_rd;
-        vpx_memcpy(&best_mbmode, mbmi, sizeof(MB_MODE_INFO));
-        vpx_memcpy(&best_partition, x->partition_info, sizeof(PARTITION_INFO));
-
-        if ((this_mode == I4X4_PRED)
-            || (this_mode == I8X8_PRED)
-            || (this_mode == SPLITMV))
-          for (i = 0; i < 16; i++) {
-            best_bmodes[i] = xd->mode_info_context->bmi[i];
-          }
-      }
-
-      // Testing this mode gave rise to an improvement in best error score.
-      // Lower threshold a bit for next time
-      cpi->rd_thresh_mult[mode_index] =
-          (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ?
-          cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
-      cpi->rd_threshes[mode_index] =
-          (cpi->rd_baseline_thresh[mode_index] >> 7) *
-          cpi->rd_thresh_mult[mode_index];
-    } else {
-      // If the mode did not help improve the best error case then raise the
-      // threshold for testing that mode next time around.
-      cpi->rd_thresh_mult[mode_index] += 4;
-
-      if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
-        cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
-
-      cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7)
-          * cpi->rd_thresh_mult[mode_index];
-    }
-
-    /* keep record of best compound/single-only prediction */
-    if (!disable_skip && mbmi->ref_frame != INTRA_FRAME) {
-      int64_t single_rd, hybrid_rd;
-      int single_rate, hybrid_rate;
-
-      if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
-        single_rate = rate2 - compmode_cost;
-        hybrid_rate = rate2;
-      } else {
-        single_rate = rate2;
-        hybrid_rate = rate2 + compmode_cost;
-      }
-
-      single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
-      hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
-
-      if (mbmi->second_ref_frame <= INTRA_FRAME &&
-          single_rd < best_pred_rd[SINGLE_PREDICTION_ONLY]) {
-        best_pred_rd[SINGLE_PREDICTION_ONLY] = single_rd;
-      } else if (mbmi->second_ref_frame > INTRA_FRAME &&
-                 single_rd < best_pred_rd[COMP_PREDICTION_ONLY]) {
-        best_pred_rd[COMP_PREDICTION_ONLY] = single_rd;
-      }
-      if (hybrid_rd < best_pred_rd[HYBRID_PREDICTION])
-        best_pred_rd[HYBRID_PREDICTION] = hybrid_rd;
-    }
-
-    /* keep record of best txfm size */
-    if (!mode_excluded && this_rd != INT64_MAX) {
-      for (i = 0; i < NB_TXFM_MODES; i++) {
-        int64_t adj_rd;
-        if (this_mode != I4X4_PRED) {
-          const int64_t txfm_mode_diff =
-              txfm_cache[i] - txfm_cache[cm->txfm_mode];
-          adj_rd = this_rd + txfm_mode_diff;
-        } else {
-          adj_rd = this_rd;
-        }
-        if (adj_rd < best_txfm_rd[i])
-          best_txfm_rd[i] = adj_rd;
-      }
-    }
-
-    if (x->skip && !mode_excluded)
-      break;
-  }
-
-  assert((cm->mcomp_filter_type == SWITCHABLE) ||
-         (cm->mcomp_filter_type == best_mbmode.interp_filter) ||
-         (best_mbmode.mode <= I4X4_PRED));
-
-#if CONFIG_COMP_INTERINTRA_PRED
-  ++cpi->interintra_select_count[is_best_interintra];
-#endif
-
-  // Accumulate filter usage stats
-  // TODO(agrange): Use RD criteria to select interpolation filter mode.
-  if (is_inter_mode(best_mode))
-    ++cpi->best_switchable_interp_count[vp9_switchable_interp_map[best_filter]];
-
-  // Reduce the activation RD thresholds for the best choice mode
-  if ((cpi->rd_baseline_thresh[best_mode_index] > 0) &&
-      (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2))) {
-    int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 2);
-
-    cpi->rd_thresh_mult[best_mode_index] =
-        (cpi->rd_thresh_mult[best_mode_index] >=
-         (MIN_THRESHMULT + best_adjustment)) ?
-        cpi->rd_thresh_mult[best_mode_index] - best_adjustment : MIN_THRESHMULT;
-    cpi->rd_threshes[best_mode_index] =
-        (cpi->rd_baseline_thresh[best_mode_index] >> 7) *
-        cpi->rd_thresh_mult[best_mode_index];
-  }
-
-  // This code forces Altref,0,0 and skip for the frame that overlays a
-  // an alrtef unless Altref is filtered. However, this is unsafe if
-  // segment level coding of ref frame is enabled for this
-  // segment.
-  if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
-      cpi->is_src_frame_alt_ref &&
-      (cpi->oxcf.arnr_max_frames == 0) &&
-      (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME)) {
-    mbmi->mode = ZEROMV;
-    if (cm->txfm_mode <= ALLOW_8X8)
-      mbmi->txfm_size = cm->txfm_mode;
-    else
-      mbmi->txfm_size = TX_16X16;
-    mbmi->ref_frame = ALTREF_FRAME;
-    mbmi->mv[0].as_int = 0;
-    mbmi->uv_mode = DC_PRED;
-    mbmi->mb_skip_coeff = 1;
-    mbmi->partitioning = 0;
-    set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,
-                      scale_factor);
-
-    vpx_memset(best_pred_diff, 0, sizeof(best_pred_diff));
-    vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));
-    goto end;
-  }
-
-  // macroblock modes
-  vpx_memcpy(mbmi, &best_mbmode, sizeof(MB_MODE_INFO));
-  if (best_mbmode.mode == I4X4_PRED) {
-    for (i = 0; i < 16; i++) {
-      xd->mode_info_context->bmi[i].as_mode = best_bmodes[i].as_mode;
-    }
-  }
-
-  if (best_mbmode.mode == I8X8_PRED)
-    set_i8x8_block_modes(x, mode8x8);
-
-  if (best_mbmode.mode == SPLITMV) {
-    for (i = 0; i < 16; i++)
-      xd->mode_info_context->bmi[i].as_mv[0].as_int =
-          best_bmodes[i].as_mv[0].as_int;
-    if (mbmi->second_ref_frame > 0)
-      for (i = 0; i < 16; i++)
-        xd->mode_info_context->bmi[i].as_mv[1].as_int =
-            best_bmodes[i].as_mv[1].as_int;
-
-    vpx_memcpy(x->partition_info, &best_partition, sizeof(PARTITION_INFO));
-
-    mbmi->mv[0].as_int = x->partition_info->bmi[15].mv.as_int;
-    mbmi->mv[1].as_int = x->partition_info->bmi[15].second_mv.as_int;
-  }
-
-  for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
-    if (best_pred_rd[i] == INT64_MAX)
-      best_pred_diff[i] = INT_MIN;
-    else
-      best_pred_diff[i] = best_rd - best_pred_rd[i];
-  }
-
-  if (!x->skip) {
-    for (i = 0; i < NB_TXFM_MODES; i++) {
-      if (best_txfm_rd[i] == INT64_MAX)
-        best_txfm_diff[i] = 0;
-      else
-        best_txfm_diff[i] = best_rd - best_txfm_rd[i];
-    }
-  } else {
-    vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));
-  }
-
-end:
-
-  // Flag all modes that have a distortion thats > 2x the best we found at
-  // this level.
-  for (mode_index = 0; mode_index < MB_MODE_COUNT; ++mode_index) {
-    if (mode_index == NEARESTMV || mode_index == NEARMV || mode_index == NEWMV
-        || mode_index == SPLITMV)
-      continue;
-
-    if (mode_distortions[mode_index] > 2 * *returndistortion) {
-      x->mb_context[xd->sb_index][xd->mb_index].modes_with_high_error |= (1
-          << mode_index);
-    }
-  }
-
-  // Flag all ref frames that have a distortion thats > 2x the best we found at
-  // this level.
-  for (ref_frame = INTRA_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
-    if (frame_distortions[ref_frame] > 2 * *returndistortion) {
-      x->mb_context[xd->sb_index][xd->mb_index].frames_with_high_error |= (1
-          << ref_frame);
-    }
-  }
-
-  set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,
-                    scale_factor);
-  store_coding_context(x, &x->mb_context[xd->sb_index][xd->mb_index],
-                       best_mode_index, &best_partition,
-                       &mbmi->ref_mvs[mbmi->ref_frame][0],
-                       &mbmi->ref_mvs[mbmi->second_ref_frame < 0 ? 0 :
-                                      mbmi->second_ref_frame][0],
-                       best_pred_diff, best_txfm_diff);
-}
-#endif  // !CONFIG_SB8X8
-
 void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                int *returnrate, int *returndist,
                                BLOCK_SIZE_TYPE bsize,
@@ -4621,30 +2366,24 @@
   int dist_y = 0, dist_uv;
   int y_skip = 0, uv_skip;
   int64_t txfm_cache[NB_TXFM_MODES], err;
-#if CONFIG_SB8X8
   MB_PREDICTION_MODE mode;
   TX_SIZE txfm_size;
   int rate4x4_y, rate4x4_y_tokenonly, dist4x4_y;
   int64_t err4x4 = INT64_MAX;
-#endif
   int i;
 
   ctx->skip = 0;
   xd->mode_info_context->mbmi.mode = DC_PRED;
   err = rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
                                &dist_y, &y_skip, bsize, txfm_cache);
-#if CONFIG_SB8X8
   mode = xd->mode_info_context->mbmi.mode;
   txfm_size = xd->mode_info_context->mbmi.txfm_size;
-#endif
   rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
                           &dist_uv, &uv_skip, bsize);
-#if CONFIG_SB8X8
   if (bsize == BLOCK_SIZE_SB8X8)
     err4x4 = rd_pick_intra4x4mby_modes(cpi, x, &rate4x4_y,
                                        &rate4x4_y_tokenonly,
                                        &dist4x4_y, err);
-#endif
 
   if (y_skip && uv_skip) {
     *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
@@ -4652,7 +2391,6 @@
     *returndist = dist_y + (dist_uv >> 2);
     memset(ctx->txfm_rd_diff, 0,
            sizeof(x->sb32_context[xd->sb_index].txfm_rd_diff));
-#if CONFIG_SB8X8
     xd->mode_info_context->mbmi.mode = mode;
     xd->mode_info_context->mbmi.txfm_size = txfm_size;
   } else if (bsize == BLOCK_SIZE_SB8X8 && err4x4 < err) {
@@ -4663,156 +2401,20 @@
       ctx->txfm_rd_diff[i] = MIN(err4x4, err - txfm_cache[i]);
     }
     xd->mode_info_context->mbmi.txfm_size = TX_4X4;
-#endif
   } else {
     *returnrate = rate_y + rate_uv +
         vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
     *returndist = dist_y + (dist_uv >> 2);
     for (i = 0; i < NB_TXFM_MODES; i++) {
-#if CONFIG_SB8X8
       ctx->txfm_rd_diff[i] = MIN(err4x4, err - txfm_cache[i]);
-#else
-      ctx->txfm_rd_diff[i] = err - txfm_cache[i];
-#endif
     }
-#if CONFIG_SB8X8
     xd->mode_info_context->mbmi.txfm_size = txfm_size;
     xd->mode_info_context->mbmi.mode = mode;
-#endif
   }
 
   vpx_memcpy(&ctx->mic, xd->mode_info_context, sizeof(MODE_INFO));
 }
 
-#if !CONFIG_SB8X8
-void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,
-                            int *returnrate, int *returndist) {
-  VP9_COMMON *cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
-  int64_t error4x4, error16x16;
-  int rate4x4, rate16x16 = 0, rateuv[2];
-  int dist4x4 = 0, dist16x16 = 0, distuv[2];
-  int rate;
-  int rate4x4_tokenonly = 0;
-  int rate16x16_tokenonly = 0;
-  int rateuv_tokenonly[2];
-  int64_t error8x8;
-  int rate8x8_tokenonly=0;
-  int rate8x8, dist8x8;
-  int mode16x16;
-  int mode8x8[4];
-  int dist;
-  int modeuv[2], uv_intra_skippable[2];
-  int y_intra16x16_skippable = 0;
-  int64_t txfm_cache[2][NB_TXFM_MODES];
-  TX_SIZE txfm_size_16x16, txfm_size_8x8;
-  int i;
-
-  x->mb_context[xd->sb_index][xd->mb_index].skip = 0;
-  mbmi->ref_frame = INTRA_FRAME;
-  mbmi->mode = DC_PRED;
-  for (i = 0; i <= TX_8X8; i++) {
-    mbmi->txfm_size = i;
-    rd_pick_intra_sbuv_mode(cpi, x, &rateuv[i], &rateuv_tokenonly[i],
-                            &distuv[i], &uv_intra_skippable[i],
-                            BLOCK_SIZE_MB16X16);
-    modeuv[i] = mbmi->uv_mode;
-  }
-
-  // current macroblock under rate-distortion optimization test loop
-  error16x16 = rd_pick_intra_sby_mode(cpi, x, &rate16x16,
-                                      &rate16x16_tokenonly, &dist16x16,
-                                      &y_intra16x16_skippable,
-                                      BLOCK_SIZE_MB16X16, txfm_cache[1]);
-  mode16x16 = mbmi->mode;
-  txfm_size_16x16 = mbmi->txfm_size;
-  if (y_intra16x16_skippable &&
-      ((cm->txfm_mode == ONLY_4X4 && uv_intra_skippable[TX_4X4]) ||
-       (cm->txfm_mode != ONLY_4X4 && uv_intra_skippable[TX_8X8]))) {
-    error16x16 -= RDCOST(x->rdmult, x->rddiv, rate16x16_tokenonly, 0);
-    rate16x16 -= rate16x16_tokenonly;
-  }
-  for (i = 0; i < NB_TXFM_MODES; i++) {
-    txfm_cache[0][i] = error16x16 - txfm_cache[1][cm->txfm_mode] +
-                       txfm_cache[1][i];
-  }
-
-  error8x8 = rd_pick_intra8x8mby_modes_and_txsz(cpi, x, &rate8x8,
-                                                &rate8x8_tokenonly,
-                                                &dist8x8, mode8x8,
-                                                error16x16, txfm_cache[1]);
-  txfm_size_8x8 = mbmi->txfm_size;
-  for (i = 0; i < NB_TXFM_MODES; i++) {
-    int64_t tmp_rd = error8x8 - txfm_cache[1][cm->txfm_mode] + txfm_cache[1][i];
-    if (tmp_rd < txfm_cache[0][i])
-      txfm_cache[0][i] = tmp_rd;
-  }
-
-  mbmi->txfm_size = TX_4X4;
-  error4x4 = rd_pick_intra4x4mby_modes(cpi, x,
-                                       &rate4x4, &rate4x4_tokenonly,
-                                       &dist4x4, error16x16);
-  for (i = 0; i < NB_TXFM_MODES; i++) {
-    if (error4x4 < txfm_cache[0][i])
-      txfm_cache[0][i] = error4x4;
-  }
-
-  mbmi->mb_skip_coeff = 0;
-  if (y_intra16x16_skippable &&
-      ((cm->txfm_mode == ONLY_4X4 && uv_intra_skippable[TX_4X4]) ||
-       (cm->txfm_mode != ONLY_4X4 && uv_intra_skippable[TX_8X8]))) {
-    mbmi->mb_skip_coeff = 1;
-    mbmi->mode = mode16x16;
-    mbmi->uv_mode = modeuv[cm->txfm_mode != ONLY_4X4];
-    rate = rate16x16 + vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1);
-    dist = dist16x16;
-    rate += rateuv[cm->txfm_mode != ONLY_4X4] -
-            rateuv_tokenonly[cm->txfm_mode != ONLY_4X4];
-    dist += (distuv[cm->txfm_mode != ONLY_4X4] >> 2);
-    mbmi->txfm_size = txfm_size_16x16;
-  } else if (error8x8 > error16x16) {
-    if (error4x4 < error16x16) {
-      rate = rateuv[TX_4X4] + rate4x4;
-      mbmi->mode = I4X4_PRED;
-      mbmi->txfm_size = TX_4X4;
-      dist = dist4x4 + (distuv[TX_4X4] >> 2);
-      mbmi->uv_mode = modeuv[TX_4X4];
-    } else {
-      mbmi->txfm_size = txfm_size_16x16;
-      mbmi->mode = mode16x16;
-      rate = rate16x16 + rateuv[mbmi->txfm_size != TX_4X4];
-      dist = dist16x16 + (distuv[mbmi->txfm_size != TX_4X4] >> 2);
-      mbmi->uv_mode = modeuv[mbmi->txfm_size != TX_4X4];
-    }
-    rate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
-  } else {
-    if (error4x4 < error8x8) {
-      rate = rateuv[TX_4X4] + rate4x4;
-      mbmi->mode = I4X4_PRED;
-      mbmi->txfm_size = TX_4X4;
-      dist = dist4x4 + (distuv[TX_4X4] >> 2);
-      mbmi->uv_mode = modeuv[TX_4X4];
-    } else {
-      mbmi->mode = I8X8_PRED;
-      mbmi->txfm_size = txfm_size_8x8;
-      set_i8x8_block_modes(x, mode8x8);
-      rate = rate8x8 + rateuv[TX_4X4];
-      dist = dist8x8 + (distuv[TX_4X4] >> 2);
-    }
-    rate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
-  }
-
-  for (i = 0; i < NB_TXFM_MODES; i++) {
-    x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff[i] =
-        txfm_cache[0][cm->txfm_mode] - txfm_cache[0][i];
-  }
-
-  *returnrate = rate;
-  *returndist = dist;
-}
-#endif
-
 int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                   int mi_row, int mi_col,
                                   int *returnrate,
@@ -4868,20 +2470,16 @@
   int64_t frame_distortions[MAX_REF_FRAMES] = {-1};
   int intra_cost_penalty = 20 * vp9_dc_quant(cpi->common.base_qindex,
                                              cpi->common.y_dc_delta_q);
-#if CONFIG_SB8X8
   int_mv seg_mvs[4][MAX_REF_FRAMES - 1];
   union b_mode_info best_bmodes[4];
   PARTITION_INFO best_partition;
-#endif
 
-#if CONFIG_SB8X8
   for (i = 0; i < 4; i++) {
     int j;
 
     for (j = 0; j < MAX_REF_FRAMES - 1; j++)
       seg_mvs[i][j].as_int = INVALID_MV;
   }
-#endif
   // Everywhere the flag is set the error is much higher than its neighbors.
   ctx->frames_with_high_error = 0;
   ctx->modes_with_high_error = 0;
@@ -4973,6 +2571,7 @@
     x->skip = 0;
     this_mode = vp9_mode_order[mode_index].mode;
     ref_frame = vp9_mode_order[mode_index].ref_frame;
+
     if (!(ref_frame == INTRA_FRAME
         || (cpi->ref_frame_flags & flag_list[ref_frame]))) {
       continue;
@@ -4993,6 +2592,21 @@
 
     mbmi->ref_frame = ref_frame;
     mbmi->second_ref_frame = vp9_mode_order[mode_index].second_ref_frame;
+
+    // TODO(jingning, jkoleszar): scaling reference frame not supported for
+    // SPLITMV.
+    if (mbmi->ref_frame > 0 &&
+          (yv12_mb[mbmi->ref_frame].y_width != cm->mb_cols * 16 ||
+           yv12_mb[mbmi->ref_frame].y_height != cm->mb_rows * 16) &&
+        this_mode == SPLITMV)
+      continue;
+
+    if (mbmi->second_ref_frame > 0 &&
+          (yv12_mb[mbmi->second_ref_frame].y_width != cm->mb_cols * 16 ||
+           yv12_mb[mbmi->second_ref_frame].y_height != cm->mb_rows * 16) &&
+        this_mode == SPLITMV)
+      continue;
+
     set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,
                       scale_factor);
     comp_pred = mbmi->second_ref_frame > INTRA_FRAME;
@@ -5011,16 +2625,8 @@
     // if (!(cpi->ref_frame_flags & flag_list[ref_frame]))
     //  continue;
 
-    if (
-#if CONFIG_SB8X8
-        bsize != BLOCK_SIZE_SB8X8 &&
-        (this_mode == I4X4_PRED || this_mode == SPLITMV)
-#else
-        this_mode == I4X4_PRED ||
-        this_mode == I8X8_PRED ||
-        this_mode == SPLITMV
-#endif
-        )
+    if (bsize != BLOCK_SIZE_SB8X8 &&
+        (this_mode == I4X4_PRED || this_mode == SPLITMV))
       continue;
     //  if (vp9_mode_order[mode_index].second_ref_frame == INTRA_FRAME)
     //  continue;
@@ -5083,7 +2689,6 @@
       }
     }
 
-#if CONFIG_SB8X8
     if (this_mode == I4X4_PRED) {
       int rate;
 
@@ -5102,9 +2707,7 @@
       distortion2 += dist_uv[TX_4X4];
       distortion_uv = dist_uv[TX_4X4];
       mbmi->uv_mode = mode_uv[TX_4X4];
-    } else
-#endif
-    if (ref_frame == INTRA_FRAME) {
+    } else if (ref_frame == INTRA_FRAME) {
       TX_SIZE uv_tx;
       vp9_build_intra_predictors_sby_s(xd, bsize);
       super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,
@@ -5127,7 +2730,6 @@
       if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED)
         rate2 += intra_cost_penalty;
       distortion2 = distortion_y + distortion_uv;
-#if CONFIG_SB8X8
     } else if (this_mode == SPLITMV) {
       const int is_comp_pred = mbmi->second_ref_frame > 0;
       int rate, distortion;
@@ -5240,8 +2842,8 @@
       vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
                                       bsize);
       vp9_subtract_sbuv(x, bsize);
-      super_block_uvrd_4x4(cm, x, &rate_uv, &distortion_uv,
-                           &uv_skippable, bsize);
+      super_block_uvrd_for_txfm(cm, x, &rate_uv, &distortion_uv,
+                                &uv_skippable, bsize, TX_4X4);
       rate2 += rate_uv;
       distortion2 += distortion_uv;
       skippable = skippable && uv_skippable;
@@ -5256,7 +2858,6 @@
       compmode_cost =
           vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_COMP), is_comp_pred);
       mbmi->mode = this_mode;
-#endif
     } else {
       YV12_BUFFER_CONFIG *scaled_ref_frame = NULL;
       int fb;
@@ -5413,7 +3014,6 @@
         *returndistortion = distortion2;
         best_rd = this_rd;
         vpx_memcpy(&best_mbmode, mbmi, sizeof(MB_MODE_INFO));
-#if CONFIG_SB8X8
         vpx_memcpy(&best_partition, x->partition_info, sizeof(PARTITION_INFO));
 
         if (this_mode == I4X4_PRED || this_mode == SPLITMV) {
@@ -5421,7 +3021,6 @@
             best_bmodes[i] = xd->mode_info_context->bmi[i];
           }
         }
-#endif
       }
 #if 0
       // Testing this mode gave rise to an improvement in best error score.
@@ -5563,22 +3162,13 @@
     mbmi->mv[0].as_int = 0;
     mbmi->uv_mode = DC_PRED;
     mbmi->mb_skip_coeff = 1;
-#if !CONFIG_SB8X8
-    mbmi->partitioning = 0;
-#endif
     if (cm->txfm_mode == TX_MODE_SELECT) {
       if (bsize >= BLOCK_SIZE_SB32X32)
         mbmi->txfm_size = TX_32X32;
-#if CONFIG_SB8X8
       else if (bsize >= BLOCK_SIZE_MB16X16)
-#else
-      else
-#endif
         mbmi->txfm_size = TX_16X16;
-#if CONFIG_SB8X8
       else
         mbmi->txfm_size = TX_8X8;
-#endif
     }
 
     vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));
@@ -5588,7 +3178,6 @@
 
   // macroblock modes
   vpx_memcpy(mbmi, &best_mbmode, sizeof(MB_MODE_INFO));
-#if CONFIG_SB8X8
   if (best_mbmode.mode == I4X4_PRED) {
     for (i = 0; i < 4; i++) {
       xd->mode_info_context->bmi[i].as_mode = best_bmodes[i].as_mode;
@@ -5609,7 +3198,7 @@
     mbmi->mv[0].as_int = x->partition_info->bmi[3].mv.as_int;
     mbmi->mv[1].as_int = x->partition_info->bmi[3].second_mv.as_int;
   }
-#endif
+
   for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
     if (best_pred_rd[i] == INT64_MAX)
       best_pred_diff[i] = INT_MIN;
@@ -5632,11 +3221,7 @@
   set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,
                     scale_factor);
   store_coding_context(x, ctx, best_mode_index,
-#if CONFIG_SB8X8
                        &best_partition,
-#else
-                       NULL,
-#endif
                        &mbmi->ref_mvs[mbmi->ref_frame][0],
                        &mbmi->ref_mvs[mbmi->second_ref_frame < 0 ? 0 :
                                       mbmi->second_ref_frame][0],
@@ -5644,42 +3229,3 @@
 
   return best_rd;
 }
-
-#if !CONFIG_SB8X8
-void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
-                                    int mi_row, int mi_col,
-                                    int *totalrate, int *totaldist) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
-  int rate, distortion;
-  int64_t intra_error = 0;
-  unsigned char *segment_id = &mbmi->segment_id;
-
-  x->encode_breakout = xd->segmentation_enabled ?
-                         cpi->segment_encode_breakout[*segment_id] :
-                         cpi->oxcf.encode_breakout;
-
-  // if (cpi->sf.RD)
-  // For now this codebase is limited to a single rd encode path
-  {
-    int zbin_mode_boost_enabled = cpi->zbin_mode_boost_enabled;
-
-    rd_pick_inter_mode(cpi, x, mi_row, mi_col, &rate,
-                       &distortion, &intra_error);
-
-    /* restore cpi->zbin_mode_boost_enabled */
-    cpi->zbin_mode_boost_enabled = zbin_mode_boost_enabled;
-  }
-  // else
-  // The non rd encode path has been deleted from this code base
-  // to simplify development
-  //    vp9_pick_inter_mode
-
-  // Store metrics so they can be added in to totals if this mode is picked
-  x->mb_context[xd->sb_index][xd->mb_index].distortion  = distortion;
-  x->mb_context[xd->sb_index][xd->mb_index].intra_error = intra_error;
-
-  *totalrate = rate;
-  *totaldist = distortion;
-}
-#endif
diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h
index 6533a82..dcf5d00 100644
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -19,21 +19,10 @@
 
 void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex);
 
-#if !CONFIG_SB8X8
-void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,
-                            int *r, int *d);
-#endif
-
 void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                int *r, int *d, BLOCK_SIZE_TYPE bsize,
                                PICK_MODE_CONTEXT *ctx);
 
-#if !CONFIG_SB8X8
-void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
-                                    int mi_row, int mi_col,
-                                    int *r, int *d);
-#endif
-
 int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                   int mi_row, int mi_col,
                                   int *r, int *d, BLOCK_SIZE_TYPE bsize,
diff --git a/vp9/encoder/vp9_segmentation.c b/vp9/encoder/vp9_segmentation.c
index 86af268..fe995ad 100644
--- a/vp9/encoder/vp9_segmentation.c
+++ b/vp9/encoder/vp9_segmentation.c
@@ -123,11 +123,12 @@
                        int bw, int bh, int mi_row, int mi_col) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-  const int segment_id = mi->mbmi.segment_id;
+  int segment_id;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
+  segment_id = mi->mbmi.segment_id;
   xd->mode_info_context = mi;
   set_mi_row_col(cm, xd, mi_row, bh, mi_col, bw);
 
@@ -192,17 +193,11 @@
     assert(bwl < bsl && bhl < bsl);
     if (bsize == BLOCK_SIZE_SB64X64) {
       subsize = BLOCK_SIZE_SB32X32;
-#if CONFIG_SB8X8
     } else if (bsize == BLOCK_SIZE_SB32X32) {
       subsize = BLOCK_SIZE_MB16X16;
     } else {
       assert(bsize == BLOCK_SIZE_MB16X16);
       subsize = BLOCK_SIZE_SB8X8;
-#else
-    } else {
-      assert(bsize == BLOCK_SIZE_SB32X32);
-      subsize = BLOCK_SIZE_MB16X16;
-#endif
     }
 
     for (n = 0; n < 4; n++) {
@@ -252,11 +247,11 @@
     vp9_get_tile_col_offsets(cm, tile_col);
     mi_ptr = cm->mi + cm->cur_tile_mi_col_start;
     for (mi_row = 0; mi_row < cm->mi_rows;
-         mi_row += (4 << CONFIG_SB8X8), mi_ptr += (4 << CONFIG_SB8X8) * mis) {
+         mi_row += 8, mi_ptr += 8 * mis) {
       mi = mi_ptr;
       for (mi_col = cm->cur_tile_mi_col_start;
            mi_col < cm->cur_tile_mi_col_end;
-           mi_col += (4 << CONFIG_SB8X8), mi += (4 << CONFIG_SB8X8)) {
+           mi_col += 8, mi += 8) {
         count_segs_sb(cpi, mi, no_pred_segcounts, temporal_predictor_count,
                       t_unpred_seg_counts, mi_row, mi_col, BLOCK_SIZE_SB64X64);
       }
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 1e6b984..6bd8b50 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -481,7 +481,7 @@
   // Note: this_frame->frame has been updated in the loop
   // so it now points at the ARF frame.
   half_gf_int = cpi->baseline_gf_interval >> 1;
-  frames_after_arf = (int)(cpi->twopass.total_stats->count - this_frame - 1);
+  frames_after_arf = (int)(cpi->twopass.total_stats.count - this_frame - 1);
 
   switch (cpi->oxcf.arnr_type) {
     case 1:  // Backward filter
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 9756e6e..9a65985 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -112,8 +112,6 @@
   PLANE_TYPE type = plane ? PLANE_TYPE_UV : PLANE_TYPE_Y_WITH_DC;
   TX_SIZE tx_size = ss_txfrm_size / 2;
   int dry_run = args->dry_run;
-  int ib = old_block_idx_4x4(xd, b_width_log2(bsize) + b_height_log2(bsize),
-                             plane, block);
 
   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
   int pt; /* near block/prev token context index */
@@ -158,7 +156,7 @@
     default:
     case TX_4X4: {
       tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
-          get_tx_type_4x4(xd, ib) : DCT_DCT;
+          get_tx_type_4x4(xd, block) : DCT_DCT;
       above_ec = A[0] != 0;
       left_ec = L[0] != 0;
       seg_eob = 16;
@@ -173,7 +171,7 @@
     }
     case TX_8X8: {
       const int sz = 1 + b_width_log2(sb_type);
-      const int x = ib & ((1 << sz) - 1), y = ib - x;
+      const int x = block & ((1 << sz) - 1), y = block - x;
       tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
           get_tx_type_8x8(xd, y + (x >> 1)) : DCT_DCT;
       above_ec = (A[0] + A[1]) != 0;
@@ -190,7 +188,7 @@
     }
     case TX_16X16: {
       const int sz = 2 + b_width_log2(sb_type);
-      const int x = ib & ((1 << sz) - 1), y = ib - x;
+      const int x = block & ((1 << sz) - 1), y = block - x;
       tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
           get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT;
       above_ec = (A[0] + A[1] + A[2] + A[3]) != 0;
@@ -377,9 +375,6 @@
   int result = 1;
   struct is_skippable_args args = {xd, &result};
   foreach_transformed_block_in_plane(xd, bsize, 0,
-#if !CONFIG_SB8X8
-                                     0,
-#endif
                                      is_skippable, &args);
   return result;
 }
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 1ddd4f0..9e3ec15 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -56,7 +56,6 @@
 VP9_COMMON_SRCS-yes += common/vp9_subpelvar.h
 VP9_COMMON_SRCS-yes += common/vp9_seg_common.h
 VP9_COMMON_SRCS-yes += common/vp9_seg_common.c
-VP9_COMMON_SRCS-yes += common/vp9_setupintrarecon.h
 VP9_COMMON_SRCS-yes += common/vp9_swapyv12buffer.h
 VP9_COMMON_SRCS-yes += common/vp9_systemdependent.h
 VP9_COMMON_SRCS-yes += common/vp9_textblit.h
@@ -76,7 +75,6 @@
 VP9_COMMON_SRCS-yes += common/vp9_reconinter.c
 VP9_COMMON_SRCS-yes += common/vp9_reconintra.c
 VP9_COMMON_SRCS-yes += common/vp9_reconintra4x4.c
-VP9_COMMON_SRCS-yes += common/vp9_setupintrarecon.c
 VP9_COMMON_SRCS-yes += common/vp9_swapyv12buffer.c
 VP9_COMMON_SRCS-$(CONFIG_POSTPROC_VISUALIZER) += common/vp9_textblit.c
 VP9_COMMON_SRCS-yes += common/vp9_treecoder.c
@@ -113,14 +111,6 @@
 
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_sadmxn_sse2.c
-ifeq ($(HAVE_SSE2),yes)
-vp9/common/x86/vp9_idct_intrin_sse2.c.o: CFLAGS += -msse2
-vp9/common/x86/vp9_loopfilter_intrin_sse2.c.o: CFLAGS += -msse2
-vp9/common/x86/vp9_sadmxn_sse2.c.o: CFLAGS += -msse2
-vp9/common/x86/vp9_idct_intrin_sse2.c.d: CFLAGS += -msse2
-vp9/common/x86/vp9_loopfilter_intrin_sse2.c.d: CFLAGS += -msse2
-vp9/common/x86/vp9_sadmxn_sse2.c.d: CFLAGS += -msse2
-endif
 
 $(eval $(call asm_offsets_template,\
          vp9_asm_com_offsets.asm, $(VP9_PREFIX)common/vp9_asm_com_offsets.c))
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 39f836f..42ab02d 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -95,10 +95,5 @@
 VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt.asm
 
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c
-ifeq ($(HAVE_SSE2),yes)
-vp9/encoder/x86/vp9_dct_sse2.c.d: CFLAGS += -msse2
-vp9/encoder/x86/vp9_dct_sse2.c.o: CFLAGS += -msse2
-endif
-
 
 VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))
diff --git a/vp9/vp9dx.mk b/vp9/vp9dx.mk
index babdebb..72cdfeb 100644
--- a/vp9/vp9dx.mk
+++ b/vp9/vp9dx.mk
@@ -38,10 +38,6 @@
 VP9_DX_SRCS-$(HAVE_SSE2) += decoder/x86/vp9_idct_blk_sse2.c
 
 VP9_DX_SRCS-$(HAVE_SSE2) += decoder/x86/vp9_dequantize_sse2.c
-ifeq ($(HAVE_SSE2),yes)
-vp9/decoder/x86/vp9_dequantize_sse2.c.o: CFLAGS += -msse2
-vp9/decoder/x86/vp9_dequantize_sse2.c.d: CFLAGS += -msse2
-endif
 
 $(eval $(call asm_offsets_template,\
          vp9_asm_dec_offsets.asm, $(VP9_PREFIX)decoder/vp9_asm_dec_offsets.c))
diff --git a/vpx/vpx_image.h b/vpx/vpx_image.h
index 809fa38..c304bac 100644
--- a/vpx/vpx_image.h
+++ b/vpx/vpx_image.h
@@ -55,9 +55,11 @@
     VPX_IMG_FMT_YV12    = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_UV_FLIP | 1, /**< planar YVU */
     VPX_IMG_FMT_I420    = VPX_IMG_FMT_PLANAR | 2,
     VPX_IMG_FMT_VPXYV12 = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_UV_FLIP | 3, /** < planar 4:2:0 format with vpx color space */
-    VPX_IMG_FMT_VPXI420 = VPX_IMG_FMT_PLANAR | 4   /** < planar 4:2:0 format with vpx color space */
-  }
-                        vpx_img_fmt_t; /**< alias for enum vpx_img_fmt */
+    VPX_IMG_FMT_VPXI420 = VPX_IMG_FMT_PLANAR | 4,
+    VPX_IMG_FMT_I422    = VPX_IMG_FMT_PLANAR | 5,
+    VPX_IMG_FMT_I444    = VPX_IMG_FMT_PLANAR | 6,
+    VPX_IMG_FMT_444A    = VPX_IMG_FMT_PLANAR | VPX_IMG_FMT_HAS_ALPHA | 7
+  } vpx_img_fmt_t; /**< alias for enum vpx_img_fmt */
 
 #if !defined(VPX_CODEC_DISABLE_COMPAT) || !VPX_CODEC_DISABLE_COMPAT
 #define IMG_FMT_PLANAR         VPX_IMG_FMT_PLANAR     /**< \deprecated Use #VPX_IMG_FMT_PLANAR */
diff --git a/vpxenc.c b/vpxenc.c
index 95c6cf2..33a56a4 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -326,6 +326,7 @@
   unsigned int          h;
   struct vpx_rational   framerate;
   int                   use_i420;
+  int                   only_i420;
 };
 
 
@@ -1793,7 +1794,8 @@
 
   if (input->detect.buf_read == 4
       && file_is_y4m(input->file, &input->y4m, input->detect.buf)) {
-    if (y4m_input_open(&input->y4m, input->file, input->detect.buf, 4) >= 0) {
+    if (y4m_input_open(&input->y4m, input->file, input->detect.buf, 4,
+                       input->only_i420) >= 0) {
       input->file_type = FILE_TYPE_Y4M;
       input->w = input->y4m.pic_w;
       input->h = input->y4m.pic_h;
@@ -2517,6 +2519,7 @@
   input.framerate.num = 30;
   input.framerate.den = 1;
   input.use_i420 = 1;
+  input.only_i420 = 1;
 
   /* First parse the global configuration values, because we want to apply
    * other parameters on top of the default configuration provided by the
@@ -2551,6 +2554,12 @@
   if (!input.fn)
     usage_exit();
 
+#if CONFIG_NON420
+  /* Decide if other chroma subsamplings than 4:2:0 are supported */
+  if (global.codec->fourcc == VP9_FOURCC)
+    input.only_i420 = 0;
+#endif
+
   for (pass = global.pass ? global.pass - 1 : 0; pass < global.passes; pass++) {
     int frames_in = 0, seen_frames = 0;
     int64_t estimated_time_left = -1;
diff --git a/y4minput.c b/y4minput.c
index 24f0c15..47f005a 100644
--- a/y4minput.c
+++ b/y4minput.c
@@ -659,7 +659,8 @@
                              unsigned char *_aux) {
 }
 
-int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip) {
+int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,
+                   int only_420) {
   char buffer[80];
   int  ret;
   int  i;
@@ -701,6 +702,8 @@
             "Only progressive scan handled.\n");
     return -1;
   }
+  _y4m->vpx_fmt = VPX_IMG_FMT_I420;
+  _y4m->vpx_bps = 12;
   if (strcmp(_y4m->chroma_type, "420") == 0 ||
       strcmp(_y4m->chroma_type, "420jpeg") == 0) {
     _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v = _y4m->dst_c_dec_v = 2;
@@ -734,16 +737,30 @@
     _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
     _y4m->convert = y4m_convert_422jpeg_420jpeg;
   } else if (strcmp(_y4m->chroma_type, "422") == 0) {
-    _y4m->src_c_dec_h = _y4m->dst_c_dec_h = 2;
+    _y4m->src_c_dec_h = 2;
     _y4m->src_c_dec_v = 1;
-    _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
-    /*Chroma filter required: read into the aux buf first.
-      We need to make two filter passes, so we need some extra space in the
-       aux buffer.*/
-    _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
-    _y4m->convert = y4m_convert_422_420jpeg;
+    if (only_420) {
+      _y4m->dst_c_dec_h = 2;
+      _y4m->dst_c_dec_v = 2;
+      _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+      /*Chroma filter required: read into the aux buf first.
+        We need to make two filter passes, so we need some extra space in the
+         aux buffer.*/
+      _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
+      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz +
+          ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
+      _y4m->convert = y4m_convert_422_420jpeg;
+    } else {
+      _y4m->vpx_fmt = VPX_IMG_FMT_I422;
+      _y4m->vpx_bps = 16;
+      _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
+      _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
+      _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h
+                              + 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
+      /*Natively supported: no conversion required.*/
+      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+      _y4m->convert = y4m_convert_null;
+      }
   } else if (strcmp(_y4m->chroma_type, "411") == 0) {
     _y4m->src_c_dec_h = 4;
     _y4m->dst_c_dec_h = 2;
@@ -758,29 +775,52 @@
     _y4m->convert = y4m_convert_411_420jpeg;
   } else if (strcmp(_y4m->chroma_type, "444") == 0) {
     _y4m->src_c_dec_h = 1;
-    _y4m->dst_c_dec_h = 2;
     _y4m->src_c_dec_v = 1;
-    _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
-    /*Chroma filter required: read into the aux buf first.
-      We need to make two filter passes, so we need some extra space in the
-       aux buffer.*/
-    _y4m->aux_buf_read_sz = 2 * _y4m->pic_w * _y4m->pic_h;
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
-    _y4m->convert = y4m_convert_444_420jpeg;
+    if (only_420) {
+      _y4m->dst_c_dec_h = 2;
+      _y4m->dst_c_dec_v = 2;
+      _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+      /*Chroma filter required: read into the aux buf first.
+        We need to make two filter passes, so we need some extra space in the
+         aux buffer.*/
+      _y4m->aux_buf_read_sz = 2 * _y4m->pic_w * _y4m->pic_h;
+      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz +
+          ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
+      _y4m->convert = y4m_convert_444_420jpeg;
+    } else {
+      _y4m->vpx_fmt = VPX_IMG_FMT_I444;
+      _y4m->vpx_bps = 24;
+      _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
+      _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
+      _y4m->dst_buf_read_sz = 3 * _y4m->pic_w * _y4m->pic_h;
+      /*Natively supported: no conversion required.*/
+      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+      _y4m->convert = y4m_convert_null;
+    }
   } else if (strcmp(_y4m->chroma_type, "444alpha") == 0) {
     _y4m->src_c_dec_h = 1;
-    _y4m->dst_c_dec_h = 2;
     _y4m->src_c_dec_v = 1;
-    _y4m->dst_c_dec_v = 2;
-    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
-    /*Chroma filter required: read into the aux buf first.
-      We need to make two filter passes, so we need some extra space in the
-       aux buffer.
-      The extra plane also gets read into the aux buf.
-      It will be discarded.*/
-    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 3 * _y4m->pic_w * _y4m->pic_h;
-    _y4m->convert = y4m_convert_444_420jpeg;
+    if (only_420) {
+      _y4m->dst_c_dec_h = 2;
+      _y4m->dst_c_dec_v = 2;
+      _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+      /*Chroma filter required: read into the aux buf first.
+        We need to make two filter passes, so we need some extra space in the
+         aux buffer.
+        The extra plane also gets read into the aux buf.
+        It will be discarded.*/
+      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 3 * _y4m->pic_w * _y4m->pic_h;
+      _y4m->convert = y4m_convert_444_420jpeg;
+    } else {
+      _y4m->vpx_fmt = VPX_IMG_FMT_444A;
+      _y4m->vpx_bps = 32;
+      _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
+      _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
+      _y4m->dst_buf_read_sz = 4 * _y4m->pic_w * _y4m->pic_h;
+      /*Natively supported: no conversion required.*/
+      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+      _y4m->convert = y4m_convert_null;
+    }
   } else if (strcmp(_y4m->chroma_type, "mono") == 0) {
     _y4m->src_c_dec_h = _y4m->src_c_dec_v = 0;
     _y4m->dst_c_dec_h = _y4m->dst_c_dec_v = 2;
@@ -847,22 +887,23 @@
      sizes, which would require a separate fread call for every row.*/
   memset(_img, 0, sizeof(*_img));
   /*Y4M has the planes in Y'CbCr order, which libvpx calls Y, U, and V.*/
-  _img->fmt = IMG_FMT_I420;
+  _img->fmt = _y4m->vpx_fmt;
   _img->w = _img->d_w = _y4m->pic_w;
   _img->h = _img->d_h = _y4m->pic_h;
-  /*This is hard-coded to 4:2:0 for now, as that's all VP8 supports.*/
-  _img->x_chroma_shift = 1;
-  _img->y_chroma_shift = 1;
-  _img->bps = 12;
+  _img->x_chroma_shift = _y4m->dst_c_dec_h >> 1;
+  _img->y_chroma_shift = _y4m->dst_c_dec_v >> 1;
+  _img->bps = _y4m->vpx_bps;
+
   /*Set up the buffer pointers.*/
   pic_sz = _y4m->pic_w * _y4m->pic_h;
   c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h;
   c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v;
   c_sz = c_w * c_h;
-  _img->stride[PLANE_Y] = _y4m->pic_w;
+  _img->stride[PLANE_Y] = _img->stride[PLANE_ALPHA] = _y4m->pic_w;
   _img->stride[PLANE_U] = _img->stride[PLANE_V] = c_w;
   _img->planes[PLANE_Y] = _y4m->dst_buf;
   _img->planes[PLANE_U] = _y4m->dst_buf + pic_sz;
   _img->planes[PLANE_V] = _y4m->dst_buf + pic_sz + c_sz;
+  _img->planes[PLANE_ALPHA] = _y4m->dst_buf + pic_sz + 2 * c_sz;
   return 1;
 }
diff --git a/y4minput.h b/y4minput.h
index 2fa3767..b2a390c 100644
--- a/y4minput.h
+++ b/y4minput.h
@@ -51,9 +51,12 @@
   y4m_convert_func  convert;
   unsigned char    *dst_buf;
   unsigned char    *aux_buf;
+  enum vpx_img_fmt  vpx_fmt;
+  int               vpx_bps;
 };
 
-int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip);
+int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,
+                   int only_420);
 void y4m_input_close(y4m_input *_y4m);
 int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, vpx_image_t *img);