Merge "vp9/count_segs: fix out of bounds read" into experimental

diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index a8139cb..151a38b 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc

@@ -8,6 +8,10 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
 
 extern "C" {
 #include "./vpx_config.h"
@@ -16,10 +20,6 @@
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 }
-#include "third_party/googletest/src/include/gtest/gtest.h"
-#include "test/acm_random.h"
-#include "test/register_state_check.h"
-#include "test/util.h"
 
 namespace {
 typedef void (*convolve_fn_t)(const uint8_t *src, int src_stride,
@@ -46,27 +46,27 @@
 // Reference 8-tap subpixel filter, slightly modified to fit into this test.
 #define VP9_FILTER_WEIGHT 128
 #define VP9_FILTER_SHIFT 7
-static uint8_t clip_pixel(int x) {
+uint8_t clip_pixel(int x) {
   return x < 0 ? 0 :
          x > 255 ? 255 :
          x;
 }
 
-static void filter_block2d_8_c(const uint8_t *src_ptr,
-                               const unsigned int src_stride,
-                               const int16_t *HFilter,
-                               const int16_t *VFilter,
-                               uint8_t *dst_ptr,
-                               unsigned int dst_stride,
-                               unsigned int output_width,
-                               unsigned int output_height) {
+void filter_block2d_8_c(const uint8_t *src_ptr,
+                        const unsigned int src_stride,
+                        const int16_t *HFilter,
+                        const int16_t *VFilter,
+                        uint8_t *dst_ptr,
+                        unsigned int dst_stride,
+                        unsigned int output_width,
+                        unsigned int output_height) {
   // Between passes, we use an intermediate buffer whose height is extended to
   // have enough horizontally filtered values as input for the vertical pass.
   // This buffer is allocated to be big enough for the largest block type we
   // support.
   const int kInterp_Extend = 4;
   const unsigned int intermediate_height =
-    (kInterp_Extend - 1) + output_height + kInterp_Extend;
+      (kInterp_Extend - 1) + output_height + kInterp_Extend;
 
   /* Size of intermediate_buffer is max_intermediate_height * filter_max_width,
    * where max_intermediate_height = (kInterp_Extend - 1) + filter_max_height
@@ -87,15 +87,15 @@
     for (i = 0; i < intermediate_height; ++i) {
       for (j = 0; j < output_width; ++j) {
         // Apply filter...
-        int temp = ((int)src_ptr[0] * HFilter[0]) +
-                   ((int)src_ptr[1] * HFilter[1]) +
-                   ((int)src_ptr[2] * HFilter[2]) +
-                   ((int)src_ptr[3] * HFilter[3]) +
-                   ((int)src_ptr[4] * HFilter[4]) +
-                   ((int)src_ptr[5] * HFilter[5]) +
-                   ((int)src_ptr[6] * HFilter[6]) +
-                   ((int)src_ptr[7] * HFilter[7]) +
-                   (VP9_FILTER_WEIGHT >> 1);  // Rounding
+        const int temp = (src_ptr[0] * HFilter[0]) +
+                         (src_ptr[1] * HFilter[1]) +
+                         (src_ptr[2] * HFilter[2]) +
+                         (src_ptr[3] * HFilter[3]) +
+                         (src_ptr[4] * HFilter[4]) +
+                         (src_ptr[5] * HFilter[5]) +
+                         (src_ptr[6] * HFilter[6]) +
+                         (src_ptr[7] * HFilter[7]) +
+                         (VP9_FILTER_WEIGHT >> 1);  // Rounding
 
         // Normalize back to 0-255...
         *output_ptr = clip_pixel(temp >> VP9_FILTER_SHIFT);
@@ -115,15 +115,15 @@
     for (i = 0; i < output_height; ++i) {
       for (j = 0; j < output_width; ++j) {
         // Apply filter...
-        int temp = ((int)src_ptr[0] * VFilter[0]) +
-                   ((int)src_ptr[1] * VFilter[1]) +
-                   ((int)src_ptr[2] * VFilter[2]) +
-                   ((int)src_ptr[3] * VFilter[3]) +
-                   ((int)src_ptr[4] * VFilter[4]) +
-                   ((int)src_ptr[5] * VFilter[5]) +
-                   ((int)src_ptr[6] * VFilter[6]) +
-                   ((int)src_ptr[7] * VFilter[7]) +
-                   (VP9_FILTER_WEIGHT >> 1);  // Rounding
+        const int temp = (src_ptr[0] * VFilter[0]) +
+                         (src_ptr[1] * VFilter[1]) +
+                         (src_ptr[2] * VFilter[2]) +
+                         (src_ptr[3] * VFilter[3]) +
+                         (src_ptr[4] * VFilter[4]) +
+                         (src_ptr[5] * VFilter[5]) +
+                         (src_ptr[6] * VFilter[6]) +
+                         (src_ptr[7] * VFilter[7]) +
+                         (VP9_FILTER_WEIGHT >> 1);  // Rounding
 
         // Normalize back to 0-255...
         *dst_ptr++ = clip_pixel(temp >> VP9_FILTER_SHIFT);
@@ -135,12 +135,12 @@
   }
 }
 
-static void block2d_average_c(uint8_t *src,
-                              unsigned int src_stride,
-                              uint8_t *output_ptr,
-                              unsigned int output_stride,
-                              unsigned int output_width,
-                              unsigned int output_height) {
+void block2d_average_c(uint8_t *src,
+                       unsigned int src_stride,
+                       uint8_t *output_ptr,
+                       unsigned int output_stride,
+                       unsigned int output_width,
+                       unsigned int output_height) {
   unsigned int i, j;
   for (i = 0; i < output_height; ++i) {
     for (j = 0; j < output_width; ++j) {
@@ -150,15 +150,15 @@
   }
 }
 
-static void filter_average_block2d_8_c(const uint8_t *src_ptr,
-                                       const unsigned int src_stride,
-                                       const int16_t *HFilter,
-                                       const int16_t *VFilter,
-                                       uint8_t *dst_ptr,
-                                       unsigned int dst_stride,
-                                       unsigned int output_width,
-                                       unsigned int output_height) {
-  uint8_t tmp[64*64];
+void filter_average_block2d_8_c(const uint8_t *src_ptr,
+                                const unsigned int src_stride,
+                                const int16_t *HFilter,
+                                const int16_t *VFilter,
+                                uint8_t *dst_ptr,
+                                unsigned int dst_stride,
+                                unsigned int output_width,
+                                unsigned int output_height) {
+  uint8_t tmp[64 * 64];
 
   assert(output_width <= 64);
   assert(output_height <= 64);
@@ -173,10 +173,9 @@
   static void SetUpTestCase() {
     // Force input_ to be unaligned, output to be 16 byte aligned.
     input_ = reinterpret_cast<uint8_t*>(
-        vpx_memalign(kDataAlignment, kOuterBlockSize * kOuterBlockSize + 1))
-        + 1;
+        vpx_memalign(kDataAlignment, kInputBufferSize + 1)) + 1;
     output_ = reinterpret_cast<uint8_t*>(
-        vpx_memalign(kDataAlignment, kOuterBlockSize * kOuterBlockSize));
+        vpx_memalign(kDataAlignment, kOutputBufferSize));
   }
 
   static void TearDownTestCase() {
@@ -186,62 +185,63 @@
     output_ = NULL;
   }
 
-  protected:
-    static const int kDataAlignment = 16;
-    static const int kOuterBlockSize = 128;
-    static const int kInputStride = kOuterBlockSize;
-    static const int kOutputStride = kOuterBlockSize;
-    static const int kMaxDimension = 64;
+ protected:
+  static const int kDataAlignment = 16;
+  static const int kOuterBlockSize = 128;
+  static const int kInputStride = kOuterBlockSize;
+  static const int kOutputStride = kOuterBlockSize;
+  static const int kMaxDimension = 64;
+  static const int kInputBufferSize = kOuterBlockSize * kOuterBlockSize;
+  static const int kOutputBufferSize = kOuterBlockSize * kOuterBlockSize;
 
-    int Width() const { return GET_PARAM(0); }
-    int Height() const { return GET_PARAM(1); }
-    int BorderLeft() const {
-      const int center = (kOuterBlockSize - Width()) / 2;
-      return (center + (kDataAlignment - 1)) & ~(kDataAlignment - 1);
-    }
-    int BorderTop() const { return (kOuterBlockSize - Height()) / 2; }
+  int Width() const { return GET_PARAM(0); }
+  int Height() const { return GET_PARAM(1); }
+  int BorderLeft() const {
+    const int center = (kOuterBlockSize - Width()) / 2;
+    return (center + (kDataAlignment - 1)) & ~(kDataAlignment - 1);
+  }
+  int BorderTop() const { return (kOuterBlockSize - Height()) / 2; }
 
-    bool IsIndexInBorder(int i) {
-      return (i < BorderTop() * kOuterBlockSize ||
-              i >= (BorderTop() + Height()) * kOuterBlockSize ||
-              i % kOuterBlockSize < BorderLeft() ||
-              i % kOuterBlockSize >= (BorderLeft() + Width()));
+  bool IsIndexInBorder(int i) {
+    return (i < BorderTop() * kOuterBlockSize ||
+            i >= (BorderTop() + Height()) * kOuterBlockSize ||
+            i % kOuterBlockSize < BorderLeft() ||
+            i % kOuterBlockSize >= (BorderLeft() + Width()));
+  }
+
+  virtual void SetUp() {
+    UUT_ = GET_PARAM(2);
+    /* Set up guard blocks for an inner block cetered in the outer block */
+    for (int i = 0; i < kOutputBufferSize; ++i) {
+      if (IsIndexInBorder(i))
+        output_[i] = 255;
+      else
+        output_[i] = 0;
     }
 
-    virtual void SetUp() {
-      UUT_ = GET_PARAM(2);
-      memset(input_, 0, sizeof(input_));
-      /* Set up guard blocks for an inner block cetered in the outer block */
-      for (int i = 0; i < kOuterBlockSize * kOuterBlockSize; ++i) {
-        if (IsIndexInBorder(i))
-          output_[i] = 255;
-        else
-          output_[i] = 0;
-      }
+    ::libvpx_test::ACMRandom prng;
+    for (int i = 0; i < kInputBufferSize; ++i)
+      input_[i] = prng.Rand8Extremes();
+  }
 
-      ::libvpx_test::ACMRandom prng;
-      for (int i = 0; i < kOuterBlockSize * kOuterBlockSize; ++i)
-        input_[i] = prng.Rand8Extremes();
+  void CheckGuardBlocks() {
+    for (int i = 0; i < kOutputBufferSize; ++i) {
+      if (IsIndexInBorder(i))
+        EXPECT_EQ(255, output_[i]);
     }
+  }
 
-    void CheckGuardBlocks() {
-      for (int i = 0; i < kOuterBlockSize * kOuterBlockSize; ++i) {
-        if (IsIndexInBorder(i))
-          EXPECT_EQ(255, output_[i]);
-      }
-    }
+  uint8_t* input() const {
+    return input_ + BorderTop() * kOuterBlockSize + BorderLeft();
+  }
 
-    uint8_t* input() {
-      return input_ + BorderTop() * kOuterBlockSize + BorderLeft();
-    }
+  uint8_t* output() const {
+    return output_ + BorderTop() * kOuterBlockSize + BorderLeft();
+  }
 
-    uint8_t* output() {
-      return output_ + BorderTop() * kOuterBlockSize + BorderLeft();
-    }
-
-    const ConvolveFunctions* UUT_;
-    static uint8_t* input_;
-    static uint8_t* output_;
+  const ConvolveFunctions* UUT_;
+  static uint8_t* input_;
+  static uint8_t* output_;
 };
 uint8_t* ConvolveTest::input_ = NULL;
 uint8_t* ConvolveTest::output_ = NULL;
@@ -309,7 +309,7 @@
   vp9_sub_pel_filters_8lp
 };
 const int kNumFilterBanks = sizeof(kTestFilterList) /
-    sizeof(kTestFilterList[0]);
+                            sizeof(kTestFilterList[0]);
 const int kNumFilters = 16;
 
 TEST(ConvolveTest, FiltersWontSaturateWhenAddedPairwise) {
@@ -525,7 +525,6 @@
     make_tuple(64, 32, &convolve8_c),
     make_tuple(32, 64, &convolve8_c),
     make_tuple(64, 64, &convolve8_c)));
-}
 
 #if HAVE_SSSE3
 const ConvolveFunctions convolve8_ssse3(
@@ -548,3 +547,4 @@
     make_tuple(32, 64, &convolve8_ssse3),
     make_tuple(64, 64, &convolve8_ssse3)));
 #endif
+}  // namespace

diff --git a/test/superframe_test.cc b/test/superframe_test.cc
index 52faddb..062ec6c 100644
--- a/test/superframe_test.cc
+++ b/test/superframe_test.cc

@@ -30,7 +30,7 @@
   }
 
   virtual void TearDown() {
-    delete modified_buf_;
+    delete[] modified_buf_;
   }
 
   virtual bool Continue() const {
@@ -59,7 +59,7 @@
         buffer[pkt->data.frame.sz - index_sz] == marker) {
       // frame is a superframe. strip off the index.
       if (modified_buf_)
-        delete modified_buf_;
+        delete[] modified_buf_;
       modified_buf_ = new uint8_t[pkt->data.frame.sz - index_sz];
       memcpy(modified_buf_, pkt->data.frame.buf,
              pkt->data.frame.sz - index_sz);

diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index ed5441c..f0a5d97 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c

@@ -771,8 +771,7 @@
   vp9_update_mode_info_border(cm, cm->prev_mip);
   vp9_update_mode_info_in_image(cm, cm->prev_mi);
 
-  cm->ref_frame_sign_bias[GOLDEN_FRAME] = 0;
-  cm->ref_frame_sign_bias[ALTREF_FRAME] = 0;
+  vpx_memset(cm->ref_frame_sign_bias, 0, sizeof(cm->ref_frame_sign_bias));
 
   cm->frame_context_idx = 0;
 }

diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index bb873c1..1538a00 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h

@@ -32,13 +32,9 @@
 
 void vp9_initialize_common(void);
 
-#if CONFIG_MULTIPLE_ARF
+// Define the number of candidate reference buffers.
 #define NUM_REF_FRAMES 8
 #define NUM_REF_FRAMES_LG2 3
-#else
-#define NUM_REF_FRAMES 3
-#define NUM_REF_FRAMES_LG2 2
-#endif
 
 #define ALLOWED_REFS_PER_FRAME 3
 
@@ -168,8 +164,8 @@
 typedef struct VP9Common {
   struct vpx_internal_error_info  error;
 
-  DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][2]);
+  DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][2]);
 
   int width;
   int height;

diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index f1b214d..408573f 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c

@@ -703,11 +703,9 @@
 #endif
     }
 
-    if (is_inter_mode(mbmi->mode)) {
-      mbmi->interp_filter = cm->mcomp_filter_type == SWITCHABLE
-                                ? read_switchable_filter_type(pbi, r)
-                                : cm->mcomp_filter_type;
-    }
+    mbmi->interp_filter = cm->mcomp_filter_type == SWITCHABLE
+                              ? read_switchable_filter_type(pbi, r)
+                              : cm->mcomp_filter_type;
 
     if (cm->comp_pred_mode == COMP_PREDICTION_ONLY ||
         (cm->comp_pred_mode == HYBRID_PREDICTION &&

diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index f0948ba..0f87a21 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c

@@ -164,7 +164,7 @@
 }
 
 void vp9_init_dequantizer(VP9_COMMON *pc) {
-  int q, i;
+  int q;
 
   for (q = 0; q < QINDEX_RANGE; q++) {
     // DC value
@@ -172,12 +172,8 @@
     pc->uv_dequant[q][0] = vp9_dc_quant(q, pc->uv_dc_delta_q);
 
     // AC values
-    for (i = 1; i < 16; i++) {
-      const int rc = vp9_default_zig_zag1d_4x4[i];
-
-      pc->y_dequant[q][rc] = vp9_ac_quant(q, 0);
-      pc->uv_dequant[q][rc] = vp9_ac_quant(q, pc->uv_ac_delta_q);
-    }
+    pc->y_dequant[q][1] = vp9_ac_quant(q, 0);
+    pc->uv_dequant[q][1] = vp9_ac_quant(q, pc->uv_ac_delta_q);
   }
 }
 
@@ -1278,8 +1274,11 @@
         vp9_setup_scale_factors_for_frame(sf, fb, pc->width, pc->height);
     }
 
-    pc->ref_frame_sign_bias[GOLDEN_FRAME] = vp9_read_bit(&header_bc);
-    pc->ref_frame_sign_bias[ALTREF_FRAME] = vp9_read_bit(&header_bc);
+    // Read the sign bias for each reference frame buffer.
+    for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {
+      pc->ref_frame_sign_bias[i + 1] = vp9_read_bit(&header_bc);
+    }
+
     xd->allow_high_precision_mv = vp9_read_bit(&header_bc);
     pc->mcomp_filter_type = read_mcomp_filter_type(&header_bc);
 

diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 7eb35da..7152ac9 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c

@@ -1984,9 +1984,10 @@
     vp9_write_literal(&header_bc, cpi->gld_fb_idx, NUM_REF_FRAMES_LG2);
     vp9_write_literal(&header_bc, cpi->alt_fb_idx, NUM_REF_FRAMES_LG2);
 
-    // Indicate reference frame sign bias for Golden and ARF frames (always 0 for last frame buffer)
-    vp9_write_bit(&header_bc, pc->ref_frame_sign_bias[GOLDEN_FRAME]);
-    vp9_write_bit(&header_bc, pc->ref_frame_sign_bias[ALTREF_FRAME]);
+    // Indicate the sign bias for each reference frame buffer.
+    for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {
+      vp9_write_bit(&header_bc, pc->ref_frame_sign_bias[LAST_FRAME + i]);
+    }
 
     // Signal whether to allow high MV precision
     vp9_write_bit(&header_bc, (xd->allow_high_precision_mv) ? 1 : 0);

diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 1dd2800..22d11e4 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c

@@ -1134,8 +1134,8 @@
         r2 += x->partition_cost[pl][PARTITION_VERT];
         if (RDCOST(x->rdmult, x->rddiv, r2, d2) <
                 RDCOST(x->rdmult, x->rddiv, mb16_rate, mb16_dist)) {
-          mb16_rate = r;
-          mb16_dist = d;
+          mb16_rate = r2;
+          mb16_dist = d2;
           mb_partitioning[i][j] = BLOCK_SIZE_SB8X16;
         }
         for (p = 0; p < MAX_MB_PLANE; p++) {
@@ -1179,8 +1179,8 @@
         r2 += x->partition_cost[pl][PARTITION_HORZ];
         if (RDCOST(x->rdmult, x->rddiv, r2, d2) <
                 RDCOST(x->rdmult, x->rddiv, mb16_rate, mb16_dist)) {
-          mb16_rate = r;
-          mb16_dist = d;
+          mb16_rate = r2;
+          mb16_dist = d2;
           mb_partitioning[i][j] = BLOCK_SIZE_SB16X8;
         }
         for (p = 0; p < MAX_MB_PLANE; p++) {
@@ -1220,7 +1220,7 @@
 
         // Dummy encode, do not do the tokenization
 #if CONFIG_SB8X8
-        encode_sb(cpi, tp, mi_row + y_idx, mi_col + x_idx, 0,
+        encode_sb(cpi, tp, mi_row + y_idx_m, mi_col + x_idx_m, 0,
                   BLOCK_SIZE_MB16X16, mb_partitioning[i][j], NULL, NULL);
 #else
         encode_macroblock(cpi, tp, 0, mi_row + y_idx_m,
@@ -2245,7 +2245,8 @@
         mi_row, mi_col, xd->scale_factor, xd->scale_factor_uv);
 
     if (!x->skip) {
-      vp9_encode_inter16x16(cm, x, mi_row, mi_col);
+      vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_SIZE_MB16X16);
+      vp9_encode_sb(cm, x, BLOCK_SIZE_MB16X16);
     } else {
       vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_SIZE_MB16X16);
 #if CONFIG_COMP_INTERINTRA_PRED
@@ -2491,90 +2492,7 @@
   } else
 #endif
   if (!x->skip) {
-    vp9_subtract_sb(x, bsize);
-
-    switch (xd->mode_info_context->mbmi.txfm_size) {
-      case TX_32X32:
-        vp9_transform_sby_32x32(x, bsize);
-        vp9_quantize_sby_32x32(x, bsize);
-        if (bsize == BLOCK_SIZE_SB64X64) {
-          vp9_transform_sbuv_32x32(x, bsize);
-          vp9_quantize_sbuv_32x32(x, bsize);
-        } else {
-          vp9_transform_sbuv_16x16(x, bsize);
-          vp9_quantize_sbuv_16x16(x, bsize);
-        }
-        if (x->optimize) {
-          vp9_optimize_sby(cm, x, bsize);
-          if (bsize == BLOCK_SIZE_SB64X64)
-            vp9_optimize_sbuv(cm, x, bsize);
-          else
-            vp9_optimize_sbuv(cm, x, bsize);
-        }
-        vp9_inverse_transform_sby_32x32(xd, bsize);
-        if (bsize == BLOCK_SIZE_SB64X64)
-          vp9_inverse_transform_sbuv_32x32(xd, bsize);
-        else
-          vp9_inverse_transform_sbuv_16x16(xd, bsize);
-        break;
-      case TX_16X16:
-        vp9_transform_sby_16x16(x, bsize);
-        vp9_quantize_sby_16x16(x, bsize);
-        if (bsize >= BLOCK_SIZE_SB32X32) {
-          vp9_transform_sbuv_16x16(x, bsize);
-          vp9_quantize_sbuv_16x16(x, bsize);
-        } else {
-          vp9_transform_sbuv_8x8(x, bsize);
-          vp9_quantize_sbuv_8x8(x, bsize);
-        }
-        if (x->optimize) {
-          vp9_optimize_sby(cm, x, bsize);
-          if (bsize >= BLOCK_SIZE_SB32X32)
-            vp9_optimize_sbuv(cm, x, bsize);
-          else
-            vp9_optimize_sbuv(cm, x, bsize);
-        }
-        vp9_inverse_transform_sby_16x16(xd, bsize);
-        if (bsize >= BLOCK_SIZE_SB32X32)
-          vp9_inverse_transform_sbuv_16x16(xd, bsize);
-        else
-          vp9_inverse_transform_sbuv_8x8(xd, bsize);
-        break;
-      case TX_8X8:
-        vp9_transform_sby_8x8(x, bsize);
-        vp9_quantize_sby_8x8(x, bsize);
-        if (x->optimize)
-          vp9_optimize_sby(cm, x, bsize);
-        vp9_inverse_transform_sby_8x8(xd, bsize);
-        if (bsize >= BLOCK_SIZE_MB16X16) {
-          vp9_transform_sbuv_8x8(x, bsize);
-          vp9_quantize_sbuv_8x8(x, bsize);
-          if (x->optimize)
-            vp9_optimize_sbuv(cm, x, bsize);
-          vp9_inverse_transform_sbuv_8x8(xd, bsize);
-        } else {
-          vp9_transform_sbuv_4x4(x, bsize);
-          vp9_quantize_sbuv_4x4(x, bsize);
-          if (x->optimize)
-            vp9_optimize_sbuv(cm, x, bsize);
-          vp9_inverse_transform_sbuv_4x4(xd, bsize);
-        }
-        break;
-      case TX_4X4:
-        vp9_transform_sby_4x4(x, bsize);
-        vp9_transform_sbuv_4x4(x, bsize);
-        vp9_quantize_sby_4x4(x, bsize);
-        vp9_quantize_sbuv_4x4(x, bsize);
-        if (x->optimize) {
-          vp9_optimize_sby(cm, x, bsize);
-          vp9_optimize_sbuv(cm, x, bsize);
-        }
-        vp9_inverse_transform_sby_4x4(xd, bsize);
-        vp9_inverse_transform_sbuv_4x4(xd, bsize);
-        break;
-      default: assert(0);
-    }
-    vp9_recon_sb_c(xd, bsize);
+    vp9_encode_sb(cm, x, bsize);
     vp9_tokenize_sb(cpi, &x->e_mbd, t, !output_enabled, bsize);
   } else {
     // FIXME(rbultje): not tile-aware (mi - 1)

diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 15fd4f1..5f00b70 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c

@@ -480,31 +480,33 @@
   *a = *l = (final_eob > 0);
 }
 
-struct optimize_ctx {
-  ENTROPY_CONTEXT ta[MAX_MB_PLANE][16];
-  ENTROPY_CONTEXT tl[MAX_MB_PLANE][16];
-};
-
 struct optimize_block_args {
   VP9_COMMON *cm;
   MACROBLOCK *x;
   struct optimize_ctx *ctx;
 };
 
-static void optimize_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
-                           int ss_txfrm_size, void *arg) {
-  const struct optimize_block_args* const args = arg;
-  MACROBLOCKD* const xd = &args->x->e_mbd;
+void vp9_optimize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
+                    int ss_txfrm_size, VP9_COMMON *cm, MACROBLOCK *mb,
+                    struct optimize_ctx *ctx) {
+  MACROBLOCKD* const xd = &mb->e_mbd;
   int x, y;
 
   // find current entropy context
   txfrm_block_to_raster_xy(xd, bsize, plane, block, ss_txfrm_size, &x, &y);
 
-  optimize_b(args->cm, args->x, plane, block, bsize,
-             &args->ctx->ta[plane][x], &args->ctx->tl[plane][y],
+  optimize_b(cm, mb, plane, block, bsize,
+             &ctx->ta[plane][x], &ctx->tl[plane][y],
              ss_txfrm_size / 2);
 }
 
+static void optimize_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
+                           int ss_txfrm_size, void *arg) {
+  const struct optimize_block_args* const args = arg;
+  vp9_optimize_b(plane, block, bsize, ss_txfrm_size, args->cm, args->x,
+                 args->ctx);
+}
+
 void vp9_optimize_init(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,
                        struct optimize_ctx *ctx) {
   int p;
@@ -553,77 +555,127 @@
   foreach_transformed_block_uv(&x->e_mbd, bsize, optimize_block, &arg);
 }
 
-#if !CONFIG_SB8X8
-void vp9_fidct_mb(VP9_COMMON *const cm, MACROBLOCK *x) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
+struct encode_b_args {
+  VP9_COMMON *cm;
+  MACROBLOCK *x;
+  struct optimize_ctx *ctx;
+};
 
-  if (tx_size == TX_16X16) {
-    vp9_transform_sby_16x16(x, BLOCK_SIZE_MB16X16);
-    vp9_transform_sbuv_8x8(x, BLOCK_SIZE_MB16X16);
-    vp9_quantize_sby_16x16(x, BLOCK_SIZE_MB16X16);
-    vp9_quantize_sbuv_8x8(x, BLOCK_SIZE_MB16X16);
-    if (x->optimize) {
-      vp9_optimize_sby(cm, x, BLOCK_SIZE_MB16X16);
-      vp9_optimize_sbuv(cm, x, BLOCK_SIZE_MB16X16);
-    }
-    vp9_inverse_transform_sby_16x16(xd, BLOCK_SIZE_MB16X16);
-    vp9_inverse_transform_sbuv_8x8(xd, BLOCK_SIZE_MB16X16);
-  } else if (tx_size == TX_8X8) {
-    vp9_transform_sby_8x8(x, BLOCK_SIZE_MB16X16);
-    vp9_quantize_sby_8x8(x, BLOCK_SIZE_MB16X16);
-    if (x->optimize)
-      vp9_optimize_sby(cm, x, BLOCK_SIZE_MB16X16);
-    vp9_inverse_transform_sby_8x8(xd, BLOCK_SIZE_MB16X16);
-    if (xd->mode_info_context->mbmi.mode == SPLITMV) {
-      assert(xd->mode_info_context->mbmi.partitioning != PARTITIONING_4X4);
-      vp9_transform_sbuv_4x4(x, BLOCK_SIZE_MB16X16);
-      vp9_quantize_sbuv_4x4(x, BLOCK_SIZE_MB16X16);
-      if (x->optimize)
-        vp9_optimize_sbuv(cm, x, BLOCK_SIZE_MB16X16);
-      vp9_inverse_transform_sbuv_4x4(xd, BLOCK_SIZE_MB16X16);
-    } else {
-      vp9_transform_sbuv_8x8(x, BLOCK_SIZE_MB16X16);
-      vp9_quantize_sbuv_8x8(x, BLOCK_SIZE_MB16X16);
-      if (x->optimize)
-        vp9_optimize_sbuv(cm, x, BLOCK_SIZE_MB16X16);
-      vp9_inverse_transform_sbuv_8x8(xd, BLOCK_SIZE_MB16X16);
-    }
-  } else {
-    vp9_transform_sby_4x4(x, BLOCK_SIZE_MB16X16);
-    vp9_transform_sbuv_4x4(x, BLOCK_SIZE_MB16X16);
-    vp9_quantize_sby_4x4(x, BLOCK_SIZE_MB16X16);
-    vp9_quantize_sbuv_4x4(x, BLOCK_SIZE_MB16X16);
-    if (x->optimize) {
-      vp9_optimize_sby(cm, x, BLOCK_SIZE_MB16X16);
-      vp9_optimize_sbuv(cm, x, BLOCK_SIZE_MB16X16);
-    }
-    vp9_inverse_transform_sby_4x4(xd, BLOCK_SIZE_MB16X16);
-    vp9_inverse_transform_sbuv_4x4(xd, BLOCK_SIZE_MB16X16);
+static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
+                         int ss_txfrm_size, void *arg) {
+  struct encode_b_args* const args = arg;
+  MACROBLOCK* const x = args->x;
+  MACROBLOCKD* const xd = &x->e_mbd;
+  const int bw = 4 << (b_width_log2(bsize) - xd->plane[plane].subsampling_x);
+  const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,
+                                                       block, ss_txfrm_size);
+  int16_t* const src_diff = raster_block_offset_int16(xd, bsize, plane,
+                                                      raster_block,
+                                                      x->plane[plane].src_diff);
+  int16_t* const diff = raster_block_offset_int16(xd, bsize, plane,
+                                                  raster_block,
+                                                  xd->plane[plane].diff);
+  TX_TYPE tx_type = DCT_DCT;
+
+  switch (ss_txfrm_size / 2) {
+    case TX_32X32:
+      vp9_short_fdct32x32(src_diff,
+                          BLOCK_OFFSET(x->plane[plane].coeff, block, 16),
+                          bw * 2);
+      break;
+    case TX_16X16:
+      tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;
+      if (tx_type != DCT_DCT) {
+        vp9_short_fht16x16(src_diff,
+                           BLOCK_OFFSET(x->plane[plane].coeff, block, 16),
+                           bw, tx_type);
+      } else {
+        x->fwd_txm16x16(src_diff,
+                        BLOCK_OFFSET(x->plane[plane].coeff, block, 16),
+                        bw * 2);
+      }
+      break;
+    case TX_8X8:
+      tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;
+      if (tx_type != DCT_DCT) {
+        vp9_short_fht8x8(src_diff,
+                           BLOCK_OFFSET(x->plane[plane].coeff, block, 16),
+                           bw, tx_type);
+      } else {
+        x->fwd_txm8x8(src_diff,
+                      BLOCK_OFFSET(x->plane[plane].coeff, block, 16),
+                      bw * 2);
+      }
+      break;
+    case TX_4X4:
+      tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT;
+      if (tx_type != DCT_DCT) {
+        vp9_short_fht4x4(src_diff,
+                           BLOCK_OFFSET(x->plane[plane].coeff, block, 16),
+                           bw, tx_type);
+      } else {
+        x->fwd_txm4x4(src_diff,
+                      BLOCK_OFFSET(x->plane[plane].coeff, block, 16),
+                      bw * 2);
+      }
+      break;
+    default:
+      assert(0);
+  }
+
+  vp9_quantize(x, plane, block, 16 << ss_txfrm_size, tx_type);
+  if (x->optimize)
+    vp9_optimize_b(plane, block, bsize, ss_txfrm_size, args->cm, x, args->ctx);
+
+  switch (ss_txfrm_size / 2) {
+    case TX_32X32:
+      vp9_short_idct32x32(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
+                          diff, bw * 2);
+      break;
+    case TX_16X16:
+      if (tx_type == DCT_DCT) {
+        vp9_short_idct16x16(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
+                            diff, bw * 2);
+      } else {
+        vp9_short_iht16x16(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
+                           diff, bw, tx_type);
+      }
+      break;
+    case TX_8X8:
+      if (tx_type == DCT_DCT) {
+        vp9_short_idct8x8(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
+                          diff, bw * 2);
+      } else {
+        vp9_short_iht8x8(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
+                         diff, bw, tx_type);
+      }
+      break;
+    case TX_4X4:
+      if (tx_type == DCT_DCT) {
+        // this is like vp9_short_idct4x4 but has a special case around eob<=1
+        // which is significant (not just an optimization) for the lossless
+        // case.
+        vp9_inverse_transform_b_4x4(xd, xd->plane[plane].eobs[block],
+            BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16), diff, bw * 2);
+      } else {
+        vp9_short_iht4x4(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
+                         diff, bw, tx_type);
+      }
+      break;
   }
 }
 
-void vp9_encode_inter16x16(VP9_COMMON *const cm, MACROBLOCK *x,
-                           int mi_row, int mi_col) {
-  MACROBLOCKD *const xd = &x->e_mbd;
+void vp9_encode_sb(VP9_COMMON *const cm, MACROBLOCK *x,
+                   BLOCK_SIZE_TYPE bsize) {
+  MACROBLOCKD* const xd = &x->e_mbd;
+  struct optimize_ctx ctx;
+  struct encode_b_args arg = {cm, x, &ctx};
 
-  vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_SIZE_MB16X16);
-  vp9_subtract_sb(x, BLOCK_SIZE_MB16X16);
-  vp9_fidct_mb(cm, x);
-  vp9_recon_sb(xd, BLOCK_SIZE_MB16X16);
-}
-#endif
+  vp9_subtract_sb(x, bsize);
+  if (x->optimize)
+    vp9_optimize_init(xd, bsize, &ctx);
 
-/* this function is used by first pass only */
-void vp9_encode_inter16x16y(MACROBLOCK *x, int mi_row, int mi_col) {
-  MACROBLOCKD *xd = &x->e_mbd;
+  foreach_transformed_block(xd, bsize, encode_block, &arg);
 
-  vp9_build_inter_predictors_sby(xd, mi_row, mi_col, BLOCK_SIZE_MB16X16);
-  vp9_subtract_sby(x, BLOCK_SIZE_MB16X16);
-
-  vp9_transform_sby_4x4(x, BLOCK_SIZE_MB16X16);
-  vp9_quantize_sby_4x4(x, BLOCK_SIZE_MB16X16);
-  vp9_inverse_transform_sby_4x4(xd, BLOCK_SIZE_MB16X16);
-
-  vp9_recon_sby(xd, BLOCK_SIZE_MB16X16);
+  vp9_recon_sb(xd, bsize);
 }

diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h
index b1d8771..8322479 100644
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h

@@ -23,14 +23,8 @@
 } MODE_DEFINITION;
 
 
-struct VP9_ENCODER_RTCD;
 #if !CONFIG_SB8X8
-void vp9_encode_inter16x16(VP9_COMMON *const cm, MACROBLOCK *x,
-                           int mb_row, int mb_col);
 #endif
-
-void vp9_encode_inter16x16y(MACROBLOCK *x, int mb_row, int mb_col);
-
 void vp9_transform_sby_32x32(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
 void vp9_transform_sby_16x16(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
 void vp9_transform_sby_8x8(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
@@ -40,14 +34,21 @@
 void vp9_transform_sbuv_8x8(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
 void vp9_transform_sbuv_4x4(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
 
+struct optimize_ctx {
+  ENTROPY_CONTEXT ta[MAX_MB_PLANE][16];
+  ENTROPY_CONTEXT tl[MAX_MB_PLANE][16];
+};
+void vp9_optimize_init(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,
+                       struct optimize_ctx *ctx);
+void vp9_optimize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
+                    int ss_txfrm_size, VP9_COMMON *cm, MACROBLOCK *x,
+                    struct optimize_ctx *ctx);
 void vp9_optimize_sby(VP9_COMMON *const cm, MACROBLOCK *x,
                       BLOCK_SIZE_TYPE bsize);
 void vp9_optimize_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
                        BLOCK_SIZE_TYPE bsize);
 
-#if !CONFIG_SB8X8
-void vp9_fidct_mb(VP9_COMMON *const cm, MACROBLOCK *x);
-#endif
+void vp9_encode_sb(VP9_COMMON *const cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
 
 void vp9_subtract_block(int rows, int cols,
                         int16_t *diff_ptr, int diff_stride,

diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index e4d6863..4d28f1b 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c

@@ -626,7 +626,10 @@
           this_error = motion_error;
           vp9_set_mbmode_and_mvs(x, NEWMV, &mv);
           xd->mode_info_context->mbmi.txfm_size = TX_4X4;
-          vp9_encode_inter16x16y(x, mb_row, mb_col);
+          vp9_build_inter_predictors_sby(xd, mb_row << CONFIG_SB8X8,
+                                         mb_col << CONFIG_SB8X8,
+                                         BLOCK_SIZE_MB16X16);
+          vp9_encode_sb(cm, x, BLOCK_SIZE_MB16X16);
           sum_mvr += mv.as_mv.row;
           sum_mvr_abs += abs(mv.as_mv.row);
           sum_mvc += mv.as_mv.col;

diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index fe8ba4b..b70041e 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c

@@ -75,6 +75,43 @@
   *eob_ptr = eob + 1;
 }
 
+void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coeffs,
+                  TX_TYPE tx_type) {
+  MACROBLOCKD *const xd = &mb->e_mbd;
+  const int mul = n_coeffs == 1024 ? 2 : 1;
+  const int *scan;
+
+  // These contexts may be available in the caller
+  switch (n_coeffs) {
+    case 4 * 4:
+      scan = get_scan_4x4(tx_type);
+      break;
+    case 8 * 8:
+      scan = get_scan_8x8(tx_type);
+      break;
+    case 16 * 16:
+      scan = get_scan_16x16(tx_type);
+      break;
+    default:
+      scan = vp9_default_zig_zag1d_32x32;
+      break;
+  }
+
+  quantize(mb->plane[plane].zrun_zbin_boost,
+           BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
+           n_coeffs, mb->skip_block,
+           mb->plane[plane].zbin,
+           mb->plane[plane].round,
+           mb->plane[plane].quant,
+           mb->plane[plane].quant_shift,
+           BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16),
+           BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
+           xd->plane[plane].dequant,
+           mb->plane[plane].zbin_extra,
+           &xd->plane[plane].eobs[block],
+           scan, mul);
+}
+
 void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
                                 int y_blocks) {
   MACROBLOCKD *const xd = &mb->e_mbd;
@@ -266,6 +303,7 @@
 void vp9_init_quantizer(VP9_COMP *cpi) {
   int i;
   int quant_val;
+  int quant_uv_val;
   int q;
 
   static const int zbin_boost[16] = { 0,  0,  0,  8,  8,  8, 10, 12,
@@ -293,25 +331,26 @@
     cpi->common.uv_dequant[q][0] = quant_val;
     cpi->zrun_zbin_boost_uv[q][0] = (quant_val * zbin_boost[0]) >> 7;
 
+    quant_val = vp9_ac_quant(q, 0);
+    cpi->common.y_dequant[q][1] = quant_val;
+    quant_uv_val = vp9_ac_quant(q, cpi->common.uv_ac_delta_q);
+    cpi->common.uv_dequant[q][1] = quant_uv_val;
     // all the 4x4 ac values =;
     for (i = 1; i < 16; i++) {
       int rc = vp9_default_zig_zag1d_4x4[i];
 
-      quant_val = vp9_ac_quant(q, 0);
       invert_quant(cpi->Y1quant[q] + rc, cpi->Y1quant_shift[q] + rc, quant_val);
       cpi->Y1zbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
       cpi->Y1round[q][rc] = (qrounding_factor * quant_val) >> 7;
-      cpi->common.y_dequant[q][rc] = quant_val;
       cpi->zrun_zbin_boost_y1[q][i] =
           ROUND_POWER_OF_TWO(quant_val * zbin_boost[i], 7);
 
-      quant_val = vp9_ac_quant(q, cpi->common.uv_ac_delta_q);
-      invert_quant(cpi->UVquant[q] + rc, cpi->UVquant_shift[q] + rc, quant_val);
-      cpi->UVzbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
-      cpi->UVround[q][rc] = (qrounding_factor * quant_val) >> 7;
-      cpi->common.uv_dequant[q][rc] = quant_val;
+      invert_quant(cpi->UVquant[q] + rc, cpi->UVquant_shift[q] + rc,
+        quant_uv_val);
+      cpi->UVzbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_uv_val, 7);
+      cpi->UVround[q][rc] = (qrounding_factor * quant_uv_val) >> 7;
       cpi->zrun_zbin_boost_uv[q][i] =
-          ROUND_POWER_OF_TWO(quant_val * zbin_boost[i], 7);
+          ROUND_POWER_OF_TWO(quant_uv_val * zbin_boost[i], 7);
     }
   }
 }

diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h
index fb74cbd..718a127 100644
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h

@@ -22,6 +22,9 @@
 #define prototype_quantize_mb(sym) \
   void (sym)(MACROBLOCK *x)
 
+void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coefs,
+                  TX_TYPE tx_type);
+
 void vp9_regular_quantize_b_4x4_pair(MACROBLOCK *mb, int b_idx1, int b_idx2,
                                      int y_blocks);
 void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index cf4b1e8..f971d91 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c

@@ -13,8 +13,8 @@
 #include <math.h>
 #include <limits.h>
 #include <assert.h>
-#include "vp9/common/vp9_pragmas.h"
 
+#include "vp9/common/vp9_pragmas.h"
 #include "vp9/encoder/vp9_tokenize.h"
 #include "vp9/encoder/vp9_treewriter.h"
 #include "vp9/encoder/vp9_onyx_int.h"
@@ -34,7 +34,6 @@
 #include "vpx_mem/vpx_mem.h"
 #include "vp9/common/vp9_systemdependent.h"
 #include "vp9/encoder/vp9_encodemv.h"
-
 #include "vp9/common/vp9_seg_common.h"
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_entropy.h"
@@ -42,8 +41,6 @@
 #include "vp9/common/vp9_mvref_common.h"
 #include "vp9/common/vp9_common.h"
 
-#define MAXF(a,b)            (((a) > (b)) ? (a) : (b))
-
 #define INVALID_MV 0x80008000
 
 /* Factor to weigh the rate for switchable interp filters */
@@ -157,11 +154,9 @@
   for (i = 0; i < BLOCK_TYPES; i++)
     for (j = 0; j < REF_TYPES; j++)
       for (k = 0; k < COEF_BANDS; k++)
-        for (l = 0; l < PREV_COEF_CONTEXTS; l++) {
-          vp9_cost_tokens_skip((int *)(c[i][j][k][l]),
-                               p[i][j][k][l],
+        for (l = 0; l < PREV_COEF_CONTEXTS; l++)
+          vp9_cost_tokens_skip((int *)c[i][j][k][l], p[i][j][k][l],
                                vp9_coef_tree);
-        }
 }
 
 static int rd_iifactor[32] =  { 4, 4, 3, 2, 1, 0, 0, 0,
@@ -184,7 +179,7 @@
   for (i = 0; i < QINDEX_RANGE; i++) {
     sad_per_bit16lut[i] =
       (int)((0.0418 * vp9_convert_qindex_to_q(i)) + 2.4107);
-    sad_per_bit4lut[i] = (int)((0.063 * vp9_convert_qindex_to_q(i)) + 2.742);
+    sad_per_bit4lut[i] = (int)(0.063 * vp9_convert_qindex_to_q(i) + 2.742);
   }
 }
 
@@ -208,7 +203,7 @@
   // for key frames, golden frames and arf frames.
   // if (cpi->common.refresh_golden_frame ||
   //     cpi->common.refresh_alt_ref_frame)
-  qindex = (qindex < 0) ? 0 : ((qindex > MAXQ) ? MAXQ : qindex);
+  qindex = clamp(qindex, 0, MAXQ);
 
   cpi->RDMULT = compute_rd_mult(qindex);
   if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
@@ -2890,24 +2885,24 @@
       /* Use 8x8 result as 16x8/8x16's predictor MV. Adjust search range
        * according to the closeness of 2 MV. */
       /* block 8X16 */
-      sr = MAXF((abs(bsi.sv_mvp[0].as_mv.row - bsi.sv_mvp[2].as_mv.row)) >> 3,
-                (abs(bsi.sv_mvp[0].as_mv.col - bsi.sv_mvp[2].as_mv.col)) >> 3);
+      sr = MAX(abs(bsi.sv_mvp[0].as_mv.row - bsi.sv_mvp[2].as_mv.row) >> 3,
+               abs(bsi.sv_mvp[0].as_mv.col - bsi.sv_mvp[2].as_mv.col) >> 3);
       cal_step_param(sr, &bsi.sv_istep[0]);
 
-      sr = MAXF((abs(bsi.sv_mvp[1].as_mv.row - bsi.sv_mvp[3].as_mv.row)) >> 3,
-                (abs(bsi.sv_mvp[1].as_mv.col - bsi.sv_mvp[3].as_mv.col)) >> 3);
+      sr = MAX(abs(bsi.sv_mvp[1].as_mv.row - bsi.sv_mvp[3].as_mv.row) >> 3,
+               abs(bsi.sv_mvp[1].as_mv.col - bsi.sv_mvp[3].as_mv.col) >> 3);
       cal_step_param(sr, &bsi.sv_istep[1]);
 
       rd_check_segment(cpi, x, &bsi, PARTITIONING_8X16,
                        seg_mvs[PARTITIONING_8X16], txfm_cache);
 
       /* block 16X8 */
-      sr = MAXF((abs(bsi.sv_mvp[0].as_mv.row - bsi.sv_mvp[1].as_mv.row)) >> 3,
-                (abs(bsi.sv_mvp[0].as_mv.col - bsi.sv_mvp[1].as_mv.col)) >> 3);
+      sr = MAX(abs(bsi.sv_mvp[0].as_mv.row - bsi.sv_mvp[1].as_mv.row) >> 3,
+               abs(bsi.sv_mvp[0].as_mv.col - bsi.sv_mvp[1].as_mv.col) >> 3);
       cal_step_param(sr, &bsi.sv_istep[0]);
 
-      sr = MAXF((abs(bsi.sv_mvp[2].as_mv.row - bsi.sv_mvp[3].as_mv.row)) >> 3,
-                (abs(bsi.sv_mvp[2].as_mv.col - bsi.sv_mvp[3].as_mv.col)) >> 3);
+      sr = MAX(abs(bsi.sv_mvp[2].as_mv.row - bsi.sv_mvp[3].as_mv.row) >> 3,
+               abs(bsi.sv_mvp[2].as_mv.col - bsi.sv_mvp[3].as_mv.col) >> 3);
       cal_step_param(sr, &bsi.sv_istep[1]);
 
       rd_check_segment(cpi, x, &bsi, PARTITIONING_16X8,