Merge "Fix vp9_build_intra_predictors_sbuv_s for non-4:2:0" into experimental
diff --git a/configure b/configure
index 5cbf070..cc8c581 100755
--- a/configure
+++ b/configure
@@ -247,6 +247,7 @@
     multiple_arf
     non420
     ab4x4
+    comp_inter_joint_search
 "
 CONFIG_LIST="
     external_build
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index f6d2d59..9fb45d6 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -17,6 +17,7 @@
 extern "C" {
 #include "vp9/common/vp9_entropy.h"
 #include "vp9_rtcd.h"
+void vp9_short_idct16x16_add_c(short *input, uint8_t *output, int pitch);
 }
 
 #include "acm_random.h"
@@ -269,19 +270,23 @@
   const int count_test_block = 1000;
   for (int i = 0; i < count_test_block; ++i) {
     int16_t in[256], coeff[256];
-    int16_t out_c[256];
+    uint8_t dst[256], src[256];
     double out_r[256];
 
+    for (int j = 0; j < 256; ++j) {
+      src[j] = rnd.Rand8();
+      dst[j] = rnd.Rand8();
+    }
     // Initialize a test block with input range [-255, 255].
     for (int j = 0; j < 256; ++j)
-      in[j] = rnd.Rand8() - rnd.Rand8();
+      in[j] = src[j] - dst[j];
 
     reference_16x16_dct_2d(in, out_r);
     for (int j = 0; j < 256; j++)
       coeff[j] = round(out_r[j]);
-    vp9_short_idct16x16_c(coeff, out_c, 32);
+    vp9_short_idct16x16_add_c(coeff, dst, 16);
     for (int j = 0; j < 256; ++j) {
-      const int diff = out_c[j] - in[j];
+      const int diff = dst[j] - src[j];
       const int error = diff * diff;
       EXPECT_GE(1, error)
           << "Error: 16x16 IDCT has error " << error
@@ -289,7 +294,7 @@
     }
   }
 }
-#if 1
+
 // we need enable fdct test once we re-do the 16 point fdct.
 TEST(VP9Fdct16x16Test, AccuracyCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
@@ -299,18 +304,22 @@
   for (int i = 0; i < count_test_block; ++i) {
     int16_t test_input_block[256];
     int16_t test_temp_block[256];
-    int16_t test_output_block[256];
+    uint8_t dst[256], src[256];
 
+    for (int j = 0; j < 256; ++j) {
+      src[j] = rnd.Rand8();
+      dst[j] = rnd.Rand8();
+    }
     // Initialize a test block with input range [-255, 255].
     for (int j = 0; j < 256; ++j)
-      test_input_block[j] = rnd.Rand8() - rnd.Rand8();
+      test_input_block[j] = src[j] - dst[j];
 
     const int pitch = 32;
     vp9_short_fdct16x16_c(test_input_block, test_temp_block, pitch);
-    vp9_short_idct16x16_c(test_temp_block, test_output_block, pitch);
+    vp9_short_idct16x16_add_c(test_temp_block, dst, 16);
 
     for (int j = 0; j < 256; ++j) {
-      const int diff = test_input_block[j] - test_output_block[j];
+      const int diff = dst[j] - src[j];
       const int error = diff * diff;
       if (max_error < error)
         max_error = error;
@@ -354,6 +363,4 @@
     }
   }
 }
-#endif
-
 }  // namespace
diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index a565270..e05d482 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -18,7 +18,7 @@
 #include "vp9/common/vp9_entropy.h"
 #include "./vp9_rtcd.h"
   void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch);
-  void vp9_short_idct32x32_c(short *input, short *output, int pitch);
+  void vp9_short_idct32x32_add_c(short *input, uint8_t *output, int pitch);
 }
 
 #include "test/acm_random.h"
@@ -91,28 +91,31 @@
   }
 }
 
-
 TEST(VP9Idct32x32Test, AccuracyCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   const int count_test_block = 1000;
   for (int i = 0; i < count_test_block; ++i) {
     int16_t in[1024], coeff[1024];
-    int16_t out_c[1024];
+    uint8_t dst[1024], src[1024];
     double out_r[1024];
 
+    for (int j = 0; j < 1024; ++j) {
+      src[j] = rnd.Rand8();
+      dst[j] = rnd.Rand8();
+    }
     // Initialize a test block with input range [-255, 255].
     for (int j = 0; j < 1024; ++j)
-      in[j] = rnd.Rand8() - rnd.Rand8();
+      in[j] = src[j] - dst[j];
 
     reference_32x32_dct_2d(in, out_r);
     for (int j = 0; j < 1024; j++)
       coeff[j] = round(out_r[j]);
-    vp9_short_idct32x32_c(coeff, out_c, 64);
+    vp9_short_idct32x32_add_c(coeff, dst, 32);
     for (int j = 0; j < 1024; ++j) {
-      const int diff = out_c[j] - in[j];
+      const int diff = dst[j] - src[j];
       const int error = diff * diff;
       EXPECT_GE(1, error)
-          << "Error: 3x32 IDCT has error " << error
+          << "Error: 32x32 IDCT has error " << error
           << " at index " << j;
     }
   }
@@ -126,18 +129,22 @@
   for (int i = 0; i < count_test_block; ++i) {
     int16_t test_input_block[1024];
     int16_t test_temp_block[1024];
-    int16_t test_output_block[1024];
+    uint8_t dst[1024], src[1024];
 
+    for (int j = 0; j < 1024; ++j) {
+      src[j] = rnd.Rand8();
+      dst[j] = rnd.Rand8();
+    }
     // Initialize a test block with input range [-255, 255].
     for (int j = 0; j < 1024; ++j)
-      test_input_block[j] = rnd.Rand8() - rnd.Rand8();
+      test_input_block[j] = src[j] - dst[j];
 
     const int pitch = 64;
     vp9_short_fdct32x32_c(test_input_block, test_temp_block, pitch);
-    vp9_short_idct32x32_c(test_temp_block, test_output_block, pitch);
+    vp9_short_idct32x32_add_c(test_temp_block, dst, 32);
 
     for (int j = 0; j < 1024; ++j) {
-      const unsigned diff = test_input_block[j] - test_output_block[j];
+      const unsigned diff = dst[j] - src[j];
       const unsigned error = diff * diff;
       if (max_error < error)
         max_error = error;
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index ab9e28d..07607d8 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -70,17 +70,17 @@
 } INTERPOLATIONFILTERTYPE;
 
 typedef enum {
-  DC_PRED,            /* average of above and left pixels */
-  V_PRED,             /* vertical prediction */
-  H_PRED,             /* horizontal prediction */
-  D45_PRED,           /* Directional 45 deg prediction  [anti-clockwise from 0 deg hor] */
-  D135_PRED,          /* Directional 135 deg prediction [anti-clockwise from 0 deg hor] */
-  D117_PRED,          /* Directional 112 deg prediction [anti-clockwise from 0 deg hor] */
-  D153_PRED,          /* Directional 157 deg prediction [anti-clockwise from 0 deg hor] */
-  D27_PRED,           /* Directional 22 deg prediction  [anti-clockwise from 0 deg hor] */
-  D63_PRED,           /* Directional 67 deg prediction  [anti-clockwise from 0 deg hor] */
-  TM_PRED,            /* Truemotion prediction */
-  I4X4_PRED,          /* 4x4 based prediction, each 4x4 has its own mode */
+  DC_PRED,         // Average of above and left pixels
+  V_PRED,          // Vertical
+  H_PRED,          // Horizontal
+  D45_PRED,        // Directional 45  deg = round(arctan(1/1) * 180/pi)
+  D135_PRED,       // Directional 135 deg = 180 - 45
+  D117_PRED,       // Directional 117 deg = 180 - 63
+  D153_PRED,       // Directional 153 deg = 180 - 27
+  D27_PRED,        // Directional 27  deg = round(arctan(1/2) * 180/pi)
+  D63_PRED,        // Directional 63  deg = round(arctan(2/1) * 180/pi)
+  TM_PRED,         // True-motion
+  I4X4_PRED,       // Each 4x4 subblock has its own mode
   NEARESTMV,
   NEARMV,
   ZEROMV,
@@ -222,12 +222,21 @@
 
 static INLINE int mi_width_log2(BLOCK_SIZE_TYPE sb_type) {
   int a = b_width_log2(sb_type) - 1;
+#if CONFIG_AB4X4
+  // align 4x4 block to mode_info
+  if (a < 0)
+    a = 0;
+#endif
   assert(a >= 0);
   return a;
 }
 
 static INLINE int mi_height_log2(BLOCK_SIZE_TYPE sb_type) {
   int a = b_height_log2(sb_type) - 1;
+#if CONFIG_AB4X4
+  if (a < 0)
+    a = 0;
+#endif
   assert(a >= 0);
   return a;
 }
@@ -401,10 +410,39 @@
   int sb_index;   // index of 32x32 block inside the 64x64 block
   int mb_index;   // index of 16x16 block inside the 32x32 block
   int b_index;    // index of 8x8 block inside the 16x16 block
+#if CONFIG_AB4X4
+  int ab_index;   // index of 4x4 block inside the 8x8 block
+#endif
   int q_index;
 
 } MACROBLOCKD;
 
+static int *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE_TYPE subsize) {
+  switch (subsize) {
+    case BLOCK_SIZE_SB64X32:
+    case BLOCK_SIZE_SB32X64:
+    case BLOCK_SIZE_SB32X32:
+      return &xd->sb_index;
+    case BLOCK_SIZE_SB32X16:
+    case BLOCK_SIZE_SB16X32:
+    case BLOCK_SIZE_MB16X16:
+      return &xd->mb_index;
+    case BLOCK_SIZE_SB16X8:
+    case BLOCK_SIZE_SB8X16:
+    case BLOCK_SIZE_SB8X8:
+      return &xd->b_index;
+#if CONFIG_AB4X4
+    case BLOCK_SIZE_SB8X4:
+    case BLOCK_SIZE_SB4X8:
+    case BLOCK_SIZE_AB4X4:
+      return &xd->ab_index;
+#endif
+    default:
+      assert(0);
+      return NULL;
+  }
+}
+
 static INLINE void update_partition_context(MACROBLOCKD *xd,
                                             BLOCK_SIZE_TYPE sb_type,
                                             BLOCK_SIZE_TYPE sb_size) {
@@ -413,9 +451,12 @@
   int bhl = mi_height_log2(sb_type);
   int boffset = mi_width_log2(BLOCK_SIZE_SB64X64) - bsl;
   int i;
-  // skip macroblock partition
+
+#if !CONFIG_AB4X4
+  // skip 8x8 block partition
   if (bsl == 0)
     return;
+#endif
 
   // update the partition context at the end notes. set partition bits
   // of block sizes larger than the current one to be one, and partition
@@ -463,7 +504,11 @@
   above = (above > 0);
   left  = (left > 0);
 
+#if CONFIG_AB4X4
+  return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
+#else
   return (left * 2 + above) + (bsl - 1) * PARTITION_PLOFFSET;
+#endif
 }
 
 static BLOCK_SIZE_TYPE get_subsize(BLOCK_SIZE_TYPE bsize,
@@ -480,6 +525,10 @@
         subsize = BLOCK_SIZE_SB32X16;
       else if (bsize == BLOCK_SIZE_MB16X16)
         subsize = BLOCK_SIZE_SB16X8;
+#if CONFIG_AB4X4
+      else if (bsize == BLOCK_SIZE_SB8X8)
+        subsize = BLOCK_SIZE_SB8X4;
+#endif
       else
         assert(0);
       break;
@@ -490,6 +539,10 @@
         subsize = BLOCK_SIZE_SB16X32;
       else if (bsize == BLOCK_SIZE_MB16X16)
         subsize = BLOCK_SIZE_SB8X16;
+#if CONFIG_AB4X4
+      else if (bsize == BLOCK_SIZE_SB8X8)
+        subsize = BLOCK_SIZE_SB4X8;
+#endif
       else
         assert(0);
       break;
@@ -500,6 +553,10 @@
         subsize = BLOCK_SIZE_MB16X16;
       else if (bsize == BLOCK_SIZE_MB16X16)
         subsize = BLOCK_SIZE_SB8X8;
+#if CONFIG_AB4X4
+      else if (bsize == BLOCK_SIZE_SB8X8)
+        subsize = BLOCK_SIZE_AB4X4;
+#endif
       else
         assert(0);
       break;
@@ -543,6 +600,7 @@
 
     case B_V_PRED :
     case B_D117_PRED :
+    case B_D63_PRED:
       return ADST_DCT;
 
     case B_H_PRED :
diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h
index dbfb9ed..b6252d9 100644
--- a/vp9/common/vp9_common.h
+++ b/vp9/common/vp9_common.h
@@ -52,6 +52,10 @@
   return value < low ? low : (value > high ? high : value);
 }
 
+static INLINE double fclamp(double value, double low, double high) {
+  return value < low ? low : (value > high ? high : value);
+}
+
 static INLINE int multiple16(int value) {
   return (value + 15) & ~15;
 }
diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c
index aef6871..532e5d3 100644
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -46,6 +46,13 @@
   5, 5, 5, 5, 5, 5, 5, 5,
   5, 5, 5, 5, 5, 5, 5, 5
 };
+
+DECLARE_ALIGNED(16, const uint8_t,
+                vp9_coefband_trans_8x8plus[MAXBAND_INDEX + 1]) = {
+  0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4,
+  4, 4, 4, 4, 4, 5
+};
+
 DECLARE_ALIGNED(16, const int, vp9_coef_bands4x4[16]) = {
   0, 1, 2, 3,
   1, 2, 3, 4,
@@ -53,6 +60,12 @@
   3, 4, 5, 5
 };
 
+DECLARE_ALIGNED(16, const uint8_t,
+                vp9_coefband_trans_4x4[MAXBAND_INDEX + 1]) = {
+  0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4,
+  4, 4, 4, 4, 4, 5
+};
+
 DECLARE_ALIGNED(16, const uint8_t, vp9_pt_energy_class[MAX_ENTROPY_TOKENS]) = {
   0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5
 };
diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index 579313f..9352bf6 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -133,20 +133,20 @@
 
 extern const int vp9_coef_bands8x8[64];
 extern const int vp9_coef_bands4x4[16];
+extern const uint8_t vp9_coefband_trans_8x8plus[22];
+extern const uint8_t vp9_coefband_trans_4x4[22];
 
-static int get_coef_band(const int *scan, TX_SIZE tx_size, int coef_index) {
-  if (tx_size == TX_4X4) {
-    return vp9_coef_bands4x4[scan[coef_index]];
-  } else {
-    const int pos = scan[coef_index];
-    const int sz = 1 << (2 + tx_size);
-    const int x = pos & (sz - 1), y = pos >> (2 + tx_size);
-    if (x >= 8 || y >= 8)
-      return 5;
-    else
-      return vp9_coef_bands8x8[y * 8 + x];
-  }
+// This is the index in the scan order beyond which all coefficients for
+// 8x8 transform and above are in the top band.
+// For 4x4 blocks the index is less but to keep things common the lookup
+// table for 4x4 is padded out to this index.
+#define MAXBAND_INDEX 21
+
+static int get_coef_band(const uint8_t * band_translate, int coef_index) {
+  return (coef_index > MAXBAND_INDEX)
+    ? (COEF_BANDS-1) : band_translate[coef_index];
 }
+
 extern int vp9_get_coef_context(const int *scan, const int *neighbors,
                                 int nb_pad, uint8_t *token_cache, int c, int l);
 const int *vp9_get_coef_neighbors_handle(const int *scan, int *pad);
diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index dcee62f..577aab5 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -106,6 +106,12 @@
 const vp9_prob vp9_partition_probs[NUM_PARTITION_CONTEXTS]
                                   [PARTITION_TYPES - 1] = {
   // FIXME(jingning,rbultje) put real probabilities here
+#if CONFIG_AB4X4
+  {202, 162, 107},
+  {16,  2,   169},
+  {3,   246,  19},
+  {104, 90,  134},
+#endif
   {202, 162, 107},
   {16,  2,   169},
   {3,   246,  19},
@@ -513,6 +519,7 @@
                       vp9_sub_mv_ref_tree, fc->sub_mv_ref_counts[i],
                       fc->pre_sub_mv_ref_prob[i], fc->sub_mv_ref_prob[i],
                       LEFT4X4);
+
   for (i = 0; i < NUM_PARTITION_CONTEXTS; i++)
     update_mode_probs(PARTITION_TYPES, vp9_partition_tree,
                       fc->partition_counts[i], fc->pre_partition_prob[i],
diff --git a/vp9/common/vp9_enums.h b/vp9/common/vp9_enums.h
index 1663195..626f0b9 100644
--- a/vp9/common/vp9_enums.h
+++ b/vp9/common/vp9_enums.h
@@ -36,6 +36,7 @@
   BLOCK_SIZE_SB32X64,
   BLOCK_SIZE_SB64X32,
   BLOCK_SIZE_SB64X64,
+  BLOCK_SIZE_TYPES
 } BLOCK_SIZE_TYPE;
 
 typedef enum PARTITION_TYPE {
@@ -47,6 +48,10 @@
 } PARTITION_TYPE;
 
 #define PARTITION_PLOFFSET   4  // number of probability models per block size
+#if CONFIG_AB4X4
+#define NUM_PARTITION_CONTEXTS (4 * PARTITION_PLOFFSET)
+#else
 #define NUM_PARTITION_CONTEXTS (3 * PARTITION_PLOFFSET)
+#endif
 
 #endif  // VP9_COMMON_VP9_ENUMS_H_
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index 3ec093f..b166fcb 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -621,10 +621,9 @@
   output[15] = step2[0] - step2[15];
 }
 
-void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
   int16_t out[16 * 16];
   int16_t *outptr = out;
-  const int half_pitch = pitch >> 1;
   int i, j;
   int16_t temp_in[16], temp_out[16];
 
@@ -641,7 +640,8 @@
       temp_in[j] = out[j * 16 + i];
     idct16_1d(temp_in, temp_out);
     for (j = 0; j < 16; ++j)
-      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                  + dest[j * dest_stride + i]);
   }
 }
 
@@ -823,8 +823,8 @@
   { iadst16_1d, iadst16_1d }   // ADST_ADST = 3
 };
 
-void vp9_short_iht16x16_c(int16_t *input, int16_t *output,
-                          int pitch, int tx_type) {
+void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride,
+                              int tx_type) {
   int i, j;
   int16_t out[16 * 16];
   int16_t *outptr = out;
@@ -844,38 +844,38 @@
       temp_in[j] = out[j * 16 + i];
     ht.cols(temp_in, temp_out);
     for (j = 0; j < 16; ++j)
-      output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                  + dest[j * dest_stride + i]);  }
+}
+
+void vp9_short_idct10_16x16_add_c(int16_t *input, uint8_t *dest,
+                                  int dest_stride) {
+  int16_t out[16 * 16];
+  int16_t *outptr = out;
+  int i, j;
+  int16_t temp_in[16], temp_out[16];
+
+  /* First transform rows. Since all non-zero dct coefficients are in
+   * upper-left 4x4 area, we only need to calculate first 4 rows here.
+   */
+  vpx_memset(out, 0, sizeof(out));
+  for (i = 0; i < 4; ++i) {
+    idct16_1d(input, outptr);
+    input += 16;
+    outptr += 16;
+  }
+
+  // Then transform columns
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < 16; ++j)
+      temp_in[j] = out[j*16 + i];
+    idct16_1d(temp_in, temp_out);
+    for (j = 0; j < 16; ++j)
+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                  + dest[j * dest_stride + i]);
   }
 }
 
-void vp9_short_idct10_16x16_c(int16_t *input, int16_t *output, int pitch) {
-    int16_t out[16 * 16];
-    int16_t *outptr = out;
-    const int half_pitch = pitch >> 1;
-    int i, j;
-    int16_t temp_in[16], temp_out[16];
-
-    /* First transform rows. Since all non-zero dct coefficients are in
-     * upper-left 4x4 area, we only need to calculate first 4 rows here.
-     */
-    vpx_memset(out, 0, sizeof(out));
-    for (i = 0; i < 4; ++i) {
-      idct16_1d(input, outptr);
-      input += 16;
-      outptr += 16;
-    }
-
-    // Then transform columns
-    for (i = 0; i < 16; ++i) {
-      for (j = 0; j < 16; ++j)
-        temp_in[j] = out[j*16 + i];
-      idct16_1d(temp_in, temp_out);
-      for (j = 0; j < 16; ++j)
-        output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
-    }
-}
-
-
 void vp9_short_idct1_16x16_c(int16_t *input, int16_t *output) {
   int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
   out = dct_const_round_shift(out * cospi_16_64);
@@ -1249,10 +1249,9 @@
   output[31] = step1[0] - step1[31];
 }
 
-void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct32x32_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
   int16_t out[32 * 32];
   int16_t *outptr = out;
-  const int half_pitch = pitch >> 1;
   int i, j;
   int16_t temp_in[32], temp_out[32];
 
@@ -1269,7 +1268,8 @@
       temp_in[j] = out[j * 32 + i];
     idct32_1d(temp_in, temp_out);
     for (j = 0; j < 32; ++j)
-      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                  + dest[j * dest_stride + i]);
   }
 }
 
@@ -1279,10 +1279,10 @@
   output[0] = ROUND_POWER_OF_TWO(out, 6);
 }
 
-void vp9_short_idct10_32x32_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct10_32x32_add_c(int16_t *input, uint8_t *dest,
+                                  int dest_stride) {
   int16_t out[32 * 32];
   int16_t *outptr = out;
-  const int half_pitch = pitch >> 1;
   int i, j;
   int16_t temp_in[32], temp_out[32];
 
@@ -1302,6 +1302,7 @@
       temp_in[j] = out[j * 32 + i];
     idct32_1d(temp_in, temp_out);
     for (j = 0; j < 32; ++j)
-      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                  + dest[j * dest_stride + i]);
   }
 }
diff --git a/vp9/common/vp9_loopfilter.h b/vp9/common/vp9_loopfilter.h
index 80fccd5..589984f 100644
--- a/vp9/common/vp9_loopfilter.h
+++ b/vp9/common/vp9_loopfilter.h
@@ -49,9 +49,6 @@
   void sym(uint8_t *y, uint8_t *u, uint8_t *v, \
            int ystride, int uv_stride, struct loop_filter_info *lfi)
 
-#define prototype_simple_loopfilter(sym) \
-  void sym(uint8_t *y, int ystride, const unsigned char *blimit)
-
 #if ARCH_X86 || ARCH_X86_64
 #include "x86/vp9_loopfilter_x86.h"
 #endif
diff --git a/vp9/common/vp9_loopfilter_filters.c b/vp9/common/vp9_loopfilter_filters.c
index 15785f5..fc7fbc4 100644
--- a/vp9/common/vp9_loopfilter_filters.c
+++ b/vp9/common/vp9_loopfilter_filters.c
@@ -8,15 +8,13 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include <stdlib.h>
 #include "vpx_config.h"
+#include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_loopfilter.h"
 #include "vp9/common/vp9_onyxc_int.h"
 
 static INLINE int8_t signed_char_clamp(int t) {
-  t = (t < -128 ? -128 : t);
-  t = (t > 127 ? 127 : t);
-  return (int8_t) t;
+  return (int8_t)clamp(t, -128, 127);
 }
 
 // should we apply any filter at all: 11111111 yes, 00000000 no
@@ -36,7 +34,7 @@
   return ~mask;
 }
 
-// is there high variance internal edge: 11111111 yes, 00000000 no
+// is there high edge variance internal edge: 11111111 yes, 00000000 no
 static INLINE int8_t hevmask(uint8_t thresh, uint8_t p1, uint8_t p0,
                              uint8_t q0, uint8_t q1) {
   int8_t hev = 0;
@@ -68,12 +66,9 @@
 
   *oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80;
   *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80;
-  filter = filter1;
 
   // outer tap adjustments
-  filter += 1;
-  filter >>= 1;
-  filter &= ~hev;
+  filter = ((filter1 + 1) >> 1) & ~hev;
 
   *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80;
   *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;
@@ -84,23 +79,19 @@
                                        const uint8_t *limit,
                                        const uint8_t *thresh,
                                        int count) {
-  int i = 0;
+  int i;
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
-  do {
-    const int8_t mask = filter_mask(limit[0], blimit[0],
-                                    s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],
-                                    s[0 * p],  s[1 * p],  s[2 * p],  s[3 * p]);
-
-    // high edge variance
-    const int8_t hev = hevmask(thresh[0],
-                               s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]);
-
+  for (i = 0; i < 8 * count; ++i) {
+    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+    const uint8_t q0 = s[0 * p],  q1 = s[1 * p],  q2 = s[2 * p],  q3 = s[3 * p];
+    const int8_t mask = filter_mask(*limit, *blimit,
+                                    p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t hev = hevmask(*thresh, p1, p0, q0, q1);
     filter(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p);
-
     ++s;
-  } while (++i < count * 8);
+  }
 }
 
 void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch,
@@ -108,21 +99,21 @@
                                      const uint8_t *limit,
                                      const uint8_t *thresh,
                                      int count) {
-  int i = 0;
+  int i;
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
-  do {
-    const int8_t mask = filter_mask(limit[0], blimit[0],
-                                    s[-4], s[-3], s[-2], s[-1],
-                                    s[0],  s[1],  s[2],  s[3]);
-
-    // high edge variance
-    const int8_t hev = hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
+  for (i = 0; i < 8 * count; ++i) {
+    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint8_t q0 = s[0],  q1 = s[1],  q2 = s[2],  q3 = s[3];
+    const int8_t mask = filter_mask(*limit, *blimit,
+                                    p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t hev = hevmask(*thresh, p1, p0, q0, q1);
     filter(mask, hev, s - 2, s - 1, s, s + 1);
     s += pitch;
-  } while (++i < count * 8);
+  }
 }
+
 static INLINE int8_t flatmask4(uint8_t thresh,
                                uint8_t p3, uint8_t p2,
                                uint8_t p1, uint8_t p0,
@@ -157,14 +148,8 @@
                             uint8_t *oq2, uint8_t *oq3) {
   // use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line
   if (flat && mask) {
-    const uint8_t p3 = *op3;
-    const uint8_t p2 = *op2;
-    const uint8_t p1 = *op1;
-    const uint8_t p0 = *op0;
-    const uint8_t q0 = *oq0;
-    const uint8_t q1 = *oq1;
-    const uint8_t q2 = *oq2;
-    const uint8_t q3 = *oq3;
+    const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
 
     *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3);
     *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3);
@@ -173,33 +158,7 @@
     *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3);
     *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3);
   } else {
-    int8_t filter1, filter2;
-
-    const int8_t ps1 = (int8_t) *op1 ^ 0x80;
-    const int8_t ps0 = (int8_t) *op0 ^ 0x80;
-    const int8_t qs0 = (int8_t) *oq0 ^ 0x80;
-    const int8_t qs1 = (int8_t) *oq1 ^ 0x80;
-
-    // add outer taps if we have high edge variance
-    int8_t filter = signed_char_clamp(ps1 - qs1) & hev;
-
-    // inner taps
-    filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask;
-
-    filter1 = signed_char_clamp(filter + 4) >> 3;
-    filter2 = signed_char_clamp(filter + 3) >> 3;
-
-    *oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80;
-    *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80;
-    filter = filter1;
-
-    // outer tap adjustments
-    filter += 1;
-    filter >>= 1;
-    filter &= ~hev;
-
-    *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80;
-    *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;
+    filter(mask, hev, op1,  op0, oq0, oq1);
   }
 }
 
@@ -208,28 +167,23 @@
                                          const uint8_t *limit,
                                          const uint8_t *thresh,
                                          int count) {
-  int i = 0;
+  int i;
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
-  do {
-    const int8_t mask = filter_mask(limit[0], blimit[0],
-                                    s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],
-                                    s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p]);
+  for (i = 0; i < 8 * count; ++i) {
+    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
 
-    const int8_t hev = hevmask(thresh[0],
-                               s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]);
-
-    const int8_t flat = flatmask4(1,
-                                  s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],
-                                  s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p]);
+    const int8_t mask = filter_mask(*limit, *blimit,
+                                    p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t hev = hevmask(*thresh, p1, p0, q0, q1);
+    const int8_t flat = flatmask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
     mbfilter(mask, hev, flat,
              s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
              s,         s + 1 * p, s + 2 * p, s + 3 * p);
-
     ++s;
-  } while (++i < count * 8);
-
+  }
 }
 
 void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch,
@@ -237,72 +191,19 @@
                                        const uint8_t *limit,
                                        const uint8_t *thresh,
                                        int count) {
-  int i = 0;
+  int i;
 
-  do {
-    const int8_t mask = filter_mask(limit[0], blimit[0],
-                                    s[-4], s[-3], s[-2], s[-1],
-                                    s[0],  s[1],  s[2],  s[3]);
-
-    const int8_t hev = hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
-    const int8_t flat = flatmask4(1, s[-4], s[-3], s[-2], s[-1],
-                                     s[ 0], s[ 1], s[ 2], s[ 3]);
+  for (i = 0; i < 8 * count; ++i) {
+    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+    const int8_t mask = filter_mask(*limit, *blimit,
+                                    p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t hev = hevmask(thresh[0], p1, p0, q0, q1);
+    const int8_t flat = flatmask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
     mbfilter(mask, hev, flat, s - 4, s - 3, s - 2, s - 1,
                               s,     s + 1, s + 2, s + 3);
     s += pitch;
-  } while (++i < count * 8);
-
-}
-
-// should we apply any filter at all: 11111111 yes, 00000000 no
-static INLINE int8_t simple_filter_mask(uint8_t blimit,
-                                        uint8_t p1, uint8_t p0,
-                                        uint8_t q0, uint8_t q1) {
-  return (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  <= blimit) * -1;
-}
-
-static INLINE void simple_filter(int8_t mask,
-                                 uint8_t *op1, uint8_t *op0,
-                                 uint8_t *oq0, uint8_t *oq1) {
-  int8_t filter1, filter2;
-  const int8_t p1 = (int8_t) *op1 ^ 0x80;
-  const int8_t p0 = (int8_t) *op0 ^ 0x80;
-  const int8_t q0 = (int8_t) *oq0 ^ 0x80;
-  const int8_t q1 = (int8_t) *oq1 ^ 0x80;
-
-  int8_t filter = signed_char_clamp(p1 - q1);
-  filter = signed_char_clamp(filter + 3 * (q0 - p0));
-  filter &= mask;
-
-  // save bottom 3 bits so that we round one side +4 and the other +3
-  filter1 = signed_char_clamp(filter + 4) >> 3;
-  *oq0  = signed_char_clamp(q0 - filter1) ^ 0x80;
-
-  filter2 = signed_char_clamp(filter + 3) >> 3;
-  *op0 = signed_char_clamp(p0 + filter2) ^ 0x80;
-}
-
-void vp9_loop_filter_simple_horizontal_edge_c(uint8_t *s, int p,
-                                              const uint8_t *blimit) {
-  int i = 0;
-
-  do {
-    const int8_t mask = simple_filter_mask(blimit[0], s[-2 * p], s[-1 * p],
-                                                      s[0 * p],  s[1 * p]);
-    simple_filter(mask, s - 2 * p, s - 1 * p, s, s + 1 * p);
-    ++s;
-  } while (++i < 16);
-}
-
-void vp9_loop_filter_simple_vertical_edge_c(uint8_t *s, int p,
-                                            const uint8_t *blimit) {
-  int i = 0;
-
-  do {
-    const int8_t mask = simple_filter_mask(blimit[0], s[-2], s[-1], s[0], s[1]);
-    simple_filter(mask, s - 2, s - 1, s, s + 1);
-    s += p;
-  } while (++i < 16);
+  }
 }
 
 /* Vertical MB Filtering */
@@ -392,11 +293,6 @@
                                       lfi->blim, lfi->lim, lfi->hev_thr, 1);
 }
 
-void vp9_loop_filter_bhs_c(uint8_t *y, int y_stride, const uint8_t *blimit) {
-  vp9_loop_filter_simple_horizontal_edge_c(y + 4 * y_stride, y_stride, blimit);
-  vp9_loop_filter_simple_horizontal_edge_c(y + 8 * y_stride, y_stride, blimit);
-  vp9_loop_filter_simple_horizontal_edge_c(y + 12 * y_stride, y_stride, blimit);
-}
 
 void vp9_loop_filter_bv8x8_c(uint8_t *y, uint8_t *u, uint8_t *v,
                              int y_stride, int uv_stride,
@@ -413,12 +309,6 @@
                                     lfi->blim, lfi->lim, lfi->hev_thr, 1);
 }
 
-void vp9_loop_filter_bvs_c(uint8_t *y, int y_stride, const uint8_t *blimit) {
-  vp9_loop_filter_simple_vertical_edge_c(y + 4, y_stride, blimit);
-  vp9_loop_filter_simple_vertical_edge_c(y + 8, y_stride, blimit);
-  vp9_loop_filter_simple_vertical_edge_c(y + 12, y_stride, blimit);
-}
-
 static INLINE void wide_mbfilter(int8_t mask, uint8_t hev,
                                  uint8_t flat, uint8_t flat2,
                                  uint8_t *op7, uint8_t *op6, uint8_t *op5,
@@ -429,22 +319,11 @@
                                  uint8_t *oq7) {
   // use a 15 tap filter [1,1,1,1,1,1,1,2,1,1,1,1,1,1,1] for flat line
   if (flat2 && flat && mask) {
-    const uint8_t p7 = *op7;
-    const uint8_t p6 = *op6;
-    const uint8_t p5 = *op5;
-    const uint8_t p4 = *op4;
-    const uint8_t p3 = *op3;
-    const uint8_t p2 = *op2;
-    const uint8_t p1 = *op1;
-    const uint8_t p0 = *op0;
-    const uint8_t q0 = *oq0;
-    const uint8_t q1 = *oq1;
-    const uint8_t q2 = *oq2;
-    const uint8_t q3 = *oq3;
-    const uint8_t q4 = *oq4;
-    const uint8_t q5 = *oq5;
-    const uint8_t q6 = *oq6;
-    const uint8_t q7 = *oq7;
+    const uint8_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4,
+                  p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+
+    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3,
+                  q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7;
 
     *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 +
                               q0, 4);
@@ -474,49 +353,8 @@
                               q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
     *oq6 = ROUND_POWER_OF_TWO(p0 +
                               q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
-  } else if (flat && mask) {
-    const uint8_t p3 = *op3;
-    const uint8_t p2 = *op2;
-    const uint8_t p1 = *op1;
-    const uint8_t p0 = *op0;
-    const uint8_t q0 = *oq0;
-    const uint8_t q1 = *oq1;
-    const uint8_t q2 = *oq2;
-    const uint8_t q3 = *oq3;
-
-    *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3);
-    *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3);
-    *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3);
-    *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3);
-    *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3);
-    *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3);
   } else {
-    int8_t filter1, filter2;
-
-    const int8_t ps1 = (int8_t) * op1 ^ 0x80;
-    const int8_t ps0 = (int8_t) * op0 ^ 0x80;
-    const int8_t qs0 = (int8_t) * oq0 ^ 0x80;
-    const int8_t qs1 = (int8_t) * oq1 ^ 0x80;
-
-    // add outer taps if we have high edge variance
-    int8_t filter = signed_char_clamp(ps1 - qs1) & hev;
-
-    // inner taps
-    filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask;
-    filter1 = signed_char_clamp(filter + 4) >> 3;
-    filter2 = signed_char_clamp(filter + 3) >> 3;
-
-    *oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80;
-    *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80;
-    filter = filter1;
-
-    // outer tap adjustments
-    filter += 1;
-    filter >>= 1;
-    filter &= ~hev;
-
-    *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80;
-    *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;
+    mbfilter(mask, hev, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
   }
 }
 
@@ -525,25 +363,20 @@
                                  const uint8_t *limit,
                                  const uint8_t *thresh,
                                  int count) {
-  int i = 0;
+  int i;
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
-  do {
-    const int8_t mask = filter_mask(limit[0], blimit[0],
-                                    s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],
-                                    s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p]);
-
-    const int8_t hev = hevmask(thresh[0],
-                               s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]);
-
-    const int8_t flat = flatmask4(1,
-                                  s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],
-                                  s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p]);
-
+  for (i = 0; i < 8 * count; ++i) {
+    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+    const int8_t mask = filter_mask(*limit, *blimit,
+                                    p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t hev = hevmask(*thresh, p1, p0, q0, q1);
+    const int8_t flat = flatmask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
     const int8_t flat2 = flatmask5(1,
-                         s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], s[-1 * p],
-                         s[ 0 * p], s[ 4 * p], s[ 5 * p], s[ 6 * p], s[ 7 * p]);
+                             s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0,
+                             q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p]);
 
     wide_mbfilter(mask, hev, flat, flat2,
                   s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p,
@@ -552,33 +385,31 @@
                   s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p);
 
     ++s;
-  } while (++i < count * 8);
+  }
 }
+
 void vp9_mb_lpf_vertical_edge_w(uint8_t *s, int p,
                                 const uint8_t *blimit,
                                 const uint8_t *limit,
                                 const uint8_t *thresh,
                                 int count) {
-  int i = 0;
+  int i;
 
-  do {
-    const int8_t mask = filter_mask(limit[0], blimit[0],
-                                    s[-4], s[-3], s[-2], s[-1],
-                                    s[0],  s[1],  s[2],  s[3]);
-
-    const int8_t hev = hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
-    const int8_t flat = flatmask4(1, s[-4], s[-3], s[-2], s[-1],
-                                     s[ 0], s[ 1], s[ 2], s[ 3]);
-    const int8_t flat2 = flatmask5(1, s[-8], s[-7], s[-6], s[-5], s[-1],
-                                      s[ 0], s[ 4], s[ 5], s[ 6], s[ 7]);
+  for (i = 0; i < 8 * count; ++i) {
+    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint8_t q0 = s[0], q1 = s[1],  q2 = s[2], q3 = s[3];
+    const int8_t mask = filter_mask(*limit, *blimit,
+                                    p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t hev = hevmask(*thresh, p1, p0, q0, q1);
+    const int8_t flat = flatmask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t flat2 = flatmask5(1, s[-8], s[-7], s[-6], s[-5], p0,
+                                   q0, s[4], s[5], s[6], s[7]);
 
     wide_mbfilter(mask, hev, flat, flat2,
-                  s - 8, s - 7, s - 6, s - 5,
-                  s - 4, s - 3, s - 2, s - 1,
-                  s,     s + 1, s + 2, s + 3,
-                  s + 4, s + 5, s + 6, s + 7);
+                  s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1,
+                  s,     s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7);
     s += p;
-  } while (++i < count * 8);
+  }
 }
 
 void vp9_lpf_mbv_w_c(uint8_t *y, uint8_t *u, uint8_t *v,
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index 57f9978..2d4cd30 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -306,6 +306,13 @@
   return 2 * ((cm->mb_cols + 3) & ~3);
 }
 
+static INLINE void set_partition_seg_context(VP9_COMMON *cm,
+                                             MACROBLOCKD *xd,
+                                             int mi_row, int mi_col) {
+  xd->above_seg_context = cm->above_seg_context + mi_col;
+  xd->left_seg_context  = cm->left_seg_context + (mi_row & MI_MASK);
+}
+
 static void set_mi_row_col(VP9_COMMON *cm, MACROBLOCKD *xd,
                        int mi_row, int bh,
                        int mi_col, int bw) {
diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c
index f81690a..8001adb 100644
--- a/vp9/common/vp9_postproc.c
+++ b/vp9/common/vp9_postproc.c
@@ -132,14 +132,15 @@
 
 /****************************************************************************
  */
-void vp9_post_proc_down_and_across_c(uint8_t *src_ptr,
+void vp9_post_proc_down_and_across_c(const uint8_t *src_ptr,
                                      uint8_t *dst_ptr,
                                      int src_pixels_per_line,
                                      int dst_pixels_per_line,
                                      int rows,
                                      int cols,
                                      int flimit) {
-  uint8_t *p_src, *p_dst;
+  uint8_t const *p_src;
+  uint8_t *p_dst;
   int row;
   int col;
   int i;
@@ -313,51 +314,52 @@
                                 source->uv_height, source->uv_width, ppl);
 }
 
-void vp9_deblock(YV12_BUFFER_CONFIG         *source,
-                 YV12_BUFFER_CONFIG         *post,
-                 int                         q,
-                 int                         low_var_thresh,
-                 int                         flag) {
-  double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
-  int ppl = (int)(level + .5);
-  (void) low_var_thresh;
-  (void) flag;
+void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
+                 int q) {
+  const int ppl = (int)(6.0e-05 * q * q * q - 0.0067 * q * q + 0.306 * q
+                        + 0.0065 + 0.5);
+  int i;
 
-  vp9_post_proc_down_and_across(source->y_buffer, post->y_buffer,
-                                source->y_stride, post->y_stride,
-                                source->y_height, source->y_width, ppl);
+  const uint8_t *const srcs[3] = {src->y_buffer, src->u_buffer, src->v_buffer};
+  const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride};
+  const int src_widths[3] = {src->y_width, src->uv_width, src->uv_width};
+  const int src_heights[3] = {src->y_height, src->uv_height, src->uv_height};
 
-  vp9_post_proc_down_and_across(source->u_buffer, post->u_buffer,
-                                source->uv_stride, post->uv_stride,
-                                source->uv_height, source->uv_width, ppl);
+  uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer};
+  const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride};
 
-  vp9_post_proc_down_and_across(source->v_buffer, post->v_buffer,
-                                source->uv_stride, post->uv_stride,
-                                source->uv_height, source->uv_width, ppl);
+  for (i = 0; i < MAX_MB_PLANE; ++i)
+    vp9_post_proc_down_and_across(srcs[i], dsts[i],
+                                  src_strides[i], dst_strides[i],
+                                  src_heights[i], src_widths[i], ppl);
 }
 
-void vp9_denoise(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *post,
-                 int q, int low_var_thresh, int flag) {
-  double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
-  int ppl = (int)(level + .5);
-  (void) post;
-  (void) low_var_thresh;
-  (void) flag;
+void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
+                 int q) {
+  const int ppl = (int)(6.0e-05 * q * q * q - 0.0067 * q * q + 0.306 * q
+                        + 0.0065 + 0.5);
+  int i;
 
-  vp9_post_proc_down_and_across(src->y_buffer + 2 * src->y_stride + 2,
-                                src->y_buffer + 2 * src->y_stride + 2,
-                                src->y_stride, src->y_stride, src->y_height - 4,
-                                src->y_width - 4, ppl);
+  const uint8_t *const srcs[3] = {src->y_buffer, src->u_buffer, src->v_buffer};
+  const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride};
+  const int src_widths[3] = {src->y_width, src->uv_width, src->uv_width};
+  const int src_heights[3] = {src->y_height, src->uv_height, src->uv_height};
 
-  vp9_post_proc_down_and_across(src->u_buffer + 2 * src->uv_stride + 2,
-                                src->u_buffer + 2 * src->uv_stride + 2,
-                                src->uv_stride, src->uv_stride,
-                                src->uv_height - 4, src->uv_width - 4, ppl);
+  uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer};
+  const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride};
 
-  vp9_post_proc_down_and_across(src->v_buffer + 2 * src->uv_stride + 2,
-                                src->v_buffer + 2 * src->uv_stride + 2,
-                                src->uv_stride, src->uv_stride,
-                                src->uv_height - 4, src->uv_width - 4, ppl);
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    const int src_stride = src_strides[i];
+    const uint8_t *const src = srcs[i] + 2 * src_stride + 2;
+    const int src_width = src_widths[i] - 4;
+    const int src_height = src_heights[i] - 4;
+
+    const int dst_stride = dst_strides[i];
+    uint8_t *const dst = dsts[i] + 2 * dst_stride + 2;
+
+    vp9_post_proc_down_and_across(src, dst, src_stride, dst_stride,
+                                  src_height, src_width, ppl);
+  }
 }
 
 double vp9_gaussian(double sigma, double mu, double x) {
@@ -642,7 +644,7 @@
     deblock_and_de_macro_block(oci->frame_to_show, &oci->post_proc_buffer,
                                q + (deblock_level - 5) * 10, 1, 0);
   } else if (flags & VP9D_DEBLOCK) {
-    vp9_deblock(oci->frame_to_show, &oci->post_proc_buffer, q, 1, 0);
+    vp9_deblock(oci->frame_to_show, &oci->post_proc_buffer, q);
   } else {
     vp8_yv12_copy_frame(oci->frame_to_show, &oci->post_proc_buffer);
   }
diff --git a/vp9/common/vp9_postproc.h b/vp9/common/vp9_postproc.h
index c2f556e..2c0d333 100644
--- a/vp9/common/vp9_postproc.h
+++ b/vp9/common/vp9_postproc.h
@@ -29,10 +29,8 @@
 int vp9_post_proc_frame(struct VP9Common *oci, YV12_BUFFER_CONFIG *dest,
                         vp9_ppflags_t *flags);
 
-void vp9_denoise(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *post,
-                 int q, int low_var_thresh, int flag);
+void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q);
 
-void vp9_deblock(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *post,
-                 int q, int low_var_thresh, int flag);
+void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q);
 
 #endif  // VP9_COMMON_VP9_POSTPROC_H_
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index 3668fcd..e7303f1 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -17,6 +17,78 @@
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
 
+static int scale_value_x_with_scaling(int val,
+                                      const struct scale_factors *scale) {
+  return val * scale->x_num / scale->x_den;
+}
+
+static int scale_value_y_with_scaling(int val,
+                                      const struct scale_factors *scale) {
+  return val * scale->y_num / scale->y_den;
+}
+
+static int unscaled_value(int val, const struct scale_factors *scale) {
+  (void) scale;
+  return val;
+}
+
+static int_mv32 mv_q3_to_q4_with_scaling(const int_mv *src_mv,
+                                         const struct scale_factors *scale) {
+  // returns mv * scale + offset
+  int_mv32 result;
+  const int32_t mv_row_q4 = src_mv->as_mv.row << 1;
+  const int32_t mv_col_q4 = src_mv->as_mv.col << 1;
+
+  /* TODO(jkoleszar): make fixed point, or as a second multiply? */
+  result.as_mv.row =  mv_row_q4 * scale->y_num / scale->y_den
+                      + scale->y_offset_q4;
+  result.as_mv.col =  mv_col_q4 * scale->x_num / scale->x_den
+                      + scale->x_offset_q4;
+  return result;
+}
+
+static int_mv32 mv_q3_to_q4_without_scaling(const int_mv *src_mv,
+                                            const struct scale_factors *scale) {
+  // returns mv * scale + offset
+  int_mv32 result;
+
+  result.as_mv.row = src_mv->as_mv.row << 1;
+  result.as_mv.col = src_mv->as_mv.col << 1;
+  return result;
+}
+
+static int32_t mv_component_q4_with_scaling(int mv_q4, int num, int den,
+                                            int offset_q4) {
+  // returns the scaled and offset value of the mv component.
+
+  /* TODO(jkoleszar): make fixed point, or as a second multiply? */
+  return mv_q4 * num / den + offset_q4;
+}
+
+static int32_t mv_component_q4_without_scaling(int mv_q4, int num, int den,
+                                               int offset_q4) {
+  // returns the scaled and offset value of the mv component.
+  (void)num;
+  (void)den;
+  (void)offset_q4;
+  return mv_q4;
+}
+
+static void set_offsets_with_scaling(struct scale_factors *scale,
+                                     int row, int col) {
+  const int x_q4 = 16 * col;
+  const int y_q4 = 16 * row;
+
+  scale->x_offset_q4 = (x_q4 * scale->x_num / scale->x_den) & 0xf;
+  scale->y_offset_q4 = (y_q4 * scale->y_num / scale->y_den) & 0xf;
+}
+
+static void set_offsets_without_scaling(struct scale_factors *scale,
+                                        int row, int col) {
+  scale->x_offset_q4 = 0;
+  scale->y_offset_q4 = 0;
+}
+
 void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,
                                        int other_w, int other_h,
                                        int this_w, int this_h) {
@@ -34,18 +106,14 @@
     scale->scale_value_x = unscaled_value;
     scale->scale_value_y = unscaled_value;
     scale->set_scaled_offsets = set_offsets_without_scaling;
-    scale->scale_motion_vector_q3_to_q4 =
-        motion_vector_q3_to_q4_without_scaling;
-    scale->scale_motion_vector_component_q4 =
-        motion_vector_component_q4_without_scaling;
+    scale->scale_motion_vector_q3_to_q4 = mv_q3_to_q4_without_scaling;
+    scale->scale_motion_vector_component_q4 = mv_component_q4_without_scaling;
   } else {
     scale->scale_value_x = scale_value_x_with_scaling;
     scale->scale_value_y = scale_value_y_with_scaling;
     scale->set_scaled_offsets = set_offsets_with_scaling;
-    scale->scale_motion_vector_q3_to_q4 =
-        motion_vector_q3_to_q4_with_scaling;
-    scale->scale_motion_vector_component_q4 =
-        motion_vector_component_q4_with_scaling;
+    scale->scale_motion_vector_q3_to_q4 = mv_q3_to_q4_with_scaling;
+    scale->scale_motion_vector_component_q4 = mv_component_q4_with_scaling;
   }
 
   // TODO(agrange): Investigate the best choice of functions to use here
@@ -424,3 +492,18 @@
   vp9_build_inter_predictors_sbuv(xd, mb_row, mb_col,
                                   BLOCK_SIZE_MB16X16);
 }
+
+// TODO(dkovalev: find better place for this function)
+void vp9_setup_scale_factors(VP9_COMMON *cm, int i) {
+  const int ref = cm->active_ref_idx[i];
+  struct scale_factors *const sf = &cm->active_ref_scale[i];
+  if (ref >= NUM_YV12_BUFFERS) {
+    memset(sf, 0, sizeof(*sf));
+  } else {
+    YV12_BUFFER_CONFIG *const fb = &cm->yv12_fb[ref];
+    vp9_setup_scale_factors_for_frame(sf,
+                                      fb->y_crop_width, fb->y_crop_height,
+                                      cm->width, cm->height);
+  }
+}
+
diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h
index faf018c..8f76195 100644
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@@ -52,21 +52,6 @@
                                   int w, int h, int do_avg,
                                   const struct subpix_fn_table *subpix);
 
-static int scale_value_x_with_scaling(int val,
-                                      const struct scale_factors *scale) {
-  return val * scale->x_num / scale->x_den;
-}
-
-static int scale_value_y_with_scaling(int val,
-                                      const struct scale_factors *scale) {
-  return val * scale->y_num / scale->y_den;
-}
-
-static int unscaled_value(int val, const struct scale_factors *scale) {
-  (void) scale;
-  return val;
-}
-
 static int scaled_buffer_offset(int x_offset, int y_offset, int stride,
                                 const struct scale_factors *scale) {
   const int x = scale ? scale->scale_value_x(x_offset, scale) : x_offset;
@@ -137,66 +122,6 @@
   xd->scale_factor_uv[1] = xd->scale_factor[1];
 }
 
-static void set_offsets_with_scaling(struct scale_factors *scale,
-                                     int row, int col) {
-  const int x_q4 = 16 * col;
-  const int y_q4 = 16 * row;
+void vp9_setup_scale_factors(VP9_COMMON *cm, int i);
 
-  scale->x_offset_q4 = (x_q4 * scale->x_num / scale->x_den) & 0xf;
-  scale->y_offset_q4 = (y_q4 * scale->y_num / scale->y_den) & 0xf;
-}
-
-static void set_offsets_without_scaling(struct scale_factors *scale,
-                                        int row, int col) {
-  scale->x_offset_q4 = 0;
-  scale->y_offset_q4 = 0;
-}
-
-static int_mv32 motion_vector_q3_to_q4_with_scaling(
-    const int_mv *src_mv,
-    const struct scale_factors *scale) {
-  // returns mv * scale + offset
-  int_mv32 result;
-  const int32_t mv_row_q4 = src_mv->as_mv.row << 1;
-  const int32_t mv_col_q4 = src_mv->as_mv.col << 1;
-
-  /* TODO(jkoleszar): make fixed point, or as a second multiply? */
-  result.as_mv.row =  mv_row_q4 * scale->y_num / scale->y_den
-                      + scale->y_offset_q4;
-  result.as_mv.col =  mv_col_q4 * scale->x_num / scale->x_den
-                      + scale->x_offset_q4;
-  return result;
-}
-
-static int_mv32 motion_vector_q3_to_q4_without_scaling(
-    const int_mv *src_mv,
-    const struct scale_factors *scale) {
-  // returns mv * scale + offset
-  int_mv32 result;
-
-  result.as_mv.row = src_mv->as_mv.row << 1;
-  result.as_mv.col = src_mv->as_mv.col << 1;
-  return result;
-}
-
-static int32_t motion_vector_component_q4_with_scaling(int mv_q4,
-                                                       int num,
-                                                       int den,
-                                                       int offset_q4) {
-  // returns the scaled and offset value of the mv component.
-
-  /* TODO(jkoleszar): make fixed point, or as a second multiply? */
-  return mv_q4 * num / den + offset_q4;
-}
-
-static int32_t motion_vector_component_q4_without_scaling(int mv_q4,
-                                                          int num,
-                                                          int den,
-                                                          int offset_q4) {
-  // returns the scaled and offset value of the mv component.
-  (void)num;
-  (void)den;
-  (void)offset_q4;
-  return mv_q4;
-}
 #endif  // VP9_COMMON_VP9_RECONINTER_H_
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 75e3604..48ce7db 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -91,12 +91,6 @@
 prototype void vp9_add_residual_8x8 "const int16_t *diff, uint8_t *dest, int stride"
 specialize vp9_add_residual_8x8 sse2
 
-prototype void vp9_add_residual_16x16 "const int16_t *diff, uint8_t *dest, int stride"
-specialize vp9_add_residual_16x16 sse2
-
-prototype void vp9_add_residual_32x32 "const int16_t *diff, uint8_t *dest, int stride"
-specialize vp9_add_residual_32x32 sse2
-
 prototype void vp9_add_constant_residual_8x8 "const int16_t diff, uint8_t *dest, int stride"
 specialize vp9_add_constant_residual_8x8 sse2
 
@@ -128,30 +122,6 @@
 prototype void vp9_loop_filter_bh8x8 "uint8_t *y, uint8_t *u, uint8_t *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
 specialize vp9_loop_filter_bh8x8 sse2
 
-prototype void vp9_loop_filter_simple_mbv "uint8_t *y, int ystride, const uint8_t *blimit"
-specialize vp9_loop_filter_simple_mbv mmx sse2
-vp9_loop_filter_simple_mbv_c=vp9_loop_filter_simple_vertical_edge_c
-vp9_loop_filter_simple_mbv_mmx=vp9_loop_filter_simple_vertical_edge_mmx
-vp9_loop_filter_simple_mbv_sse2=vp9_loop_filter_simple_vertical_edge_sse2
-
-prototype void vp9_loop_filter_simple_mbh "uint8_t *y, int ystride, const uint8_t *blimit"
-specialize vp9_loop_filter_simple_mbh mmx sse2
-vp9_loop_filter_simple_mbh_c=vp9_loop_filter_simple_horizontal_edge_c
-vp9_loop_filter_simple_mbh_mmx=vp9_loop_filter_simple_horizontal_edge_mmx
-vp9_loop_filter_simple_mbh_sse2=vp9_loop_filter_simple_horizontal_edge_sse2
-
-prototype void vp9_loop_filter_simple_bv "uint8_t *y, int ystride, const uint8_t *blimit"
-specialize vp9_loop_filter_simple_bv mmx sse2
-vp9_loop_filter_simple_bv_c=vp9_loop_filter_bvs_c
-vp9_loop_filter_simple_bv_mmx=vp9_loop_filter_bvs_mmx
-vp9_loop_filter_simple_bv_sse2=vp9_loop_filter_bvs_sse2
-
-prototype void vp9_loop_filter_simple_bh "uint8_t *y, int ystride, const uint8_t *blimit"
-specialize vp9_loop_filter_simple_bh mmx sse2
-vp9_loop_filter_simple_bh_c=vp9_loop_filter_bhs_c
-vp9_loop_filter_simple_bh_mmx=vp9_loop_filter_bhs_mmx
-vp9_loop_filter_simple_bh_sse2=vp9_loop_filter_bhs_sse2
-
 prototype void vp9_lpf_mbh_w "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi"
 specialize vp9_lpf_mbh_w sse2
 
@@ -170,7 +140,7 @@
 specialize vp9_mbpost_proc_across_ip sse2
 vp9_mbpost_proc_across_ip_sse2=vp9_mbpost_proc_across_ip_xmm
 
-prototype void vp9_post_proc_down_and_across "uint8_t *src_ptr, uint8_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit"
+prototype void vp9_post_proc_down_and_across "const uint8_t *src_ptr, uint8_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit"
 specialize vp9_post_proc_down_and_across mmx sse2
 vp9_post_proc_down_and_across_sse2=vp9_post_proc_down_and_across_xmm
 
@@ -227,24 +197,23 @@
 prototype void vp9_short_idct1_8x8 "int16_t *input, int16_t *output"
 specialize vp9_short_idct1_8x8
 
-prototype void vp9_short_idct16x16 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct16x16 sse2
+prototype void vp9_short_idct16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct16x16_add sse2
 
-prototype void vp9_short_idct10_16x16 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct10_16x16 sse2
+prototype void vp9_short_idct10_16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct10_16x16_add sse2
 
 prototype void vp9_short_idct1_16x16 "int16_t *input, int16_t *output"
 specialize vp9_short_idct1_16x16
 
-
-prototype void vp9_short_idct32x32 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct32x32 sse2
+prototype void vp9_short_idct32x32_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct32x32_add sse2
 
 prototype void vp9_short_idct1_32x32 "int16_t *input, int16_t *output"
 specialize vp9_short_idct1_32x32
 
-prototype void vp9_short_idct10_32x32 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct10_32x32
+prototype void vp9_short_idct10_32x32_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct10_32x32_add
 
 prototype void vp9_short_iht8x8 "int16_t *input, int16_t *output, int pitch, int tx_type"
 specialize vp9_short_iht8x8
@@ -252,8 +221,8 @@
 prototype void vp9_short_iht4x4 "int16_t *input, int16_t *output, int pitch, int tx_type"
 specialize vp9_short_iht4x4
 
-prototype void vp9_short_iht16x16 "int16_t *input, int16_t *output, int pitch, int tx_type"
-specialize vp9_short_iht16x16
+prototype void vp9_short_iht16x16_add "int16_t *input, uint8_t *output, int pitch, int tx_type"
+specialize vp9_short_iht16x16_add
 
 prototype void vp9_idct4_1d "int16_t *input, int16_t *output"
 specialize vp9_idct4_1d sse2
@@ -337,41 +306,74 @@
 prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance64x64 sse2
 
+prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance64x64
+
 prototype unsigned int vp9_sub_pixel_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance32x64
 
+prototype unsigned int vp9_sub_pixel_avg_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance32x64
+
 prototype unsigned int vp9_sub_pixel_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance64x32
 
+prototype unsigned int vp9_sub_pixel_avg_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance64x32
+
 prototype unsigned int vp9_sub_pixel_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance32x16
 
+prototype unsigned int vp9_sub_pixel_avg_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance32x16
+
 prototype unsigned int vp9_sub_pixel_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance16x32
 
+prototype unsigned int vp9_sub_pixel_avg_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance16x32
+
 prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance32x32 sse2
 
+prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance32x32
+
 prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance16x16 sse2 mmx ssse3
 
+prototype unsigned int vp9_sub_pixel_avg_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance16x16
+
 prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance8x16 sse2 mmx
 vp9_sub_pixel_variance8x16_sse2=vp9_sub_pixel_variance8x16_wmt
 
+prototype unsigned int vp9_sub_pixel_avg_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance8x16
+
 prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance16x8 sse2 mmx ssse3
 vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_ssse3;
 vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_wmt
 
+prototype unsigned int vp9_sub_pixel_avg_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance16x8
+
 prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance8x8 sse2 mmx
 vp9_sub_pixel_variance8x8_sse2=vp9_sub_pixel_variance8x8_wmt
 
+prototype unsigned int vp9_sub_pixel_avg_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance8x8
+
 prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_sub_pixel_variance4x4 sse2 mmx
 vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
 
+prototype unsigned int vp9_sub_pixel_avg_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance4x4
+
 prototype unsigned int vp9_sad64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
 specialize vp9_sad64x64 sse2
 
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index dd7e68a..667da33 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -752,8 +752,17 @@
                            stp2_10, stp2_13, stp2_11, stp2_12) \
   }
 
-void vp9_short_idct16x16_sse2(int16_t *input, int16_t *output, int pitch) {
-  const int half_pitch = pitch >> 1;
+#define RECON_AND_STORE(dest, in_x) \
+  {                                                     \
+     __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
+      d0 = _mm_unpacklo_epi8(d0, zero); \
+      in_x = _mm_add_epi16(in_x, d0); \
+      in_x = _mm_packus_epi16(in_x, in_x); \
+      _mm_storel_epi64((__m128i *)(dest), in_x); \
+      dest += stride; \
+  }
+
+void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1<<5);
   const __m128i zero = _mm_setzero_si128();
@@ -938,31 +947,30 @@
       in14 = _mm_srai_epi16(in14, 6);
       in15 = _mm_srai_epi16(in15, 6);
 
-      // Store results
-      _mm_store_si128((__m128i *)output, in0);
-      _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);
-      _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);
-      _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);
-      _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);
-      _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);
-      _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);
-      _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
-      _mm_store_si128((__m128i *)(output + half_pitch * 8), in8);
-      _mm_store_si128((__m128i *)(output + half_pitch * 9), in9);
-      _mm_store_si128((__m128i *)(output + half_pitch * 10), in10);
-      _mm_store_si128((__m128i *)(output + half_pitch * 11), in11);
-      _mm_store_si128((__m128i *)(output + half_pitch * 12), in12);
-      _mm_store_si128((__m128i *)(output + half_pitch * 13), in13);
-      _mm_store_si128((__m128i *)(output + half_pitch * 14), in14);
-      _mm_store_si128((__m128i *)(output + half_pitch * 15), in15);
+      RECON_AND_STORE(dest, in0);
+      RECON_AND_STORE(dest, in1);
+      RECON_AND_STORE(dest, in2);
+      RECON_AND_STORE(dest, in3);
+      RECON_AND_STORE(dest, in4);
+      RECON_AND_STORE(dest, in5);
+      RECON_AND_STORE(dest, in6);
+      RECON_AND_STORE(dest, in7);
+      RECON_AND_STORE(dest, in8);
+      RECON_AND_STORE(dest, in9);
+      RECON_AND_STORE(dest, in10);
+      RECON_AND_STORE(dest, in11);
+      RECON_AND_STORE(dest, in12);
+      RECON_AND_STORE(dest, in13);
+      RECON_AND_STORE(dest, in14);
+      RECON_AND_STORE(dest, in15);
 
-      output += 8;
+      dest += 8 - (stride * 16);
     }
   }
 }
 
-void vp9_short_idct10_16x16_sse2(int16_t *input, int16_t *output, int pitch) {
-  const int half_pitch = pitch >> 1;
+void vp9_short_idct10_16x16_add_sse2(int16_t *input, uint8_t *dest,
+                                     int stride) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1<<5);
   const __m128i zero = _mm_setzero_si128();
@@ -1007,7 +1015,6 @@
           stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   int i;
-
   // 1-D idct. Load input data.
   in0 = _mm_load_si128((__m128i *)input);
   in8 = _mm_load_si128((__m128i *)(input + 8 * 1));
@@ -1298,29 +1305,28 @@
     in14 = _mm_srai_epi16(in14, 6);
     in15 = _mm_srai_epi16(in15, 6);
 
-    // Store results
-    _mm_store_si128((__m128i *)output, in0);
-    _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);
-    _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);
-    _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);
-    _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);
-    _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);
-    _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);
-    _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
-    _mm_store_si128((__m128i *)(output + half_pitch * 8), in8);
-    _mm_store_si128((__m128i *)(output + half_pitch * 9), in9);
-    _mm_store_si128((__m128i *)(output + half_pitch * 10), in10);
-    _mm_store_si128((__m128i *)(output + half_pitch * 11), in11);
-    _mm_store_si128((__m128i *)(output + half_pitch * 12), in12);
-    _mm_store_si128((__m128i *)(output + half_pitch * 13), in13);
-    _mm_store_si128((__m128i *)(output + half_pitch * 14), in14);
-    _mm_store_si128((__m128i *)(output + half_pitch * 15), in15);
-    output += 8;
+    RECON_AND_STORE(dest, in0);
+    RECON_AND_STORE(dest, in1);
+    RECON_AND_STORE(dest, in2);
+    RECON_AND_STORE(dest, in3);
+    RECON_AND_STORE(dest, in4);
+    RECON_AND_STORE(dest, in5);
+    RECON_AND_STORE(dest, in6);
+    RECON_AND_STORE(dest, in7);
+    RECON_AND_STORE(dest, in8);
+    RECON_AND_STORE(dest, in9);
+    RECON_AND_STORE(dest, in10);
+    RECON_AND_STORE(dest, in11);
+    RECON_AND_STORE(dest, in12);
+    RECON_AND_STORE(dest, in13);
+    RECON_AND_STORE(dest, in14);
+    RECON_AND_STORE(dest, in15);
+
+    dest += 8 - (stride * 16);
   }
 }
 
-void vp9_short_idct32x32_sse2(int16_t *input, int16_t *output, int pitch) {
-  const int half_pitch = pitch >> 1;
+void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1<<5);
 
@@ -1832,6 +1838,8 @@
       col[i * 32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
       col[i * 32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
     } else {
+      const __m128i zero = _mm_setzero_si128();
+
       // 2_D: Calculate the results and store them to destination.
       in0 = _mm_add_epi16(stp1_0, stp1_31);
       in1 = _mm_add_epi16(stp1_1, stp1_30);
@@ -1933,41 +1941,40 @@
       in30 = _mm_srai_epi16(in30, 6);
       in31 = _mm_srai_epi16(in31, 6);
 
-      // Store results
-      _mm_store_si128((__m128i *)output, in0);
-      _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);
-      _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);
-      _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);
-      _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);
-      _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);
-      _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);
-      _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
-      _mm_store_si128((__m128i *)(output + half_pitch * 8), in8);
-      _mm_store_si128((__m128i *)(output + half_pitch * 9), in9);
-      _mm_store_si128((__m128i *)(output + half_pitch * 10), in10);
-      _mm_store_si128((__m128i *)(output + half_pitch * 11), in11);
-      _mm_store_si128((__m128i *)(output + half_pitch * 12), in12);
-      _mm_store_si128((__m128i *)(output + half_pitch * 13), in13);
-      _mm_store_si128((__m128i *)(output + half_pitch * 14), in14);
-      _mm_store_si128((__m128i *)(output + half_pitch * 15), in15);
-      _mm_store_si128((__m128i *)(output + half_pitch * 16), in16);
-      _mm_store_si128((__m128i *)(output + half_pitch * 17), in17);
-      _mm_store_si128((__m128i *)(output + half_pitch * 18), in18);
-      _mm_store_si128((__m128i *)(output + half_pitch * 19), in19);
-      _mm_store_si128((__m128i *)(output + half_pitch * 20), in20);
-      _mm_store_si128((__m128i *)(output + half_pitch * 21), in21);
-      _mm_store_si128((__m128i *)(output + half_pitch * 22), in22);
-      _mm_store_si128((__m128i *)(output + half_pitch * 23), in23);
-      _mm_store_si128((__m128i *)(output + half_pitch * 24), in24);
-      _mm_store_si128((__m128i *)(output + half_pitch * 25), in25);
-      _mm_store_si128((__m128i *)(output + half_pitch * 26), in26);
-      _mm_store_si128((__m128i *)(output + half_pitch * 27), in27);
-      _mm_store_si128((__m128i *)(output + half_pitch * 28), in28);
-      _mm_store_si128((__m128i *)(output + half_pitch * 29), in29);
-      _mm_store_si128((__m128i *)(output + half_pitch * 30), in30);
-      _mm_store_si128((__m128i *)(output + half_pitch * 31), in31);
+      RECON_AND_STORE(dest, in0);
+      RECON_AND_STORE(dest, in1);
+      RECON_AND_STORE(dest, in2);
+      RECON_AND_STORE(dest, in3);
+      RECON_AND_STORE(dest, in4);
+      RECON_AND_STORE(dest, in5);
+      RECON_AND_STORE(dest, in6);
+      RECON_AND_STORE(dest, in7);
+      RECON_AND_STORE(dest, in8);
+      RECON_AND_STORE(dest, in9);
+      RECON_AND_STORE(dest, in10);
+      RECON_AND_STORE(dest, in11);
+      RECON_AND_STORE(dest, in12);
+      RECON_AND_STORE(dest, in13);
+      RECON_AND_STORE(dest, in14);
+      RECON_AND_STORE(dest, in15);
+      RECON_AND_STORE(dest, in16);
+      RECON_AND_STORE(dest, in17);
+      RECON_AND_STORE(dest, in18);
+      RECON_AND_STORE(dest, in19);
+      RECON_AND_STORE(dest, in20);
+      RECON_AND_STORE(dest, in21);
+      RECON_AND_STORE(dest, in22);
+      RECON_AND_STORE(dest, in23);
+      RECON_AND_STORE(dest, in24);
+      RECON_AND_STORE(dest, in25);
+      RECON_AND_STORE(dest, in26);
+      RECON_AND_STORE(dest, in27);
+      RECON_AND_STORE(dest, in28);
+      RECON_AND_STORE(dest, in29);
+      RECON_AND_STORE(dest, in30);
+      RECON_AND_STORE(dest, in31);
 
-      output += 8;
+      dest += 8 - (stride * 32);
     }
   }
 }
diff --git a/vp9/common/x86/vp9_loopfilter_intrin_mmx.c b/vp9/common/x86/vp9_loopfilter_intrin_mmx.c
index 2be9e31..7e6c4be 100644
--- a/vp9/common/x86/vp9_loopfilter_intrin_mmx.c
+++ b/vp9/common/x86/vp9_loopfilter_intrin_mmx.c
@@ -35,16 +35,6 @@
 
 }
 
-void vp9_loop_filter_bhs_mmx(unsigned char *y_ptr, int y_stride,
-                             const unsigned char *blimit) {
-  vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride,
-                                             y_stride, blimit);
-  vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride,
-                                             y_stride, blimit);
-  vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride,
-                                             y_stride, blimit);
-}
-
 /* Vertical B Filtering */
 void vp9_loop_filter_bv_mmx(unsigned char *y_ptr,
                             unsigned char *u_ptr, unsigned char *v_ptr,
@@ -66,9 +56,3 @@
                                       lfi->blim, lfi->lim, lfi->hev_thr, 1);
 }
 
-void vp9_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride,
-                             const unsigned char *blimit) {
-  vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, blimit);
-  vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, blimit);
-  vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, blimit);
-}
diff --git a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
index 08447a6..7982ca6 100644
--- a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
+++ b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
@@ -1115,16 +1115,6 @@
                                             v_ptr + 4 * uv_stride);
 }
 
-void vp9_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride,
-                              const unsigned char *blimit) {
-  vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride,
-                                              y_stride, blimit);
-  vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride,
-                                              y_stride, blimit);
-  vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride,
-                                              y_stride, blimit);
-}
-
 /* Vertical B Filtering */
 void vp9_loop_filter_bv_sse2(unsigned char *y_ptr,
                              unsigned char *u_ptr, unsigned char *v_ptr,
@@ -1143,9 +1133,3 @@
                                           v_ptr + 4);
 }
 
-void vp9_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride,
-                              const unsigned char *blimit) {
-  vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, blimit);
-  vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, blimit);
-  vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, blimit);
-}
diff --git a/vp9/common/x86/vp9_loopfilter_mmx.asm b/vp9/common/x86/vp9_loopfilter_mmx.asm
index ceffdf5..4ebb51b 100644
--- a/vp9/common/x86/vp9_loopfilter_mmx.asm
+++ b/vp9/common/x86/vp9_loopfilter_mmx.asm
@@ -593,349 +593,6 @@
     pop         rbp
     ret
 
-
-;void vp9_loop_filter_simple_horizontal_edge_mmx
-;(
-;    unsigned char *src_ptr,
-;    int  src_pixel_step,
-;    const char *blimit
-;)
-global sym(vp9_loop_filter_simple_horizontal_edge_mmx) PRIVATE
-sym(vp9_loop_filter_simple_horizontal_edge_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 3
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rsi, arg(0) ;src_ptr
-        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
-
-        mov         rcx, 2                ; count
-.nexts8_h:
-        mov         rdx, arg(2) ;blimit           ; get blimit
-        movq        mm3, [rdx]            ;
-
-        mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
-        add         rdi, rax
-        neg         rax
-
-        ; calculate mask
-        movq        mm1, [rsi+2*rax]      ; p1
-        movq        mm0, [rdi]            ; q1
-        movq        mm2, mm1
-        movq        mm7, mm0
-        movq        mm4, mm0
-        psubusb     mm0, mm1              ; q1-=p1
-        psubusb     mm1, mm4              ; p1-=q1
-        por         mm1, mm0              ; abs(p1-q1)
-        pand        mm1, [GLOBAL(tfe)]    ; set lsb of each byte to zero
-        psrlw       mm1, 1                ; abs(p1-q1)/2
-
-        movq        mm5, [rsi+rax]        ; p0
-        movq        mm4, [rsi]            ; q0
-        movq        mm0, mm4              ; q0
-        movq        mm6, mm5              ; p0
-        psubusb     mm5, mm4              ; p0-=q0
-        psubusb     mm4, mm6              ; q0-=p0
-        por         mm5, mm4              ; abs(p0 - q0)
-        paddusb     mm5, mm5              ; abs(p0-q0)*2
-        paddusb     mm5, mm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
-        psubusb     mm5, mm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
-        pxor        mm3, mm3
-        pcmpeqb     mm5, mm3
-
-        ; start work on filters
-        pxor        mm2, [GLOBAL(t80)]    ; p1 offset to convert to signed values
-        pxor        mm7, [GLOBAL(t80)]    ; q1 offset to convert to signed values
-        psubsb      mm2, mm7              ; p1 - q1
-
-        pxor        mm6, [GLOBAL(t80)]    ; offset to convert to signed values
-        pxor        mm0, [GLOBAL(t80)]    ; offset to convert to signed values
-        movq        mm3, mm0              ; q0
-        psubsb      mm0, mm6              ; q0 - p0
-        paddsb      mm2, mm0              ; p1 - q1 + 1 * (q0 - p0)
-        paddsb      mm2, mm0              ; p1 - q1 + 2 * (q0 - p0)
-        paddsb      mm2, mm0              ; p1 - q1 + 3 * (q0 - p0)
-        pand        mm5, mm2              ; mask filter values we don't care about
-
-        ; do + 4 side
-        paddsb      mm5, [GLOBAL(t4)]     ; 3* (q0 - p0) + (p1 - q1) + 4
-
-        movq        mm0, mm5              ; get a copy of filters
-        psllw       mm0, 8                ; shift left 8
-        psraw       mm0, 3                ; arithmetic shift right 11
-        psrlw       mm0, 8
-        movq        mm1, mm5              ; get a copy of filters
-        psraw       mm1, 11               ; arithmetic shift right 11
-        psllw       mm1, 8                ; shift left 8 to put it back
-
-        por         mm0, mm1              ; put the two together to get result
-
-        psubsb      mm3, mm0              ; q0-= q0 add
-        pxor        mm3, [GLOBAL(t80)]    ; unoffset
-        movq        [rsi], mm3            ; write back
-
-
-        ; now do +3 side
-        psubsb      mm5, [GLOBAL(t1s)]     ; +3 instead of +4
-
-        movq        mm0, mm5              ; get a copy of filters
-        psllw       mm0, 8                ; shift left 8
-        psraw       mm0, 3                ; arithmetic shift right 11
-        psrlw       mm0, 8
-        psraw       mm5, 11               ; arithmetic shift right 11
-        psllw       mm5, 8                ; shift left 8 to put it back
-        por         mm0, mm5              ; put the two together to get result
-
-
-        paddsb      mm6, mm0              ; p0+= p0 add
-        pxor        mm6, [GLOBAL(t80)]    ; unoffset
-        movq        [rsi+rax], mm6        ; write back
-
-        add         rsi,8
-        neg         rax
-        dec         rcx
-        jnz         .nexts8_h
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_loop_filter_simple_vertical_edge_mmx
-;(
-;    unsigned char *src_ptr,
-;    int  src_pixel_step,
-;    const char *blimit
-;)
-global sym(vp9_loop_filter_simple_vertical_edge_mmx) PRIVATE
-sym(vp9_loop_filter_simple_vertical_edge_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 3
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub          rsp, 32      ; reserve 32 bytes
-    %define t0   [rsp + 0]    ;__declspec(align(16)) char t0[8];
-    %define t1   [rsp + 16]   ;__declspec(align(16)) char t1[8];
-
-        mov         rsi, arg(0) ;src_ptr
-        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
-
-        lea         rsi, [rsi + rax*4- 2];  ;
-        mov         rcx, 2                                      ; count
-.nexts8_v:
-
-        lea         rdi,        [rsi + rax];
-        movd        mm0,        [rdi + rax * 2]                 ; xx xx xx xx 73 72 71 70
-
-        movd        mm6,        [rsi + rax * 2]                 ; xx xx xx xx 63 62 61 60
-        punpcklbw   mm6,        mm0                             ; 73 63 72 62 71 61 70 60
-
-        movd        mm0,        [rsi + rax]                     ; xx xx xx xx 53 52 51 50
-        movd        mm4,        [rsi]                           ; xx xx xx xx 43 42 41 40
-
-        punpcklbw   mm4,        mm0                             ; 53 43 52 42 51 41 50 40
-        movq        mm5,        mm4                             ; 53 43 52 42 51 41 50 40
-
-        punpcklwd   mm4,        mm6                             ; 71 61 51 41 70 60 50 40
-        punpckhwd   mm5,        mm6                             ; 73 63 53 43 72 62 52 42
-
-        neg         rax
-
-        movd        mm7,        [rsi + rax]                     ; xx xx xx xx 33 32 31 30
-        movd        mm6,        [rsi + rax * 2]                 ; xx xx xx xx 23 22 21 20
-
-        punpcklbw   mm6,        mm7                             ; 33 23 32 22 31 21 30 20
-        movd        mm1,        [rdi + rax * 4]                 ; xx xx xx xx 13 12 11 10
-
-        movd        mm0,        [rsi + rax * 4]                 ; xx xx xx xx 03 02 01 00
-        punpcklbw   mm0,        mm1                             ; 13 03 12 02 11 01 10 00
-
-        movq        mm2,        mm0                             ; 13 03 12 02 11 01 10 00
-        punpcklwd   mm0,        mm6                             ; 31 21 11 01 30 20 10 00
-
-        punpckhwd   mm2,        mm6                             ; 33 23 13 03 32 22 12 02
-        movq        mm1,        mm0                             ; 13 03 12 02 11 01 10 00
-
-        punpckldq   mm0,        mm4                             ; 70 60 50 40 30 20 10 00       = p1
-        movq        mm3,        mm2                             ; 33 23 13 03 32 22 12 02
-
-        punpckhdq   mm1,        mm4                             ; 71 61 51 41 31 21 11 01       = p0
-        punpckldq   mm2,        mm5                             ; 72 62 52 42 32 22 12 02       = q0
-
-        punpckhdq   mm3,        mm5                             ; 73 63 53 43 33 23 13 03       = q1
-
-
-        ; calculate mask
-        movq        mm6,        mm0                             ; p1
-        movq        mm7,        mm3                             ; q1
-        psubusb     mm7,        mm6                             ; q1-=p1
-        psubusb     mm6,        mm3                             ; p1-=q1
-        por         mm6,        mm7                             ; abs(p1-q1)
-        pand        mm6,        [GLOBAL(tfe)]                   ; set lsb of each byte to zero
-        psrlw       mm6,        1                               ; abs(p1-q1)/2
-
-        movq        mm5,        mm1                             ; p0
-        movq        mm4,        mm2                             ; q0
-
-        psubusb     mm5,        mm2                             ; p0-=q0
-        psubusb     mm4,        mm1                             ; q0-=p0
-
-        por         mm5,        mm4                             ; abs(p0 - q0)
-        paddusb     mm5,        mm5                             ; abs(p0-q0)*2
-        paddusb     mm5,        mm6                             ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
-        mov         rdx,        arg(2) ;blimit                          ; get blimit
-        movq        mm7,        [rdx]
-
-        psubusb     mm5,        mm7                             ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
-        pxor        mm7,        mm7
-        pcmpeqb     mm5,        mm7                             ; mm5 = mask
-
-        ; start work on filters
-        movq        t0,         mm0
-        movq        t1,         mm3
-
-        pxor        mm0,        [GLOBAL(t80)]                   ; p1 offset to convert to signed values
-        pxor        mm3,        [GLOBAL(t80)]                   ; q1 offset to convert to signed values
-
-        psubsb      mm0,        mm3                             ; p1 - q1
-        movq        mm6,        mm1                             ; p0
-
-        movq        mm7,        mm2                             ; q0
-        pxor        mm6,        [GLOBAL(t80)]                   ; offset to convert to signed values
-
-        pxor        mm7,        [GLOBAL(t80)]                   ; offset to convert to signed values
-        movq        mm3,        mm7                             ; offseted ; q0
-
-        psubsb      mm7,        mm6                             ; q0 - p0
-        paddsb      mm0,        mm7                             ; p1 - q1 + 1 * (q0 - p0)
-
-        paddsb      mm0,        mm7                             ; p1 - q1 + 2 * (q0 - p0)
-        paddsb      mm0,        mm7                             ; p1 - q1 + 3 * (q0 - p0)
-
-        pand        mm5,        mm0                             ; mask filter values we don't care about
-
-        paddsb      mm5,        [GLOBAL(t4)]                    ;  3* (q0 - p0) + (p1 - q1) + 4
-
-        movq        mm0,        mm5                             ; get a copy of filters
-        psllw       mm0,        8                               ; shift left 8
-        psraw       mm0,        3                               ; arithmetic shift right 11
-        psrlw       mm0,        8
-
-        movq        mm7,        mm5                             ; get a copy of filters
-        psraw       mm7,        11                              ; arithmetic shift right 11
-        psllw       mm7,        8                               ; shift left 8 to put it back
-
-        por         mm0,        mm7                             ; put the two together to get result
-
-        psubsb      mm3,        mm0                             ; q0-= q0sz add
-        pxor        mm3,        [GLOBAL(t80)]                   ; unoffset
-
-        ; now do +3 side
-        psubsb      mm5, [GLOBAL(t1s)]                          ; +3 instead of +4
-
-        movq        mm0, mm5                                    ; get a copy of filters
-        psllw       mm0, 8                                      ; shift left 8
-        psraw       mm0, 3                                      ; arithmetic shift right 11
-        psrlw       mm0, 8
-
-        psraw       mm5, 11                                     ; arithmetic shift right 11
-        psllw       mm5, 8                                      ; shift left 8 to put it back
-        por         mm0, mm5                                    ; put the two together to get result
-
-        paddsb      mm6, mm0                                    ; p0+= p0 add
-        pxor        mm6, [GLOBAL(t80)]                          ; unoffset
-
-
-        movq        mm0,        t0
-        movq        mm4,        t1
-
-        ; mm0 = 70 60 50 40 30 20 10 00
-        ; mm6 = 71 61 51 41 31 21 11 01
-        ; mm3 = 72 62 52 42 32 22 12 02
-        ; mm4 = 73 63 53 43 33 23 13 03
-        ; transpose back to write out
-
-        movq        mm1,        mm0                         ;
-        punpcklbw   mm0,        mm6                         ; 31 30 21 20 11 10 01 00
-
-        punpckhbw   mm1,        mm6                         ; 71 70 61 60 51 50 41 40
-        movq        mm2,        mm3                         ;
-
-        punpcklbw   mm2,        mm4                         ; 33 32 23 22 13 12 03 02
-        movq        mm5,        mm1                         ; 71 70 61 60 51 50 41 40
-
-        punpckhbw   mm3,        mm4                         ; 73 72 63 62 53 52 43 42
-        movq        mm6,        mm0                         ; 31 30 21 20 11 10 01 00
-
-        punpcklwd   mm0,        mm2                         ; 13 12 11 10 03 02 01 00
-        punpckhwd   mm6,        mm2                         ; 33 32 31 30 23 22 21 20
-
-        movd        [rsi+rax*4], mm0                        ; write 03 02 01 00
-        punpcklwd   mm1,        mm3                         ; 53 52 51 50 43 42 41 40
-
-        psrlq       mm0,        32                          ; xx xx xx xx 13 12 11 10
-        punpckhwd   mm5,        mm3                         ; 73 72 71 70 63 62 61 60
-
-        movd        [rdi+rax*4], mm0                        ; write 13 12 11 10
-        movd        [rsi+rax*2], mm6                        ; write 23 22 21 20
-
-        psrlq       mm6,        32                          ; 33 32 31 30
-        movd        [rsi],      mm1                         ; write 43 42 41 40
-
-        movd        [rsi + rax], mm6                        ; write 33 32 31 30
-        neg         rax
-
-        movd        [rsi + rax*2], mm5                      ; write 63 62 61 60
-        psrlq       mm1,        32                          ; 53 52 51 50
-
-        movd        [rdi],      mm1                         ; write out 53 52 51 50
-        psrlq       mm5,        32                          ; 73 72 71 70
-
-        movd        [rdi + rax*2], mm5                      ; write 73 72 71 70
-
-        lea         rsi,        [rsi+rax*8]                 ; next 8
-
-        dec         rcx
-        jnz         .nexts8_v
-
-    add rsp, 32
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr,
-;                  int y_stride,
-;                  loop_filter_info *lfi)
-;{
-;
-;
-;    vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
-;    vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
-;    vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
-;}
-
 SECTION_RODATA
 align 16
 tfe:
diff --git a/vp9/common/x86/vp9_loopfilter_sse2.asm b/vp9/common/x86/vp9_loopfilter_sse2.asm
index ae4c60f..74236cf 100644
--- a/vp9/common/x86/vp9_loopfilter_sse2.asm
+++ b/vp9/common/x86/vp9_loopfilter_sse2.asm
@@ -845,372 +845,6 @@
     pop         rbp
     ret
 
-;void vp9_loop_filter_simple_horizontal_edge_sse2
-;(
-;    unsigned char *src_ptr,
-;    int  src_pixel_step,
-;    const char *blimit,
-;)
-global sym(vp9_loop_filter_simple_horizontal_edge_sse2) PRIVATE
-sym(vp9_loop_filter_simple_horizontal_edge_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 3
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rsi, arg(0)             ;src_ptr
-        movsxd      rax, dword ptr arg(1)   ;src_pixel_step     ; destination pitch?
-        mov         rdx, arg(2)             ;blimit
-        movdqa      xmm3, XMMWORD PTR [rdx]
-
-        mov         rdi, rsi                ; rdi points to row +1 for indirect addressing
-        add         rdi, rax
-        neg         rax
-
-        ; calculate mask
-        movdqa      xmm1, [rsi+2*rax]       ; p1
-        movdqa      xmm0, [rdi]             ; q1
-        movdqa      xmm2, xmm1
-        movdqa      xmm7, xmm0
-        movdqa      xmm4, xmm0
-        psubusb     xmm0, xmm1              ; q1-=p1
-        psubusb     xmm1, xmm4              ; p1-=q1
-        por         xmm1, xmm0              ; abs(p1-q1)
-        pand        xmm1, [GLOBAL(tfe)]     ; set lsb of each byte to zero
-        psrlw       xmm1, 1                 ; abs(p1-q1)/2
-
-        movdqa      xmm5, [rsi+rax]         ; p0
-        movdqa      xmm4, [rsi]             ; q0
-        movdqa      xmm0, xmm4              ; q0
-        movdqa      xmm6, xmm5              ; p0
-        psubusb     xmm5, xmm4              ; p0-=q0
-        psubusb     xmm4, xmm6              ; q0-=p0
-        por         xmm5, xmm4              ; abs(p0 - q0)
-        paddusb     xmm5, xmm5              ; abs(p0-q0)*2
-        paddusb     xmm5, xmm1              ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
-        psubusb     xmm5, xmm3              ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
-        pxor        xmm3, xmm3
-        pcmpeqb     xmm5, xmm3
-
-        ; start work on filters
-        pxor        xmm2, [GLOBAL(t80)]     ; p1 offset to convert to signed values
-        pxor        xmm7, [GLOBAL(t80)]     ; q1 offset to convert to signed values
-        psubsb      xmm2, xmm7              ; p1 - q1
-
-        pxor        xmm6, [GLOBAL(t80)]     ; offset to convert to signed values
-        pxor        xmm0, [GLOBAL(t80)]     ; offset to convert to signed values
-        movdqa      xmm3, xmm0              ; q0
-        psubsb      xmm0, xmm6              ; q0 - p0
-        paddsb      xmm2, xmm0              ; p1 - q1 + 1 * (q0 - p0)
-        paddsb      xmm2, xmm0              ; p1 - q1 + 2 * (q0 - p0)
-        paddsb      xmm2, xmm0              ; p1 - q1 + 3 * (q0 - p0)
-        pand        xmm5, xmm2              ; mask filter values we don't care about
-
-        ; do + 4 side
-        paddsb      xmm5, [GLOBAL(t4)]      ; 3* (q0 - p0) + (p1 - q1) + 4
-
-        movdqa      xmm0, xmm5              ; get a copy of filters
-        psllw       xmm0, 8                 ; shift left 8
-        psraw       xmm0, 3                 ; arithmetic shift right 11
-        psrlw       xmm0, 8
-        movdqa      xmm1, xmm5              ; get a copy of filters
-        psraw       xmm1, 11                ; arithmetic shift right 11
-        psllw       xmm1, 8                 ; shift left 8 to put it back
-
-        por         xmm0, xmm1              ; put the two together to get result
-
-        psubsb      xmm3, xmm0              ; q0-= q0 add
-        pxor        xmm3, [GLOBAL(t80)]     ; unoffset
-        movdqa      [rsi], xmm3             ; write back
-
-        ; now do +3 side
-        psubsb      xmm5, [GLOBAL(t1s)]     ; +3 instead of +4
-
-        movdqa      xmm0, xmm5              ; get a copy of filters
-        psllw       xmm0, 8                 ; shift left 8
-        psraw       xmm0, 3                 ; arithmetic shift right 11
-        psrlw       xmm0, 8
-        psraw       xmm5, 11                ; arithmetic shift right 11
-        psllw       xmm5, 8                 ; shift left 8 to put it back
-        por         xmm0, xmm5              ; put the two together to get result
-
-
-        paddsb      xmm6, xmm0              ; p0+= p0 add
-        pxor        xmm6, [GLOBAL(t80)]     ; unoffset
-        movdqa      [rsi+rax], xmm6         ; write back
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_loop_filter_simple_vertical_edge_sse2
-;(
-;    unsigned char *src_ptr,
-;    int  src_pixel_step,
-;    const char *blimit,
-;)
-global sym(vp9_loop_filter_simple_vertical_edge_sse2) PRIVATE
-sym(vp9_loop_filter_simple_vertical_edge_sse2):
-    push        rbp         ; save old base pointer value.
-    mov         rbp, rsp    ; set new base pointer value.
-    SHADOW_ARGS_TO_STACK 3
-    SAVE_XMM 7
-    GET_GOT     rbx         ; save callee-saved reg
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 32                         ; reserve 32 bytes
-    %define t0  [rsp + 0]    ;__declspec(align(16)) char t0[16];
-    %define t1  [rsp + 16]   ;__declspec(align(16)) char t1[16];
-
-        mov         rsi, arg(0) ;src_ptr
-        movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
-
-        lea         rsi,        [rsi - 2 ]
-        lea         rdi,        [rsi + rax]
-        lea         rdx,        [rsi + rax*4]
-        lea         rcx,        [rdx + rax]
-
-        movd        xmm0,       [rsi]                   ; (high 96 bits unused) 03 02 01 00
-        movd        xmm1,       [rdx]                   ; (high 96 bits unused) 43 42 41 40
-        movd        xmm2,       [rdi]                   ; 13 12 11 10
-        movd        xmm3,       [rcx]                   ; 53 52 51 50
-        punpckldq   xmm0,       xmm1                    ; (high 64 bits unused) 43 42 41 40 03 02 01 00
-        punpckldq   xmm2,       xmm3                    ; 53 52 51 50 13 12 11 10
-
-        movd        xmm4,       [rsi + rax*2]           ; 23 22 21 20
-        movd        xmm5,       [rdx + rax*2]           ; 63 62 61 60
-        movd        xmm6,       [rdi + rax*2]           ; 33 32 31 30
-        movd        xmm7,       [rcx + rax*2]           ; 73 72 71 70
-        punpckldq   xmm4,       xmm5                    ; 63 62 61 60 23 22 21 20
-        punpckldq   xmm6,       xmm7                    ; 73 72 71 70 33 32 31 30
-
-        punpcklbw   xmm0,       xmm2                    ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
-        punpcklbw   xmm4,       xmm6                    ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
-
-        movdqa      xmm1,       xmm0
-        punpcklwd   xmm0,       xmm4                    ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
-        punpckhwd   xmm1,       xmm4                    ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
-
-        movdqa      xmm2,       xmm0
-        punpckldq   xmm0,       xmm1                    ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
-        punpckhdq   xmm2,       xmm1                    ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
-
-        movdqa      t0,         xmm0                    ; save to t0
-        movdqa      t1,         xmm2                    ; save to t1
-
-        lea         rsi,        [rsi + rax*8]
-        lea         rdi,        [rsi + rax]
-        lea         rdx,        [rsi + rax*4]
-        lea         rcx,        [rdx + rax]
-
-        movd        xmm4,       [rsi]                   ; 83 82 81 80
-        movd        xmm1,       [rdx]                   ; c3 c2 c1 c0
-        movd        xmm6,       [rdi]                   ; 93 92 91 90
-        movd        xmm3,       [rcx]                   ; d3 d2 d1 d0
-        punpckldq   xmm4,       xmm1                    ; c3 c2 c1 c0 83 82 81 80
-        punpckldq   xmm6,       xmm3                    ; d3 d2 d1 d0 93 92 91 90
-
-        movd        xmm0,       [rsi + rax*2]           ; a3 a2 a1 a0
-        movd        xmm5,       [rdx + rax*2]           ; e3 e2 e1 e0
-        movd        xmm2,       [rdi + rax*2]           ; b3 b2 b1 b0
-        movd        xmm7,       [rcx + rax*2]           ; f3 f2 f1 f0
-        punpckldq   xmm0,       xmm5                    ; e3 e2 e1 e0 a3 a2 a1 a0
-        punpckldq   xmm2,       xmm7                    ; f3 f2 f1 f0 b3 b2 b1 b0
-
-        punpcklbw   xmm4,       xmm6                    ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80
-        punpcklbw   xmm0,       xmm2                    ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
-
-        movdqa      xmm1,       xmm4
-        punpcklwd   xmm4,       xmm0                    ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
-        punpckhwd   xmm1,       xmm0                    ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
-
-        movdqa      xmm6,       xmm4
-        punpckldq   xmm4,       xmm1                    ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
-        punpckhdq   xmm6,       xmm1                    ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
-
-        movdqa      xmm0,       t0                      ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
-        movdqa      xmm2,       t1                      ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
-        movdqa      xmm1,       xmm0
-        movdqa      xmm3,       xmm2
-
-        punpcklqdq  xmm0,       xmm4                    ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
-        punpckhqdq  xmm1,       xmm4                    ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
-        punpcklqdq  xmm2,       xmm6                    ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-        punpckhqdq  xmm3,       xmm6                    ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
-
-        ; calculate mask
-        movdqa      xmm6,       xmm0                            ; p1
-        movdqa      xmm7,       xmm3                            ; q1
-        psubusb     xmm7,       xmm0                            ; q1-=p1
-        psubusb     xmm6,       xmm3                            ; p1-=q1
-        por         xmm6,       xmm7                            ; abs(p1-q1)
-        pand        xmm6,       [GLOBAL(tfe)]                   ; set lsb of each byte to zero
-        psrlw       xmm6,       1                               ; abs(p1-q1)/2
-
-        movdqa      xmm5,       xmm1                            ; p0
-        movdqa      xmm4,       xmm2                            ; q0
-        psubusb     xmm5,       xmm2                            ; p0-=q0
-        psubusb     xmm4,       xmm1                            ; q0-=p0
-        por         xmm5,       xmm4                            ; abs(p0 - q0)
-        paddusb     xmm5,       xmm5                            ; abs(p0-q0)*2
-        paddusb     xmm5,       xmm6                            ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
-        mov         rdx,        arg(2)                          ;blimit
-        movdqa      xmm7, XMMWORD PTR [rdx]
-
-        psubusb     xmm5,        xmm7                           ; abs(p0 - q0) *2 + abs(p1-q1)/2  > blimit
-        pxor        xmm7,        xmm7
-        pcmpeqb     xmm5,        xmm7                           ; mm5 = mask
-
-        ; start work on filters
-        movdqa        t0,        xmm0
-        movdqa        t1,        xmm3
-
-        pxor        xmm0,        [GLOBAL(t80)]                  ; p1 offset to convert to signed values
-        pxor        xmm3,        [GLOBAL(t80)]                  ; q1 offset to convert to signed values
-
-        psubsb      xmm0,        xmm3                           ; p1 - q1
-        movdqa      xmm6,        xmm1                           ; p0
-
-        movdqa      xmm7,        xmm2                           ; q0
-        pxor        xmm6,        [GLOBAL(t80)]                  ; offset to convert to signed values
-
-        pxor        xmm7,        [GLOBAL(t80)]                  ; offset to convert to signed values
-        movdqa      xmm3,        xmm7                           ; offseted ; q0
-
-        psubsb      xmm7,        xmm6                           ; q0 - p0
-        paddsb      xmm0,        xmm7                           ; p1 - q1 + 1 * (q0 - p0)
-
-        paddsb      xmm0,        xmm7                           ; p1 - q1 + 2 * (q0 - p0)
-        paddsb      xmm0,        xmm7                           ; p1 - q1 + 3 * (q0 - p0)
-
-        pand        xmm5,        xmm0                           ; mask filter values we don't care about
-
-
-        paddsb      xmm5,        [GLOBAL(t4)]                   ;  3* (q0 - p0) + (p1 - q1) + 4
-
-        movdqa      xmm0,        xmm5                           ; get a copy of filters
-        psllw       xmm0,        8                              ; shift left 8
-
-        psraw       xmm0,        3                              ; arithmetic shift right 11
-        psrlw       xmm0,        8
-
-        movdqa      xmm7,        xmm5                           ; get a copy of filters
-        psraw       xmm7,        11                             ; arithmetic shift right 11
-
-        psllw       xmm7,        8                              ; shift left 8 to put it back
-        por         xmm0,        xmm7                           ; put the two together to get result
-
-        psubsb      xmm3,        xmm0                           ; q0-= q0sz add
-        pxor        xmm3,        [GLOBAL(t80)]                  ; unoffset   q0
-
-        ; now do +3 side
-        psubsb      xmm5,        [GLOBAL(t1s)]                  ; +3 instead of +4
-        movdqa      xmm0,        xmm5                           ; get a copy of filters
-
-        psllw       xmm0,        8                              ; shift left 8
-        psraw       xmm0,        3                              ; arithmetic shift right 11
-
-        psrlw       xmm0,        8
-        psraw       xmm5,        11                             ; arithmetic shift right 11
-
-        psllw       xmm5,        8                              ; shift left 8 to put it back
-        por         xmm0,        xmm5                           ; put the two together to get result
-
-        paddsb      xmm6,        xmm0                           ; p0+= p0 add
-        pxor        xmm6,        [GLOBAL(t80)]                  ; unoffset   p0
-
-        movdqa      xmm0,        t0                             ; p1
-        movdqa      xmm4,        t1                             ; q1
-
-        ; transpose back to write out
-        ; p1  f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
-        ; p0  f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
-        ; q0  f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-        ; q1  f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
-        movdqa      xmm1,       xmm0
-        punpcklbw   xmm0,       xmm6                               ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
-        punpckhbw   xmm1,       xmm6                               ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
-
-        movdqa      xmm5,       xmm3
-        punpcklbw   xmm3,       xmm4                               ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
-        punpckhbw   xmm5,       xmm4                               ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
-
-        movdqa      xmm2,       xmm0
-        punpcklwd   xmm0,       xmm3                               ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
-        punpckhwd   xmm2,       xmm3                               ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
-
-        movdqa      xmm3,       xmm1
-        punpcklwd   xmm1,       xmm5                               ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
-        punpckhwd   xmm3,       xmm5                               ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
-
-        ; write out order: xmm0 xmm2 xmm1 xmm3
-        lea         rdx,        [rsi + rax*4]
-
-        movd        [rsi],      xmm1                               ; write the second 8-line result
-        psrldq      xmm1,       4
-        movd        [rdi],      xmm1
-        psrldq      xmm1,       4
-        movd        [rsi + rax*2], xmm1
-        psrldq      xmm1,       4
-        movd        [rdi + rax*2], xmm1
-
-        movd        [rdx],      xmm3
-        psrldq      xmm3,       4
-        movd        [rcx],      xmm3
-        psrldq      xmm3,       4
-        movd        [rdx + rax*2], xmm3
-        psrldq      xmm3,       4
-        movd        [rcx + rax*2], xmm3
-
-        neg         rax
-        lea         rsi,        [rsi + rax*8]
-        neg         rax
-        lea         rdi,        [rsi + rax]
-        lea         rdx,        [rsi + rax*4]
-        lea         rcx,        [rdx + rax]
-
-        movd        [rsi],      xmm0                                ; write the first 8-line result
-        psrldq      xmm0,       4
-        movd        [rdi],      xmm0
-        psrldq      xmm0,       4
-        movd        [rsi + rax*2], xmm0
-        psrldq      xmm0,       4
-        movd        [rdi + rax*2], xmm0
-
-        movd        [rdx],      xmm2
-        psrldq      xmm2,       4
-        movd        [rcx],      xmm2
-        psrldq      xmm2,       4
-        movd        [rdx + rax*2], xmm2
-        psrldq      xmm2,       4
-        movd        [rcx + rax*2], xmm2
-
-    add rsp, 32
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
 SECTION_RODATA
 align 16
 tfe:
diff --git a/vp9/common/x86/vp9_loopfilter_x86.h b/vp9/common/x86/vp9_loopfilter_x86.h
index 46a6202..fb5af05 100644
--- a/vp9/common/x86/vp9_loopfilter_x86.h
+++ b/vp9/common/x86/vp9_loopfilter_x86.h
@@ -23,10 +23,6 @@
 extern prototype_loopfilter_block(vp9_loop_filter_bv_mmx);
 extern prototype_loopfilter_block(vp9_loop_filter_mbh_mmx);
 extern prototype_loopfilter_block(vp9_loop_filter_bh_mmx);
-extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_mmx);
-extern prototype_simple_loopfilter(vp9_loop_filter_bvs_mmx);
-extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_mmx);
-extern prototype_simple_loopfilter(vp9_loop_filter_bhs_mmx);
 #endif
 
 #if HAVE_SSE2
@@ -34,10 +30,6 @@
 extern prototype_loopfilter_block(vp9_loop_filter_bv_sse2);
 extern prototype_loopfilter_block(vp9_loop_filter_mbh_sse2);
 extern prototype_loopfilter_block(vp9_loop_filter_bh_sse2);
-extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_sse2);
-extern prototype_simple_loopfilter(vp9_loop_filter_bvs_sse2);
-extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_sse2);
-extern prototype_simple_loopfilter(vp9_loop_filter_bhs_sse2);
 #endif
 
 #endif  // LOOPFILTER_X86_H
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index a1f780a..46d21b9 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -119,13 +119,25 @@
     m->mbmi.mb_skip_coeff = vp9_read(r, vp9_get_pred_prob(cm, xd, PRED_MBSKIP));
 
   // luma mode
+#if CONFIG_AB4X4
+  if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8)
+    m->mbmi.mode = read_kf_sb_ymode(r,
+                     cm->sb_kf_ymode_prob[cm->kf_ymode_probs_index]);
+  else
+     m->mbmi.mode = I4X4_PRED;
+#else
   m->mbmi.mode = m->mbmi.sb_type > BLOCK_SIZE_SB8X8 ?
       read_kf_sb_ymode(r, cm->sb_kf_ymode_prob[cm->kf_ymode_probs_index]):
       read_kf_mb_ymode(r, cm->kf_ymode_prob[cm->kf_ymode_probs_index]);
+#endif
 
   m->mbmi.ref_frame = INTRA_FRAME;
 
+#if CONFIG_AB4X4
+  if (m->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
+#else
   if (m->mbmi.mode == I4X4_PRED) {
+#endif
     int i;
     for (i = 0; i < 4; ++i) {
       const B_PREDICTION_MODE a = above_block_mode(m, i, mis);
@@ -139,7 +151,13 @@
   m->mbmi.uv_mode = read_uv_mode(r, cm->kf_uv_mode_prob[m->mbmi.mode]);
 
   if (cm->txfm_mode == TX_MODE_SELECT &&
-      !m->mbmi.mb_skip_coeff && m->mbmi.mode != I4X4_PRED) {
+      !m->mbmi.mb_skip_coeff &&
+#if CONFIG_AB4X4
+      m->mbmi.sb_type >= BLOCK_SIZE_SB8X8
+#else
+      m->mbmi.mode != I4X4_PRED
+#endif
+      ) {
     const int allow_16x16 = m->mbmi.sb_type >= BLOCK_SIZE_MB16X16;
     const int allow_32x32 = m->mbmi.sb_type >= BLOCK_SIZE_SB32X32;
     m->mbmi.txfm_size = select_txfm_size(cm, r, allow_16x16, allow_32x32);
@@ -150,7 +168,13 @@
              m->mbmi.sb_type >= BLOCK_SIZE_MB16X16 &&
              m->mbmi.mode <= TM_PRED) {
     m->mbmi.txfm_size = TX_16X16;
-  } else if (cm->txfm_mode >= ALLOW_8X8 && m->mbmi.mode != I4X4_PRED) {
+  } else if (cm->txfm_mode >= ALLOW_8X8 &&
+#if CONFIG_AB4X4
+             m->mbmi.sb_type >= BLOCK_SIZE_SB8X8
+#else
+             m->mbmi.mode != I4X4_PRED
+#endif
+             ) {
     m->mbmi.txfm_size = TX_8X8;
   } else {
     m->mbmi.txfm_size = TX_4X4;
@@ -618,9 +642,16 @@
       if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_SKIP)) {
         mbmi->mode = ZEROMV;
       } else {
+#if CONFIG_AB4X4
+        if (mbmi->sb_type >= BLOCK_SIZE_SB8X8)
+          mbmi->mode = read_sb_mv_ref(r, mv_ref_p);
+        else
+          mbmi->mode = SPLITMV;
+#else
         mbmi->mode = mbmi->sb_type > BLOCK_SIZE_SB8X8 ?
                                      read_sb_mv_ref(r, mv_ref_p)
                                    : read_mv_ref(r, mv_ref_p);
+#endif
         vp9_accum_mv_refs(cm, mbmi->mode, mbmi->mb_mode_context[ref_frame]);
       }
 
@@ -820,6 +851,14 @@
     // required for left and above block mv
     mv0->as_int = 0;
 
+#if CONFIG_AB4X4
+    if (mbmi->sb_type >= BLOCK_SIZE_SB8X8) {
+      mbmi->mode = read_sb_ymode(r, cm->fc.sb_ymode_prob);
+      cm->fc.sb_ymode_counts[mbmi->mode]++;
+    } else {
+      mbmi->mode = I4X4_PRED;
+    }
+#else
     if (mbmi->sb_type > BLOCK_SIZE_SB8X8) {
       mbmi->mode = read_sb_ymode(r, cm->fc.sb_ymode_prob);
       cm->fc.sb_ymode_counts[mbmi->mode]++;
@@ -827,9 +866,14 @@
       mbmi->mode = read_ymode(r, cm->fc.ymode_prob);
       cm->fc.ymode_counts[mbmi->mode]++;
     }
+#endif
 
     // If MB mode is I4X4_PRED read the block modes
+#if CONFIG_AB4X4
+    if (mbmi->sb_type < BLOCK_SIZE_SB8X8) {
+#else
     if (mbmi->mode == I4X4_PRED) {
+#endif
       int j = 0;
       do {
         int m = read_bmode(r, cm->fc.bmode_prob);
@@ -842,9 +886,14 @@
     cm->fc.uv_mode_counts[mbmi->mode][mbmi->uv_mode]++;
   }
 
+#if CONFIG_AB4X4
+    if (cm->txfm_mode == TX_MODE_SELECT && mbmi->mb_skip_coeff == 0 &&
+        mbmi->sb_type >= BLOCK_SIZE_SB8X8) {
+#else
   if (cm->txfm_mode == TX_MODE_SELECT && mbmi->mb_skip_coeff == 0 &&
       ((mbmi->ref_frame == INTRA_FRAME && mbmi->mode != I4X4_PRED) ||
        (mbmi->ref_frame != INTRA_FRAME && mbmi->mode != SPLITMV))) {
+#endif
     const int allow_16x16 = mbmi->sb_type >= BLOCK_SIZE_MB16X16;
     const int allow_32x32 = mbmi->sb_type >= BLOCK_SIZE_SB32X32;
     mbmi->txfm_size = select_txfm_size(cm, r, allow_16x16, allow_32x32);
@@ -852,13 +901,21 @@
              cm->txfm_mode >= ALLOW_32X32) {
     mbmi->txfm_size = TX_32X32;
   } else if (cm->txfm_mode >= ALLOW_16X16 &&
-             mbmi->sb_type >= BLOCK_SIZE_MB16X16 &&
-      ((mbmi->ref_frame == INTRA_FRAME && mbmi->mode <= TM_PRED) ||
-       (mbmi->ref_frame != INTRA_FRAME && mbmi->mode != SPLITMV))) {
+             mbmi->sb_type >= BLOCK_SIZE_MB16X16
+#if !CONFIG_AB4X4
+      && ((mbmi->ref_frame == INTRA_FRAME && mbmi->mode <= TM_PRED) ||
+       (mbmi->ref_frame != INTRA_FRAME && mbmi->mode != SPLITMV))
+#endif
+       ) {
     mbmi->txfm_size = TX_16X16;
   } else if (cm->txfm_mode >= ALLOW_8X8 &&
+#if CONFIG_AB4X4
+      (mbmi->sb_type >= BLOCK_SIZE_SB8X8))
+#else
       (!(mbmi->ref_frame == INTRA_FRAME && mbmi->mode == I4X4_PRED) &&
-       !(mbmi->ref_frame != INTRA_FRAME && mbmi->mode == SPLITMV))) {
+       !(mbmi->ref_frame != INTRA_FRAME && mbmi->mode == SPLITMV)))
+#endif
+  {
     mbmi->txfm_size = TX_8X8;
   } else {
     mbmi->txfm_size = TX_4X4;
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index 4be3677..2e233c3 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -417,10 +417,14 @@
   vp9_decode_mb_mode_mv(pbi, xd, mi_row, mi_col, r);
   set_refs(pbi, mi_row, mi_col);
 
+#if CONFIG_AB4X4
+  if (bsize < BLOCK_SIZE_SB8X8)
+#else
   if (bsize == BLOCK_SIZE_SB8X8 &&
       (xd->mode_info_context->mbmi.mode == SPLITMV ||
        xd->mode_info_context->mbmi.mode == I4X4_PRED))
-    decode_atom(pbi, xd, mi_row, mi_col, r, bsize);
+#endif
+    decode_atom(pbi, xd, mi_row, mi_col, r, BLOCK_SIZE_SB8X8);
   else
     decode_sb(pbi, xd, mi_row, mi_col, r, bsize);
 
@@ -439,7 +443,17 @@
   if (mi_row >= pc->mi_rows || mi_col >= pc->mi_cols)
     return;
 
+#if CONFIG_AB4X4
+  if (bsize < BLOCK_SIZE_SB8X8)
+    if (xd->ab_index != 0)
+      return;
+#endif
+
+#if CONFIG_AB4X4
+  if (bsize >= BLOCK_SIZE_SB8X8) {
+#else
   if (bsize > BLOCK_SIZE_SB8X8) {
+#endif
     int pl;
     // read the partition information
     xd->left_seg_context = pc->left_seg_context + (mi_row & MI_MASK);
@@ -451,6 +465,7 @@
   }
 
   subsize = get_subsize(bsize, partition);
+
   switch (partition) {
     case PARTITION_NONE:
       decode_modes_b(pbi, mi_row, mi_col, r, subsize);
@@ -468,12 +483,7 @@
     case PARTITION_SPLIT:
       for (n = 0; n < 4; n++) {
         int j = n >> 1, i = n & 0x01;
-        if (subsize == BLOCK_SIZE_SB32X32)
-          xd->sb_index = n;
-        else if (subsize == BLOCK_SIZE_MB16X16)
-          xd->mb_index = n;
-        else
-          xd->b_index = n;
+        *(get_sb_index(xd, subsize)) = n;
         decode_modes_sb(pbi, mi_row + j * bs, mi_col + i * bs, r, subsize);
       }
       break;
@@ -481,12 +491,16 @@
       assert(0);
   }
   // update partition context
-  if ((partition == PARTITION_SPLIT) && (bsize > BLOCK_SIZE_MB16X16))
-    return;
-
-  xd->left_seg_context = pc->left_seg_context + (mi_row & MI_MASK);
-  xd->above_seg_context = pc->above_seg_context + mi_col;
-  update_partition_context(xd, subsize, bsize);
+#if CONFIG_AB4X4
+  if (bsize >= BLOCK_SIZE_SB8X8 &&
+      (bsize == BLOCK_SIZE_SB8X8 || partition != PARTITION_SPLIT)) {
+#else
+  if (bsize > BLOCK_SIZE_SB8X8 &&
+      (bsize == BLOCK_SIZE_MB16X16 || partition != PARTITION_SPLIT)) {
+#endif
+    set_partition_seg_context(pc, xd, mi_row, mi_col);
+    update_partition_context(xd, subsize, bsize);
+  }
 }
 
 static void setup_token_decoder(VP9D_COMP *pbi,
@@ -811,12 +825,12 @@
   int mi_row, mi_col;
 
   for (mi_row = pc->cur_tile_mi_row_start;
-       mi_row < pc->cur_tile_mi_row_end; mi_row += 8) {
+       mi_row < pc->cur_tile_mi_row_end; mi_row += 64 / MI_SIZE) {
     // For a SB there are 2 left contexts, each pertaining to a MB row within
     vpx_memset(&pc->left_context, 0, sizeof(pc->left_context));
     vpx_memset(pc->left_seg_context, 0, sizeof(pc->left_seg_context));
     for (mi_col = pc->cur_tile_mi_col_start;
-         mi_col < pc->cur_tile_mi_col_end; mi_col += 8)
+         mi_col < pc->cur_tile_mi_col_end; mi_col += 64 / MI_SIZE)
       decode_modes_sb(pbi, mi_row, mi_col, r, BLOCK_SIZE_SB64X64);
   }
 }
@@ -1007,23 +1021,13 @@
     // Select active reference frames and calculate scaling factors
     for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {
       const int ref = vp9_read_literal(&header_bc, NUM_REF_FRAMES_LG2);
-      const int mapped_ref = pc->ref_frame_map[ref];
-      YV12_BUFFER_CONFIG *const fb = &pc->yv12_fb[mapped_ref];
-      struct scale_factors *const sf = &pc->active_ref_scale[i];
-
-      pc->active_ref_idx[i] = mapped_ref;
-      if (mapped_ref >= NUM_YV12_BUFFERS)
-        memset(sf, 0, sizeof(*sf));
-      else
-        vp9_setup_scale_factors_for_frame(sf,
-                                          fb->y_crop_width, fb->y_crop_height,
-                                          pc->width, pc->height);
+      pc->active_ref_idx[i] = pc->ref_frame_map[ref];
+      vp9_setup_scale_factors(pc, i);
     }
 
     // Read the sign bias for each reference frame buffer.
-    for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {
+    for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i)
       pc->ref_frame_sign_bias[i + 1] = vp9_read_bit(&header_bc);
-    }
 
     xd->allow_high_precision_mv = vp9_read_bit(&header_bc);
     pc->mcomp_filter_type = read_mcomp_filter_type(&header_bc);
@@ -1105,8 +1109,8 @@
 
     if (pc->frame_type != KEY_FRAME) {
       vp9_adapt_mode_probs(pc);
-      vp9_adapt_nmv_probs(pc, xd->allow_high_precision_mv);
       vp9_adapt_mode_context(pc);
+      vp9_adapt_nmv_probs(pc, xd->allow_high_precision_mv);
     }
   }
 
diff --git a/vp9/decoder/vp9_decodframe.h b/vp9/decoder/vp9_decodframe.h
index 3aaae65..00b6d67 100644
--- a/vp9/decoder/vp9_decodframe.h
+++ b/vp9/decoder/vp9_decodframe.h
@@ -13,7 +13,9 @@
 #define VP9_DECODER_VP9_DECODFRAME_H_
 
 struct VP9Common;
+struct VP9Decompressor;
 
 void vp9_init_dequantizer(struct VP9Common *pc);
+int vp9_decode_frame(struct VP9Decompressor *cpi, const uint8_t **p_data_end);
 
 #endif  // VP9_DECODER_VP9_DECODFRAME_H_
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index ce2a86b..22d3cf8 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -62,7 +62,7 @@
 
 #define INCREMENT_COUNT(token)               \
   do {                                       \
-    coef_counts[type][ref][get_coef_band(scan, txfm_size, c)] \
+    coef_counts[type][ref][band] \
                [pt][token]++;     \
     token_cache[scan[c]] = token; \
   } while (0)
@@ -76,12 +76,6 @@
     continue;                                            \
   }
 
-#define WRITE_COEF_ONE()                                 \
-{                                                        \
-  qcoeff_ptr[scan[c]] = vp9_read_and_apply_sign(br, 1);  \
-  INCREMENT_COUNT(ONE_TOKEN);                            \
-}
-
 #define ADJUST_COEF(prob, bits_count)  \
   do {                                 \
     if (vp9_read(r, prob))             \
@@ -96,6 +90,7 @@
   ENTROPY_CONTEXT above_ec, left_ec;
   FRAME_CONTEXT *const fc = &dx->common.fc;
   int pt, c = 0, pad, default_eob;
+  int band;
   vp9_coeff_probs *coef_probs;
   vp9_prob *prob;
   vp9_coeff_count *coef_counts;
@@ -103,6 +98,7 @@
   TX_TYPE tx_type = DCT_DCT;
   const int *scan, *nb;
   uint8_t token_cache[1024];
+  const uint8_t * band_translate;
 
   switch (txfm_size) {
     default:
@@ -115,6 +111,7 @@
       coef_probs  = fc->coef_probs_4x4;
       coef_counts = fc->coef_counts_4x4;
       default_eob = 16;
+      band_translate = vp9_coefband_trans_4x4;
       break;
     }
     case TX_8X8: {
@@ -130,6 +127,7 @@
       above_ec = (A[0] + A[1]) != 0;
       left_ec = (L[0] + L[1]) != 0;
       default_eob = 64;
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
     }
     case TX_16X16: {
@@ -145,6 +143,7 @@
       above_ec = (A[0] + A[1] + A[2] + A[3]) != 0;
       left_ec = (L[0] + L[1] + L[2] + L[3]) != 0;
       default_eob = 256;
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
     }
     case TX_32X32:
@@ -154,6 +153,7 @@
       above_ec = (A[0] + A[1] + A[2] + A[3] + A[4] + A[5] + A[6] + A[7]) != 0;
       left_ec = (L[0] + L[1] + L[2] + L[3] + L[4] + L[5] + L[6] + L[7]) != 0;
       default_eob = 1024;
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
   }
 
@@ -162,14 +162,13 @@
 
   while (1) {
     int val;
-    int band;
     const uint8_t *cat6 = cat6_prob;
     if (c >= seg_eob)
       break;
     if (c)
       pt = vp9_get_coef_context(scan, nb, pad, token_cache,
                                 c, default_eob);
-    band = get_coef_band(scan, txfm_size, c);
+    band = get_coef_band(band_translate, c);
     prob = coef_probs[type][ref][band][pt];
     fc->eob_branch_counts[txfm_size][type][ref][band][pt]++;
     if (!vp9_read(r, prob[EOB_CONTEXT_NODE]))
@@ -181,8 +180,9 @@
     if (c)
       pt = vp9_get_coef_context(scan, nb, pad, token_cache,
                                 c, default_eob);
-    band = get_coef_band(scan, txfm_size, c);
+    band = get_coef_band(band_translate, c);
     prob = coef_probs[type][ref][band][pt];
+
     if (!vp9_read(r, prob[ZERO_CONTEXT_NODE])) {
       INCREMENT_COUNT(ZERO_TOKEN);
       ++c;
@@ -249,8 +249,7 @@
   }
 
   if (c < seg_eob)
-    coef_counts[type][ref][get_coef_band(scan, txfm_size, c)]
-        [pt][DCT_EOB_TOKEN]++;
+    coef_counts[type][ref][band][pt][DCT_EOB_TOKEN]++;
 
   for (pt = 0; pt < (1 << txfm_size); pt++) {
     A[pt] = L[pt] = c > 0;
diff --git a/vp9/decoder/vp9_idct_blk.c b/vp9/decoder/vp9_idct_blk.c
index 3480df2..bc943fa 100644
--- a/vp9/decoder/vp9_idct_blk.c
+++ b/vp9/decoder/vp9_idct_blk.c
@@ -105,14 +105,6 @@
   add_residual(diff, dest, stride, 8, 8);
 }
 
-void vp9_add_residual_16x16_c(const int16_t *diff, uint8_t *dest, int stride) {
-  add_residual(diff, dest, stride, 16, 16);
-}
-
-void vp9_add_residual_32x32_c(const int16_t *diff, uint8_t *dest, int stride) {
-  add_residual(diff, dest, stride, 32, 32);
-}
-
 static void add_constant_residual(const int16_t diff, uint8_t *dest, int stride,
                                   int width, int height) {
   int r, c;
@@ -264,19 +256,14 @@
   if (tx_type == DCT_DCT) {
     vp9_idct_add_16x16(input, dest, stride, eob);
   } else {
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output, 256);
-
     if (eob > 0) {
-      vp9_short_iht16x16(input, output, 16, tx_type);
+      vp9_short_iht16x16_add(input, dest, stride, tx_type);
       vpx_memset(input, 0, 512);
-      vp9_add_residual_16x16(output, dest, stride);
     }
   }
 }
 
 void vp9_idct_add_16x16_c(int16_t *input, uint8_t *dest, int stride, int eob) {
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 256);
-
   /* The calculation can be simplified if there are not many non-zero dct
    * coefficients. Use eobs to separate different cases. */
   if (eob) {
@@ -292,21 +279,15 @@
       vp9_add_constant_residual_16x16(out, dest, stride);
 #if !CONFIG_SCATTERSCAN
     } else if (eob <= 10) {
-      // the idct halves ( >> 1) the pitch
-      vp9_short_idct10_16x16(input, output, 32);
-
+      vp9_short_idct10_16x16_add(input, dest, stride);
       input[0] = input[1] = input[2] = input[3] = 0;
       input[16] = input[17] = input[18] = 0;
       input[32] = input[33] = 0;
       input[48] = 0;
-
-      vp9_add_residual_16x16(output, dest, stride);
 #endif
     } else {
-      // the idct halves ( >> 1) the pitch
-      vp9_short_idct16x16(input, output, 16 << 1);
+      vp9_short_idct16x16_add(input, dest, stride);
       vpx_memset(input, 0, 512);
-      vp9_add_residual_16x16(output, dest, stride);
     }
   }
 }
@@ -321,20 +302,16 @@
       input[0] = 0;
 #if !CONFIG_SCATTERSCAN
     } else if (eob <= 10) {
-      // the idct halves ( >> 1) the pitch
-      vp9_short_idct10_32x32(input, output, 64);
-
+      vp9_short_idct10_32x32_add_c(input, dest, stride);
       input[0] = input[1] = input[2] = input[3] = 0;
       input[32] = input[33] = input[34] = 0;
       input[64] = input[65] = 0;
       input[96] = 0;
 
-      vp9_add_residual_32x32(output, dest, stride);
 #endif
     } else {
-      vp9_short_idct32x32(input, output, 64);
+      vp9_short_idct32x32_add(input, dest, stride);
       vpx_memset(input, 0, 2048);
-      vp9_add_residual_32x32(output, dest, stride);
     }
   }
 }
diff --git a/vp9/decoder/vp9_onyxd_int.h b/vp9/decoder/vp9_onyxd_int.h
index a7d444e..8698570 100644
--- a/vp9/decoder/vp9_onyxd_int.h
+++ b/vp9/decoder/vp9_onyxd_int.h
@@ -41,8 +41,6 @@
   int initial_height;
 } VP9D_COMP;
 
-int vp9_decode_frame(VP9D_COMP *cpi, const uint8_t **p_data_end);
-
 
 #if CONFIG_DEBUG
 #define CHECK_MEM_ERROR(lval,expr) do {\
diff --git a/vp9/decoder/x86/vp9_dequantize_sse2.c b/vp9/decoder/x86/vp9_dequantize_sse2.c
index 1296b70..796fc12 100644
--- a/vp9/decoder/x86/vp9_dequantize_sse2.c
+++ b/vp9/decoder/x86/vp9_dequantize_sse2.c
@@ -122,124 +122,6 @@
   _mm_storel_epi64((__m128i *)(dest + 7 * stride), p6);
 }
 
-void vp9_add_residual_16x16_sse2(const int16_t *diff, uint8_t *dest,
-                                 int stride) {
-  const int width = 16;
-  int i = 4;
-  const __m128i zero = _mm_setzero_si128();
-
-  // Diff data
-  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
-  __m128i p0, p1, p2, p3, p4, p5, p6, p7;
-
-  do {
-    d0 = _mm_load_si128((const __m128i *)(diff + 0 * width));
-    d1 = _mm_load_si128((const __m128i *)(diff + 0 * width + 8));
-    d2 = _mm_load_si128((const __m128i *)(diff + 1 * width));
-    d3 = _mm_load_si128((const __m128i *)(diff + 1 * width + 8));
-    d4 = _mm_load_si128((const __m128i *)(diff + 2 * width));
-    d5 = _mm_load_si128((const __m128i *)(diff + 2 * width + 8));
-    d6 = _mm_load_si128((const __m128i *)(diff + 3 * width));
-    d7 = _mm_load_si128((const __m128i *)(diff + 3 * width + 8));
-
-    // Prediction data.
-    p1 = _mm_load_si128((const __m128i *)(dest + 0 * stride));
-    p3 = _mm_load_si128((const __m128i *)(dest + 1 * stride));
-    p5 = _mm_load_si128((const __m128i *)(dest + 2 * stride));
-    p7 = _mm_load_si128((const __m128i *)(dest + 3 * stride));
-
-    p0 = _mm_unpacklo_epi8(p1, zero);
-    p1 = _mm_unpackhi_epi8(p1, zero);
-    p2 = _mm_unpacklo_epi8(p3, zero);
-    p3 = _mm_unpackhi_epi8(p3, zero);
-    p4 = _mm_unpacklo_epi8(p5, zero);
-    p5 = _mm_unpackhi_epi8(p5, zero);
-    p6 = _mm_unpacklo_epi8(p7, zero);
-    p7 = _mm_unpackhi_epi8(p7, zero);
-
-    p0 = _mm_add_epi16(p0, d0);
-    p1 = _mm_add_epi16(p1, d1);
-    p2 = _mm_add_epi16(p2, d2);
-    p3 = _mm_add_epi16(p3, d3);
-    p4 = _mm_add_epi16(p4, d4);
-    p5 = _mm_add_epi16(p5, d5);
-    p6 = _mm_add_epi16(p6, d6);
-    p7 = _mm_add_epi16(p7, d7);
-
-    p0 = _mm_packus_epi16(p0, p1);
-    p1 = _mm_packus_epi16(p2, p3);
-    p2 = _mm_packus_epi16(p4, p5);
-    p3 = _mm_packus_epi16(p6, p7);
-
-    _mm_store_si128((__m128i *)(dest + 0 * stride), p0);
-    _mm_store_si128((__m128i *)(dest + 1 * stride), p1);
-    _mm_store_si128((__m128i *)(dest + 2 * stride), p2);
-    _mm_store_si128((__m128i *)(dest + 3 * stride), p3);
-
-    diff += 4 * width;
-    dest += 4 * stride;
-  } while (--i);
-}
-
-void vp9_add_residual_32x32_sse2(const int16_t *diff, uint8_t *dest,
-                                 int stride) {
-  const int width = 32;
-  int i = 16;
-  const __m128i zero = _mm_setzero_si128();
-
-  // Diff data
-  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
-  __m128i p0, p1, p2, p3, p4, p5, p6, p7;
-
-  do {
-    d0 = _mm_load_si128((const __m128i *)(diff + 0 * width));
-    d1 = _mm_load_si128((const __m128i *)(diff + 0 * width + 8));
-    d2 = _mm_load_si128((const __m128i *)(diff + 0 * width + 16));
-    d3 = _mm_load_si128((const __m128i *)(diff + 0 * width + 24));
-    d4 = _mm_load_si128((const __m128i *)(diff + 1 * width));
-    d5 = _mm_load_si128((const __m128i *)(diff + 1 * width + 8));
-    d6 = _mm_load_si128((const __m128i *)(diff + 1 * width + 16));
-    d7 = _mm_load_si128((const __m128i *)(diff + 1 * width + 24));
-
-    // Prediction data.
-    p1 = _mm_load_si128((const __m128i *)(dest + 0 * stride));
-    p3 = _mm_load_si128((const __m128i *)(dest + 0 * stride + 16));
-    p5 = _mm_load_si128((const __m128i *)(dest + 1 * stride));
-    p7 = _mm_load_si128((const __m128i *)(dest + 1 * stride + 16));
-
-    p0 = _mm_unpacklo_epi8(p1, zero);
-    p1 = _mm_unpackhi_epi8(p1, zero);
-    p2 = _mm_unpacklo_epi8(p3, zero);
-    p3 = _mm_unpackhi_epi8(p3, zero);
-    p4 = _mm_unpacklo_epi8(p5, zero);
-    p5 = _mm_unpackhi_epi8(p5, zero);
-    p6 = _mm_unpacklo_epi8(p7, zero);
-    p7 = _mm_unpackhi_epi8(p7, zero);
-
-    p0 = _mm_add_epi16(p0, d0);
-    p1 = _mm_add_epi16(p1, d1);
-    p2 = _mm_add_epi16(p2, d2);
-    p3 = _mm_add_epi16(p3, d3);
-    p4 = _mm_add_epi16(p4, d4);
-    p5 = _mm_add_epi16(p5, d5);
-    p6 = _mm_add_epi16(p6, d6);
-    p7 = _mm_add_epi16(p7, d7);
-
-    p0 = _mm_packus_epi16(p0, p1);
-    p1 = _mm_packus_epi16(p2, p3);
-    p2 = _mm_packus_epi16(p4, p5);
-    p3 = _mm_packus_epi16(p6, p7);
-
-    _mm_store_si128((__m128i *)(dest + 0 * stride), p0);
-    _mm_store_si128((__m128i *)(dest + 0 * stride + 16), p1);
-    _mm_store_si128((__m128i *)(dest + 1 * stride), p2);
-    _mm_store_si128((__m128i *)(dest + 1 * stride + 16), p3);
-
-    diff += 2 * width;
-    dest += 2 * stride;
-  } while (--i);
-}
-
 void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest,
                                         int stride) {
   uint8_t abs_diff;
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index da0bb21..3985451 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -629,12 +629,21 @@
     active_section = 6;
 #endif
 
+#if CONFIG_AB4X4
+    if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8)
+      write_sb_ymode(bc, mode, pc->fc.sb_ymode_prob);
+#else
     if (m->mbmi.sb_type > BLOCK_SIZE_SB8X8)
       write_sb_ymode(bc, mode, pc->fc.sb_ymode_prob);
     else
       write_ymode(bc, mode, pc->fc.ymode_prob);
+#endif
 
+#if CONFIG_AB4X4
+    if (m->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
+#else
     if (mode == I4X4_PRED) {
+#endif
       int j = 0;
       do {
         write_bmode(bc, m->bmi[j].as_mode.first,
@@ -654,11 +663,16 @@
 
     // If segment skip is not enabled code the mode.
     if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {
+#if CONFIG_AB4X4
+      if (mi->sb_type >= BLOCK_SIZE_SB8X8)
+        write_sb_mv_ref(bc, mode, mv_ref_p);
+#else
       if (mi->sb_type > BLOCK_SIZE_SB8X8) {
         write_sb_mv_ref(bc, mode, mv_ref_p);
       } else {
         write_mv_ref(bc, mode, mv_ref_p);
       }
+#endif
       vp9_accum_mv_refs(&cpi->common, mode, mi->mb_mode_context[rf]);
     }
 
@@ -744,11 +758,20 @@
     }
   }
 
+#if CONFIG_AB4X4
+  if (((rf == INTRA_FRAME && mi->sb_type >= BLOCK_SIZE_SB8X8) ||
+       (rf != INTRA_FRAME && mi->sb_type >= BLOCK_SIZE_SB8X8)) &&
+      pc->txfm_mode == TX_MODE_SELECT &&
+      !(skip_coeff || vp9_segfeature_active(xd, segment_id,
+                                            SEG_LVL_SKIP)))
+#else
   if (((rf == INTRA_FRAME && mode != I4X4_PRED) ||
        (rf != INTRA_FRAME && mode != SPLITMV)) &&
       pc->txfm_mode == TX_MODE_SELECT &&
       !(skip_coeff || vp9_segfeature_active(xd, segment_id,
-                                            SEG_LVL_SKIP))) {
+                                            SEG_LVL_SKIP)))
+#endif
+  {
     TX_SIZE sz = mi->txfm_size;
     // FIXME(rbultje) code ternary symbol once all experiments are merged
     vp9_write(bc, sz != TX_4X4, pc->prob_tx[0]);
@@ -780,12 +803,21 @@
     vp9_write(bc, skip_coeff, vp9_get_pred_prob(c, xd, PRED_MBSKIP));
   }
 
+#if CONFIG_AB4X4
+  if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8)
+    sb_kfwrite_ymode(bc, ym, c->sb_kf_ymode_prob[c->kf_ymode_probs_index]);
+#else
   if (m->mbmi.sb_type > BLOCK_SIZE_SB8X8)
     sb_kfwrite_ymode(bc, ym, c->sb_kf_ymode_prob[c->kf_ymode_probs_index]);
   else
     kfwrite_ymode(bc, ym, c->kf_ymode_prob[c->kf_ymode_probs_index]);
+#endif
 
+#if CONFIG_AB4X4
+  if (m->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
+#else
   if (ym == I4X4_PRED) {
+#endif
     int i = 0;
     do {
       const B_PREDICTION_MODE a = above_block_mode(m, i, mis);
@@ -803,8 +835,13 @@
 
   write_uv_mode(bc, m->mbmi.uv_mode, c->kf_uv_mode_prob[ym]);
 
+#if CONFIG_AB4X4
+  if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8 && c->txfm_mode == TX_MODE_SELECT &&
+      !(skip_coeff || vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP))) {
+#else
   if (ym != I4X4_PRED && c->txfm_mode == TX_MODE_SELECT &&
       !(skip_coeff || vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP))) {
+#endif
     TX_SIZE sz = m->mbmi.txfm_size;
     // FIXME(rbultje) code ternary symbol once all experiments are merged
     vp9_write(bc, sz != TX_4X4, c->prob_tx[0]);
@@ -876,7 +913,19 @@
   else
     assert(0);
 
+#if CONFIG_AB4X4
+  if (bsize == BLOCK_SIZE_SB8X8 && m->mbmi.sb_type < BLOCK_SIZE_SB8X8)
+    partition = PARTITION_SPLIT;
+  if (bsize < BLOCK_SIZE_SB8X8)
+    if (xd->ab_index != 0)
+      return;
+#endif
+
+#if CONFIG_AB4X4
+  if (bsize >= BLOCK_SIZE_SB8X8) {
+#else
   if (bsize > BLOCK_SIZE_SB8X8) {
+#endif
     int pl;
     xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK);
     xd->above_seg_context = cm->above_seg_context + mi_col;
@@ -905,6 +954,7 @@
     case PARTITION_SPLIT:
       for (n = 0; n < 4; n++) {
         int j = n >> 1, i = n & 0x01;
+        *(get_sb_index(xd, subsize)) = n;
         write_modes_sb(cpi, m + j * bs * mis + i * bs, bc, tok, tok_end,
                        mi_row + j * bs, mi_col + i * bs, subsize);
       }
@@ -914,12 +964,16 @@
   }
 
   // update partition context
-  if ((partition == PARTITION_SPLIT) && (bsize > BLOCK_SIZE_MB16X16))
-    return;
-
-  xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK);
-  xd->above_seg_context = cm->above_seg_context + mi_col;
-  update_partition_context(xd, subsize, bsize);
+#if CONFIG_AB4X4
+  if (bsize >= BLOCK_SIZE_SB8X8 &&
+      (bsize == BLOCK_SIZE_SB8X8 || partition != PARTITION_SPLIT)) {
+#else
+  if (bsize > BLOCK_SIZE_SB8X8 &&
+      (bsize == BLOCK_SIZE_MB16X16 || partition != PARTITION_SPLIT)) {
+#endif
+    set_partition_seg_context(cm, xd, mi_row, mi_col);
+    update_partition_context(xd, subsize, bsize);
+  }
 }
 
 static void write_modes(VP9_COMP *cpi, vp9_writer* const bc,
@@ -1242,16 +1296,6 @@
 FILE *vpxlogc = 0;
 #endif
 
-static void put_delta_q(vp9_writer *bc, int delta_q) {
-  if (delta_q != 0) {
-    vp9_write_bit(bc, 1);
-    vp9_write_literal(bc, abs(delta_q), 4);
-    vp9_write_bit(bc, delta_q < 0);
-  } else {
-    vp9_write_bit(bc, 0);
-  }
-}
-
 static void decide_kf_ymode_entropy(VP9_COMP *cpi) {
   int mode_cost[MB_MODE_COUNT];
   int bestcost = INT_MAX;
@@ -1298,9 +1342,21 @@
   }
 }
 
-static void encode_loopfilter(MACROBLOCKD *xd, vp9_writer *w) {
+static void encode_loopfilter(VP9_COMMON *pc, MACROBLOCKD *xd, vp9_writer *w) {
   int i;
 
+  // Encode the loop filter level and type
+  vp9_write_literal(w, pc->filter_level, 6);
+  vp9_write_literal(w, pc->sharpness_level, 3);
+#if CONFIG_LOOP_DERING
+  if (pc->dering_enabled) {
+    vp9_write_bit(w, 1);
+    vp9_write_literal(w, pc->dering_enabled - 1, 4);
+  } else {
+    vp9_write_bit(w, 0);
+  }
+#endif
+
   // Write out loop filter deltas applied at the MB level based on mode or
   // ref frame (if they are enabled).
   vp9_write_bit(w, xd->mode_ref_lf_delta_enabled);
@@ -1354,6 +1410,24 @@
   }
 }
 
+static void put_delta_q(vp9_writer *bc, int delta_q) {
+  if (delta_q != 0) {
+    vp9_write_bit(bc, 1);
+    vp9_write_literal(bc, abs(delta_q), 4);
+    vp9_write_bit(bc, delta_q < 0);
+  } else {
+    vp9_write_bit(bc, 0);
+  }
+}
+
+static void encode_quantization(VP9_COMMON *pc, vp9_writer *w) {
+  vp9_write_literal(w, pc->base_qindex, QINDEX_BITS);
+  put_delta_q(w, pc->y_dc_delta_q);
+  put_delta_q(w, pc->uv_dc_delta_q);
+  put_delta_q(w, pc->uv_ac_delta_q);
+}
+
+
 static void encode_segmentation(VP9_COMP *cpi, vp9_writer *w) {
   int i, j;
   VP9_COMMON *const pc = &cpi->common;
@@ -1495,27 +1569,9 @@
   // lossless mode: note this needs to be before loopfilter
   vp9_write_bit(&header_bc, cpi->mb.e_mbd.lossless);
 
-  // Encode the loop filter level and type
-  vp9_write_literal(&header_bc, pc->filter_level, 6);
-  vp9_write_literal(&header_bc, pc->sharpness_level, 3);
-#if CONFIG_LOOP_DERING
-  if (pc->dering_enabled) {
-    vp9_write_bit(&header_bc, 1);
-    vp9_write_literal(&header_bc, pc->dering_enabled - 1, 4);
-  } else {
-    vp9_write_bit(&header_bc, 0);
-  }
-#endif
+  encode_loopfilter(pc, xd, &header_bc);
 
-  encode_loopfilter(xd, &header_bc);
-
-  // Frame Q baseline quantizer index
-  vp9_write_literal(&header_bc, pc->base_qindex, QINDEX_BITS);
-
-  // Transmit Dc, Second order and Uv quantizer delta information
-  put_delta_q(&header_bc, pc->y_dc_delta_q);
-  put_delta_q(&header_bc, pc->uv_dc_delta_q);
-  put_delta_q(&header_bc, pc->uv_ac_delta_q);
+  encode_quantization(pc, &header_bc);
 
   // When there is a key frame all reference buffers are updated using the new key frame
   if (pc->frame_type != KEY_FRAME) {
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 6bc42c7..d3851b4 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -138,9 +138,14 @@
 
   int optimize;
 
-  // Structure to hold context for each of the 4 MBs within a SB:
-  // when encoded as 4 independent MBs:
-  PICK_MODE_CONTEXT sb8_context[4][4][4];
+  // TODO(jingning): Need to refactor the structure arrays that buffers the
+  // coding mode decisions of each partition type.
+#if CONFIG_AB4X4
+  PICK_MODE_CONTEXT ab4x4_context[4][4][4];
+  PICK_MODE_CONTEXT sb8x4_context[4][4][4];
+  PICK_MODE_CONTEXT sb4x8_context[4][4][4];
+#endif
+  PICK_MODE_CONTEXT sb8x8_context[4][4][4];
   PICK_MODE_CONTEXT sb8x16_context[4][4][2];
   PICK_MODE_CONTEXT sb16x8_context[4][4][2];
   PICK_MODE_CONTEXT mb_context[4][4];
@@ -153,6 +158,13 @@
   PICK_MODE_CONTEXT sb64_context;
   int partition_cost[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];
 
+#if CONFIG_AB4X4
+  BLOCK_SIZE_TYPE b_partitioning[4][4][4];
+#endif
+  BLOCK_SIZE_TYPE mb_partitioning[4][4];
+  BLOCK_SIZE_TYPE sb_partitioning[4];
+  BLOCK_SIZE_TYPE sb64_partitioning;
+
   void (*fwd_txm4x4)(int16_t *input, int16_t *output, int pitch);
   void (*fwd_txm8x4)(int16_t *input, int16_t *output, int pitch);
   void (*fwd_txm8x8)(int16_t *input, int16_t *output, int pitch);
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 418f60e..954eefa 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -361,8 +361,8 @@
   assert(mb_mode_index < MAX_MODES);
   assert(mi->mbmi.ref_frame < MAX_REF_FRAMES);
 #endif
-  assert(mi->mbmi.sb_type == bsize);
 
+  assert(mi->mbmi.sb_type == bsize);
   // Restore the coding context of the MB to that that was in place
   // when the mode was picked for it
   for (y = 0; y < bh; y++) {
@@ -539,15 +539,6 @@
                    x->e_mbd.plane[2].subsampling_y);
 }
 
-static INLINE void set_partition_seg_context(VP9_COMP *cpi,
-                                             int mi_row, int mi_col) {
-  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-
-  xd->above_seg_context = cm->above_seg_context + mi_col;
-  xd->left_seg_context  = cm->left_seg_context + (mi_row & MI_MASK);
-}
-
 static void set_offsets(VP9_COMP *cpi,
                         int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize) {
   MACROBLOCK *const x = &cpi->mb;
@@ -571,7 +562,7 @@
   }
 
   // partition contexts
-  set_partition_seg_context(cpi, mi_row, mi_col);
+  set_partition_seg_context(cm, xd, mi_row, mi_col);
 
   // Activity map pointer
   x->mb_activity_ptr = &cpi->mb_activity_map[idx_map];
@@ -649,6 +640,12 @@
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
 
+#if CONFIG_AB4X4
+  if (bsize < BLOCK_SIZE_SB8X8)
+    if (xd->ab_index != 0)
+      return;
+#endif
+
   set_offsets(cpi, mi_row, mi_col, bsize);
   xd->mode_info_context->mbmi.sb_type = bsize;
   if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
@@ -727,10 +724,20 @@
   } else if (bsize >= BLOCK_SIZE_MB16X16) {
     xd->mb_index = idx;
   } else {
+#if CONFIG_AB4X4
+    if (bsize >= BLOCK_SIZE_SB8X8)
+      xd->b_index = idx;
+    else
+      xd->ab_index = idx;
+#else
     xd->b_index = idx;
+#endif
   }
 }
 
+// TODO(jingning): the variables used here are little complicated. need further
+// refactoring on organizing the the temporary buffers, when recursive
+// partition down to 4x4 block size is enabled.
 static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x,
                                             BLOCK_SIZE_TYPE bsize) {
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -755,13 +762,71 @@
     case BLOCK_SIZE_SB8X16:
       return &x->sb8x16_context[xd->sb_index][xd->mb_index][xd->b_index];
     case BLOCK_SIZE_SB8X8:
-      return &x->sb8_context[xd->sb_index][xd->mb_index][xd->b_index];
+      return &x->sb8x8_context[xd->sb_index][xd->mb_index][xd->b_index];
+#if CONFIG_AB4X4
+    case BLOCK_SIZE_SB8X4:
+      return &x->sb8x4_context[xd->sb_index][xd->mb_index][xd->b_index];
+    case BLOCK_SIZE_SB4X8:
+      return &x->sb4x8_context[xd->sb_index][xd->mb_index][xd->b_index];
+    case BLOCK_SIZE_AB4X4:
+      return &x->ab4x4_context[xd->sb_index][xd->mb_index][xd->b_index];
+#endif
     default:
       assert(0);
       return NULL;
   }
 }
 
+static BLOCK_SIZE_TYPE *get_sb_partitioning(MACROBLOCK *x,
+                                            BLOCK_SIZE_TYPE bsize) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  switch (bsize) {
+    case BLOCK_SIZE_SB64X64:
+      return &x->sb64_partitioning;
+    case BLOCK_SIZE_SB32X32:
+      return &x->sb_partitioning[xd->sb_index];
+    case BLOCK_SIZE_MB16X16:
+      return &x->mb_partitioning[xd->sb_index][xd->mb_index];
+#if CONFIG_AB4X4
+    case BLOCK_SIZE_SB8X8:
+      return &x->b_partitioning[xd->sb_index][xd->mb_index][xd->b_index];
+#endif
+    default:
+      assert(0);
+      return NULL;
+  }
+}
+
+static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col,
+                            ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
+                            ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
+                            PARTITION_CONTEXT sa[8],
+                            PARTITION_CONTEXT sl[8],
+                            BLOCK_SIZE_TYPE bsize) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int p;
+  int bwl = b_width_log2(bsize), bw = 1 << bwl;
+  int bhl = b_height_log2(bsize), bh = 1 << bhl;
+  int mwl = mi_width_log2(bsize), mw = 1 << mwl;
+  int mhl = mi_height_log2(bsize), mh = 1 << mhl;
+  for (p = 0; p < MAX_MB_PLANE; p++) {
+    vpx_memcpy(cm->above_context[p] +
+               ((mi_col * 2) >> xd->plane[p].subsampling_x),
+               a + bw * p,
+               sizeof(ENTROPY_CONTEXT) * bw >> xd->plane[p].subsampling_x);
+    vpx_memcpy(cm->left_context[p] +
+               ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
+               l + bh * p,
+               sizeof(ENTROPY_CONTEXT) * bh >> xd->plane[p].subsampling_y);
+  }
+  vpx_memcpy(cm->above_seg_context + mi_col, sa,
+             sizeof(PARTITION_CONTEXT) * mw);
+  vpx_memcpy(cm->left_seg_context + (mi_row & MI_MASK), sl,
+             sizeof(PARTITION_CONTEXT) * mh);
+}
+
 static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp,
                      int mi_row, int mi_col, int output_enabled,
                      BLOCK_SIZE_TYPE bsize, int sub_index) {
@@ -788,28 +853,45 @@
 
 static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp,
                       int mi_row, int mi_col, int output_enabled,
-                      BLOCK_SIZE_TYPE level,
-                      BLOCK_SIZE_TYPE c1, BLOCK_SIZE_TYPE c2[4],
-                      BLOCK_SIZE_TYPE c3[4][4]
-                      ) {
+                      BLOCK_SIZE_TYPE bsize) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  const int bsl = mi_width_log2(level), bs = 1 << (bsl - 1);
-  const int bwl = mi_width_log2(c1), bhl = mi_height_log2(c1);
+  BLOCK_SIZE_TYPE c1 = BLOCK_SIZE_SB8X8;
+  const int bsl = mi_width_log2(bsize), bs = (1 << bsl) / 2;
+  int bwl, bhl;
   int UNINITIALIZED_IS_SAFE(pl);
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  if (level > BLOCK_SIZE_SB8X8) {
-    set_partition_seg_context(cpi, mi_row, mi_col);
-    pl = partition_plane_context(xd, level);
+#if CONFIG_AB4X4
+  c1 = BLOCK_SIZE_AB4X4;
+  if (bsize >= BLOCK_SIZE_SB8X8)
+#else
+  if (bsize > BLOCK_SIZE_SB8X8)
+#endif
+  {
+    set_partition_seg_context(cm, xd, mi_row, mi_col);
+    pl = partition_plane_context(xd, bsize);
+    c1 = *(get_sb_partitioning(x, bsize));
   }
 
+  bwl = mi_width_log2(c1), bhl = mi_height_log2(c1);
+
   if (bsl == bwl && bsl == bhl) {
-    if (output_enabled && level > BLOCK_SIZE_SB8X8)
+#if CONFIG_AB4X4
+    if (output_enabled && bsize >= BLOCK_SIZE_SB8X8) {
+      if (bsize > BLOCK_SIZE_SB8X8 ||
+          (bsize == BLOCK_SIZE_SB8X8 && c1 == bsize))
+        cpi->partition_count[pl][PARTITION_NONE]++;
+      else
+        cpi->partition_count[pl][PARTITION_SPLIT]++;
+    }
+#else
+    if (output_enabled && bsize > BLOCK_SIZE_SB8X8)
       cpi->partition_count[pl][PARTITION_NONE]++;
+#endif
     encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, -1);
   } else if (bsl == bhl && bsl > bwl) {
     if (output_enabled)
@@ -826,14 +908,7 @@
     int i;
 
     assert(bwl < bsl && bhl < bsl);
-    if (level == BLOCK_SIZE_SB64X64) {
-      subsize = BLOCK_SIZE_SB32X32;
-    } else if (level == BLOCK_SIZE_SB32X32) {
-      subsize = BLOCK_SIZE_MB16X16;
-    } else {
-      assert(level == BLOCK_SIZE_MB16X16);
-      subsize = BLOCK_SIZE_SB8X8;
-    }
+    subsize = get_subsize(bsize, PARTITION_SPLIT);
 
     if (output_enabled)
       cpi->partition_count[pl][PARTITION_SPLIT]++;
@@ -843,26 +918,226 @@
 
       set_block_index(xd, i, subsize);
       encode_sb(cpi, tp, mi_row + y_idx * bs, mi_col + x_idx * bs,
-                output_enabled, subsize,
-                c2 ? c2[i] : c1, c3 ? c3[i] : NULL, NULL);
+                output_enabled, subsize);
     }
   }
 
-  if (level > BLOCK_SIZE_SB8X8 &&
-      (level == BLOCK_SIZE_MB16X16 || bsl == bwl || bsl == bhl)) {
-    set_partition_seg_context(cpi, mi_row, mi_col);
-    update_partition_context(xd, c1, level);
+#if CONFIG_AB4X4
+  if (bsize >= BLOCK_SIZE_SB8X8 &&
+      (bsize == BLOCK_SIZE_SB8X8 || bsl == bwl || bsl == bhl)) {
+#else
+  if (bsize > BLOCK_SIZE_SB8X8 &&
+      (bsize == BLOCK_SIZE_MB16X16 || bsl == bwl || bsl == bhl)) {
+#endif
+    set_partition_seg_context(cm, xd, mi_row, mi_col);
+    update_partition_context(xd, c1, bsize);
   }
 }
 
-static void encode_sb_row(VP9_COMP *cpi,
-                          int mi_row,
-                          TOKENEXTRA **tp,
-                          int *totalrate) {
+
+// TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
+// unlikely to be selected depending on previously rate-distortion optimization
+// results, for encoding speed-up.
+static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp,
+                              int mi_row, int mi_col,
+                              BLOCK_SIZE_TYPE bsize,
+                              int *rate, int *dist) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  int mi_col, pl;
+  int bsl = b_width_log2(bsize), bs = 1 << bsl;
+  int ms = bs / 2;
+  ENTROPY_CONTEXT   l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
+  PARTITION_CONTEXT sl[8], sa[8];
+  TOKENEXTRA *tp_orig = *tp;
+  int i, p, pl;
+  BLOCK_SIZE_TYPE subsize;
+  int srate = INT_MAX, sdist = INT_MAX;
+
+#if CONFIG_AB4X4
+  if (bsize < BLOCK_SIZE_SB8X8)
+    if (xd->ab_index != 0) {
+      *rate = 0;
+      *dist = 0;
+      return;
+    }
+#endif
+
+  assert(mi_height_log2(bsize) == mi_width_log2(bsize));
+
+  // buffer the above/left context information of the block in search.
+  for (p = 0; p < MAX_MB_PLANE; ++p) {
+    vpx_memcpy(a + bs * p, cm->above_context[p] +
+               (mi_col * 2 >> xd->plane[p].subsampling_x),
+               sizeof(ENTROPY_CONTEXT) * bs >> xd->plane[p].subsampling_x);
+    vpx_memcpy(l + bs * p, cm->left_context[p] +
+               ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
+               sizeof(ENTROPY_CONTEXT) * bs >> xd->plane[p].subsampling_y);
+  }
+  vpx_memcpy(sa, cm->above_seg_context + mi_col,
+             sizeof(PARTITION_CONTEXT) * ms);
+  vpx_memcpy(sl, cm->left_seg_context + (mi_row & MI_MASK),
+             sizeof(PARTITION_CONTEXT) * ms);
+
+  // PARTITION_SPLIT
+#if CONFIG_AB4X4
+  if (bsize >= BLOCK_SIZE_SB8X8) {
+#else
+  if (bsize >= BLOCK_SIZE_MB16X16) {
+#endif
+    int r4 = 0, d4 = 0;
+    subsize = get_subsize(bsize, PARTITION_SPLIT);
+    *(get_sb_partitioning(x, bsize)) = subsize;
+
+    for (i = 0; i < 4; ++i) {
+      int x_idx = (i & 1) * (ms >> 1);
+      int y_idx = (i >> 1) * (ms >> 1);
+      int r, d;
+
+      if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
+        continue;
+
+      *(get_sb_index(xd, subsize)) = i;
+      rd_pick_partition(cpi, tp, mi_row + y_idx, mi_col + x_idx, subsize,
+                        &r, &d);
+
+      r4 += r;
+      d4 += d;
+    }
+    set_partition_seg_context(cm, xd, mi_row, mi_col);
+    pl = partition_plane_context(xd, bsize);
+#if CONFIG_AB4X4
+    if (r4 < INT_MAX)
+      r4 += x->partition_cost[pl][PARTITION_SPLIT];
+#else
+    r4 += x->partition_cost[pl][PARTITION_SPLIT];
+#endif
+    assert(r4 >= 0);
+    assert(d4 >= 0);
+    srate = r4;
+    sdist = d4;
+    restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+  }
+
+  // TODO(jingning): need to enable 4x8 and 8x4 partition coding
+  // PARTITION_HORZ
+  if ((mi_col + ms <= cm->mi_cols) && (mi_row + (ms >> 1) <= cm->mi_rows) &&
+      (bsize >= BLOCK_SIZE_MB16X16)) {
+    int r2, d2;
+    int mb_skip = 0;
+    subsize = get_subsize(bsize, PARTITION_HORZ);
+    *(get_sb_index(xd, subsize)) = 0;
+    pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize,
+                  get_block_context(x, subsize));
+
+    if (mi_row + ms <= cm->mi_rows) {
+      int r, d;
+      update_state(cpi, get_block_context(x, subsize), subsize, 0);
+      encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
+      *(get_sb_index(xd, subsize)) = 1;
+      pick_sb_modes(cpi, mi_row + (ms >> 1), mi_col, tp, &r, &d, subsize,
+                    get_block_context(x, subsize));
+      r2 += r;
+      d2 += d;
+    } else {
+      if (mi_row + (ms >> 1) != cm->mi_rows)
+        mb_skip = 1;
+    }
+    set_partition_seg_context(cm, xd, mi_row, mi_col);
+    pl = partition_plane_context(xd, bsize);
+    r2 += x->partition_cost[pl][PARTITION_HORZ];
+
+    if ((RDCOST(x->rdmult, x->rddiv, r2, d2) <
+         RDCOST(x->rdmult, x->rddiv, srate, sdist)) && !mb_skip) {
+      srate = r2;
+      sdist = d2;
+      *(get_sb_partitioning(x, bsize)) = subsize;
+    }
+    restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+  }
+
+  // PARTITION_VERT
+  if ((mi_row + ms <= cm->mi_rows) && (mi_col + (ms >> 1) <= cm->mi_cols) &&
+      (bsize >= BLOCK_SIZE_MB16X16)) {
+    int r2, d2;
+    int mb_skip = 0;
+    subsize = get_subsize(bsize, PARTITION_VERT);
+    *(get_sb_index(xd, subsize)) = 0;
+    pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize,
+                  get_block_context(x, subsize));
+    if (mi_col + ms <= cm->mi_cols) {
+      int r, d;
+      update_state(cpi, get_block_context(x, subsize), subsize, 0);
+      encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
+      *(get_sb_index(xd, subsize)) = 1;
+      pick_sb_modes(cpi, mi_row, mi_col + (ms >> 1), tp, &r, &d, subsize,
+                    get_block_context(x, subsize));
+      r2 += r;
+      d2 += d;
+    } else {
+      if (mi_col + (ms >> 1) != cm->mi_cols)
+        mb_skip = 1;
+    }
+    set_partition_seg_context(cm, xd, mi_row, mi_col);
+    pl = partition_plane_context(xd, bsize);
+    r2 += x->partition_cost[pl][PARTITION_VERT];
+
+    if ((RDCOST(x->rdmult, x->rddiv, r2, d2) <
+         RDCOST(x->rdmult, x->rddiv, srate, sdist)) && !mb_skip) {
+      srate = r2;
+      sdist = d2;
+      *(get_sb_partitioning(x, bsize)) = subsize;
+    }
+    restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+  }
+
+  // PARTITION_NONE
+  if (mi_row + ms <= cm->mi_rows && mi_col + ms <= cm->mi_cols) {
+    int r, d;
+    pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, bsize,
+                  get_block_context(x, bsize));
+#if CONFIG_AB4X4
+    if (bsize >= BLOCK_SIZE_SB8X8) {
+#else
+    if (bsize >= BLOCK_SIZE_MB16X16) {
+#endif
+      set_partition_seg_context(cm, xd, mi_row, mi_col);
+      pl = partition_plane_context(xd, bsize);
+      r += x->partition_cost[pl][PARTITION_NONE];
+    }
+
+    if (RDCOST(x->rdmult, x->rddiv, r, d) <
+        RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
+      srate = r;
+      sdist = d;
+#if CONFIG_AB4X4
+      if (bsize >= BLOCK_SIZE_SB8X8)
+#else
+      if (bsize >= BLOCK_SIZE_MB16X16)
+#endif
+        *(get_sb_partitioning(x, bsize)) = bsize;
+    }
+  }
+
+  *rate = srate;
+  *dist = sdist;
+
+  if (srate < INT_MAX && sdist < INT_MAX)
+    encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_SIZE_SB64X64, bsize);
+
+  if (bsize == BLOCK_SIZE_SB64X64) {
+    assert(tp_orig < *tp);
+    assert(srate < INT_MAX);
+    assert(sdist < INT_MAX);
+  } else {
+    assert(tp_orig == *tp);
+  }
+}
+
+static void encode_sb_row(VP9_COMP *cpi, int mi_row,
+                       TOKENEXTRA **tp, int *totalrate) {
+  VP9_COMMON *const cm = &cpi->common;
+  int mi_col;
 
   // Initialize the left context for the new SB row
   vpx_memset(&cm->left_context, 0, sizeof(cm->left_context));
@@ -871,526 +1146,9 @@
   // Code each SB in the row
   for (mi_col = cm->cur_tile_mi_col_start;
        mi_col < cm->cur_tile_mi_col_end; mi_col += 8) {
-    int i, p;
-    BLOCK_SIZE_TYPE mb_partitioning[4][4];
-    BLOCK_SIZE_TYPE sb_partitioning[4];
-    BLOCK_SIZE_TYPE sb64_partitioning = BLOCK_SIZE_SB32X32;
-    int sb64_rate = 0, sb64_dist = 0;
-    int sb64_skip = 0;
-    ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
-    PARTITION_CONTEXT seg_l[64 / MI_SIZE], seg_a[64 / MI_SIZE];
-    TOKENEXTRA *tp_orig = *tp;
-
-    for (p = 0; p < MAX_MB_PLANE; p++) {
-      memcpy(a + 16 * p, cm->above_context[p] +
-                 (mi_col * 2 >> xd->plane[p].subsampling_x),
-             sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_x);
-      memcpy(l + 16 * p, cm->left_context[p],
-             sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_y);
-    }
-    vpx_memcpy(&seg_a, cm->above_seg_context + mi_col, sizeof(seg_a));
-    vpx_memcpy(&seg_l, cm->left_seg_context, sizeof(seg_l));
-
-    // FIXME(rbultje): this function should probably be rewritten to be
-    // recursive at some point in the future.
-    for (i = 0; i < 4; i++) {
-      const int x_idx = (i & 1) << 2;
-      const int y_idx = (i & 2) << 1;
-      int sb32_rate = 0, sb32_dist = 0;
-      int splitmodes_used = 0;
-      int sb32_skip = 0;
-      int j;
-      ENTROPY_CONTEXT l2[8 * MAX_MB_PLANE], a2[8 * MAX_MB_PLANE];
-      PARTITION_CONTEXT sl32[32 / MI_SIZE], sa32[32 / MI_SIZE];
-
-      sb_partitioning[i] = BLOCK_SIZE_MB16X16;
-      if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
-        continue;
-
-      xd->sb_index = i;
-
-      /* Function should not modify L & A contexts; save and restore on exit */
-      for (p = 0; p < MAX_MB_PLANE; p++) {
-        vpx_memcpy(l2 + 8 * p,
-                   cm->left_context[p] +
-                       (y_idx * 2 >> xd->plane[p].subsampling_y),
-                   sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_y);
-        vpx_memcpy(a2 + 8 * p,
-                   cm->above_context[p] +
-                       ((mi_col + x_idx) * 2 >> xd->plane[p].subsampling_x),
-                   sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_x);
-      }
-      vpx_memcpy(&sa32, cm->above_seg_context + mi_col + x_idx, sizeof(sa32));
-      vpx_memcpy(&sl32, cm->left_seg_context + y_idx, sizeof(sl32));
-
-      /* Encode MBs in raster order within the SB */
-      for (j = 0; j < 4; j++) {
-        const int x_idx_m = x_idx + ((j & 1) << 1);
-        const int y_idx_m = y_idx + ((j >> 1) << 1);
-        int r, d;
-        int r2, d2, mb16_rate = 0, mb16_dist = 0, k;
-        ENTROPY_CONTEXT l3[4 * MAX_MB_PLANE], a3[4 * MAX_MB_PLANE];
-        PARTITION_CONTEXT sl16[16 / MI_SIZE], sa16[16 / MI_SIZE];
-
-        mb_partitioning[i][j] = BLOCK_SIZE_SB8X8;
-
-        if (mi_row + y_idx_m >= cm->mi_rows ||
-            mi_col + x_idx_m >= cm->mi_cols) {
-          // MB lies outside frame, move on
-          continue;
-        }
-
-        // Index of the MB in the SB 0..3
-        xd->mb_index = j;
-
-        for (p = 0; p < MAX_MB_PLANE; p++) {
-          vpx_memcpy(l3 + 4 * p,
-                     cm->left_context[p] +
-                         (y_idx_m * 2 >> xd->plane[p].subsampling_y),
-                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y);
-          vpx_memcpy(a3 + 4 * p,
-                     cm->above_context[p] +
-                         ((mi_col + x_idx_m) * 2 >> xd->plane[p].subsampling_x),
-                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x);
-        }
-        vpx_memcpy(&sa16, cm->above_seg_context + mi_col + x_idx_m,
-                   sizeof(sa16));
-        vpx_memcpy(&sl16, cm->left_seg_context + y_idx_m, sizeof(sl16));
-
-        for (k = 0; k < 4; k++) {
-          xd->b_index = k;
-
-          // try 8x8 coding
-          pick_sb_modes(cpi, mi_row + y_idx_m + (k >> 1),
-                        mi_col + x_idx_m + (k & 1),
-                        tp, &r, &d, BLOCK_SIZE_SB8X8,
-                        &x->sb8_context[xd->sb_index][xd->mb_index]
-                                       [xd->b_index]);
-          mb16_rate += r;
-          mb16_dist += d;
-          update_state(cpi, &x->sb8_context[xd->sb_index][xd->mb_index]
-                                           [xd->b_index],
-                       BLOCK_SIZE_SB8X8, 0);
-          encode_superblock(cpi, tp,
-                            0, mi_row + y_idx_m + (k >> 1),
-                            mi_col + x_idx_m + (k & 1),
-                            BLOCK_SIZE_SB8X8);
-        }
-        set_partition_seg_context(cpi, mi_row + y_idx_m, mi_col + x_idx_m);
-        pl = partition_plane_context(xd, BLOCK_SIZE_MB16X16);
-        mb16_rate += x->partition_cost[pl][PARTITION_SPLIT];
-        for (p = 0; p < MAX_MB_PLANE; p++) {
-          vpx_memcpy(cm->left_context[p] +
-                         (y_idx_m * 2 >> xd->plane[p].subsampling_y),
-                     l3 + 4 * p,
-                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y);
-          vpx_memcpy(cm->above_context[p] +
-                         ((mi_col + x_idx_m) * 2 >> xd->plane[p].subsampling_x),
-                     a3 + 4 * p,
-                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x);
-        }
-        vpx_memcpy(cm->above_seg_context + mi_col + x_idx_m,
-                   sa16, sizeof(sa16));
-        vpx_memcpy(cm->left_seg_context + y_idx_m, sl16, sizeof(sl16));
-
-        // try 8x16 coding
-        r2 = 0;
-        d2 = 0;
-        xd->b_index = 0;
-        pick_sb_modes(cpi, mi_row + y_idx_m, mi_col + x_idx_m,
-                      tp, &r, &d, BLOCK_SIZE_SB8X16,
-                      &x->sb8x16_context[xd->sb_index][xd->mb_index]
-                                        [xd->b_index]);
-        r2 += r;
-        d2 += d;
-        update_state(cpi, &x->sb8x16_context[xd->sb_index][xd->mb_index]
-                                            [xd->b_index],
-                     BLOCK_SIZE_SB8X16, 0);
-        encode_superblock(cpi, tp,
-                          0, mi_row + y_idx_m, mi_col + x_idx_m,
-                          BLOCK_SIZE_SB8X16);
-        xd->b_index = 1;
-        pick_sb_modes(cpi, mi_row + y_idx_m, mi_col + x_idx_m + 1,
-                      tp, &r, &d, BLOCK_SIZE_SB8X16,
-                      &x->sb8x16_context[xd->sb_index][xd->mb_index]
-                                        [xd->b_index]);
-        r2 += r;
-        d2 += d;
-        set_partition_seg_context(cpi, mi_row + y_idx_m, mi_col + x_idx_m);
-        pl = partition_plane_context(xd, BLOCK_SIZE_MB16X16);
-        r2 += x->partition_cost[pl][PARTITION_VERT];
-        if (RDCOST(x->rdmult, x->rddiv, r2, d2) <
-                RDCOST(x->rdmult, x->rddiv, mb16_rate, mb16_dist)) {
-          mb16_rate = r2;
-          mb16_dist = d2;
-          mb_partitioning[i][j] = BLOCK_SIZE_SB8X16;
-        }
-        for (p = 0; p < MAX_MB_PLANE; p++) {
-          vpx_memcpy(cm->left_context[p] +
-                         (y_idx_m * 2 >> xd->plane[p].subsampling_y),
-                     l3 + 4 * p,
-                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y);
-          vpx_memcpy(cm->above_context[p] +
-                         ((mi_col + x_idx_m) * 2 >> xd->plane[p].subsampling_x),
-                     a3 + 4 * p,
-                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x);
-        }
-
-        // try 16x8 coding
-        r2 = 0;
-        d2 = 0;
-        xd->b_index = 0;
-        pick_sb_modes(cpi, mi_row + y_idx_m, mi_col + x_idx_m,
-                      tp, &r, &d, BLOCK_SIZE_SB16X8,
-                      &x->sb16x8_context[xd->sb_index][xd->mb_index]
-                                        [xd->b_index]);
-        r2 += r;
-        d2 += d;
-        update_state(cpi, &x->sb16x8_context[xd->sb_index][xd->mb_index]
-                                            [xd->b_index],
-                     BLOCK_SIZE_SB16X8, 0);
-        encode_superblock(cpi, tp,
-                          0, mi_row + y_idx_m, mi_col + x_idx_m,
-                          BLOCK_SIZE_SB16X8);
-        xd->b_index = 1;
-        pick_sb_modes(cpi, mi_row + y_idx_m + 1, mi_col + x_idx_m,
-                      tp, &r, &d, BLOCK_SIZE_SB16X8,
-                      &x->sb16x8_context[xd->sb_index][xd->mb_index]
-                                        [xd->b_index]);
-        r2 += r;
-        d2 += d;
-        set_partition_seg_context(cpi, mi_row + y_idx_m, mi_col + x_idx_m);
-        pl = partition_plane_context(xd, BLOCK_SIZE_MB16X16);
-        r2 += x->partition_cost[pl][PARTITION_HORZ];
-        if (RDCOST(x->rdmult, x->rddiv, r2, d2) <
-                RDCOST(x->rdmult, x->rddiv, mb16_rate, mb16_dist)) {
-          mb16_rate = r2;
-          mb16_dist = d2;
-          mb_partitioning[i][j] = BLOCK_SIZE_SB16X8;
-        }
-        for (p = 0; p < MAX_MB_PLANE; p++) {
-          vpx_memcpy(cm->left_context[p] +
-                         (y_idx_m * 2 >> xd->plane[p].subsampling_y),
-                     l3 + 4 * p,
-                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y);
-          vpx_memcpy(cm->above_context[p] +
-                         ((mi_col + x_idx_m) * 2 >> xd->plane[p].subsampling_x),
-                     a3 + 4 * p,
-                     sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x);
-        }
-
-        // try as 16x16
-        pick_sb_modes(cpi, mi_row + y_idx_m, mi_col + x_idx_m,
-                      tp, &r, &d, BLOCK_SIZE_MB16X16,
-                      &x->mb_context[xd->sb_index][xd->mb_index]);
-        set_partition_seg_context(cpi, mi_row + y_idx_m, mi_col + x_idx_m);
-        pl = partition_plane_context(xd, BLOCK_SIZE_MB16X16);
-        r += x->partition_cost[pl][PARTITION_NONE];
-        if (RDCOST(x->rdmult, x->rddiv, r, d) <
-                RDCOST(x->rdmult, x->rddiv, mb16_rate, mb16_dist)) {
-          mb16_rate = r;
-          mb16_dist = d;
-          mb_partitioning[i][j] = BLOCK_SIZE_MB16X16;
-        }
-        sb32_rate += mb16_rate;
-        sb32_dist += mb16_dist;
-
-        // Dummy encode, do not do the tokenization
-        encode_sb(cpi, tp, mi_row + y_idx_m, mi_col + x_idx_m, 0,
-                  BLOCK_SIZE_MB16X16, mb_partitioning[i][j], NULL, NULL);
-      }
-
-      /* Restore L & A coding context to those in place on entry */
-      for (p = 0; p < MAX_MB_PLANE; p++) {
-        vpx_memcpy(cm->left_context[p] +
-                       (y_idx * 2 >> xd->plane[p].subsampling_y),
-                   l2 + 8 * p,
-                   sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_y);
-        vpx_memcpy(cm->above_context[p] +
-                       ((mi_col + x_idx) * 2 >> xd->plane[p].subsampling_x),
-                   a2 + 8 * p,
-                   sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_x);
-      }
-      // restore partition information context
-      vpx_memcpy(cm->above_seg_context + mi_col + x_idx, sa32, sizeof(sa32));
-      vpx_memcpy(cm->left_seg_context + y_idx, sl32, sizeof(sl32));
-
-      set_partition_seg_context(cpi, mi_row + y_idx, mi_col + x_idx);
-      pl = partition_plane_context(xd, BLOCK_SIZE_SB32X32);
-      sb32_rate += x->partition_cost[pl][PARTITION_SPLIT];
-
-      if (cpi->sf.splitmode_breakout) {
-        sb32_skip = splitmodes_used;
-        sb64_skip += splitmodes_used;
-      }
-
-      // check 32x16
-      if (mi_col + x_idx + 4 <= cm->mi_cols) {
-        int r, d;
-
-        xd->mb_index = 0;
-        pick_sb_modes(cpi, mi_row + y_idx, mi_col + x_idx,
-                      tp, &r, &d, BLOCK_SIZE_SB32X16,
-                      &x->sb32x16_context[xd->sb_index][xd->mb_index]);
-        if (mi_row + y_idx + 2 < cm->mi_rows) {
-          int r2, d2;
-
-          update_state(cpi, &x->sb32x16_context[xd->sb_index][xd->mb_index],
-                       BLOCK_SIZE_SB32X16, 0);
-          encode_superblock(cpi, tp,
-                            0, mi_row + y_idx, mi_col + x_idx,
-                            BLOCK_SIZE_SB32X16);
-          xd->mb_index = 1;
-          pick_sb_modes(cpi, mi_row + y_idx + 2,
-                        mi_col + x_idx, tp, &r2, &d2, BLOCK_SIZE_SB32X16,
-                        &x->sb32x16_context[xd->sb_index][xd->mb_index]);
-          r += r2;
-          d += d2;
-        }
-
-        set_partition_seg_context(cpi, mi_row + y_idx, mi_col + x_idx);
-        pl = partition_plane_context(xd, BLOCK_SIZE_SB32X32);
-        r += x->partition_cost[pl][PARTITION_HORZ];
-
-        /* is this better than MB coding? */
-        if (RDCOST(x->rdmult, x->rddiv, r, d) <
-                RDCOST(x->rdmult, x->rddiv, sb32_rate, sb32_dist)) {
-          sb32_rate = r;
-          sb32_dist = d;
-          sb_partitioning[i] = BLOCK_SIZE_SB32X16;
-        }
-
-        for (p = 0; p < MAX_MB_PLANE; p++) {
-          vpx_memcpy(cm->left_context[p] +
-                         (y_idx * 2 >> xd->plane[p].subsampling_y),
-                     l2 + 8 * p,
-                     sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_y);
-          vpx_memcpy(cm->above_context[p] +
-                         ((mi_col + x_idx) * 2 >> xd->plane[p].subsampling_x),
-                     a2 + 8 * p,
-                     sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_x);
-        }
-      }
-
-      // check 16x32
-      if (mi_row + y_idx + 4 <= cm->mi_rows) {
-        int r, d;
-
-        xd->mb_index = 0;
-        pick_sb_modes(cpi, mi_row + y_idx, mi_col + x_idx,
-                      tp, &r, &d, BLOCK_SIZE_SB16X32,
-                      &x->sb16x32_context[xd->sb_index][xd->mb_index]);
-        if (mi_col + x_idx + 2 < cm->mi_cols) {
-          int r2, d2;
-
-          update_state(cpi, &x->sb16x32_context[xd->sb_index][xd->mb_index],
-                       BLOCK_SIZE_SB16X32, 0);
-          encode_superblock(cpi, tp,
-                            0, mi_row + y_idx, mi_col + x_idx,
-                            BLOCK_SIZE_SB16X32);
-          xd->mb_index = 1;
-          pick_sb_modes(cpi, mi_row + y_idx,
-                        mi_col + x_idx + 2,
-                        tp, &r2, &d2, BLOCK_SIZE_SB16X32,
-                        &x->sb16x32_context[xd->sb_index][xd->mb_index]);
-          r += r2;
-          d += d2;
-        }
-
-        set_partition_seg_context(cpi, mi_row + y_idx, mi_col + x_idx);
-        pl = partition_plane_context(xd, BLOCK_SIZE_SB32X32);
-        r += x->partition_cost[pl][PARTITION_VERT];
-
-        /* is this better than MB coding? */
-        if (RDCOST(x->rdmult, x->rddiv, r, d) <
-                RDCOST(x->rdmult, x->rddiv, sb32_rate, sb32_dist)) {
-          sb32_rate = r;
-          sb32_dist = d;
-          sb_partitioning[i] = BLOCK_SIZE_SB16X32;
-        }
-
-        for (p = 0; p < MAX_MB_PLANE; p++) {
-          vpx_memcpy(cm->left_context[p] +
-                         (y_idx * 2 >> xd->plane[p].subsampling_y),
-                     l2 + 8 * p,
-                     sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_y);
-          vpx_memcpy(cm->above_context[p] +
-                         ((mi_col + x_idx) * 2 >> xd->plane[p].subsampling_x),
-                     a2 + 8 * p,
-                     sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_x);
-        }
-      }
-
-      if (!sb32_skip &&
-          mi_col + x_idx + 4 <= cm->mi_cols &&
-          mi_row + y_idx + 4 <= cm->mi_rows) {
-        int r, d;
-
-        /* Pick a mode assuming that it applies to all 4 of the MBs in the SB */
-        pick_sb_modes(cpi, mi_row + y_idx, mi_col + x_idx,
-                      tp, &r, &d, BLOCK_SIZE_SB32X32,
-                      &x->sb32_context[xd->sb_index]);
-
-        set_partition_seg_context(cpi, mi_row + y_idx, mi_col + x_idx);
-        pl = partition_plane_context(xd, BLOCK_SIZE_SB32X32);
-        r += x->partition_cost[pl][PARTITION_NONE];
-
-        if (RDCOST(x->rdmult, x->rddiv, r, d) <
-                RDCOST(x->rdmult, x->rddiv, sb32_rate, sb32_dist)) {
-          sb32_rate = r;
-          sb32_dist = d;
-          sb_partitioning[i] = BLOCK_SIZE_SB32X32;
-        }
-      }
-
-      // If we used 16x16 instead of 32x32 then skip 64x64 (if enabled).
-      if (cpi->sf.mb16_breakout && sb_partitioning[i] != BLOCK_SIZE_SB32X32) {
-        ++sb64_skip;
-      }
-
-      sb64_rate += sb32_rate;
-      sb64_dist += sb32_dist;
-
-      /* Encode SB using best computed mode(s) */
-      // FIXME(rbultje): there really shouldn't be any need to encode_mb/sb
-      // for each level that we go up, we can just keep tokens and recon
-      // pixels of the lower level; also, inverting SB/MB order (big->small
-      // instead of small->big) means we can use as threshold for small, which
-      // may enable breakouts if RD is not good enough (i.e. faster)
-      encode_sb(cpi, tp, mi_row + y_idx, mi_col + x_idx, 0,
-                BLOCK_SIZE_SB32X32, sb_partitioning[i], mb_partitioning[i],
-                NULL);
-    }
-
-    for (p = 0; p < MAX_MB_PLANE; p++) {
-      memcpy(cm->above_context[p] +
-                 (mi_col * 2 >> xd->plane[p].subsampling_x),
-             a + 16 * p,
-             sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_x);
-      memcpy(cm->left_context[p], l + 16 * p,
-             sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_y);
-    }
-    memcpy(cm->above_seg_context + mi_col, &seg_a, sizeof(seg_a));
-    memcpy(cm->left_seg_context, &seg_l, sizeof(seg_l));
-
-    set_partition_seg_context(cpi, mi_row, mi_col);
-    pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64);
-    sb64_rate += x->partition_cost[pl][PARTITION_SPLIT];
-
-    // check 64x32
-    if (mi_col + 8 <= cm->mi_cols && !(cm->mb_rows & 1)) {
-      int r, d;
-
-      xd->sb_index = 0;
-      pick_sb_modes(cpi, mi_row, mi_col,
-                    tp, &r, &d, BLOCK_SIZE_SB64X32,
-                    &x->sb64x32_context[xd->sb_index]);
-      if (mi_row + 4 != cm->mi_rows) {
-        int r2, d2;
-
-        update_state(cpi, &x->sb64x32_context[xd->sb_index],
-                     BLOCK_SIZE_SB64X32, 0);
-        encode_superblock(cpi, tp,
-                          0, mi_row, mi_col, BLOCK_SIZE_SB64X32);
-        xd->sb_index = 1;
-        pick_sb_modes(cpi, mi_row + 4, mi_col,
-                      tp, &r2, &d2, BLOCK_SIZE_SB64X32,
-                      &x->sb64x32_context[xd->sb_index]);
-        r += r2;
-        d += d2;
-      }
-
-      set_partition_seg_context(cpi, mi_row, mi_col);
-      pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64);
-      r += x->partition_cost[pl][PARTITION_HORZ];
-
-      /* is this better than MB coding? */
-      if (RDCOST(x->rdmult, x->rddiv, r, d) <
-              RDCOST(x->rdmult, x->rddiv, sb64_rate, sb64_dist)) {
-        sb64_rate = r;
-        sb64_dist = d;
-        sb64_partitioning = BLOCK_SIZE_SB64X32;
-      }
-
-      for (p = 0; p < MAX_MB_PLANE; p++) {
-        memcpy(cm->above_context[p] +
-                   (mi_col * 2 >> xd->plane[p].subsampling_x),
-               a + 16 * p,
-               sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_x);
-        memcpy(cm->left_context[p], l + 16 * p,
-               sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_y);
-      }
-    }
-
-    // check 32x64
-    if (mi_row + 8 <= cm->mi_rows && !(cm->mb_cols & 1)) {
-      int r, d;
-
-      xd->sb_index = 0;
-      pick_sb_modes(cpi, mi_row, mi_col,
-                    tp, &r, &d, BLOCK_SIZE_SB32X64,
-                    &x->sb32x64_context[xd->sb_index]);
-      if (mi_col + 4 != cm->mi_cols) {
-        int r2, d2;
-
-        update_state(cpi, &x->sb32x64_context[xd->sb_index],
-                     BLOCK_SIZE_SB32X64, 0);
-        encode_superblock(cpi, tp,
-                          0, mi_row, mi_col, BLOCK_SIZE_SB32X64);
-        xd->sb_index = 1;
-        pick_sb_modes(cpi, mi_row, mi_col + 4,
-                      tp, &r2, &d2, BLOCK_SIZE_SB32X64,
-                      &x->sb32x64_context[xd->sb_index]);
-        r += r2;
-        d += d2;
-      }
-
-      set_partition_seg_context(cpi, mi_row, mi_col);
-      pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64);
-      r += x->partition_cost[pl][PARTITION_VERT];
-
-      /* is this better than MB coding? */
-      if (RDCOST(x->rdmult, x->rddiv, r, d) <
-              RDCOST(x->rdmult, x->rddiv, sb64_rate, sb64_dist)) {
-        sb64_rate = r;
-        sb64_dist = d;
-        sb64_partitioning = BLOCK_SIZE_SB32X64;
-      }
-
-      for (p = 0; p < MAX_MB_PLANE; p++) {
-        memcpy(cm->above_context[p] +
-                   (mi_col * 2 >> xd->plane[p].subsampling_x),
-               a + 16 * p,
-               sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_x);
-        memcpy(cm->left_context[p], l + 16 * p,
-               sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_y);
-      }
-    }
-
-    if (!sb64_skip &&
-        mi_col + 8 <= cm->mi_cols &&
-        mi_row + 8 <= cm->mi_rows) {
-      int r, d;
-
-      pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d,
-                    BLOCK_SIZE_SB64X64, &x->sb64_context);
-
-      set_partition_seg_context(cpi, mi_row, mi_col);
-      pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64);
-      r += x->partition_cost[pl][PARTITION_NONE];
-
-      if (RDCOST(x->rdmult, x->rddiv, r, d) <
-              RDCOST(x->rdmult, x->rddiv, sb64_rate, sb64_dist)) {
-        sb64_rate = r;
-        sb64_dist = d;
-        sb64_partitioning = BLOCK_SIZE_SB64X64;
-      }
-    }
-
-    assert(tp_orig == *tp);
-    encode_sb(cpi, tp, mi_row, mi_col, 1, BLOCK_SIZE_SB64X64,
-              sb64_partitioning, sb_partitioning, mb_partitioning);
-    assert(tp_orig < *tp);
+    int dummy_rate, dummy_dist;
+    rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
+                      &dummy_rate, &dummy_dist);
   }
 }
 
@@ -1559,9 +1317,8 @@
           vp9_get_tile_col_offsets(cm, tile_col);
           for (mi_row = cm->cur_tile_mi_row_start;
                mi_row < cm->cur_tile_mi_row_end;
-               mi_row += 8) {
+               mi_row += 8)
             encode_sb_row(cpi, mi_row, &tp, &totalrate);
-          }
           cpi->tok_count[tile_col] = (unsigned int)(tp - tp_old);
           assert(tp - cpi->tok <=
                  get_token_alloc(cm->mb_rows, cm->mb_cols));
@@ -1901,7 +1658,11 @@
   }
 #endif
 
+#if CONFIG_AB4X4
+  if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB8X8) {
+#else
   if (xd->mode_info_context->mbmi.sb_type > BLOCK_SIZE_SB8X8) {
+#endif
     ++cpi->sb_ymode_count[m];
   } else {
     ++cpi->ymode_count[m];
@@ -1986,13 +1747,17 @@
     vp9_update_zbin_extra(cpi, x);
   }
 
+#if CONFIG_AB4X4
+  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME &&
+      bsize < BLOCK_SIZE_SB8X8) {
+#else
   if (xd->mode_info_context->mbmi.mode == I4X4_PRED) {
     assert(bsize == BLOCK_SIZE_SB8X8 &&
            xd->mode_info_context->mbmi.txfm_size == TX_4X4);
-
-    vp9_encode_intra4x4mby(x, bsize);
-    vp9_build_intra_predictors_sbuv_s(&x->e_mbd, bsize);
-    vp9_encode_sbuv(cm, x, bsize);
+#endif
+    vp9_encode_intra4x4mby(x, BLOCK_SIZE_SB8X8);
+    vp9_build_intra_predictors_sbuv_s(&x->e_mbd, BLOCK_SIZE_SB8X8);
+    vp9_encode_sbuv(cm, x, BLOCK_SIZE_SB8X8);
 
     if (output_enabled)
       sum_intra_stats(cpi, x);
@@ -2028,15 +1793,22 @@
             ? &cpi->common.yv12_fb[second_ref_fb_idx] : NULL,
         mi_row, mi_col, xd->scale_factor, xd->scale_factor_uv);
 
-    vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+    vp9_build_inter_predictors_sb(xd, mi_row, mi_col,
+                     (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
   }
 
+#if CONFIG_AB4X4
+  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME &&
+      bsize < BLOCK_SIZE_SB8X8) {
+#else
   if (xd->mode_info_context->mbmi.mode == I4X4_PRED) {
     assert(bsize == BLOCK_SIZE_SB8X8);
-    vp9_tokenize_sb(cpi, &x->e_mbd, t, !output_enabled, bsize);
+#endif
+    vp9_tokenize_sb(cpi, &x->e_mbd, t, !output_enabled, BLOCK_SIZE_SB8X8);
   } else if (!x->skip) {
-    vp9_encode_sb(cm, x, bsize);
-    vp9_tokenize_sb(cpi, &x->e_mbd, t, !output_enabled, bsize);
+    vp9_encode_sb(cm, x, (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
+    vp9_tokenize_sb(cpi, &x->e_mbd, t, !output_enabled,
+                    (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
   } else {
     // FIXME(rbultje): not tile-aware (mi - 1)
     int mb_skip_context =
@@ -2045,7 +1817,8 @@
     xd->mode_info_context->mbmi.mb_skip_coeff = 1;
     if (output_enabled)
       cpi->skip_true_count[mb_skip_context]++;
-    vp9_reset_sb_tokens_context(xd, bsize);
+    vp9_reset_sb_tokens_context(xd,
+                 (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
   }
 
   // copy skip flag on all mb_mode_info contexts in this SB
@@ -2075,8 +1848,12 @@
         sz = TX_16X16;
       if (sz == TX_16X16 && bsize < BLOCK_SIZE_MB16X16)
         sz = TX_8X8;
+#if CONFIG_AB4X4
+      if (sz == TX_8X8 && bsize < BLOCK_SIZE_SB8X8)
+#else
       if (sz == TX_8X8 && (xd->mode_info_context->mbmi.mode == SPLITMV ||
                            xd->mode_info_context->mbmi.mode == I4X4_PRED))
+#endif
         sz = TX_4X4;
 
       for (y = 0; y < bh; y++) {
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 4665fcc..221de74 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -139,6 +139,7 @@
   const int ib = txfrm_block_to_raster_block(xd, bsize, plane,
                                              block, 2 * tx_size);
   const int16_t *dequant_ptr = xd->plane[plane].dequant;
+  const uint8_t * band_translate;
 
   assert((!type && !plane) || (type && plane));
   dqcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16);
@@ -149,23 +150,27 @@
       const TX_TYPE tx_type = plane == 0 ? get_tx_type_4x4(xd, ib) : DCT_DCT;
       default_eob = 16;
       scan = get_scan_4x4(tx_type);
+      band_translate = vp9_coefband_trans_4x4;
       break;
     }
     case TX_8X8: {
       const TX_TYPE tx_type = plane == 0 ? get_tx_type_8x8(xd, ib) : DCT_DCT;
       scan = get_scan_8x8(tx_type);
       default_eob = 64;
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
     }
     case TX_16X16: {
       const TX_TYPE tx_type = plane == 0 ? get_tx_type_16x16(xd, ib) : DCT_DCT;
       scan = get_scan_16x16(tx_type);
       default_eob = 256;
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
     }
     case TX_32X32:
       scan = vp9_default_zig_zag1d_32x32;
       default_eob = 1024;
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
   }
   assert(eob <= default_eob);
@@ -204,7 +209,7 @@
       t0 = (vp9_dct_value_tokens_ptr + x)->token;
       /* Consider both possible successor states. */
       if (next < default_eob) {
-        band = get_coef_band(scan, tx_size, i + 1);
+        band = get_coef_band(band_translate, i + 1);
         pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache,
                                        pad, default_eob);
         rate0 +=
@@ -254,7 +259,7 @@
         t0 = t1 = (vp9_dct_value_tokens_ptr + x)->token;
       }
       if (next < default_eob) {
-        band = get_coef_band(scan, tx_size, i + 1);
+        band = get_coef_band(band_translate, i + 1);
         if (t0 != DCT_EOB_TOKEN) {
           pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache,
                                          pad, default_eob);
@@ -291,7 +296,7 @@
      *  add a new trellis node, but we do need to update the costs.
      */
     else {
-      band = get_coef_band(scan, tx_size, i + 1);
+      band = get_coef_band(band_translate, i + 1);
       t0 = tokens[next][0].token;
       t1 = tokens[next][1].token;
       /* Update the cost of each path if we're past the EOB token. */
@@ -310,7 +315,7 @@
   }
 
   /* Now pick the best path through the whole trellis. */
-  band = get_coef_band(scan, tx_size, i + 1);
+  band = get_coef_band(band_translate, i + 1);
   pt = combine_entropy_contexts(*a, *l);
   rate0 = tokens[next][0].rate;
   rate1 = tokens[next][1].rate;
@@ -420,6 +425,7 @@
   VP9_COMMON *cm;
   MACROBLOCK *x;
   struct optimize_ctx *ctx;
+  int *wip_txfrm_size;  // for "work in progress" only... will remove once done
 };
 
 static void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize,
@@ -488,6 +494,7 @@
                          int ss_txfrm_size, void *arg) {
   struct encode_b_args* const args = arg;
   MACROBLOCK* const x = args->x;
+  int *wip_txfrm_size = args->wip_txfrm_size;
   MACROBLOCKD* const xd = &x->e_mbd;
   const int bw = 4 << (b_width_log2(bsize) - xd->plane[plane].subsampling_x);
   const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,
@@ -495,6 +502,10 @@
   int16_t* const diff = raster_block_offset_int16(xd, bsize, plane,
                                                   raster_block,
                                                   xd->plane[plane].diff);
+  uint8_t* const dst = raster_block_offset_uint8(xd, bsize, plane,
+                                                 raster_block,
+                                                 xd->plane[plane].dst.buf,
+                                                 xd->plane[plane].dst.stride);
   TX_TYPE tx_type = DCT_DCT;
 
   xform_quant(plane, block, bsize, ss_txfrm_size, arg);
@@ -504,18 +515,21 @@
 
   switch (ss_txfrm_size / 2) {
     case TX_32X32:
-      vp9_short_idct32x32(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
-                          diff, bw * 2);
+        vp9_short_idct32x32_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff,
+                                block, 16), dst, xd->plane[plane].dst.stride);
+        *wip_txfrm_size = 32;
       break;
     case TX_16X16:
       tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;
       if (tx_type == DCT_DCT) {
-        vp9_short_idct16x16(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
-                            diff, bw * 2);
+        vp9_short_idct16x16_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff,
+                                block, 16), dst, xd->plane[plane].dst.stride);
       } else {
-        vp9_short_iht16x16(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
-                           diff, bw, tx_type);
+        vp9_short_iht16x16_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff,
+                               block, 16), dst, xd->plane[plane].dst.stride,
+                               tx_type);
       }
+      *wip_txfrm_size = 16;
       break;
     case TX_8X8:
       tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;
@@ -526,6 +540,7 @@
         vp9_short_iht8x8(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
                          diff, bw, tx_type);
       }
+      *wip_txfrm_size = 8;
       break;
     case TX_4X4:
       tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT;
@@ -539,6 +554,7 @@
         vp9_short_iht4x4(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
                          diff, bw, tx_type);
       }
+      *wip_txfrm_size = 4;
       break;
   }
 }
@@ -546,7 +562,7 @@
 void vp9_xform_quant_sby(VP9_COMMON *const cm, MACROBLOCK *x,
                          BLOCK_SIZE_TYPE bsize) {
   MACROBLOCKD* const xd = &x->e_mbd;
-  struct encode_b_args arg = {cm, x, NULL};
+  struct encode_b_args arg = {cm, x, NULL, NULL};
 
   foreach_transformed_block_in_plane(xd, bsize, 0,
                                      xform_quant, &arg);
@@ -555,7 +571,7 @@
 void vp9_xform_quant_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
                          BLOCK_SIZE_TYPE bsize) {
   MACROBLOCKD* const xd = &x->e_mbd;
-  struct encode_b_args arg = {cm, x, NULL};
+  struct encode_b_args arg = {cm, x, NULL, NULL};
 
   foreach_transformed_block_uv(xd, bsize, xform_quant, &arg);
 }
@@ -564,7 +580,8 @@
                     BLOCK_SIZE_TYPE bsize) {
   MACROBLOCKD* const xd = &x->e_mbd;
   struct optimize_ctx ctx;
-  struct encode_b_args arg = {cm, x, &ctx};
+  int wip_txfrm_size = 0;
+  struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size};
 
   vp9_subtract_sby(x, bsize);
   if (x->optimize)
@@ -572,15 +589,16 @@
 
   foreach_transformed_block_in_plane(xd, bsize, 0,
                                      encode_block, &arg);
-
-  vp9_recon_sby(xd, bsize);
+  if (wip_txfrm_size < 32)
+    vp9_recon_sby(xd, bsize);
 }
 
 void vp9_encode_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
                      BLOCK_SIZE_TYPE bsize) {
   MACROBLOCKD* const xd = &x->e_mbd;
   struct optimize_ctx ctx;
-  struct encode_b_args arg = {cm, x, &ctx};
+  int wip_txfrm_size = 0;
+  struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size};
 
   vp9_subtract_sbuv(x, bsize);
   if (x->optimize)
@@ -588,20 +606,35 @@
 
   foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
 
-  vp9_recon_sbuv(xd, bsize);
+  if (wip_txfrm_size < 16)
+    vp9_recon_sbuv(xd, bsize);
 }
 
 void vp9_encode_sb(VP9_COMMON *const cm, MACROBLOCK *x,
                    BLOCK_SIZE_TYPE bsize) {
   MACROBLOCKD* const xd = &x->e_mbd;
   struct optimize_ctx ctx;
-  struct encode_b_args arg = {cm, x, &ctx};
+  int wip_txfrm_size = 0;
+  struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size};
 
   vp9_subtract_sb(x, bsize);
   if (x->optimize)
     vp9_optimize_init(xd, bsize, &ctx);
-
+#if 0
   foreach_transformed_block(xd, bsize, encode_block, &arg);
 
   vp9_recon_sb(xd, bsize);
+#else
+  // wip version... will use foreach_transformed_block when done
+  foreach_transformed_block_in_plane(xd, bsize, 0,
+                                     encode_block, &arg);
+  if (wip_txfrm_size < 16)
+    vp9_recon_sby(xd, bsize);
+  wip_txfrm_size = 0;
+
+  foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
+
+  if (wip_txfrm_size < 16)
+    vp9_recon_sbuv(xd, bsize);
+#endif
 }
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index ff0725f..0561efe 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -47,7 +47,7 @@
 #define KF_MB_INTRA_MIN 150
 #define GF_MB_INTRA_MIN 100
 
-#define DOUBLE_DIVIDE_CHECK(X) ((X)<0?(X)-.000001:(X)+.000001)
+#define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x) - 0.000001 : (x) + 0.000001)
 
 #define POW1 (double)cpi->oxcf.two_pass_vbrbias/100.0
 #define POW2 (double)cpi->oxcf.two_pass_vbrbias/100.0
@@ -78,8 +78,8 @@
 
 
 // Resets the first pass file to the given position using a relative seek from the current position
-static void reset_fpf_position(VP9_COMP *cpi, FIRSTPASS_STATS *Position) {
-  cpi->twopass.stats_in = Position;
+static void reset_fpf_position(VP9_COMP *cpi, FIRSTPASS_STATS *position) {
+  cpi->twopass.stats_in = position;
 }
 
 static int lookup_next_frame_stats(VP9_COMP *cpi, FIRSTPASS_STATS *next_frame) {
@@ -252,17 +252,11 @@
 
 // Calculate a modified Error used in distributing bits between easier and harder frames
 static double calculate_modified_err(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
-  double av_err = (cpi->twopass.total_stats.ssim_weighted_pred_err /
-                   cpi->twopass.total_stats.count);
-  double this_err = this_frame->ssim_weighted_pred_err;
-  double modified_err;
-
-  if (this_err > av_err)
-    modified_err = av_err * pow((this_err / DOUBLE_DIVIDE_CHECK(av_err)), POW1);
-  else
-    modified_err = av_err * pow((this_err / DOUBLE_DIVIDE_CHECK(av_err)), POW2);
-
-  return modified_err;
+  const FIRSTPASS_STATS *const stats = &cpi->twopass.total_stats;
+  const double av_err = stats->ssim_weighted_pred_err / stats->count;
+  const double this_err = this_frame->ssim_weighted_pred_err;
+  return av_err * pow(this_err / DOUBLE_DIVIDE_CHECK(av_err),
+                      this_err > av_err ? POW1 : POW2);
 }
 
 static const double weight_table[256] = {
@@ -328,20 +322,14 @@
 static int frame_max_bits(VP9_COMP *cpi) {
   // Max allocation for a single frame based on the max section guidelines
   // passed in and how many bits are left.
-  int max_bits;
-
   // For VBR base this on the bits and frames left plus the
   // two_pass_vbrmax_section rate passed in by the user.
-  max_bits = (int) (((double) cpi->twopass.bits_left
-      / (cpi->twopass.total_stats.count - (double) cpi->common
-             .current_video_frame))
-                    * ((double) cpi->oxcf.two_pass_vbrmax_section / 100.0));
+  const double max_bits = (1.0 * cpi->twopass.bits_left /
+      (cpi->twopass.total_stats.count - cpi->common.current_video_frame)) *
+      (cpi->oxcf.two_pass_vbrmax_section / 100.0);
 
   // Trap case where we are out of bits.
-  if (max_bits < 0)
-    max_bits = 0;
-
-  return max_bits;
+  return MAX((int)max_bits, 0);
 }
 
 void vp9_init_first_pass(VP9_COMP *cpi) {
@@ -534,6 +522,8 @@
       xd->plane[2].dst.buf = new_yv12->v_buffer + recon_uvoffset;
       xd->left_available = (mb_col != 0);
 
+      xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_MB16X16;
+
       // do intra 16x16 prediction
       this_error = vp9_encode_intra(cpi, x, use_dc_pred);
 
@@ -632,7 +622,7 @@
           vp9_build_inter_predictors_sby(xd, mb_row << 1,
                                          mb_col << 1,
                                          BLOCK_SIZE_MB16X16);
-          vp9_encode_sb(cm, x, BLOCK_SIZE_MB16X16);
+          vp9_encode_sby(cm, x, BLOCK_SIZE_MB16X16);
           sum_mvr += mv.as_mv.row;
           sum_mvr_abs += abs(mv.as_mv.row);
           sum_mvc += mv.as_mv.col;
@@ -854,26 +844,18 @@
                                      double err_divisor,
                                      double pt_low,
                                      double pt_high,
-                                     int Q) {
-  double power_term;
-  double error_term = err_per_mb / err_divisor;
-  double correction_factor;
+                                     int q) {
+  const double error_term = err_per_mb / err_divisor;
 
   // Adjustment based on actual quantizer to power term.
-  power_term = (vp9_convert_qindex_to_q(Q) * 0.01) + pt_low;
-  power_term = (power_term > pt_high) ? pt_high : power_term;
+  const double power_term = MIN(vp9_convert_qindex_to_q(q) * 0.01 + pt_low,
+                                pt_high);
 
   // Calculate correction factor
   if (power_term < 1.0)
     assert(error_term >= 0.0);
-  correction_factor = pow(error_term, power_term);
 
-  // Clip range
-  correction_factor =
-    (correction_factor < 0.05)
-    ? 0.05 : (correction_factor > 5.0) ? 5.0 : correction_factor;
-
-  return correction_factor;
+  return fclamp(pow(error_term, power_term), 0.05, 5.0);
 }
 
 // Given a current maxQ value sets a range for future values.
@@ -882,10 +864,8 @@
 // (now uses the actual quantizer) but has not been tuned.
 static void adjust_maxq_qrange(VP9_COMP *cpi) {
   int i;
-  double q;
-
   // Set the max corresponding to cpi->avg_q * 2.0
-  q = cpi->avg_q * 2.0;
+  double q = cpi->avg_q * 2.0;
   cpi->twopass.maxq_max_limit = cpi->worst_quality;
   for (i = cpi->best_quality; i <= cpi->worst_quality; i++) {
     cpi->twopass.maxq_max_limit = i;
@@ -906,12 +886,11 @@
 static int estimate_max_q(VP9_COMP *cpi,
                           FIRSTPASS_STATS *fpstats,
                           int section_target_bandwitdh) {
-  int Q;
+  int q;
   int num_mbs = cpi->common.MBs;
   int target_norm_bits_per_mb;
 
-  double section_err = (fpstats->coded_error / fpstats->count);
-  double sr_err_diff;
+  double section_err = fpstats->coded_error / fpstats->count;
   double sr_correction;
   double err_per_mb = section_err / num_mbs;
   double err_correction_factor;
@@ -920,92 +899,74 @@
   if (section_target_bandwitdh <= 0)
     return cpi->twopass.maxq_max_limit;          // Highest value allowed
 
-  target_norm_bits_per_mb =
-    (section_target_bandwitdh < (1 << 20))
-    ? (512 * section_target_bandwitdh) / num_mbs
-    : 512 * (section_target_bandwitdh / num_mbs);
+  target_norm_bits_per_mb = section_target_bandwitdh < (1 << 20)
+                              ? (512 * section_target_bandwitdh) / num_mbs
+                              : 512 * (section_target_bandwitdh / num_mbs);
 
   // Look at the drop in prediction quality between the last frame
   // and the GF buffer (which contained an older frame).
   if (fpstats->sr_coded_error > fpstats->coded_error) {
-    sr_err_diff =
-      (fpstats->sr_coded_error - fpstats->coded_error) /
-      (fpstats->count * cpi->common.MBs);
-    sr_correction = (sr_err_diff / 32.0);
-    sr_correction = pow(sr_correction, 0.25);
-    if (sr_correction < 0.75)
-      sr_correction = 0.75;
-    else if (sr_correction > 1.25)
-      sr_correction = 1.25;
+    double sr_err_diff = (fpstats->sr_coded_error - fpstats->coded_error) /
+                             (fpstats->count * cpi->common.MBs);
+    sr_correction = fclamp(pow(sr_err_diff / 32.0, 0.25), 0.75, 1.25);
   } else {
     sr_correction = 0.75;
   }
 
   // Calculate a corrective factor based on a rolling ratio of bits spent
   // vs target bits
-  if ((cpi->rolling_target_bits > 0) &&
-      (cpi->active_worst_quality < cpi->worst_quality)) {
-    double rolling_ratio;
-
-    rolling_ratio = (double)cpi->rolling_actual_bits /
-                    (double)cpi->rolling_target_bits;
+  if (cpi->rolling_target_bits > 0 &&
+      cpi->active_worst_quality < cpi->worst_quality) {
+    double rolling_ratio = (double)cpi->rolling_actual_bits /
+                               (double)cpi->rolling_target_bits;
 
     if (rolling_ratio < 0.95)
       cpi->twopass.est_max_qcorrection_factor -= 0.005;
     else if (rolling_ratio > 1.05)
       cpi->twopass.est_max_qcorrection_factor += 0.005;
 
-    cpi->twopass.est_max_qcorrection_factor =
-      (cpi->twopass.est_max_qcorrection_factor < 0.1)
-      ? 0.1
-      : (cpi->twopass.est_max_qcorrection_factor > 10.0)
-      ? 10.0 : cpi->twopass.est_max_qcorrection_factor;
+    cpi->twopass.est_max_qcorrection_factor = fclamp(
+        cpi->twopass.est_max_qcorrection_factor, 0.1, 10.0);
   }
 
   // Corrections for higher compression speed settings
   // (reduced compression expected)
-  if (cpi->compressor_speed == 1) {
-    if (cpi->oxcf.cpu_used <= 5)
-      speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04);
-    else
-      speed_correction = 1.25;
-  }
+  if (cpi->compressor_speed == 1)
+    speed_correction = cpi->oxcf.cpu_used <= 5 ?
+                          1.04 + (cpi->oxcf.cpu_used * 0.04) :
+                          1.25;
 
   // Try and pick a max Q that will be high enough to encode the
   // content at the given rate.
-  for (Q = cpi->twopass.maxq_min_limit; Q < cpi->twopass.maxq_max_limit; Q++) {
+  for (q = cpi->twopass.maxq_min_limit; q < cpi->twopass.maxq_max_limit; q++) {
     int bits_per_mb_at_this_q;
 
-    err_correction_factor =
-      calc_correction_factor(err_per_mb, ERR_DIVISOR, 0.4, 0.90, Q) *
-      sr_correction * speed_correction *
-      cpi->twopass.est_max_qcorrection_factor;
+    err_correction_factor = calc_correction_factor(err_per_mb,
+                                                   ERR_DIVISOR, 0.4, 0.90, q) *
+                                sr_correction * speed_correction *
+                                cpi->twopass.est_max_qcorrection_factor;
 
-
-    bits_per_mb_at_this_q =
-      vp9_bits_per_mb(INTER_FRAME, Q, err_correction_factor);
+    bits_per_mb_at_this_q = vp9_bits_per_mb(INTER_FRAME, q,
+                                            err_correction_factor);
 
     if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
       break;
   }
 
   // Restriction on active max q for constrained quality mode.
-  if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
-      (Q < cpi->cq_target_quality)) {
-    Q = cpi->cq_target_quality;
-  }
+  if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY &&
+      q < cpi->cq_target_quality)
+    q = cpi->cq_target_quality;
 
   // Adjust maxq_min_limit and maxq_max_limit limits based on
   // average q observed in clip for non kf/gf/arf frames
   // Give average a chance to settle though.
   // PGW TODO.. This code is broken for the extended Q range
-  if ((cpi->ni_frames >
-       ((int)cpi->twopass.total_stats.count >> 8)) &&
-      (cpi->ni_frames > 25)) {
+  if (cpi->ni_frames > ((int)cpi->twopass.total_stats.count >> 8) &&
+      cpi->ni_frames > 25)
     adjust_maxq_qrange(cpi);
-  }
 
-  return Q;
+  return q;
 }
 
 // For cq mode estimate a cq level that matches the observed
@@ -1013,7 +974,7 @@
 static int estimate_cq(VP9_COMP *cpi,
                        FIRSTPASS_STATS *fpstats,
                        int section_target_bandwitdh) {
-  int Q;
+  int q;
   int num_mbs = cpi->common.MBs;
   int target_norm_bits_per_mb;
 
@@ -1064,29 +1025,29 @@
     clip_iifactor = 0.80;
 
   // Try and pick a Q that can encode the content at the given rate.
-  for (Q = 0; Q < MAXQ; Q++) {
+  for (q = 0; q < MAXQ; q++) {
     int bits_per_mb_at_this_q;
 
     // Error per MB based correction factor
     err_correction_factor =
-      calc_correction_factor(err_per_mb, 100.0, 0.4, 0.90, Q) *
+      calc_correction_factor(err_per_mb, 100.0, 0.4, 0.90, q) *
       sr_correction * speed_correction * clip_iifactor;
 
     bits_per_mb_at_this_q =
-      vp9_bits_per_mb(INTER_FRAME, Q, err_correction_factor);
+      vp9_bits_per_mb(INTER_FRAME, q, err_correction_factor);
 
     if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
       break;
   }
 
   // Clip value to range "best allowed to (worst allowed - 1)"
-  Q = select_cq_level(Q);
-  if (Q >= cpi->worst_quality)
-    Q = cpi->worst_quality - 1;
-  if (Q < cpi->best_quality)
-    Q = cpi->best_quality;
+  q = select_cq_level(q);
+  if (q >= cpi->worst_quality)
+    q = cpi->worst_quality - 1;
+  if (q < cpi->best_quality)
+    q = cpi->best_quality;
 
-  return Q;
+  return q;
 }
 
 
@@ -1117,9 +1078,8 @@
   // encoded in the second pass is a guess.  However the sum duration is not.
   // Its calculated based on the actual durations of all frames from the first
   // pass.
-  vp9_new_frame_rate(cpi,
-                     10000000.0 * cpi->twopass.total_stats.count /
-                     cpi->twopass.total_stats.duration);
+  vp9_new_frame_rate(cpi, 10000000.0 * cpi->twopass.total_stats.count /
+                       cpi->twopass.total_stats.duration);
 
   cpi->output_frame_rate = cpi->oxcf.frame_rate;
   cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration *
@@ -1191,9 +1151,8 @@
 
   // Look at the observed drop in prediction quality between the last frame
   // and the GF buffer (which contains an older frame).
-  mb_sr_err_diff =
-    (next_frame->sr_coded_error - next_frame->coded_error) /
-    (cpi->common.MBs);
+  mb_sr_err_diff = (next_frame->sr_coded_error - next_frame->coded_error) /
+                   cpi->common.MBs;
   if (mb_sr_err_diff <= 512.0) {
     second_ref_decay = 1.0 - (mb_sr_err_diff / 512.0);
     second_ref_decay = pow(second_ref_decay, 0.5);
@@ -1225,9 +1184,9 @@
   // Break clause to detect very still sections after motion
   // For example a static image after a fade or other transition
   // instead of a clean scene cut.
-  if ((frame_interval > MIN_GF_INTERVAL) &&
-      (loop_decay_rate >= 0.999) &&
-      (last_decay_rate < 0.9)) {
+  if (frame_interval > MIN_GF_INTERVAL &&
+      loop_decay_rate >= 0.999 &&
+      last_decay_rate < 0.9) {
     int j;
     FIRSTPASS_STATS *position = cpi->twopass.stats_in;
     FIRSTPASS_STATS tmp_next_frame;
@@ -1271,10 +1230,9 @@
     // are reasonably well predicted by an earlier (pre flash) frame.
     // The recovery after a flash is indicated by a high pcnt_second_ref
     // comapred to pcnt_inter.
-    if ((next_frame.pcnt_second_ref > next_frame.pcnt_inter) &&
-        (next_frame.pcnt_second_ref >= 0.5)) {
+    if (next_frame.pcnt_second_ref > next_frame.pcnt_inter &&
+        next_frame.pcnt_second_ref >= 0.5)
       flash_detected = 1;
-    }
   }
 
   return flash_detected;
@@ -1356,13 +1314,9 @@
   return frame_boost;
 }
 
-static int calc_arf_boost(
-  VP9_COMP *cpi,
-  int offset,
-  int f_frames,
-  int b_frames,
-  int *f_boost,
-  int *b_boost) {
+static int calc_arf_boost(VP9_COMP *cpi, int offset,
+                          int f_frames, int b_frames,
+                          int *f_boost, int *b_boost) {
   FIRSTPASS_STATS this_frame;
 
   int i;
@@ -1392,8 +1346,7 @@
 
     // Cumulative effect of prediction quality decay
     if (!flash_detected) {
-      decay_accumulator =
-        decay_accumulator * get_prediction_decay_rate(cpi, &this_frame);
+      decay_accumulator *= get_prediction_decay_rate(cpi, &this_frame);
       decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
                           ? MIN_DECAY_FACTOR : decay_accumulator;
     }
@@ -1429,10 +1382,9 @@
 
     // Cumulative effect of prediction quality decay
     if (!flash_detected) {
-      decay_accumulator =
-        decay_accumulator * get_prediction_decay_rate(cpi, &this_frame);
+      decay_accumulator *= get_prediction_decay_rate(cpi, &this_frame);
       decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
-                          ? MIN_DECAY_FACTOR : decay_accumulator;
+                              ? MIN_DECAY_FACTOR : decay_accumulator;
     }
 
     boost_score += (decay_accumulator *
@@ -1871,26 +1823,20 @@
   for (i = 0;
       i <= (cpi->source_alt_ref_pending && cpi->common.frame_type != KEY_FRAME);
       ++i) {
-    int boost;
     int allocation_chunks;
-    int Q =
-        (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
+    int q = cpi->oxcf.fixed_q < 0 ? cpi->last_q[INTER_FRAME]
+                                  : cpi->oxcf.fixed_q;
     int gf_bits;
 
-    boost = (cpi->gfu_boost * vp9_gfboost_qadjust(Q)) / 100;
+    int boost = (cpi->gfu_boost * vp9_gfboost_qadjust(q)) / 100;
 
     // Set max and minimum boost and hence minimum allocation
-    if (boost > ((cpi->baseline_gf_interval + 1) * 200))
-      boost = ((cpi->baseline_gf_interval + 1) * 200);
-    else if (boost < 125)
-      boost = 125;
+    boost = clamp(boost, 125, (cpi->baseline_gf_interval + 1) * 200);
 
     if (cpi->source_alt_ref_pending && i == 0)
-      allocation_chunks =
-        ((cpi->baseline_gf_interval + 1) * 100) + boost;
+      allocation_chunks = ((cpi->baseline_gf_interval + 1) * 100) + boost;
     else
-      allocation_chunks =
-        (cpi->baseline_gf_interval * 100) + (boost - 100);
+      allocation_chunks = (cpi->baseline_gf_interval * 100) + (boost - 100);
 
     // Prevent overflow
     if (boost > 1023) {
@@ -1901,41 +1847,34 @@
 
     // Calculate the number of bits to be spent on the gf or arf based on
     // the boost number
-    gf_bits = (int)((double)boost *
-                    (cpi->twopass.gf_group_bits /
-                     (double)allocation_chunks));
+    gf_bits = (int)((double)boost * (cpi->twopass.gf_group_bits /
+                                       (double)allocation_chunks));
 
     // If the frame that is to be boosted is simpler than the average for
     // the gf/arf group then use an alternative calculation
     // based on the error score of the frame itself
     if (mod_frame_err < gf_group_err / (double)cpi->baseline_gf_interval) {
-      double  alt_gf_grp_bits;
-      int     alt_gf_bits;
-
-      alt_gf_grp_bits =
+      double alt_gf_grp_bits =
         (double)cpi->twopass.kf_group_bits  *
         (mod_frame_err * (double)cpi->baseline_gf_interval) /
         DOUBLE_DIVIDE_CHECK(cpi->twopass.kf_group_error_left);
 
-      alt_gf_bits = (int)((double)boost * (alt_gf_grp_bits /
+      int alt_gf_bits = (int)((double)boost * (alt_gf_grp_bits /
                                            (double)allocation_chunks));
 
-      if (gf_bits > alt_gf_bits) {
+      if (gf_bits > alt_gf_bits)
         gf_bits = alt_gf_bits;
-      }
     }
     // Else if it is harder than other frames in the group make sure it at
     // least receives an allocation in keeping with its relative error
     // score, otherwise it may be worse off than an "un-boosted" frame
     else {
-      int alt_gf_bits =
-        (int)((double)cpi->twopass.kf_group_bits *
-              mod_frame_err /
-              DOUBLE_DIVIDE_CHECK(cpi->twopass.kf_group_error_left));
+      int alt_gf_bits = (int)((double)cpi->twopass.kf_group_bits *
+                        mod_frame_err /
+                        DOUBLE_DIVIDE_CHECK(cpi->twopass.kf_group_error_left));
 
-      if (alt_gf_bits > gf_bits) {
+      if (alt_gf_bits > gf_bits)
         gf_bits = alt_gf_bits;
-      }
     }
 
     // Dont allow a negative value for gf_bits
@@ -1983,14 +1922,11 @@
     // despite (MIN_GF_INTERVAL) and would cause a divide by 0 in the
     // calculation of alt_extra_bits.
     if (cpi->baseline_gf_interval >= 3) {
-      int boost = (cpi->source_alt_ref_pending)
-                  ? b_boost : cpi->gfu_boost;
+      const int boost = cpi->source_alt_ref_pending ? b_boost : cpi->gfu_boost;
 
       if (boost >= 150) {
-        int pct_extra;
         int alt_extra_bits;
-
-        pct_extra = (boost - 100) / 50;
+        int pct_extra = (boost - 100) / 50;
         pct_extra = (pct_extra > 20) ? 20 : pct_extra;
 
         alt_extra_bits = (int)((cpi->twopass.gf_group_bits * pct_extra) / 100);
@@ -2071,33 +2007,21 @@
 // Make a damped adjustment to the active max q.
 static int adjust_active_maxq(int old_maxqi, int new_maxqi) {
   int i;
-  int ret_val = new_maxqi;
-  double old_q;
-  double new_q;
-  double target_q;
-
-  old_q = vp9_convert_qindex_to_q(old_maxqi);
-  new_q = vp9_convert_qindex_to_q(new_maxqi);
-
-  target_q = ((old_q * 7.0) + new_q) / 8.0;
+  const double old_q = vp9_convert_qindex_to_q(old_maxqi);
+  const double new_q = vp9_convert_qindex_to_q(new_maxqi);
+  const double target_q = ((old_q * 7.0) + new_q) / 8.0;
 
   if (target_q > old_q) {
-    for (i = old_maxqi; i <= new_maxqi; i++) {
-      if (vp9_convert_qindex_to_q(i) >= target_q) {
-        ret_val = i;
-        break;
-      }
-    }
+    for (i = old_maxqi; i <= new_maxqi; i++)
+      if (vp9_convert_qindex_to_q(i) >= target_q)
+        return i;
   } else {
-    for (i = old_maxqi; i >= new_maxqi; i--) {
-      if (vp9_convert_qindex_to_q(i) <= target_q) {
-        ret_val = i;
-        break;
-      }
-    }
+    for (i = old_maxqi; i >= new_maxqi; i--)
+      if (vp9_convert_qindex_to_q(i) <= target_q)
+        return i;
   }
 
-  return ret_val;
+  return new_maxqi;
 }
 
 void vp9_second_pass(VP9_COMP *cpi) {
@@ -2111,9 +2035,8 @@
   double this_frame_intra_error;
   double this_frame_coded_error;
 
-  if (!cpi->twopass.stats_in) {
+  if (!cpi->twopass.stats_in)
     return;
-  }
 
   vp9_clear_system_state();
 
@@ -2123,12 +2046,8 @@
 
     // Set a cq_level in constrained quality mode.
     if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
-      int est_cq;
-
-      est_cq =
-        estimate_cq(cpi,
-                    &cpi->twopass.total_left_stats,
-                    (int)(cpi->twopass.bits_left / frames_left));
+      int est_cq = estimate_cq(cpi, &cpi->twopass.total_left_stats,
+                               (int)(cpi->twopass.bits_left / frames_left));
 
       cpi->cq_target_quality = cpi->oxcf.cq_level;
       if (est_cq > cpi->cq_target_quality)
@@ -2139,14 +2058,12 @@
     cpi->twopass.maxq_max_limit = cpi->worst_quality;
     cpi->twopass.maxq_min_limit = cpi->best_quality;
 
-    tmp_q = estimate_max_q(
-              cpi,
-              &cpi->twopass.total_left_stats,
-              (int)(cpi->twopass.bits_left / frames_left));
+    tmp_q = estimate_max_q(cpi, &cpi->twopass.total_left_stats,
+                           (int)(cpi->twopass.bits_left / frames_left));
 
-    cpi->active_worst_quality         = tmp_q;
-    cpi->ni_av_qi                     = tmp_q;
-    cpi->avg_q                        = vp9_convert_qindex_to_q(tmp_q);
+    cpi->active_worst_quality = tmp_q;
+    cpi->ni_av_qi = tmp_q;
+    cpi->avg_q = vp9_convert_qindex_to_q(tmp_q);
 
 #ifndef ONE_SHOT_Q_ESTIMATE
     // Limit the maxq value returned subsequently.
@@ -2404,9 +2321,9 @@
     if (cpi->oxcf.auto_key
         && lookup_next_frame_stats(cpi, &next_frame) != EOF) {
       // Normal scene cut check
-      if (test_candidate_kf(cpi, &last_frame, this_frame, &next_frame)) {
+      if (test_candidate_kf(cpi, &last_frame, this_frame, &next_frame))
         break;
-      }
+
 
       // How fast is prediction quality decaying
       loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
@@ -2416,19 +2333,14 @@
       // quality since the last GF or KF.
       recent_loop_decay[i % 8] = loop_decay_rate;
       decay_accumulator = 1.0;
-      for (j = 0; j < 8; j++) {
-        decay_accumulator = decay_accumulator * recent_loop_decay[j];
-      }
+      for (j = 0; j < 8; j++)
+        decay_accumulator *= recent_loop_decay[j];
 
       // Special check for transition or high motion followed by a
       // to a static scene.
-      if (detect_transition_to_still(cpi, i,
-                                     (cpi->key_frame_frequency - i),
-                                     loop_decay_rate,
-                                     decay_accumulator)) {
+      if (detect_transition_to_still(cpi, i, cpi->key_frame_frequency - i,
+                                     loop_decay_rate, decay_accumulator))
         break;
-      }
-
 
       // Step on to the next frame
       cpi->twopass.frames_to_key++;
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 74caba5..aff5637 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -413,6 +413,201 @@
 
   return besterr;
 }
+
+#if CONFIG_COMP_INTER_JOINT_SEARCH
+#undef DIST
+/* returns subpixel variance error function */
+#define DIST(r, c) \
+    vfp->svaf(PRE(r, c), y_stride, SP(c), SP(r), \
+              z, src_stride, &sse, second_pred)
+
+int vp9_find_best_sub_pixel_comp(MACROBLOCK *x,
+                                 int_mv *bestmv, int_mv *ref_mv,
+                                 int error_per_bit,
+                                 const vp9_variance_fn_ptr_t *vfp,
+                                 int *mvjcost, int *mvcost[2],
+                                 int *distortion,
+                                 unsigned int *sse1,
+                                 const uint8_t *second_pred, int w, int h) {
+  uint8_t *z = x->plane[0].src.buf;
+  int src_stride = x->plane[0].src.stride;
+  MACROBLOCKD *xd = &x->e_mbd;
+
+  int rr, rc, br, bc, hstep;
+  int tr, tc;
+  unsigned int besterr = INT_MAX;
+  unsigned int left, right, up, down, diag;
+  unsigned int sse;
+  unsigned int whichdir;
+  unsigned int halfiters = 4;
+  unsigned int quarteriters = 4;
+  unsigned int eighthiters = 4;
+  int thismse;
+  int maxc, minc, maxr, minr;
+  int y_stride;
+  int offset;
+  int usehp = xd->allow_high_precision_mv;
+
+  uint8_t *comp_pred = vpx_memalign(16, w * h * sizeof(uint8_t));
+  uint8_t *y = xd->plane[0].pre[0].buf +
+               (bestmv->as_mv.row) * xd->plane[0].pre[0].stride +
+               bestmv->as_mv.col;
+
+  y_stride = xd->plane[0].pre[0].stride;
+
+  rr = ref_mv->as_mv.row;
+  rc = ref_mv->as_mv.col;
+  br = bestmv->as_mv.row << 3;
+  bc = bestmv->as_mv.col << 3;
+  hstep = 4;
+  minc = MAX(x->mv_col_min << 3, (ref_mv->as_mv.col) -
+             ((1 << MV_MAX_BITS) - 1));
+  maxc = MIN(x->mv_col_max << 3, (ref_mv->as_mv.col) +
+             ((1 << MV_MAX_BITS) - 1));
+  minr = MAX(x->mv_row_min << 3, (ref_mv->as_mv.row) -
+             ((1 << MV_MAX_BITS) - 1));
+  maxr = MIN(x->mv_row_max << 3, (ref_mv->as_mv.row) +
+             ((1 << MV_MAX_BITS) - 1));
+
+  tr = br;
+  tc = bc;
+
+
+  offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;
+
+  // central mv
+  bestmv->as_mv.row <<= 3;
+  bestmv->as_mv.col <<= 3;
+
+  // calculate central point error
+  // TODO(yunqingwang): central pointer error was already calculated in full-
+  // pixel search, and can be passed in this function.
+  comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
+  besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost,
+                         error_per_bit, xd->allow_high_precision_mv);
+
+  // Each subsequent iteration checks at least one point in
+  // common with the last iteration could be 2 ( if diag selected)
+  while (--halfiters) {
+    // 1/2 pel
+    CHECK_BETTER(left, tr, tc - hstep);
+    CHECK_BETTER(right, tr, tc + hstep);
+    CHECK_BETTER(up, tr - hstep, tc);
+    CHECK_BETTER(down, tr + hstep, tc);
+
+    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+    switch (whichdir) {
+      case 0:
+        CHECK_BETTER(diag, tr - hstep, tc - hstep);
+        break;
+      case 1:
+        CHECK_BETTER(diag, tr - hstep, tc + hstep);
+        break;
+      case 2:
+        CHECK_BETTER(diag, tr + hstep, tc - hstep);
+        break;
+      case 3:
+        CHECK_BETTER(diag, tr + hstep, tc + hstep);
+        break;
+    }
+
+    // no reason to check the same one again.
+    if (tr == br && tc == bc)
+      break;
+
+    tr = br;
+    tc = bc;
+  }
+
+  // Each subsequent iteration checks at least one point in common with
+  // the last iteration could be 2 ( if diag selected) 1/4 pel
+  hstep >>= 1;
+  while (--quarteriters) {
+    CHECK_BETTER(left, tr, tc - hstep);
+    CHECK_BETTER(right, tr, tc + hstep);
+    CHECK_BETTER(up, tr - hstep, tc);
+    CHECK_BETTER(down, tr + hstep, tc);
+
+    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+    switch (whichdir) {
+      case 0:
+        CHECK_BETTER(diag, tr - hstep, tc - hstep);
+        break;
+      case 1:
+        CHECK_BETTER(diag, tr - hstep, tc + hstep);
+        break;
+      case 2:
+        CHECK_BETTER(diag, tr + hstep, tc - hstep);
+        break;
+      case 3:
+        CHECK_BETTER(diag, tr + hstep, tc + hstep);
+        break;
+    }
+
+    // no reason to check the same one again.
+    if (tr == br && tc == bc)
+      break;
+
+    tr = br;
+    tc = bc;
+  }
+
+  if (xd->allow_high_precision_mv) {
+    usehp = vp9_use_nmv_hp(&ref_mv->as_mv);
+  } else {
+    usehp = 0;
+  }
+
+  if (usehp) {
+    hstep >>= 1;
+    while (--eighthiters) {
+      CHECK_BETTER(left, tr, tc - hstep);
+      CHECK_BETTER(right, tr, tc + hstep);
+      CHECK_BETTER(up, tr - hstep, tc);
+      CHECK_BETTER(down, tr + hstep, tc);
+
+      whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+      switch (whichdir) {
+        case 0:
+          CHECK_BETTER(diag, tr - hstep, tc - hstep);
+          break;
+        case 1:
+          CHECK_BETTER(diag, tr - hstep, tc + hstep);
+          break;
+        case 2:
+          CHECK_BETTER(diag, tr + hstep, tc - hstep);
+          break;
+        case 3:
+          CHECK_BETTER(diag, tr + hstep, tc + hstep);
+          break;
+      }
+
+      // no reason to check the same one again.
+      if (tr == br && tc == bc)
+        break;
+
+      tr = br;
+      tc = bc;
+    }
+  }
+  bestmv->as_mv.row = br;
+  bestmv->as_mv.col = bc;
+
+  vpx_free(comp_pred);
+
+  if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3)))
+    return INT_MAX;
+
+  return besterr;
+}
+#endif  // CONFIG_COMP_INTER_JOINT_SEARCH
+
 #undef MVC
 #undef PRE
 #undef DIST
@@ -2132,7 +2327,109 @@
     return INT_MAX;
 }
 
+#if CONFIG_COMP_INTER_JOINT_SEARCH
+/* This function is called when we do joint motion search in comp_inter_inter
+ * mode.
+ */
+int vp9_refining_search_8p_c(MACROBLOCK *x,
+                             int_mv *ref_mv, int error_per_bit,
+                             int search_range, vp9_variance_fn_ptr_t *fn_ptr,
+                             int *mvjcost, int *mvcost[2], int_mv *center_mv,
+                             const uint8_t *second_pred, int w, int h) {
+  const MACROBLOCKD* const xd = &x->e_mbd;
+  MV neighbors[8] = {{-1, 0}, {0, -1}, {0, 1}, {1, 0},
+      {-1, -1}, {1, -1}, {-1, 1}, {1, 1}};
+  int i, j;
+  int this_row_offset, this_col_offset;
 
+  int what_stride = x->plane[0].src.stride;
+  int in_what_stride = xd->plane[0].pre[0].stride;
+  uint8_t *what = x->plane[0].src.buf;
+  uint8_t *best_address = xd->plane[0].pre[0].buf +
+                          (ref_mv->as_mv.row * xd->plane[0].pre[0].stride) +
+                          ref_mv->as_mv.col;
+  uint8_t *check_here;
+  unsigned int thissad;
+  int_mv this_mv;
+  unsigned int bestsad = INT_MAX;
+  int_mv fcenter_mv;
+
+  int *mvjsadcost = x->nmvjointsadcost;
+  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
+
+  /* Compound pred buffer */
+  uint8_t *comp_pred = vpx_memalign(16, w * h * sizeof(uint8_t));
+
+  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+  /* Get compound pred by averaging two pred blocks. */
+  comp_avg_pred(comp_pred, second_pred, w, h, best_address, in_what_stride);
+
+  bestsad = fn_ptr->sdf(what, what_stride, comp_pred, w, 0x7fffffff) +
+      mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit);
+
+  for (i = 0; i < search_range; i++) {
+    int best_site = -1;
+
+    for (j = 0; j < 8; j++) {
+      this_row_offset = ref_mv->as_mv.row + neighbors[j].row;
+      this_col_offset = ref_mv->as_mv.col + neighbors[j].col;
+
+      if ((this_col_offset > x->mv_col_min) &&
+          (this_col_offset < x->mv_col_max) &&
+          (this_row_offset > x->mv_row_min) &&
+          (this_row_offset < x->mv_row_max)) {
+        check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col +
+            best_address;
+
+        /* Get compound block and use it to calculate SAD. */
+        comp_avg_pred(comp_pred, second_pred, w, h, check_here,
+                      in_what_stride);
+        thissad = fn_ptr->sdf(what, what_stride, comp_pred, w, bestsad);
+
+        if (thissad < bestsad) {
+          this_mv.as_mv.row = this_row_offset;
+          this_mv.as_mv.col = this_col_offset;
+          thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost,
+                                    mvsadcost, error_per_bit);
+
+          if (thissad < bestsad) {
+            bestsad = thissad;
+            best_site = j;
+          }
+        }
+      }
+    }
+
+    if (best_site == -1) {
+      break;
+    } else {
+      ref_mv->as_mv.row += neighbors[best_site].row;
+      ref_mv->as_mv.col += neighbors[best_site].col;
+      best_address += (neighbors[best_site].row) * in_what_stride +
+          neighbors[best_site].col;
+    }
+  }
+
+  this_mv.as_mv.row = ref_mv->as_mv.row << 3;
+  this_mv.as_mv.col = ref_mv->as_mv.col << 3;
+
+  if (bestsad < INT_MAX) {
+    int besterr;
+    comp_avg_pred(comp_pred, second_pred, w, h, best_address, in_what_stride);
+    besterr = fn_ptr->vf(what, what_stride, comp_pred, w,
+        (unsigned int *)(&thissad)) +
+        mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit,
+                    xd->allow_high_precision_mv);
+    vpx_free(comp_pred);
+    return besterr;
+  } else {
+    vpx_free(comp_pred);
+    return INT_MAX;
+  }
+}
+#endif  // CONFIG_COMP_INTER_JOINT_SEARCH
 
 #ifdef ENTROPY_STATS
 void print_mode_context(VP9_COMMON *pc) {
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index e1ba7fd..cdbd29a 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -79,5 +79,21 @@
                                        int *mvjcost, int *mvcost[2],
                                        int_mv *center_mv);
 
+#if CONFIG_COMP_INTER_JOINT_SEARCH
+int vp9_find_best_sub_pixel_comp(MACROBLOCK *x,
+                                 int_mv *bestmv, int_mv *ref_mv,
+                                 int error_per_bit,
+                                 const vp9_variance_fn_ptr_t *vfp,
+                                 int *mvjcost, int *mvcost[2],
+                                 int *distortion, unsigned int *sse1,
+                                 const uint8_t *second_pred,
+                                 int w, int h);
 
+int vp9_refining_search_8p_c(MACROBLOCK *x,
+                             int_mv *ref_mv, int error_per_bit,
+                             int search_range, vp9_variance_fn_ptr_t *fn_ptr,
+                             int *mvjcost, int *mvcost[2],
+                             int_mv *center_mv, const uint8_t *second_pred,
+                             int w, int h);
+#endif  // CONFIG_COMP_INTER_JOINT_SEARCH
 #endif  // VP9_ENCODER_VP9_MCOMP_H_
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index e55f555..464b649 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -1527,10 +1527,11 @@
   for (i = 0; i < MAX_MODES; i++)
     cpi->rd_thresh_mult[i] = 128;
 
-#define BFP(BT, SDF, VF, SVF, SVFHH, SVFHV, SVFHHV, SDX3F, SDX8F, SDX4DF) \
+#define BFP(BT, SDF, VF, SVF, SVAF, SVFHH, SVFHV, SVFHHV, SDX3F, SDX8F, SDX4DF)\
     cpi->fn_ptr[BT].sdf            = SDF; \
     cpi->fn_ptr[BT].vf             = VF; \
     cpi->fn_ptr[BT].svf            = SVF; \
+    cpi->fn_ptr[BT].svaf           = SVAF; \
     cpi->fn_ptr[BT].svf_halfpix_h  = SVFHH; \
     cpi->fn_ptr[BT].svf_halfpix_v  = SVFHV; \
     cpi->fn_ptr[BT].svf_halfpix_hv = SVFHHV; \
@@ -1539,57 +1540,64 @@
     cpi->fn_ptr[BT].sdx4df         = SDX4DF;
 
   BFP(BLOCK_32X16, vp9_sad32x16, vp9_variance32x16, vp9_sub_pixel_variance32x16,
-      NULL, NULL,
+      vp9_sub_pixel_avg_variance32x16, NULL, NULL,
       NULL, NULL, NULL,
       vp9_sad32x16x4d)
 
   BFP(BLOCK_16X32, vp9_sad16x32, vp9_variance16x32, vp9_sub_pixel_variance16x32,
-      NULL, NULL,
+      vp9_sub_pixel_avg_variance16x32, NULL, NULL,
       NULL, NULL, NULL,
       vp9_sad16x32x4d)
 
   BFP(BLOCK_64X32, vp9_sad64x32, vp9_variance64x32, vp9_sub_pixel_variance64x32,
-      NULL, NULL,
+      vp9_sub_pixel_avg_variance64x32, NULL, NULL,
       NULL, NULL, NULL,
       vp9_sad64x32x4d)
 
   BFP(BLOCK_32X64, vp9_sad32x64, vp9_variance32x64, vp9_sub_pixel_variance32x64,
-      NULL, NULL,
+      vp9_sub_pixel_avg_variance32x64, NULL, NULL,
       NULL, NULL, NULL,
       vp9_sad32x64x4d)
 
   BFP(BLOCK_32X32, vp9_sad32x32, vp9_variance32x32, vp9_sub_pixel_variance32x32,
-      vp9_variance_halfpixvar32x32_h, vp9_variance_halfpixvar32x32_v,
+      vp9_sub_pixel_avg_variance32x32, vp9_variance_halfpixvar32x32_h,
+      vp9_variance_halfpixvar32x32_v,
       vp9_variance_halfpixvar32x32_hv, vp9_sad32x32x3, vp9_sad32x32x8,
       vp9_sad32x32x4d)
 
   BFP(BLOCK_64X64, vp9_sad64x64, vp9_variance64x64, vp9_sub_pixel_variance64x64,
-      vp9_variance_halfpixvar64x64_h, vp9_variance_halfpixvar64x64_v,
+      vp9_sub_pixel_avg_variance64x64, vp9_variance_halfpixvar64x64_h,
+      vp9_variance_halfpixvar64x64_v,
       vp9_variance_halfpixvar64x64_hv, vp9_sad64x64x3, vp9_sad64x64x8,
       vp9_sad64x64x4d)
 
   BFP(BLOCK_16X16, vp9_sad16x16, vp9_variance16x16, vp9_sub_pixel_variance16x16,
-       vp9_variance_halfpixvar16x16_h, vp9_variance_halfpixvar16x16_v,
-       vp9_variance_halfpixvar16x16_hv, vp9_sad16x16x3, vp9_sad16x16x8,
-       vp9_sad16x16x4d)
+      vp9_sub_pixel_avg_variance16x16, vp9_variance_halfpixvar16x16_h,
+      vp9_variance_halfpixvar16x16_v,
+      vp9_variance_halfpixvar16x16_hv, vp9_sad16x16x3, vp9_sad16x16x8,
+      vp9_sad16x16x4d)
 
   BFP(BLOCK_16X8, vp9_sad16x8, vp9_variance16x8, vp9_sub_pixel_variance16x8,
-      NULL, NULL, NULL, vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d)
+      vp9_sub_pixel_avg_variance16x8, NULL, NULL, NULL,
+      vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d)
 
   BFP(BLOCK_8X16, vp9_sad8x16, vp9_variance8x16, vp9_sub_pixel_variance8x16,
-      NULL, NULL, NULL, vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d)
+      vp9_sub_pixel_avg_variance8x16, NULL, NULL, NULL,
+      vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d)
 
   BFP(BLOCK_8X8, vp9_sad8x8, vp9_variance8x8, vp9_sub_pixel_variance8x8,
-      NULL, NULL, NULL, vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)
+      vp9_sub_pixel_avg_variance8x8, NULL, NULL, NULL,
+      vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)
 
   BFP(BLOCK_4X8, NULL, vp9_variance4x8, NULL,
-      NULL, NULL, NULL, NULL, NULL, NULL)
+      NULL, NULL, NULL, NULL, NULL, NULL, NULL)
 
   BFP(BLOCK_8X4, NULL, vp9_variance8x4, NULL,
-      NULL, NULL, NULL, NULL, NULL, NULL)
+      NULL, NULL, NULL, NULL, NULL, NULL, NULL)
 
   BFP(BLOCK_4X4, vp9_sad4x4, vp9_variance4x4, vp9_sub_pixel_variance4x4,
-      NULL, NULL, NULL, vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
+      vp9_sub_pixel_avg_variance4x4, NULL, NULL, NULL,
+      vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
 
   cpi->full_search_sad = vp9_full_search_sad;
   cpi->diamond_search_sad = vp9_diamond_search_sad;
@@ -2120,49 +2128,31 @@
   const int in_h = src_fb->y_crop_height;
   const int out_w = dst_fb->y_crop_width;
   const int out_h = dst_fb->y_crop_height;
-  int x, y;
+  int x, y, i;
+
+  uint8_t *srcs[3] = {src_fb->y_buffer, src_fb->u_buffer, src_fb->v_buffer};
+  int src_strides[3] = {src_fb->y_stride, src_fb->uv_stride, src_fb->uv_stride};
+
+  uint8_t *dsts[3] = {dst_fb->y_buffer, dst_fb->u_buffer, dst_fb->v_buffer};
+  int dst_strides[3] = {dst_fb->y_stride, dst_fb->uv_stride, dst_fb->uv_stride};
 
   for (y = 0; y < out_h; y += 16) {
     for (x = 0; x < out_w; x += 16) {
-      int x_q4 = x * 16 * in_w / out_w;
-      int y_q4 = y * 16 * in_h / out_h;
-      uint8_t *src = src_fb->y_buffer + y * in_h / out_h * src_fb->y_stride +
-                     x * in_w / out_w;
-      uint8_t *dst = dst_fb->y_buffer + y * dst_fb->y_stride + x;
-      int src_stride = src_fb->y_stride;
-      int dst_stride = dst_fb->y_stride;
+      for (i = 0; i < MAX_MB_PLANE; ++i) {
+        const int factor = i == 0 ? 1 : 2;
+        const int x_q4 = x * (16 / factor) * in_w / out_w;
+        const int y_q4 = y * (16 / factor) * in_h / out_h;
+        const int src_stride = src_strides[i];
+        const int dst_stride = dst_strides[i];
+        uint8_t *src = srcs[i] + y / factor * in_h / out_h * src_stride +
+                                 x / factor * in_w / out_w;
+        uint8_t *dst = dsts[i] + y * dst_stride + x;
 
-      vp9_convolve8(src, src_stride, dst, dst_stride,
-                    vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * in_w / out_w,
-                    vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * in_h / out_h,
-                    16, 16);
-
-      x_q4 >>= 1;
-      y_q4 >>= 1;
-      src_stride = src_fb->uv_stride;
-      dst_stride = dst_fb->uv_stride;
-
-      src = src_fb->u_buffer +
-          y / 2 * in_h / out_h * src_fb->uv_stride +
-          x / 2 * in_w / out_w;
-      dst = dst_fb->u_buffer +
-          y / 2 * dst_fb->uv_stride +
-          x / 2;
-      vp9_convolve8(src, src_stride, dst, dst_stride,
-                    vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * in_w / out_w,
-                    vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * in_h / out_h,
-                    8, 8);
-
-      src = src_fb->v_buffer +
-          y / 2 * in_h / out_h * src_fb->uv_stride +
-          x / 2 * in_w / out_w;
-      dst = dst_fb->v_buffer +
-          y / 2 * dst_fb->uv_stride +
-          x / 2;
-      vp9_convolve8(src, src_stride, dst, dst_stride,
-                    vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * in_w / out_w,
-                    vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * in_h / out_h,
-                    8, 8);
+        vp9_convolve8(src, src_stride, dst, dst_stride,
+                      vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * in_w / out_w,
+                      vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * in_h / out_h,
+                      16 / factor, 16 / factor);
+      }
     }
   }
 
@@ -2873,7 +2863,7 @@
         break;
     }
 
-    vp9_denoise(cpi->Source, cpi->Source, l, 1, 0);
+    vp9_denoise(cpi->Source, cpi->Source, l);
   }
 
 #endif
@@ -3870,16 +3860,8 @@
                            VP9BORDERINPIXELS);
 
   // Calculate scaling factors for each of the 3 available references
-  for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {
-    if (cm->active_ref_idx[i] >= NUM_YV12_BUFFERS) {
-      memset(&cm->active_ref_scale[i], 0, sizeof(cm->active_ref_scale[i]));
-    } else {
-      YV12_BUFFER_CONFIG *fb = &cm->yv12_fb[cm->active_ref_idx[i]];
-      vp9_setup_scale_factors_for_frame(&cm->active_ref_scale[i],
-                                        fb->y_crop_width, fb->y_crop_height,
-                                        cm->width, cm->height);
-    }
-  }
+  for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i)
+    vp9_setup_scale_factors(cm, i);
 
   vp9_setup_interp_filters(&cpi->mb.e_mbd, DEFAULT_INTERP_FILTER, cm);
 
@@ -3958,7 +3940,7 @@
           double weight = 0;
 #if CONFIG_POSTPROC
           vp9_deblock(cm->frame_to_show, &cm->post_proc_buffer,
-                      cm->filter_level * 10 / 6, 1, 0);
+                      cm->filter_level * 10 / 6);
 #endif
           vp9_clear_system_state();
 
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 1b143f5..271a63f 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -272,6 +272,7 @@
                         [ENTROPY_NODES];
   int seg_eob, default_eob;
   uint8_t token_cache[1024];
+  const uint8_t * band_translate;
 
   // Check for consistency of tx_size with mode info
   assert((!type && !plane) || (type && plane));
@@ -291,6 +292,7 @@
       coef_probs = cm->fc.coef_probs_4x4;
       seg_eob = 16;
       scan = get_scan_4x4(tx_type);
+      band_translate = vp9_coefband_trans_4x4;
       break;
     }
     case TX_8X8: {
@@ -304,6 +306,7 @@
       scan = get_scan_8x8(tx_type);
       coef_probs = cm->fc.coef_probs_8x8;
       seg_eob = 64;
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
     }
     case TX_16X16: {
@@ -317,6 +320,7 @@
       seg_eob = 256;
       above_ec = (A[0] + A[1] + A[2] + A[3]) != 0;
       left_ec = (L[0] + L[1] + L[2] + L[3]) != 0;
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
     }
     case TX_32X32:
@@ -325,6 +329,7 @@
       seg_eob = 1024;
       above_ec = (A[0] + A[1] + A[2] + A[3] + A[4] + A[5] + A[6] + A[7]) != 0;
       left_ec = (L[0] + L[1] + L[2] + L[3] + L[4] + L[5] + L[6] + L[7]) != 0;
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
     default:
       abort();
@@ -347,7 +352,7 @@
     for (c = 0; c < eob; c++) {
       int v = qcoeff_ptr[scan[c]];
       int t = vp9_dct_value_tokens_ptr[v].token;
-      int band = get_coef_band(scan, tx_size, c);
+      int band = get_coef_band(band_translate, c);
       if (c)
         pt = vp9_get_coef_context(scan, nb, pad, token_cache, c, default_eob);
 
@@ -361,7 +366,7 @@
       if (c)
         pt = vp9_get_coef_context(scan, nb, pad, token_cache, c, default_eob);
       cost += mb->token_costs[tx_size][type][ref]
-          [get_coef_band(scan, tx_size, c)]
+          [get_coef_band(band_translate, c)]
           [pt][DCT_EOB_TOKEN];
     }
   }
@@ -684,7 +689,11 @@
                                          int *Distortion, int64_t best_rd) {
   int i;
   MACROBLOCKD *const xd = &mb->e_mbd;
+#if CONFIG_AB4X4
+  int cost = 0;
+#else
   int cost = mb->mbmode_cost[xd->frame_type][I4X4_PRED];
+#endif
   int distortion = 0;
   int tot_rate_y = 0;
   int64_t total_rd = 0;
@@ -714,7 +723,6 @@
     total_rd += rd_pick_intra4x4block(cpi, mb, i, &best_mode, bmode_costs,
                                       t_above + x_idx, t_left + y_idx,
                                       &r, &ry, &d);
-
     cost += r;
     distortion += d;
     tot_rate_y += ry;
@@ -748,6 +756,13 @@
   TX_SIZE UNINITIALIZED_IS_SAFE(best_tx);
   int i;
 
+#if CONFIG_AB4X4
+  if (bsize < BLOCK_SIZE_SB8X8) {
+    x->e_mbd.mode_info_context->mbmi.txfm_size = TX_4X4;
+    return best_rd;
+  }
+#endif
+
   for (i = 0; i < NB_TXFM_MODES; i++)
     txfm_cache[i] = INT64_MAX;
 
@@ -1069,9 +1084,7 @@
   B_PREDICTION_MODE modes[4];
   int_mv mvs[4], second_mvs[4];
   int eobs[4];
-
   int mvthresh;
-  int *mdcounts;
 } BEST_SEG_INFO;
 
 static INLINE int mv_check_bounds(MACROBLOCK *x, int_mv *mv) {
@@ -1322,7 +1335,6 @@
                                        int_mv *best_ref_mv,
                                        int_mv *second_best_ref_mv,
                                        int64_t best_rd,
-                                       int *mdcounts,
                                        int *returntotrate,
                                        int *returnyrate,
                                        int *returndistortion,
@@ -1339,7 +1351,6 @@
   bsi.second_ref_mv = second_best_ref_mv;
   bsi.mvp.as_int = best_ref_mv->as_int;
   bsi.mvthresh = mvthresh;
-  bsi.mdcounts = mdcounts;
 
   for (i = 0; i < 4; i++)
     bsi.modes[i] = ZERO4X4;
@@ -1612,7 +1623,6 @@
                                int mi_row, int mi_col,
                                int_mv frame_nearest_mv[MAX_REF_FRAMES],
                                int_mv frame_near_mv[MAX_REF_FRAMES],
-                               int frame_mdcounts[4][4],
                                struct buf_2d yv12_mb[4][MAX_MB_PLANE],
                                struct scale_factors scale[MAX_REF_FRAMES]) {
   VP9_COMMON *cm = &cpi->common;
@@ -1797,7 +1807,7 @@
 
 static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                  BLOCK_SIZE_TYPE bsize,
-                                 int mdcounts[4], int64_t txfm_cache[],
+                                 int64_t txfm_cache[],
                                  int *rate2, int *distortion, int *skippable,
                                  int *compmode_cost,
                                  int *rate_y, int *distortion_y,
@@ -1807,8 +1817,9 @@
                                  INTERPOLATIONFILTERTYPE *best_filter,
                                  int_mv frame_mv[MB_MODE_COUNT]
                                                 [MAX_REF_FRAMES],
-                                 YV12_BUFFER_CONFIG *scaled_ref_frame,
-                                 int mi_row, int mi_col) {
+                                 YV12_BUFFER_CONFIG **scaled_ref_frame,
+                                 int mi_row, int mi_col,
+                                 int_mv single_newmv[MAX_REF_FRAMES]) {
   const int bw = 1 << mi_width_log2(bsize), bh = 1 << mi_height_log2(bsize);
 
   VP9_COMMON *cm = &cpi->common;
@@ -1838,6 +1849,158 @@
       ref_mv[1] = mbmi->ref_mvs[refs[1]][0];
 
       if (is_comp_pred) {
+#if CONFIG_COMP_INTER_JOINT_SEARCH
+        const int b_sz[BLOCK_SIZE_TYPES][2] = {
+            {4, 4},
+            {8, 8},
+            {8, 16},
+            {16, 8},
+            {16, 16},
+            {16, 32},
+            {32, 16},
+            {32, 32},
+            {32, 64},
+            {64, 32},
+            {64, 64}
+        };
+
+        int ite;
+        // Prediction buffer from second frame.
+        uint8_t *second_pred = vpx_memalign(16, b_sz[bsize][0] *
+                                            b_sz[bsize][1] * sizeof(uint8_t));
+
+        // Do joint motion search in compound mode to get more accurate mv.
+        struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
+        struct buf_2d backup_second_yv12[MAX_MB_PLANE] = {{0}};
+        struct buf_2d scaled_first_yv12;
+        int last_besterr[2] = {INT_MAX, INT_MAX};
+
+        if (scaled_ref_frame[0]) {
+          int i;
+
+          // Swap out the reference frame for a version that's been scaled to
+          // match the resolution of the current frame, allowing the existing
+          // motion search code to be used without additional modifications.
+          for (i = 0; i < MAX_MB_PLANE; i++)
+            backup_yv12[i] = xd->plane[i].pre[0];
+
+          setup_pre_planes(xd, scaled_ref_frame[0], NULL, mi_row, mi_col,
+                           NULL, NULL);
+        }
+
+        if (scaled_ref_frame[1]) {
+          int i;
+
+          for (i = 0; i < MAX_MB_PLANE; i++)
+            backup_second_yv12[i] = xd->plane[i].pre[1];
+
+          setup_pre_planes(xd, scaled_ref_frame[1], NULL, mi_row, mi_col,
+                           NULL, NULL);
+        }
+        xd->scale_factor[0].set_scaled_offsets(&xd->scale_factor[0],
+                                                mi_row, mi_col);
+        xd->scale_factor[1].set_scaled_offsets(&xd->scale_factor[1],
+                                                mi_row, mi_col);
+
+        scaled_first_yv12 = xd->plane[0].pre[0];
+
+        // Initialize mv using single prediction mode result.
+        frame_mv[NEWMV][refs[0]].as_int = single_newmv[refs[0]].as_int;
+        frame_mv[NEWMV][refs[1]].as_int = single_newmv[refs[1]].as_int;
+
+        // Allow joint search multiple times iteratively for each ref frame, and
+        // break out the search loop if it couldn't find better mv.
+        for (ite = 0; ite < 4; ite++) {
+          struct buf_2d ref_yv12[2] = {xd->plane[0].pre[0],
+                                       xd->plane[0].pre[1]};
+          int bestsme = INT_MAX;
+          int sadpb = x->sadperbit16;
+          int_mv tmp_mv;
+          int search_range = 3;
+
+          int tmp_col_min = x->mv_col_min;
+          int tmp_col_max = x->mv_col_max;
+          int tmp_row_min = x->mv_row_min;
+          int tmp_row_max = x->mv_row_max;
+          int id = ite % 2;
+
+          // Get pred block from second frame.
+          vp9_build_inter_predictor(ref_yv12[!id].buf,
+                                    ref_yv12[!id].stride,
+                                    second_pred, b_sz[bsize][0],
+                                    &frame_mv[NEWMV][refs[!id]],
+                                    &xd->scale_factor[!id],
+                                    b_sz[bsize][0], b_sz[bsize][1], 0,
+                                    &xd->subpix);
+
+          // Compound motion search on first ref frame.
+          if (id)
+            xd->plane[0].pre[0] = ref_yv12[id];
+          vp9_clamp_mv_min_max(x, &ref_mv[id]);
+
+          // Use mv result from single mode as mvp.
+          tmp_mv.as_int = frame_mv[NEWMV][refs[id]].as_int;
+
+          tmp_mv.as_mv.col >>= 3;
+          tmp_mv.as_mv.row >>= 3;
+
+          // Small-range full-pixel motion search
+          bestsme = vp9_refining_search_8p_c(x, &tmp_mv, sadpb,
+                                             search_range,
+                                             &cpi->fn_ptr[block_size],
+                                             x->nmvjointcost, x->mvcost,
+                                             &ref_mv[id], second_pred,
+                                             b_sz[bsize][0], b_sz[bsize][1]);
+
+          x->mv_col_min = tmp_col_min;
+          x->mv_col_max = tmp_col_max;
+          x->mv_row_min = tmp_row_min;
+          x->mv_row_max = tmp_row_max;
+
+          if (bestsme < INT_MAX) {
+            int dis; /* TODO: use dis in distortion calculation later. */
+            unsigned int sse;
+
+            bestsme = vp9_find_best_sub_pixel_comp(x, &tmp_mv,
+                                                   &ref_mv[id],
+                                                   x->errorperbit,
+                                                   &cpi->fn_ptr[block_size],
+                                                   x->nmvjointcost, x->mvcost,
+                                                   &dis, &sse, second_pred,
+                                                   b_sz[bsize][0],
+                                                   b_sz[bsize][1]);
+          }
+
+          if (id)
+            xd->plane[0].pre[0] = scaled_first_yv12;
+
+          if (bestsme < last_besterr[id]) {
+            frame_mv[NEWMV][refs[id]].as_int =
+                xd->mode_info_context->bmi[0].as_mv[1].as_int = tmp_mv.as_int;
+            last_besterr[id] = bestsme;
+          } else {
+            break;
+          }
+        }
+
+        // restore the predictor
+        if (scaled_ref_frame[0]) {
+          int i;
+
+          for (i = 0; i < MAX_MB_PLANE; i++)
+            xd->plane[i].pre[0] = backup_yv12[i];
+        }
+
+        if (scaled_ref_frame[1]) {
+          int i;
+
+          for (i = 0; i < MAX_MB_PLANE; i++)
+            xd->plane[i].pre[1] = backup_second_yv12[i];
+        }
+
+        vpx_free(second_pred);
+#endif  // CONFIG_COMP_INTER_JOINT_SEARCH
+
         if (frame_mv[NEWMV][refs[0]].as_int == INVALID_MV ||
             frame_mv[NEWMV][refs[1]].as_int == INVALID_MV)
           return INT64_MAX;
@@ -1862,7 +2025,7 @@
         int tmp_row_min = x->mv_row_min;
         int tmp_row_max = x->mv_row_max;
 
-        if (scaled_ref_frame) {
+        if (scaled_ref_frame[0]) {
           int i;
 
           // Swap out the reference frame for a version that's been scaled to
@@ -1871,7 +2034,7 @@
           for (i = 0; i < MAX_MB_PLANE; i++)
             backup_yv12[i] = xd->plane[i].pre[0];
 
-          setup_pre_planes(xd, scaled_ref_frame, NULL, mi_row, mi_col,
+          setup_pre_planes(xd, scaled_ref_frame[0], NULL, mi_row, mi_col,
                            NULL, NULL);
         }
 
@@ -1914,6 +2077,7 @@
         }
         frame_mv[NEWMV][refs[0]].as_int =
           xd->mode_info_context->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
+        single_newmv[refs[0]].as_int = tmp_mv.as_int;
 
         // Add the new motion vector cost to our rolling cost variable
         *rate2 += vp9_mv_bit_cost(&tmp_mv, &ref_mv[0],
@@ -1921,7 +2085,7 @@
                                   96, xd->allow_high_precision_mv);
 
         // restore the predictor, if required
-        if (scaled_ref_frame) {
+        if (scaled_ref_frame[0]) {
           int i;
 
           for (i = 0; i < MAX_MB_PLANE; i++)
@@ -2151,8 +2315,14 @@
   mode = xd->mode_info_context->mbmi.mode;
   txfm_size = xd->mode_info_context->mbmi.txfm_size;
   rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
-                          &dist_uv, &uv_skip, bsize);
+                          &dist_uv, &uv_skip,
+                          (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 :
+                                                       bsize);
+#if CONFIG_AB4X4
+  if (bsize < BLOCK_SIZE_SB8X8)
+#else
   if (bsize == BLOCK_SIZE_SB8X8)
+#endif
     err4x4 = rd_pick_intra4x4mby_modes(cpi, x, &rate4x4_y,
                                        &rate4x4_y_tokenonly,
                                        &dist4x4_y, err);
@@ -2165,7 +2335,11 @@
            sizeof(x->sb32_context[xd->sb_index].txfm_rd_diff));
     xd->mode_info_context->mbmi.mode = mode;
     xd->mode_info_context->mbmi.txfm_size = txfm_size;
+#if CONFIG_AB4X4
+  } else if (bsize < BLOCK_SIZE_SB8X8 && err4x4 < err) {
+#else
   } else if (bsize == BLOCK_SIZE_SB8X8 && err4x4 < err) {
+#endif
     *returnrate = rate4x4_y + rate_uv +
         vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
     *returndist = dist4x4_y + (dist_uv >> 2);
@@ -2203,15 +2377,14 @@
   unsigned char segment_id = xd->mode_info_context->mbmi.segment_id;
   int comp_pred, i;
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
-  int frame_mdcounts[4][4];
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
+  int_mv single_newmv[MAX_REF_FRAMES];
   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
                                     VP9_ALT_FLAG };
   int idx_list[4] = {0,
                      cpi->lst_fb_idx,
                      cpi->gld_fb_idx,
                      cpi->alt_fb_idx};
-  int mdcounts[4];
   int64_t best_rd = INT64_MAX;
   int64_t best_txfm_rd[NB_TXFM_MODES];
   int64_t best_txfm_diff[NB_TXFM_MODES];
@@ -2251,6 +2424,7 @@
   xd->mode_info_context->mbmi.segment_id = segment_id;
   estimate_ref_frame_costs(cpi, segment_id, ref_costs);
   vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
+  vpx_memset(&single_newmv, 0, sizeof(single_newmv));
 
   for (i = 0; i < NB_PREDICTION_TYPES; ++i)
     best_pred_rd[i] = INT64_MAX;
@@ -2293,7 +2467,7 @@
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
       setup_buffer_inter(cpi, x, idx_list[ref_frame], ref_frame, block_size,
                          mi_row, mi_col, frame_mv[NEARESTMV], frame_mv[NEARMV],
-                         frame_mdcounts, yv12_mb, scale_factor);
+                         yv12_mb, scale_factor);
     }
     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
     frame_mv[ZEROMV][ref_frame].as_int = 0;
@@ -2307,7 +2481,9 @@
          i++) {
       mbmi->txfm_size = i;
       rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_intra[i], &rate_uv_tokenonly[i],
-                              &dist_uv[i], &skip_uv[i], bsize);
+                              &dist_uv[i], &skip_uv[i],
+                              (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 :
+                                                           bsize);
       mode_uv[i] = mbmi->uv_mode;
     }
   }
@@ -2337,6 +2513,7 @@
         || (cpi->ref_frame_flags & flag_list[ref_frame]))) {
       continue;
     }
+
     if (cpi->speed > 0) {
       if (!(ref_frame_mask & (1 << ref_frame))) {
         continue;
@@ -2383,10 +2560,18 @@
     mbmi->interp_filter = cm->mcomp_filter_type;
     vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
 
+#if CONFIG_AB4X4
+    if (bsize >= BLOCK_SIZE_SB8X8 &&
+        (this_mode == I4X4_PRED || this_mode == SPLITMV))
+      continue;
+    if (bsize < BLOCK_SIZE_SB8X8 &&
+        !(this_mode == I4X4_PRED || this_mode == SPLITMV))
+      continue;
+#else
     if (bsize != BLOCK_SIZE_SB8X8 &&
         (this_mode == I4X4_PRED || this_mode == SPLITMV))
       continue;
-
+#endif
 
     if (comp_pred) {
       if (ref_frame == ALTREF_FRAME) {
@@ -2420,8 +2605,6 @@
         xd->plane[i].pre[1] = yv12_mb[second_ref][i];
     }
 
-    vpx_memcpy(mdcounts, frame_mdcounts[ref_frame], sizeof(mdcounts));
-
     // If the segment reference frame feature is enabled....
     // then do nothing if the current ref frame is not allowed..
     if (vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
@@ -2451,7 +2634,6 @@
 
       // Note the rate value returned here includes the cost of coding
       // the I4X4_PRED mode : x->mbmode_cost[xd->frame_type][I4X4_PRED];
-      assert(bsize == BLOCK_SIZE_SB8X8);
       mbmi->txfm_size = TX_4X4;
       rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y,
                                 &distortion_y, INT64_MAX);
@@ -2519,7 +2701,7 @@
 
         tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
                                              &mbmi->ref_mvs[mbmi->ref_frame][0],
-                                             second_ref, INT64_MAX, mdcounts,
+                                             second_ref, INT64_MAX,
                                              &rate, &rate_y, &distortion,
                                              &skippable,
                                              (int)this_rd_thresh, seg_mvs);
@@ -2558,7 +2740,7 @@
         // switchable list (bilinear, 6-tap) is indicated at the frame level
         tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
                                              &mbmi->ref_mvs[mbmi->ref_frame][0],
-                                             second_ref, INT64_MAX, mdcounts,
+                                             second_ref, INT64_MAX,
                                              &rate, &rate_y, &distortion,
                                              &skippable,
                                              (int)this_rd_thresh, seg_mvs);
@@ -2589,10 +2771,10 @@
       // If even the 'Y' rd value of split is higher than best so far
       // then dont bother looking at UV
       vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
-                                      bsize);
-      vp9_subtract_sbuv(x, bsize);
+                                      BLOCK_SIZE_SB8X8);
+      vp9_subtract_sbuv(x, BLOCK_SIZE_SB8X8);
       super_block_uvrd_for_txfm(cm, x, &rate_uv, &distortion_uv,
-                                &uv_skippable, bsize, TX_4X4);
+                                &uv_skippable, BLOCK_SIZE_SB8X8, TX_4X4);
       rate2 += rate_uv;
       distortion2 += distortion_uv;
       skippable = skippable && uv_skippable;
@@ -2608,7 +2790,7 @@
           vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_COMP), is_comp_pred);
       mbmi->mode = this_mode;
     } else {
-      YV12_BUFFER_CONFIG *scaled_ref_frame = NULL;
+      YV12_BUFFER_CONFIG *scaled_ref_frame[2] = {NULL, NULL};
       int fb;
 
       if (mbmi->ref_frame == LAST_FRAME) {
@@ -2620,17 +2802,31 @@
       }
 
       if (cpi->scaled_ref_idx[fb] != cm->ref_frame_map[fb])
-        scaled_ref_frame = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];
+        scaled_ref_frame[0] = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];
+
+      if (comp_pred) {
+        if (mbmi->second_ref_frame == LAST_FRAME) {
+          fb = cpi->lst_fb_idx;
+        } else if (mbmi->second_ref_frame == GOLDEN_FRAME) {
+          fb = cpi->gld_fb_idx;
+        } else {
+          fb = cpi->alt_fb_idx;
+        }
+
+        if (cpi->scaled_ref_idx[fb] != cm->ref_frame_map[fb])
+          scaled_ref_frame[1] = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];
+      }
 
       this_rd = handle_inter_mode(cpi, x, bsize,
-                                  mdcounts, txfm_cache,
+                                  txfm_cache,
                                   &rate2, &distortion2, &skippable,
                                   &compmode_cost,
                                   &rate_y, &distortion_y,
                                   &rate_uv, &distortion_uv,
                                   &mode_excluded, &disable_skip,
                                   mode_index, &tmp_best_filter, frame_mv,
-                                  scaled_ref_frame, mi_row, mi_col);
+                                  scaled_ref_frame, mi_row, mi_col,
+                                  single_newmv);
       if (this_rd == INT64_MAX)
         continue;
     }
@@ -2833,7 +3029,13 @@
     }
   }
 
-
+#if CONFIG_AB4X4
+  if (best_rd == INT64_MAX && bsize < BLOCK_SIZE_SB8X8) {
+    *returnrate = INT_MAX;
+    *returndistortion = INT_MAX;
+    return best_rd;
+  }
+#endif
 
   assert((cm->mcomp_filter_type == SWITCHABLE) ||
          (cm->mcomp_filter_type == best_mbmode.interp_filter) ||
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index cb670da..50d849d 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -119,7 +119,12 @@
   TOKENEXTRA *t = *tp;        /* store tokens starting here */
   const int eob = xd->plane[plane].eobs[block];
   const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16);
+#if CONFIG_AB4X4
+  const BLOCK_SIZE_TYPE sb_type = (mbmi->sb_type < BLOCK_SIZE_SB8X8) ?
+                                   BLOCK_SIZE_SB8X8 : mbmi->sb_type;
+#else
   const BLOCK_SIZE_TYPE sb_type = mbmi->sb_type;
+#endif
   const int bwl = b_width_log2(sb_type);
   const int off = block >> (2 * tx_size);
   const int mod = bwl - tx_size - xd->plane[plane].subsampling_x;
@@ -136,6 +141,7 @@
   ENTROPY_CONTEXT above_ec, left_ec;
   uint8_t token_cache[1024];
   TX_TYPE tx_type = DCT_DCT;
+  const uint8_t * band_translate;
   assert((!type && !plane) || (type && plane));
 
   switch (tx_size) {
@@ -149,6 +155,7 @@
       scan = get_scan_4x4(tx_type);
       counts = cpi->coef_counts_4x4;
       coef_probs = cpi->common.fc.coef_probs_4x4;
+      band_translate = vp9_coefband_trans_4x4;
       break;
     }
     case TX_8X8: {
@@ -162,6 +169,7 @@
       scan = get_scan_8x8(tx_type);
       counts = cpi->coef_counts_8x8;
       coef_probs = cpi->common.fc.coef_probs_8x8;
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
     }
     case TX_16X16: {
@@ -175,6 +183,7 @@
       scan = get_scan_16x16(tx_type);
       counts = cpi->coef_counts_16x16;
       coef_probs = cpi->common.fc.coef_probs_16x16;
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
     }
     case TX_32X32:
@@ -184,6 +193,7 @@
       scan = vp9_default_zig_zag1d_32x32;
       counts = cpi->coef_counts_32x32;
       coef_probs = cpi->common.fc.coef_probs_32x32;
+      band_translate = vp9_coefband_trans_8x8plus;
       break;
   }
 
@@ -196,7 +206,7 @@
 
   c = 0;
   do {
-    const int band = get_coef_band(scan, tx_size, c);
+    const int band = get_coef_band(band_translate, c);
     int token;
     int v = 0;
     rc = scan[c];
diff --git a/vp9/encoder/vp9_variance.h b/vp9/encoder/vp9_variance.h
index 13dabbd..306476b 100644
--- a/vp9/encoder/vp9_variance.h
+++ b/vp9/encoder/vp9_variance.h
@@ -12,6 +12,7 @@
 #define VP9_ENCODER_VP9_VARIANCE_H_
 
 #include "vpx/vpx_integer.h"
+// #include "./vpx_config.h"
 
 typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,
                                     int source_stride,
@@ -50,6 +51,15 @@
                                                 int Refstride,
                                                 unsigned int *sse);
 
+typedef unsigned int (*vp9_subp_avg_variance_fn_t)(const uint8_t *src_ptr,
+                                                   int source_stride,
+                                                   int xoffset,
+                                                   int yoffset,
+                                                   const uint8_t *ref_ptr,
+                                                   int Refstride,
+                                                   unsigned int *sse,
+                                                   const uint8_t *second_pred);
+
 typedef void (*vp9_ssimpf_fn_t)(uint8_t *s, int sp, uint8_t *r,
                                 int rp, unsigned long *sum_s,
                                 unsigned long *sum_r, unsigned long *sum_sq_s,
@@ -64,15 +74,33 @@
                                                    int  ref_stride);
 
 typedef struct vp9_variance_vtable {
-    vp9_sad_fn_t            sdf;
-    vp9_variance_fn_t       vf;
-    vp9_subpixvariance_fn_t svf;
-    vp9_variance_fn_t       svf_halfpix_h;
-    vp9_variance_fn_t       svf_halfpix_v;
-    vp9_variance_fn_t       svf_halfpix_hv;
-    vp9_sad_multi_fn_t      sdx3f;
-    vp9_sad_multi1_fn_t     sdx8f;
-    vp9_sad_multi_d_fn_t    sdx4df;
+    vp9_sad_fn_t               sdf;
+    vp9_variance_fn_t          vf;
+    vp9_subpixvariance_fn_t    svf;
+    vp9_subp_avg_variance_fn_t svaf;
+    vp9_variance_fn_t          svf_halfpix_h;
+    vp9_variance_fn_t          svf_halfpix_v;
+    vp9_variance_fn_t          svf_halfpix_hv;
+    vp9_sad_multi_fn_t         sdx3f;
+    vp9_sad_multi1_fn_t        sdx8f;
+    vp9_sad_multi_d_fn_t       sdx4df;
 } vp9_variance_fn_ptr_t;
 
+// #if CONFIG_COMP_INTER_JOINT_SEARCH
+static void comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int weight,
+                          int height, uint8_t *ref, int ref_stride) {
+  int i, j;
+
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < weight; j++) {
+      int tmp;
+      tmp = pred[j] + ref[j];
+      comp_pred[j] = (tmp + 1) >> 1;
+    }
+    comp_pred += weight;
+    pred += weight;
+    ref += ref_stride;
+  }
+}
+// #endif  // CONFIG_COMP_INTER_JOINT_SEARCH
 #endif  // VP9_ENCODER_VP9_VARIANCE_H_
diff --git a/vp9/encoder/vp9_variance_c.c b/vp9/encoder/vp9_variance_c.c
index c2a6004..fa53abd 100644
--- a/vp9/encoder/vp9_variance_c.c
+++ b/vp9/encoder/vp9_variance_c.c
@@ -13,6 +13,7 @@
 #include "vp9/common/vp9_filter.h"
 #include "vp9/common/vp9_subpelvar.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
 
 unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) {
   unsigned int i, sum = 0;
@@ -58,6 +59,29 @@
   return vp9_variance64x32_c(temp2, 64, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[65 * 64];  // Temp data bufffer used in filtering
+  uint8_t temp2[68 * 64];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 64 * 64);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 33, 64, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 32, 64, vfilter);
+  comp_avg_pred(temp3, second_pred, 64, 32, temp2, 64);
+  return vp9_variance64x32_c(temp3, 64, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_variance32x64_c(const uint8_t *src_ptr,
                                  int  source_stride,
                                  const uint8_t *ref_ptr,
@@ -92,6 +116,29 @@
   return vp9_variance32x64_c(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[65 * 64];  // Temp data bufffer used in filtering
+  uint8_t temp2[68 * 64];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 64);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 65, 32, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 64, 32, vfilter);
+  comp_avg_pred(temp3, second_pred, 32, 64, temp2, 32);
+  return vp9_variance32x64_c(temp3, 32, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_variance32x16_c(const uint8_t *src_ptr,
                                  int  source_stride,
                                  const uint8_t *ref_ptr,
@@ -126,6 +173,29 @@
   return vp9_variance32x16_c(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[33 * 32];  // Temp data bufffer used in filtering
+  uint8_t temp2[36 * 32];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 16);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 17, 32, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 16, 32, vfilter);
+  comp_avg_pred(temp3, second_pred, 32, 16, temp2, 32);
+  return vp9_variance32x16_c(temp3, 32, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_variance16x32_c(const uint8_t *src_ptr,
                                  int  source_stride,
                                  const uint8_t *ref_ptr,
@@ -160,6 +230,29 @@
   return vp9_variance16x32_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[33 * 32];  // Temp data bufffer used in filtering
+  uint8_t temp2[36 * 32];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 32);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 33, 16, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 32, 16, vfilter);
+  comp_avg_pred(temp3, second_pred, 16, 32, temp2, 16);
+  return vp9_variance16x32_c(temp3, 16, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_variance64x64_c(const uint8_t *src_ptr,
                                  int  source_stride,
                                  const uint8_t *ref_ptr,
@@ -317,6 +410,31 @@
   return vp9_variance4x4_c(temp2, 4, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr,
+                                             int  src_pixels_per_line,
+                                             int  xoffset,
+                                             int  yoffset,
+                                             const uint8_t *dst_ptr,
+                                             int dst_pixels_per_line,
+                                             unsigned int *sse,
+                                             const uint8_t *second_pred) {
+  uint8_t temp2[20 * 16];
+  const int16_t *hfilter, *vfilter;
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 4 * 4);  // compound pred buffer
+  uint16_t fdata3[5 * 4];  // Temp data bufffer used in filtering
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  // First filter 1d Horizontal
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 5, 4, hfilter);
+
+  // Now filter Verticaly
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 4,  4,  4,  4, vfilter);
+  comp_avg_pred(temp3, second_pred, 4, 4, temp2, 4);
+  return vp9_variance4x4_c(temp3, 4, dst_ptr, dst_pixels_per_line, sse);
+}
 
 unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr,
                                          int  src_pixels_per_line,
@@ -339,6 +457,29 @@
   return vp9_variance8x8_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr,
+                                             int  src_pixels_per_line,
+                                             int  xoffset,
+                                             int  yoffset,
+                                             const uint8_t *dst_ptr,
+                                             int dst_pixels_per_line,
+                                             unsigned int *sse,
+                                             const uint8_t *second_pred) {
+  uint16_t fdata3[9 * 8];  // Temp data bufffer used in filtering
+  uint8_t temp2[20 * 16];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 8);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 9, 8, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 8, 8, vfilter);
+  comp_avg_pred(temp3, second_pred, 8, 8, temp2, 8);
+  return vp9_variance8x8_c(temp3, 8, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr,
                                            int  src_pixels_per_line,
                                            int  xoffset,
@@ -360,6 +501,30 @@
   return vp9_variance16x16_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[17 * 16];
+  uint8_t temp2[20 * 16];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 16);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 17, 16, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 16, 16, vfilter);
+
+  comp_avg_pred(temp3, second_pred, 16, 16, temp2, 16);
+  return vp9_variance16x16_c(temp3, 16, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr,
                                            int  src_pixels_per_line,
                                            int  xoffset,
@@ -381,6 +546,29 @@
   return vp9_variance64x64_c(temp2, 64, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[65 * 64];  // Temp data bufffer used in filtering
+  uint8_t temp2[68 * 64];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 64 * 64);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 65, 64, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 64, 64, vfilter);
+  comp_avg_pred(temp3, second_pred, 64, 64, temp2, 64);
+  return vp9_variance64x64_c(temp3, 64, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr,
                                            int  src_pixels_per_line,
                                            int  xoffset,
@@ -402,6 +590,29 @@
   return vp9_variance32x32_c(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr,
+                                               int  src_pixels_per_line,
+                                               int  xoffset,
+                                               int  yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse,
+                                               const uint8_t *second_pred) {
+  uint16_t fdata3[33 * 32];  // Temp data bufffer used in filtering
+  uint8_t temp2[36 * 32];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 32);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 33, 32, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 32, 32, vfilter);
+  comp_avg_pred(temp3, second_pred, 32, 32, temp2, 32);
+  return vp9_variance32x32_c(temp3, 32, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_variance_halfpixvar16x16_h_c(const uint8_t *src_ptr,
                                               int  source_stride,
                                               const uint8_t *ref_ptr,
@@ -543,6 +754,29 @@
   return vp9_variance16x8_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr,
+                                              int  src_pixels_per_line,
+                                              int  xoffset,
+                                              int  yoffset,
+                                              const uint8_t *dst_ptr,
+                                              int dst_pixels_per_line,
+                                              unsigned int *sse,
+                                              const uint8_t *second_pred) {
+  uint16_t fdata3[16 * 9];  // Temp data bufffer used in filtering
+  uint8_t temp2[20 * 16];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 8);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 9, 16, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 8, 16, vfilter);
+  comp_avg_pred(temp3, second_pred, 16, 8, temp2, 16);
+  return vp9_variance16x8_c(temp3, 16, dst_ptr, dst_pixels_per_line, sse);
+}
+
 unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr,
                                           int  src_pixels_per_line,
                                           int  xoffset,
@@ -564,3 +798,25 @@
   return vp9_variance8x16_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
 }
 
+unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr,
+                                              int  src_pixels_per_line,
+                                              int  xoffset,
+                                              int  yoffset,
+                                              const uint8_t *dst_ptr,
+                                              int dst_pixels_per_line,
+                                              unsigned int *sse,
+                                              const uint8_t *second_pred) {
+  uint16_t fdata3[9 * 16];  // Temp data bufffer used in filtering
+  uint8_t temp2[20 * 16];
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 16);  // compound pred buffer
+  const int16_t *hfilter, *vfilter;
+
+  hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+  vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+                                    1, 17, 8, hfilter);
+  var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 16, 8, vfilter);
+  comp_avg_pred(temp3, second_pred, 8, 16, temp2, 8);
+  return vp9_variance8x16_c(temp3, 8, dst_ptr, dst_pixels_per_line, sse);
+}
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 45609da..9326165 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -543,31 +543,6 @@
   return VPX_CODEC_OK;
 }
 
-static vpx_codec_err_t image2yuvconfig(const vpx_image_t   *img,
-                                       YV12_BUFFER_CONFIG  *yv12) {
-  vpx_codec_err_t        res = VPX_CODEC_OK;
-  yv12->y_buffer = img->planes[VPX_PLANE_Y];
-  yv12->u_buffer = img->planes[VPX_PLANE_U];
-  yv12->v_buffer = img->planes[VPX_PLANE_V];
-
-  yv12->y_crop_width  = img->d_w;
-  yv12->y_crop_height = img->d_h;
-  yv12->y_width  = img->d_w;
-  yv12->y_height = img->d_h;
-
-  yv12->uv_width = img->x_chroma_shift == 1 ? (1 + yv12->y_width) / 2
-                                            : yv12->y_width;
-  yv12->uv_height = img->y_chroma_shift == 1 ? (1 + yv12->y_height) / 2
-                                             : yv12->y_height;
-
-  yv12->y_stride = img->stride[VPX_PLANE_Y];
-  yv12->uv_stride = img->stride[VPX_PLANE_U];
-
-  yv12->border  = (img->stride[VPX_PLANE_Y] - img->w) / 2;
-  yv12->clrtype = REG_YUV;
-  return res;
-}
-
 static void pick_quickcompress_mode(vpx_codec_alg_priv_t  *ctx,
                                     unsigned long          duration,
                                     unsigned long          deadline) {
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 85022c9..811cea7 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -578,30 +578,6 @@
   return res;
 }
 
-static vpx_codec_err_t image2yuvconfig(const vpx_image_t   *img,
-                                       YV12_BUFFER_CONFIG  *yv12) {
-  vpx_codec_err_t        res = VPX_CODEC_OK;
-  yv12->y_buffer = img->planes[VPX_PLANE_Y];
-  yv12->u_buffer = img->planes[VPX_PLANE_U];
-  yv12->v_buffer = img->planes[VPX_PLANE_V];
-
-  yv12->y_crop_width  = img->d_w;
-  yv12->y_crop_height = img->d_h;
-  yv12->y_width  = img->d_w;
-  yv12->y_height = img->d_h;
-  yv12->uv_width = yv12->y_width / 2;
-  yv12->uv_height = yv12->y_height / 2;
-
-  yv12->y_stride = img->stride[VPX_PLANE_Y];
-  yv12->uv_stride = img->stride[VPX_PLANE_U];
-
-  yv12->border  = (img->stride[VPX_PLANE_Y] - img->d_w) / 2;
-  yv12->clrtype = (img->fmt == VPX_IMG_FMT_VPXI420 ||
-                   img->fmt == VPX_IMG_FMT_VPXYV12);
-
-  return res;
-}
-
 
 static vpx_codec_err_t vp9_set_reference(vpx_codec_alg_priv_t *ctx,
                                          int ctr_id,
diff --git a/vp9/vp9_iface_common.h b/vp9/vp9_iface_common.h
index 96de5f5..84b4d39 100644
--- a/vp9/vp9_iface_common.h
+++ b/vp9/vp9_iface_common.h
@@ -37,11 +37,11 @@
   img->planes[VPX_PLANE_Y] = yv12->y_buffer;
   img->planes[VPX_PLANE_U] = yv12->u_buffer;
   img->planes[VPX_PLANE_V] = yv12->v_buffer;
-  img->planes[VPX_PLANE_ALPHA] = NULL;
+  img->planes[VPX_PLANE_ALPHA] = yv12->alpha_buffer;
   img->stride[VPX_PLANE_Y] = yv12->y_stride;
   img->stride[VPX_PLANE_U] = yv12->uv_stride;
   img->stride[VPX_PLANE_V] = yv12->uv_stride;
-  img->stride[VPX_PLANE_ALPHA] = yv12->y_stride;
+  img->stride[VPX_PLANE_ALPHA] = yv12->alpha_stride;
   img->bps = bps;
   img->user_priv = user_priv;
   img->img_data = yv12->buffer_alloc;
@@ -49,4 +49,34 @@
   img->self_allocd = 0;
 }
 
+static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
+                                       YV12_BUFFER_CONFIG *yv12) {
+  yv12->y_buffer = img->planes[VPX_PLANE_Y];
+  yv12->u_buffer = img->planes[VPX_PLANE_U];
+  yv12->v_buffer = img->planes[VPX_PLANE_V];
+  yv12->alpha_buffer = img->planes[VPX_PLANE_ALPHA];
+
+  yv12->y_crop_width  = img->d_w;
+  yv12->y_crop_height = img->d_h;
+  yv12->y_width  = img->d_w;
+  yv12->y_height = img->d_h;
+
+  yv12->uv_width = img->x_chroma_shift == 1 ? (1 + yv12->y_width) / 2
+                                            : yv12->y_width;
+  yv12->uv_height = img->y_chroma_shift == 1 ? (1 + yv12->y_height) / 2
+                                             : yv12->y_height;
+
+  yv12->alpha_width = yv12->alpha_buffer ? img->d_w : 0;
+  yv12->alpha_height = yv12->alpha_buffer ? img->d_h : 0;
+
+  yv12->y_stride = img->stride[VPX_PLANE_Y];
+  yv12->uv_stride = img->stride[VPX_PLANE_U];
+  yv12->alpha_stride = yv12->alpha_buffer ? img->stride[VPX_PLANE_ALPHA] : 0;
+
+  yv12->border  = (img->stride[VPX_PLANE_Y] - img->w) / 2;
+  yv12->clrtype = REG_YUV;
+
+  return VPX_CODEC_OK;
+}
+
 #endif  // VP9_VP9_IFACE_COMMON_H_
diff --git a/vpx_scale/generic/yv12config.c b/vpx_scale/generic/yv12config.c
index cd66f00..99e3543 100644
--- a/vpx_scale/generic/yv12config.c
+++ b/vpx_scale/generic/yv12config.c
@@ -76,12 +76,17 @@
     ybf->uv_height = uv_height;
     ybf->uv_stride = uv_stride;
 
+    ybf->alpha_width = 0;
+    ybf->alpha_height = 0;
+    ybf->alpha_stride = 0;
+
     ybf->border = border;
     ybf->frame_size = frame_size;
 
     ybf->y_buffer = ybf->buffer_alloc + (border * y_stride) + border;
     ybf->u_buffer = ybf->buffer_alloc + yplane_size + (border / 2  * uv_stride) + border / 2;
     ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size + (border / 2  * uv_stride) + border / 2;
+    ybf->alpha_buffer = NULL;
 
     ybf->corrupted = 0; /* assume not currupted by errors */
     return 0;
diff --git a/vpx_scale/yv12config.h b/vpx_scale/yv12config.h
index 85396c0..7b8bd85 100644
--- a/vpx_scale/yv12config.h
+++ b/vpx_scale/yv12config.h
@@ -52,9 +52,14 @@
     int   uv_stride;
     /*    int   uvinternal_width; */
 
+    int   alpha_width;
+    int   alpha_height;
+    int   alpha_stride;
+
     uint8_t *y_buffer;
     uint8_t *u_buffer;
     uint8_t *v_buffer;
+    uint8_t *alpha_buffer;
 
     uint8_t *buffer_alloc;
     int buffer_alloc_sz;