Merge "Optimize HBD up-sampled prediction functions" into nextgenv2
diff --git a/configure b/configure
index 97366e4..ed1d048 100755
--- a/configure
+++ b/configure
@@ -284,6 +284,7 @@
     ext_partition
     ext_tile
     obmc
+    entropy
 "
 CONFIG_LIST="
     dependency_tracking
diff --git a/test/borders_test.cc b/test/borders_test.cc
index 6592375..ff3812c 100644
--- a/test/borders_test.cc
+++ b/test/borders_test.cc
@@ -52,7 +52,7 @@
   // extend into the border and test the border condition.
   cfg_.g_lag_in_frames = 25;
   cfg_.rc_2pass_vbr_minsection_pct = 5;
-  cfg_.rc_2pass_vbr_minsection_pct = 2000;
+  cfg_.rc_2pass_vbr_maxsection_pct = 2000;
   cfg_.rc_target_bitrate = 2000;
   cfg_.rc_max_quantizer = 10;
 
diff --git a/test/cpu_speed_test.cc b/test/cpu_speed_test.cc
index 8baa2f9..6a938a0 100644
--- a/test/cpu_speed_test.cc
+++ b/test/cpu_speed_test.cc
@@ -74,7 +74,7 @@
   // the encoder to producing lots of big partitions which will likely
   // extend into the border and test the border condition.
   cfg_.rc_2pass_vbr_minsection_pct = 5;
-  cfg_.rc_2pass_vbr_minsection_pct = 2000;
+  cfg_.rc_2pass_vbr_maxsection_pct = 2000;
   cfg_.rc_target_bitrate = 400;
   cfg_.rc_max_quantizer = 0;
   cfg_.rc_min_quantizer = 0;
@@ -92,7 +92,7 @@
   ::libvpx_test::Y4mVideoSource video("screendata.y4m", 0, 25);
   cfg_.g_timebase = video.timebase();
   cfg_.rc_2pass_vbr_minsection_pct = 5;
-  cfg_.rc_2pass_vbr_minsection_pct = 2000;
+  cfg_.rc_2pass_vbr_maxsection_pct = 2000;
   cfg_.rc_target_bitrate = 400;
   cfg_.rc_max_quantizer = 0;
   cfg_.rc_min_quantizer = 0;
@@ -109,7 +109,7 @@
   // the encoder to producing lots of big partitions which will likely
   // extend into the border and test the border condition.
   cfg_.rc_2pass_vbr_minsection_pct = 5;
-  cfg_.rc_2pass_vbr_minsection_pct = 2000;
+  cfg_.rc_2pass_vbr_maxsection_pct = 2000;
   cfg_.rc_target_bitrate = 12000;
   cfg_.rc_max_quantizer = 10;
   cfg_.rc_min_quantizer = 0;
@@ -125,7 +125,7 @@
   // when passing in a very high min q.  This pushes the encoder to producing
   // lots of small partitions which might will test the other condition.
   cfg_.rc_2pass_vbr_minsection_pct = 5;
-  cfg_.rc_2pass_vbr_minsection_pct = 2000;
+  cfg_.rc_2pass_vbr_maxsection_pct = 2000;
   cfg_.rc_target_bitrate = 200;
   cfg_.rc_min_quantizer = 40;
 
diff --git a/test/vp9_ethread_test.cc b/test/vp9_ethread_test.cc
index 29a653f..3445bf2 100644
--- a/test/vp9_ethread_test.cc
+++ b/test/vp9_ethread_test.cc
@@ -48,7 +48,7 @@
       cfg_.g_lag_in_frames = 3;
       cfg_.rc_end_usage = VPX_VBR;
       cfg_.rc_2pass_vbr_minsection_pct = 5;
-      cfg_.rc_2pass_vbr_minsection_pct = 2000;
+      cfg_.rc_2pass_vbr_maxsection_pct = 2000;
     } else {
       cfg_.g_lag_in_frames = 0;
       cfg_.rc_end_usage = VPX_CBR;
diff --git a/vp10/common/alloccommon.c b/vp10/common/alloccommon.c
index e14aee7..b3c216e 100644
--- a/vp10/common/alloccommon.c
+++ b/vp10/common/alloccommon.c
@@ -97,10 +97,13 @@
 }
 
 void vp10_free_context_buffers(VP10_COMMON *cm) {
+  int i;
   cm->free_mi(cm);
   free_seg_map(cm);
-  vpx_free(cm->above_context);
-  cm->above_context = NULL;
+  for (i = 0 ; i < MAX_MB_PLANE ; i++) {
+    vpx_free(cm->above_context[i]);
+    cm->above_context[i] = NULL;
+  }
   vpx_free(cm->above_seg_context);
   cm->above_seg_context = NULL;
 #if CONFIG_VAR_TX
@@ -128,11 +131,14 @@
   }
 
   if (cm->above_context_alloc_cols < cm->mi_cols) {
-    vpx_free(cm->above_context);
-    cm->above_context = (ENTROPY_CONTEXT *)vpx_calloc(
-        2 * mi_cols_aligned_to_sb(cm->mi_cols) * MAX_MB_PLANE,
-        sizeof(*cm->above_context));
-    if (!cm->above_context) goto fail;
+    int i;
+    for (i = 0 ; i < MAX_MB_PLANE ; i++) {
+    vpx_free(cm->above_context[i]);
+      cm->above_context[i] = (ENTROPY_CONTEXT *)vpx_calloc(
+          2 * mi_cols_aligned_to_sb(cm->mi_cols),
+          sizeof(*cm->above_context[0]));
+      if (!cm->above_context[i]) goto fail;
+    }
 
     vpx_free(cm->above_seg_context);
     cm->above_seg_context = (PARTITION_CONTEXT *)vpx_calloc(
diff --git a/vp10/common/blockd.h b/vp10/common/blockd.h
index d5139f7..016fc75 100644
--- a/vp10/common/blockd.h
+++ b/vp10/common/blockd.h
@@ -380,7 +380,7 @@
 #if CONFIG_EXT_TX
 #define ALLOW_INTRA_EXT_TX       1
 // whether masked transforms are used for 32X32
-#define USE_MSKTX_FOR_32X32      1
+#define USE_MSKTX_FOR_32X32      0
 
 static const int num_ext_tx_set_inter[EXT_TX_SETS_INTER] = {
   1, 19, 12, 2
@@ -447,7 +447,7 @@
   { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
   { 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1},
-  { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1},
+  { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0},
 };
 
 static INLINE int get_ext_tx_types(TX_SIZE tx_size, BLOCK_SIZE bs,
diff --git a/vp10/common/common.h b/vp10/common/common.h
index 4abcbf6..54c7b89 100644
--- a/vp10/common/common.h
+++ b/vp10/common/common.h
@@ -33,12 +33,12 @@
 
 // Use this for variably-sized arrays.
 #define vp10_copy_array(dest, src, n) {       \
-    assert(sizeof(*dest) == sizeof(*src));   \
-    memcpy(dest, src, n * sizeof(*src)); \
+    assert(sizeof(*(dest)) == sizeof(*(src)));   \
+    memcpy(dest, src, n * sizeof(*(src))); \
   }
 
 #define vp10_zero(dest) memset(&(dest), 0, sizeof(dest))
-#define vp10_zero_array(dest, n) memset(dest, 0, n * sizeof(*dest))
+#define vp10_zero_array(dest, n) memset(dest, 0, n * sizeof(*(dest)))
 
 static INLINE int get_unsigned_bits(unsigned int num_values) {
   return num_values > 0 ? get_msb(num_values) + 1 : 0;
diff --git a/vp10/common/idct.c b/vp10/common/idct.c
index f621ec6..863f0db 100644
--- a/vp10/common/idct.c
+++ b/vp10/common/idct.c
@@ -260,6 +260,30 @@
 }
 
 #if CONFIG_EXT_TX
+static void iidtx4_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 4; ++i)
+    output[i] = (tran_low_t)dct_const_round_shift(input[i] * Sqrt2);
+}
+
+static void iidtx8_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 8; ++i)
+    output[i] = input[i] * 2;
+}
+
+static void iidtx16_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 16; ++i)
+    output[i] = (tran_low_t)dct_const_round_shift(input[i] * 2 * Sqrt2);
+}
+
+static void iidtx32_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 32; ++i)
+    output[i] = input[i] * 4;
+}
+
 // For use in lieu of DST
 static void ihalfcenter32_c(const tran_low_t *input, tran_low_t *output) {
   int i;
@@ -291,6 +315,37 @@
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_iidtx4_c(const tran_low_t *input, tran_low_t *output,
+                            int bd) {
+  int i;
+  for (i = 0; i < 4; ++i)
+    output[i] = (tran_low_t)highbd_dct_const_round_shift(input[i] * Sqrt2, bd);
+}
+
+static void highbd_iidtx8_c(const tran_low_t *input, tran_low_t *output,
+                            int bd) {
+  int i;
+  (void) bd;
+  for (i = 0; i < 8; ++i)
+    output[i] = input[i] * 2;
+}
+
+static void highbd_iidtx16_c(const tran_low_t *input, tran_low_t *output,
+                            int bd) {
+  int i;
+  for (i = 0; i < 16; ++i)
+    output[i] = (tran_low_t)highbd_dct_const_round_shift(
+        input[i] * 2 * Sqrt2, bd);
+}
+
+static void highbd_iidtx32_c(const tran_low_t *input, tran_low_t *output,
+                             int bd) {
+  int i;
+  (void) bd;
+  for (i = 0; i < 32; ++i)
+    output[i] = input[i] * 4;
+}
+
 static void highbd_ihalfcenter32_c(const tran_low_t *input, tran_low_t *output,
                                    int bd) {
   int i;
@@ -331,85 +386,19 @@
                            int bs, int tx_type) {
   int r, c;
   const int shift = bs < 32 ? 3 : 2;
-
-  tran_low_t temp_in[32], temp_out[32];
-  transform_2d ht = {idct4_c, idct4_c};
-  int out_scale = 1;
-  int coeff_stride = 0;
-
-  switch (bs) {
-    case 4:
-      ht.cols = idct4_c;
-      ht.rows = idct4_c;
-      out_scale = cospi_16_64 >> 3;
-      coeff_stride = 4;
-      break;
-    case 8:
-      ht.cols = idct8_c;
-      ht.rows = idct8_c;
-      out_scale = (1 << (DCT_CONST_BITS - 4));
-      coeff_stride = 8;
-      break;
-    case 16:
-      ht.cols = idct16_c;
-      ht.rows = idct16_c;
-      out_scale = cospi_16_64 >> 4;
-      coeff_stride = 16;
-      break;
-    case 32:
-      ht.cols = idct32_c;
-      ht.rows = idct32_c;
-      out_scale = (1 << (DCT_CONST_BITS - 4));
-      coeff_stride = 32;
-      break;
-    default:
-      assert(0);
-  }
-
-  // Columns
-  if (tx_type == V_DCT) {
-    for (c = 0; c < bs; ++c) {
-      for (r = 0; r < bs; ++r)
-        temp_in[r] = input[r * coeff_stride + c];
-      ht.cols(temp_in, temp_out);
-
-      for (r = 0; r < bs; ++r) {
-        tran_high_t temp = (tran_high_t)temp_out[r] * out_scale;
-        temp >>= DCT_CONST_BITS;
-        dest[r * stride + c] = clip_pixel_add(dest[r * stride + c],
-                                              (tran_low_t)temp);
-      }
-    }
-    return;
-  }
-
-  if (tx_type == H_DCT) {
+  if (tx_type == IDTX) {
     for (r = 0; r < bs; ++r) {
       for (c = 0; c < bs; ++c)
-        temp_in[c] = input[r * coeff_stride + c];
-      ht.rows(temp_in, temp_out);
-
-      for (c = 0; c < bs; ++c) {
-        tran_high_t temp = (tran_high_t)temp_out[c] * out_scale;
-        temp >>= DCT_CONST_BITS;
-        dest[r * stride + c] = clip_pixel_add(dest[r * stride + c],
-                                              (tran_low_t)temp);
-      }
+        dest[c] = clip_pixel_add(dest[c], input[c] >> shift);
+      dest += stride;
+      input += bs;
     }
-    return;
-  }
-
-  for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c)
-      dest[c] = clip_pixel_add(dest[c], input[c] >> shift);
-    dest += stride;
-    input += bs;
   }
 }
 
 #define FLIPUD_PTR(dest, stride, size) do {     \
-    (dest) = (dest) + ((size) - 1) * (stride);  \
-    (stride) = - (stride);                      \
+  (dest) = (dest) + ((size) - 1) * (stride);  \
+  (stride) = - (stride);                      \
 } while (0)
 
 static void maybe_flip_strides(uint8_t **dst, int *dstride,
@@ -428,6 +417,7 @@
     case DST_DCT:
     case DST_ADST:
     case ADST_DST:
+    case IDTX:
     case V_DCT:
     case H_DCT:
       break;
@@ -705,78 +695,13 @@
   const int shift = bs < 32 ? 3 : 2;
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  tran_low_t temp_in[32], temp_out[32];
-  highbd_transform_2d ht = {vpx_highbd_idct4_c, vpx_highbd_idct4_c};
-  int out_scale = 1;
-  int coeff_stride = 0;
-
-  switch (bs) {
-    case 4:
-      ht.cols = vpx_highbd_idct4_c;
-      ht.rows = vpx_highbd_idct4_c;
-      out_scale = cospi_16_64 >> 3;
-      coeff_stride = 4;
-      break;
-    case 8:
-      ht.cols = vpx_highbd_idct8_c;
-      ht.rows = vpx_highbd_idct8_c;
-      out_scale = (1 << (DCT_CONST_BITS - 4));
-      coeff_stride = 8;
-      break;
-    case 16:
-      ht.cols = vpx_highbd_idct16_c;
-      ht.rows = vpx_highbd_idct16_c;
-      out_scale = cospi_16_64 >> 4;
-      coeff_stride = 16;
-      break;
-    case 32:
-      ht.cols = vpx_highbd_idct32_c;
-      ht.rows = vpx_highbd_idct32_c;
-      out_scale = (1 << (DCT_CONST_BITS - 4));
-      coeff_stride = 32;
-      break;
-    default:
-      assert(0);
-  }
-
-  // Columns
-  if (tx_type == V_DCT) {
-    for (c = 0; c < bs; ++c) {
-      for (r = 0; r < bs; ++r)
-        temp_in[r] = input[r * coeff_stride + c];
-      ht.cols(temp_in, temp_out, bd);
-
-      for (r = 0; r < bs; ++r) {
-        tran_high_t temp = (tran_high_t)temp_out[r] * out_scale;
-        temp >>= DCT_CONST_BITS;
-        dest[r * stride + c] = highbd_clip_pixel_add(dest[r * stride + c],
-                                                     (tran_low_t)temp, bd);
-      }
-    }
-    return;
-  }
-
-  if (tx_type == H_DCT) {
+  if (tx_type == IDTX) {
     for (r = 0; r < bs; ++r) {
       for (c = 0; c < bs; ++c)
-        temp_in[c] = input[r * coeff_stride + c];
-      ht.rows(temp_in, temp_out, bd);
-
-      for (c = 0; c < bs; ++c) {
-        tran_high_t temp = (tran_high_t)temp_out[c] * out_scale;
-        temp >>= DCT_CONST_BITS;
-        dest[r * stride + c] = highbd_clip_pixel_add(dest[r * stride + c],
-                                                     (tran_low_t)temp, bd);
-      }
+        dest[c] = highbd_clip_pixel_add(dest[c], input[c] >> shift, bd);
+      dest += stride;
+      input += bs;
     }
-    return;
-  }
-
-  for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c)
-      dest[c] = highbd_clip_pixel_add(dest[c], input[c] >> shift, bd);
-    dest += stride;
-    input += bs;
   }
 }
 
@@ -796,6 +721,9 @@
     case DST_DCT:
     case DST_ADST:
     case ADST_DST:
+    case IDTX:
+    case V_DCT:
+    case H_DCT:
       break;
     case FLIPADST_DCT:
     case FLIPADST_ADST:
@@ -843,6 +771,9 @@
     { idst4_c,  iadst4_c },  // DST_FLIPADST      = 13,
     { iadst4_c, idst4_c  },  // FLIPADST_DST      = 14,
     { idst4_c,  idst4_c  },  // DST_DST           = 15
+    { iidtx4_c, iidtx4_c },  // IDTX              = 16
+    { idct4_c,  iidtx4_c },  // V_DCT             = 17
+    { iidtx4_c, idct4_c  },  // H_DCT             = 18
 #endif  // CONFIG_EXT_TX
   };
 
@@ -906,6 +837,9 @@
     { idst8_c,  iadst8_c },  // DST_FLIPADST      = 13,
     { iadst8_c, idst8_c  },  // FLIPADST_DST      = 14,
     { idst8_c,  idst8_c  },  // DST_DST           = 15
+    { iidtx8_c, iidtx8_c },  // IDTX              = 16
+    { idct8_c,  iidtx8_c },  // V_DCT             = 17
+    { iidtx8_c, idct8_c  },  // H_DCT             = 18
 #endif  // CONFIG_EXT_TX
   };
 
@@ -969,6 +903,9 @@
     { idst16_c,  iadst16_c },  // DST_FLIPADST      = 13,
     { iadst16_c, idst16_c  },  // FLIPADST_DST      = 14,
     { idst16_c,  idst16_c  },  // DST_DST           = 15
+    { iidtx16_c, iidtx16_c },  // IDTX              = 16
+    { idct16_c,  iidtx16_c },  // V_DCT             = 17
+    { iidtx16_c, idct16_c  },  // H_DCT             = 18
 #endif  // CONFIG_EXT_TX
   };
 
@@ -1032,6 +969,9 @@
     { ihalfcenter32_c,  ihalfright32_c },    // DST_FLIPADST      = 13,
     { ihalfright32_c, ihalfcenter32_c  },    // FLIPADST_DST      = 14,
     { ihalfcenter32_c,  ihalfcenter32_c  },  // DST_DST           = 15
+    { iidtx32_c, iidtx32_c },                // IDTX              = 16
+    { idct32_c,  iidtx32_c },                // V_DCT             = 17
+    { iidtx32_c, idct32_c  },                // H_DCT             = 18
   };
 
   int i, j;
@@ -1165,11 +1105,11 @@
     case ADST_DST:
     case FLIPADST_DST:
     case DST_FLIPADST:
+    case H_DCT:
+    case V_DCT:
       // Use C version since DST only exists in C code
       vp10_iht4x4_16_add_c(input, dest, stride, tx_type);
       break;
-    case H_DCT:
-    case V_DCT:
     case IDTX:
       inv_idtx_add_c(input, dest, stride, 4, tx_type);
       break;
@@ -1206,11 +1146,11 @@
     case ADST_DST:
     case FLIPADST_DST:
     case DST_FLIPADST:
+    case H_DCT:
+    case V_DCT:
       // Use C version since DST only exists in C code
       vp10_iht8x8_64_add_c(input, dest, stride, tx_type);
       break;
-    case H_DCT:
-    case V_DCT:
     case IDTX:
       inv_idtx_add_c(input, dest, stride, 8, tx_type);
       break;
@@ -1247,11 +1187,11 @@
     case ADST_DST:
     case FLIPADST_DST:
     case DST_FLIPADST:
+    case H_DCT:
+    case V_DCT:
       // Use C version since DST only exists in C code
       vp10_iht16x16_256_add_c(input, dest, stride, tx_type);
       break;
-    case H_DCT:
-    case V_DCT:
     case IDTX:
       inv_idtx_add_c(input, dest, stride, 16, tx_type);
       break;
@@ -1284,10 +1224,10 @@
     case ADST_DST:
     case FLIPADST_DST:
     case DST_FLIPADST:
-      vp10_iht32x32_1024_add_c(input, dest, stride, tx_type);
-      break;
     case H_DCT:
     case V_DCT:
+      vp10_iht32x32_1024_add_c(input, dest, stride, tx_type);
+      break;
     case IDTX:
       inv_idtx_add_c(input, dest, stride, 32, tx_type);
       break;
@@ -1319,6 +1259,9 @@
     {     highbd_idst4_c,  vpx_highbd_iadst4_c },  // DST_FLIPADST      = 13,
     { vpx_highbd_iadst4_c,     highbd_idst4_c  },  // FLIPADST_DST      = 14,
     {     highbd_idst4_c,      highbd_idst4_c  },  // DST_DST           = 15
+    {     highbd_iidtx4_c,     highbd_iidtx4_c },  // IDTX              = 16
+    { vpx_highbd_idct4_c,      highbd_iidtx4_c },  // V_DCT             = 17
+    {     highbd_iidtx4_c, vpx_highbd_idct4_c  },  // H_DCT             = 18
 #endif  // CONFIG_EXT_TX
   };
 
@@ -1385,6 +1328,9 @@
     {     highbd_idst8_c,  vpx_highbd_iadst8_c },  // DST_FLIPADST      = 13,
     { vpx_highbd_iadst8_c,     highbd_idst8_c  },  // FLIPADST_DST      = 14,
     {     highbd_idst8_c,      highbd_idst8_c  },  // DST_DST           = 15
+    {     highbd_iidtx8_c,     highbd_iidtx8_c },  // IDTX              = 16
+    { vpx_highbd_idct8_c,      highbd_iidtx8_c },  // V_DCT             = 17
+    {     highbd_iidtx8_c, vpx_highbd_idct8_c  },  // H_DCT             = 18
 #endif  // CONFIG_EXT_TX
   };
 
@@ -1451,6 +1397,9 @@
     {     highbd_idst16_c,  vpx_highbd_iadst16_c },  // DST_FLIPADST      = 13,
     { vpx_highbd_iadst16_c,     highbd_idst16_c  },  // FLIPADST_DST      = 14,
     {     highbd_idst16_c,      highbd_idst16_c  },  // DST_DST           = 15
+    {     highbd_iidtx16_c,     highbd_iidtx16_c },  // IDTX              = 16
+    { vpx_highbd_idct16_c,      highbd_iidtx16_c },  // V_DCT             = 17
+    {     highbd_iidtx16_c, vpx_highbd_idct16_c  },  // H_DCT             = 18
 #endif  // CONFIG_EXT_TX
   };
 
@@ -1501,22 +1450,25 @@
 void vp10_highbd_iht32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
                                      int stride, int tx_type, int bd) {
   static const highbd_transform_2d HIGH_IHT_32[] = {
-    { vpx_highbd_idct32_c, vpx_highbd_idct32_c  },        // DCT_DCT
-    { highbd_ihalfright32_c, vpx_highbd_idct32_c  },      // ADST_DCT
-    { vpx_highbd_idct32_c, highbd_ihalfright32_c },       // DCT_ADST
-    { highbd_ihalfright32_c, highbd_ihalfright32_c },     // ADST_ADST
-    { highbd_ihalfright32_c, vpx_highbd_idct32_c  },      // FLIPADST_DCT
-    { vpx_highbd_idct32_c, highbd_ihalfright32_c },       // DCT_FLIPADST
-    { highbd_ihalfright32_c, highbd_ihalfright32_c },     // FLIPADST_FLIPADST
-    { highbd_ihalfright32_c, highbd_ihalfright32_c },     // ADST_FLIPADST
-    { highbd_ihalfright32_c, highbd_ihalfright32_c },     // FLIPADST_ADST
-    { highbd_ihalfcenter32_c, vpx_highbd_idct32_c  },     // DST_DCT
-    { vpx_highbd_idct32_c, highbd_ihalfcenter32_c  },     // DCT_DST
-    { highbd_ihalfcenter32_c, highbd_ihalfright32_c },    // DST_ADST
-    { highbd_ihalfright32_c, highbd_ihalfcenter32_c  },   // ADST_DST
-    { highbd_ihalfcenter32_c, highbd_ihalfright32_c },    // DST_FLIPADST
-    { highbd_ihalfright32_c, highbd_ihalfcenter32_c  },   // FLIPADST_DST
-    { highbd_ihalfcenter32_c, highbd_ihalfcenter32_c  },  // DST_DST
+    { vpx_highbd_idct32_c,    vpx_highbd_idct32_c    },  // DCT_DCT
+    { highbd_ihalfright32_c,  vpx_highbd_idct32_c    },  // ADST_DCT
+    { vpx_highbd_idct32_c,    highbd_ihalfright32_c  },  // DCT_ADST
+    { highbd_ihalfright32_c,  highbd_ihalfright32_c  },  // ADST_ADST
+    { highbd_ihalfright32_c,  vpx_highbd_idct32_c    },  // FLIPADST_DCT
+    { vpx_highbd_idct32_c,    highbd_ihalfright32_c  },  // DCT_FLIPADST
+    { highbd_ihalfright32_c,  highbd_ihalfright32_c  },  // FLIPADST_FLIPADST
+    { highbd_ihalfright32_c,  highbd_ihalfright32_c  },  // ADST_FLIPADST
+    { highbd_ihalfright32_c,  highbd_ihalfright32_c  },  // FLIPADST_ADST
+    { highbd_ihalfcenter32_c, vpx_highbd_idct32_c    },  // DST_DCT
+    { vpx_highbd_idct32_c,    highbd_ihalfcenter32_c },  // DCT_DST
+    { highbd_ihalfcenter32_c, highbd_ihalfright32_c  },  // DST_ADST
+    { highbd_ihalfright32_c,  highbd_ihalfcenter32_c },  // ADST_DST
+    { highbd_ihalfcenter32_c, highbd_ihalfright32_c  },  // DST_FLIPADST
+    { highbd_ihalfright32_c,  highbd_ihalfcenter32_c },  // FLIPADST_DST
+    { highbd_ihalfcenter32_c, highbd_ihalfcenter32_c },  // DST_DST
+    {     highbd_iidtx32_c,   highbd_iidtx32_c       },  // IDTX
+    { vpx_highbd_idct32_c,    highbd_iidtx32_c       },  // V_DCT
+    {     highbd_iidtx32_c,   vpx_highbd_idct32_c    },  // H_DCT
   };
 
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
@@ -1657,11 +1609,11 @@
     case ADST_DST:
     case FLIPADST_DST:
     case DST_FLIPADST:
+    case H_DCT:
+    case V_DCT:
       // Use C version since DST only exists in C code
       vp10_highbd_iht4x4_16_add_c(input, dest, stride, tx_type, bd);
       break;
-    case H_DCT:
-    case V_DCT:
     case IDTX:
       highbd_inv_idtx_add_c(input, dest, stride, 4, tx_type, bd);
       break;
@@ -1699,11 +1651,11 @@
     case ADST_DST:
     case FLIPADST_DST:
     case DST_FLIPADST:
+    case H_DCT:
+    case V_DCT:
       // Use C version since DST only exists in C code
       vp10_highbd_iht8x8_64_add_c(input, dest, stride, tx_type, bd);
       break;
-    case H_DCT:
-    case V_DCT:
     case IDTX:
       highbd_inv_idtx_add_c(input, dest, stride, 8, tx_type, bd);
       break;
@@ -1741,11 +1693,11 @@
     case ADST_DST:
     case FLIPADST_DST:
     case DST_FLIPADST:
+    case H_DCT:
+    case V_DCT:
       // Use C version since DST only exists in C code
       vp10_highbd_iht16x16_256_add_c(input, dest, stride, tx_type, bd);
       break;
-    case H_DCT:
-    case V_DCT:
     case IDTX:
       highbd_inv_idtx_add_c(input, dest, stride, 16, tx_type, bd);
       break;
@@ -1779,10 +1731,10 @@
     case ADST_DST:
     case FLIPADST_DST:
     case DST_FLIPADST:
-      vp10_highbd_iht32x32_1024_add_c(input, dest, stride, tx_type, bd);
-      break;
     case H_DCT:
     case V_DCT:
+      vp10_highbd_iht32x32_1024_add_c(input, dest, stride, tx_type, bd);
+      break;
     case IDTX:
       highbd_inv_idtx_add_c(input, dest, stride, 32, tx_type, bd);
       break;
diff --git a/vp10/common/onyxc_int.h b/vp10/common/onyxc_int.h
index bcc69f3..26ae569 100644
--- a/vp10/common/onyxc_int.h
+++ b/vp10/common/onyxc_int.h
@@ -313,7 +313,7 @@
   BufferPool *buffer_pool;
 
   PARTITION_CONTEXT *above_seg_context;
-  ENTROPY_CONTEXT *above_context;
+  ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
 #if CONFIG_VAR_TX
   TXFM_CONTEXT *above_txfm_context;
   TXFM_CONTEXT left_txfm_context[8];
@@ -405,9 +405,7 @@
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     xd->plane[i].dqcoeff = dqcoeff;
-    xd->above_context[i] = cm->above_context +
-        i * sizeof(*cm->above_context) * 2 * mi_cols_aligned_to_sb(cm->mi_cols);
-
+    xd->above_context[i] = cm->above_context[i];
     if (xd->plane[i].plane_type == PLANE_TYPE_Y) {
       memcpy(xd->plane[i].seg_dequant, cm->y_dequant, sizeof(cm->y_dequant));
     } else {
@@ -525,6 +523,27 @@
   return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
 }
 
+static INLINE void vp10_zero_above_context(VP10_COMMON *const cm,
+                             int mi_col_start, int mi_col_end) {
+  const int width = mi_col_end - mi_col_start;
+  int i;
+
+  for (i = 0 ; i < MAX_MB_PLANE ; i++)
+    vp10_zero_array(cm->above_context[i] + 2 * mi_col_start, 2 * width);
+  vp10_zero_array(cm->above_seg_context + mi_col_start, width);
+#if CONFIG_VAR_TX
+  vp10_zero_array(cm->above_txfm_context + mi_col_start, width);
+#endif  // CONFIG_VAR_TX
+}
+
+static INLINE void vp10_zero_left_context(MACROBLOCKD *const xd) {
+  vp10_zero(xd->left_context);
+  vp10_zero(xd->left_seg_context);
+#if CONFIG_VAR_TX
+  vp10_zero(xd->left_txfm_context_buffer);
+#endif
+}
+
 #if CONFIG_VAR_TX
 static INLINE void set_txfm_ctx(TXFM_CONTEXT *txfm_ctx,
                                 TX_SIZE tx_size,
diff --git a/vp10/decoder/decodeframe.c b/vp10/decoder/decodeframe.c
index ce6317c..0441662 100644
--- a/vp10/decoder/decodeframe.c
+++ b/vp10/decoder/decodeframe.c
@@ -2962,18 +2962,7 @@
   assert(tile_rows <= 4);
   assert(tile_cols <= (1 << 6));
 
-  // Note: this memset assumes above_context[0], [1] and [2]
-  // are allocated as part of the same buffer.
-  memset(cm->above_context, 0,
-         sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_cols);
-
-  memset(cm->above_seg_context, 0,
-         sizeof(*cm->above_seg_context) * aligned_cols);
-
-#if CONFIG_VAR_TX
-  memset(cm->above_txfm_context, 0,
-         sizeof(*cm->above_txfm_context) * aligned_cols);
-#endif
+  vp10_zero_above_context(cm, 0, aligned_cols);
 
   get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, tile_buffers);
 
@@ -3032,11 +3021,7 @@
                         tile_cols - tile_col - 1 : tile_col;
         tile_data = pbi->tile_data + tile_cols * tile_row + col;
         vp10_tile_set_col(&tile, tile_data->cm, col);
-        vp10_zero(tile_data->xd.left_context);
-        vp10_zero(tile_data->xd.left_seg_context);
-#if CONFIG_VAR_TX
-        vp10_zero(tile_data->xd.left_txfm_context_buffer);
-#endif
+        vp10_zero_left_context(&tile_data->xd);
         for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end;
              mi_col += MI_BLOCK_SIZE) {
           decode_partition(pbi, &tile_data->xd,
@@ -3126,11 +3111,7 @@
 
   for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
        mi_row += MI_BLOCK_SIZE) {
-    vp10_zero(tile_data->xd.left_context);
-    vp10_zero(tile_data->xd.left_seg_context);
-#if CONFIG_VAR_TX
-    vp10_zero(tile_data->xd.left_txfm_context_buffer);
-#endif
+    vp10_zero_left_context(&tile_data->xd);
     for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
          mi_col += MI_BLOCK_SIZE) {
       decode_partition(tile_data->pbi, &tile_data->xd,
@@ -3211,16 +3192,8 @@
     worker->data2 = &pbi->tile_worker_info[n];
   }
 
-  // Note: this memset assumes above_context[0], [1] and [2]
-  // are allocated as part of the same buffer.
-  memset(cm->above_context, 0,
-         sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_mi_cols);
-  memset(cm->above_seg_context, 0,
-         sizeof(*cm->above_seg_context) * aligned_mi_cols);
-#if CONFIG_VAR_TX
-  memset(cm->above_txfm_context, 0,
-         sizeof(*cm->above_txfm_context) * aligned_mi_cols);
-#endif
+  vp10_zero_above_context(cm, 0, aligned_mi_cols);
+
   // Load tile data into tile_buffers
   get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, tile_buffers);
 
diff --git a/vp10/encoder/bitstream.c b/vp10/encoder/bitstream.c
index 1ef2ea5..f20c224 100644
--- a/vp10/encoder/bitstream.c
+++ b/vp10/encoder/bitstream.c
@@ -1710,10 +1710,7 @@
 
   for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
        mi_row += MI_BLOCK_SIZE) {
-    vp10_zero(xd->left_seg_context);
-#if CONFIG_VAR_TX
-    vp10_zero(xd->left_txfm_context_buffer);
-#endif
+    vp10_zero_left_context(xd);
     for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
          mi_col += MI_BLOCK_SIZE)
       write_modes_sb(cpi, tile, w, tok, tok_end,
@@ -2190,12 +2187,7 @@
   const int tile_rows = 1 << cm->log2_tile_rows;
   unsigned int max_tile = 0;
 
-  memset(cm->above_seg_context, 0,
-         sizeof(*cm->above_seg_context) * mi_cols_aligned_to_sb(cm->mi_cols));
-#if CONFIG_VAR_TX
-  memset(cm->above_txfm_context, 0,
-         sizeof(*cm->above_txfm_context) * mi_cols_aligned_to_sb(cm->mi_cols));
-#endif
+  vp10_zero_above_context(cm, 0, mi_cols_aligned_to_sb(cm->mi_cols));
 
   for (tile_row = 0; tile_row < tile_rows; tile_row++) {
     for (tile_col = 0; tile_col < tile_cols; tile_col++) {
diff --git a/vp10/encoder/context_tree.h b/vp10/encoder/context_tree.h
index 4fa5806..53c7142 100644
--- a/vp10/encoder/context_tree.h
+++ b/vp10/encoder/context_tree.h
@@ -54,7 +54,6 @@
   int hybrid_pred_diff;
   int comp_pred_diff;
   int single_pred_diff;
-  int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
 
   // TODO(jingning) Use RD_COST struct here instead. This involves a boarder
   // scope of refactoring.
diff --git a/vp10/encoder/dct.c b/vp10/encoder/dct.c
index 31a4c87..8a1ee20 100644
--- a/vp10/encoder/dct.c
+++ b/vp10/encoder/dct.c
@@ -1212,6 +1212,30 @@
 }
 
 #if CONFIG_EXT_TX
+static void fidtx4(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 4; ++i)
+    output[i] = (tran_low_t)fdct_round_shift(input[i] * Sqrt2);
+}
+
+static void fidtx8(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 8; ++i)
+    output[i] = input[i] * 2;
+}
+
+static void fidtx16(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 16; ++i)
+    output[i] = (tran_low_t)fdct_round_shift(input[i] * 2 * Sqrt2);
+}
+
+static void fidtx32(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 32; ++i)
+    output[i] = input[i] * 4;
+}
+
 // For use in lieu of DST
 static void fhalfcenter32(const tran_low_t *input, tran_low_t *output) {
   int i;
@@ -1315,6 +1339,7 @@
     case DST_DCT:
     case DST_ADST:
     case ADST_DST:
+    case IDTX:
     case H_DCT:
     case V_DCT:
       break;
@@ -1362,6 +1387,9 @@
   { fdst4,  fadst4 },  // DST_FLIPADST      = 13,
   { fadst4, fdst4  },  // FLIPADST_DST      = 14,
   { fdst4,  fdst4  },  // DST_DST           = 15
+  { fidtx4, fidtx4 },  // IDTX              = 16
+  { fdct4,  fidtx4 },  // V_DCT             = 17
+  { fidtx4, fdct4  },  // H_DCT             = 18
 #endif  // CONFIG_EXT_TX
 };
 
@@ -1383,6 +1411,9 @@
   { fdst8,  fadst8 },  // DST_FLIPADST      = 13,
   { fadst8, fdst8  },  // FLIPADST_DST      = 14,
   { fdst8,  fdst8  },  // DST_DST           = 15
+  { fidtx8, fidtx8 },  // IDTX              = 16
+  { fdct8,  fidtx8 },  // V_DCT             = 17
+  { fidtx8, fdct8  },  // H_DCT             = 18
 #endif  // CONFIG_EXT_TX
 };
 
@@ -1404,6 +1435,9 @@
   { fdst16,  fadst16 },  // DST_FLIPADST      = 13,
   { fadst16, fdst16  },  // FLIPADST_DST      = 14,
   { fdst16,  fdst16  },  // DST_DST           = 15
+  { fidtx16, fidtx16 },  // IDTX              = 16
+  { fdct16,  fidtx16 },  // V_DCT             = 17
+  { fidtx16, fdct16  },  // H_DCT             = 18
 #endif  // CONFIG_EXT_TX
 };
 
@@ -1425,6 +1459,9 @@
   { fhalfcenter32,  fhalfright32 },    // DST_FLIPADST      = 13,
   { fhalfright32, fhalfcenter32  },    // FLIPADST_DST      = 14,
   { fhalfcenter32,  fhalfcenter32  },  // DST_DST           = 15
+  { fidtx32, fidtx32 },                // IDTX              = 16
+  { fdct32,  fidtx32 },                // V_DCT             = 17
+  { fidtx32, fdct32  },                // H_DCT             = 18
 };
 #endif  // CONFIG_EXT_TX
 
@@ -1766,86 +1803,12 @@
                      int bs, int tx_type) {
   int r, c;
   const int shift = bs < 32 ? 3 : 2;
-
-  const int16_t *input = src_diff;
-  tran_low_t *output = coeff;
-
-  int i, j;
-  tran_low_t temp_in[32], temp_out[32];
-  transform_2d ht = {fdct4, fdct4};
-  int in_scale = 1;
-  int out_scale = 1;
-  int coeff_stride = 0;
-
-  switch (bs) {
-    case 4:
-      ht.cols = fdct4;
-      ht.rows = fdct4;
-      in_scale = 16;
-      out_scale = cospi_16_64 >> 1;
-      coeff_stride = 4;
-      break;
-    case 8:
-      ht.cols = fdct8;
-      ht.rows = fdct8;
-      in_scale = 4;
-      out_scale = (1 << DCT_CONST_BITS);
-      coeff_stride = 8;
-      break;
-    case 16:
-      ht.cols = fdct16;
-      ht.rows = fdct16;
-      in_scale = 4;
-      out_scale = cospi_16_64;
-      coeff_stride = 16;
-      break;
-    case 32:
-      ht.cols = fdct32;
-      ht.rows = fdct32;
-      in_scale = 4;
-      out_scale = (1 << (DCT_CONST_BITS - 2));
-      coeff_stride = 32;
-      break;
-    default:
-      assert(0);
-  }
-
-  // Columns
-  if (tx_type == V_DCT) {
-    for (i = 0; i < bs; ++i) {
-      for (j = 0; j < bs; ++j)
-        temp_in[j] = input[j * stride + i] * in_scale;
-      ht.cols(temp_in, temp_out);
-
-      for (j = 0; j < bs; ++j) {
-        tran_high_t temp = (tran_high_t)temp_out[j] * out_scale;
-        temp >>= DCT_CONST_BITS;
-        output[j * coeff_stride + i] = (tran_low_t)temp;
-      }
+  if (tx_type == IDTX) {
+    for (r = 0; r < bs; ++r) {
+      for (c = 0; c < bs; ++c) coeff[c] = src_diff[c] << shift;
+      src_diff += stride;
+      coeff += bs;
     }
-    return;
-  }
-
-  // Rows
-  if (tx_type == H_DCT) {
-    for (j = 0; j < bs; ++j) {
-      for (i = 0; i < bs; ++i)
-        temp_in[i] = input[j * stride + i] * in_scale;
-      ht.rows(temp_in, temp_out);
-
-      for (i = 0; i < bs; ++i) {
-        tran_high_t temp = (tran_high_t)temp_out[i] * out_scale;
-        temp >>= DCT_CONST_BITS;
-        output[j * coeff_stride + i] = (tran_low_t)temp;
-      }
-    }
-    return;
-  }
-
-  for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c) coeff[c] = src_diff[c] << shift;
-    src_diff += stride;
-    coeff += bs;
   }
 }
 
diff --git a/vp10/encoder/encodeframe.c b/vp10/encoder/encodeframe.c
index c5a68a9..ec00b62 100644
--- a/vp10/encoder/encodeframe.c
+++ b/vp10/encoder/encodeframe.c
@@ -1194,9 +1194,6 @@
     rdc->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff;
     rdc->comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff;
     rdc->comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff;
-
-    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
-      rdc->filter_diff[i] += ctx->best_filter_diff[i];
   }
 
   for (h = 0; h < y_mis; ++h) {
@@ -1316,9 +1313,6 @@
     rdc->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff;
     rdc->comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff;
     rdc->comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff;
-
-    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
-      rdc->filter_diff[i] += ctx->best_filter_diff[i];
   }
 
   for (h = 0; h < y_mis; ++h) {
@@ -1654,6 +1648,9 @@
                                    totalrate_nocoef,
 #endif  // CONFIG_SUPERTX
                                    bsize, ctx, best_rd);
+#if CONFIG_SUPERTX
+        assert(*totalrate_nocoef >= 0);
+#endif  // CONFIG_SUPERTX
       }
     } else {
       vp10_rd_pick_inter_mode_sub8x8(cpi, tile_data, x, mi_row, mi_col, rd_cost,
@@ -1661,6 +1658,9 @@
                                      totalrate_nocoef,
 #endif  // CONFIG_SUPERTX
                                      bsize, ctx, best_rd);
+#if CONFIG_SUPERTX
+      assert(*totalrate_nocoef >= 0);
+#endif  // CONFIG_SUPERTX
     }
   }
 
@@ -3687,13 +3687,8 @@
   SPEED_FEATURES *const sf = &cpi->sf;
   int mi_col;
 
-  // Initialize the left context for the new SB row
-  memset(&xd->left_context, 0, sizeof(xd->left_context));
-  memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context));
-#if CONFIG_VAR_TX
-  memset(xd->left_txfm_context_buffer, 0,
-         sizeof(xd->left_txfm_context_buffer));
-#endif
+  vp10_zero_left_context(xd);
+
   // Code each SB in the row
   for (mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end;
        mi_col += MI_BLOCK_SIZE) {
@@ -3791,19 +3786,9 @@
   // Copy data over into macro block data structures.
   vp10_setup_src_planes(x, cpi->Source, 0, 0);
 
-  vp10_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
+  vp10_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y);
 
-  // Note: this memset assumes above_context[0], [1] and [2]
-  // are allocated as part of the same buffer.
-  memset(xd->above_context[0], 0,
-         sizeof(*xd->above_context[0]) *
-         2 * aligned_mi_cols * MAX_MB_PLANE);
-  memset(xd->above_seg_context, 0,
-         sizeof(*xd->above_seg_context) * aligned_mi_cols);
-#if CONFIG_VAR_TX
-  memset(cm->above_txfm_context, 0,
-         sizeof(*xd->above_txfm_context) * aligned_mi_cols);
-#endif
+  vp10_zero_above_context(cm, 0, aligned_mi_cols);
 }
 
 static int check_dual_ref_flags(VP10_COMP *cpi) {
@@ -3971,7 +3956,6 @@
   vp10_zero(*td->counts);
   vp10_zero(rdc->coef_counts);
   vp10_zero(rdc->comp_pred_diff);
-  vp10_zero(rdc->filter_diff);
   rdc->m_search_count = 0;   // Count of motion search hits.
   rdc->ex_search_count = 0;  // Exhaustive mesh search hits.
 
@@ -4039,31 +4023,9 @@
   cpi->last_frame_distortion = cpi->frame_distortion;
 #endif
 }
-
-static INTERP_FILTER get_interp_filter(
-    const int64_t threshes[SWITCHABLE_FILTER_CONTEXTS], int is_alt_ref) {
-#if CONFIG_EXT_INTERP
-  if (!is_alt_ref &&
-      threshes[EIGHTTAP_SMOOTH2] > threshes[EIGHTTAP_SMOOTH] &&
-      threshes[EIGHTTAP_SMOOTH2] > threshes[EIGHTTAP_REGULAR] &&
-      threshes[EIGHTTAP_SMOOTH2] > threshes[MULTITAP_SHARP] &&
-      threshes[EIGHTTAP_SMOOTH2] > threshes[SWITCHABLE - 1]) {
-    return EIGHTTAP_SMOOTH2;
-  }
-#endif  // CONFIG_EXT_INTERP
-  if (!is_alt_ref &&
-      threshes[EIGHTTAP_SMOOTH] > threshes[EIGHTTAP_REGULAR] &&
-      threshes[EIGHTTAP_SMOOTH] > threshes[MULTITAP_SHARP] &&
-      threshes[EIGHTTAP_SMOOTH] > threshes[SWITCHABLE - 1]) {
-    return EIGHTTAP_SMOOTH;
-  } else if (threshes[MULTITAP_SHARP] > threshes[EIGHTTAP_REGULAR] &&
-             threshes[MULTITAP_SHARP] > threshes[SWITCHABLE - 1]) {
-    return MULTITAP_SHARP;
-  } else if (threshes[EIGHTTAP_REGULAR] > threshes[SWITCHABLE - 1]) {
-    return EIGHTTAP_REGULAR;
-  } else {
-    return SWITCHABLE;
-  }
+static INTERP_FILTER get_cm_interp_filter(VP10_COMP *cpi) {
+  (void)cpi;
+  return SWITCHABLE;
 }
 
 void vp10_encode_frame(VP10_COMP *cpi) {
@@ -4116,7 +4078,6 @@
     // INTRA/ALTREF/GOLDEN/LAST needs to be specified seperately.
     const MV_REFERENCE_FRAME frame_type = get_frame_type(cpi);
     int64_t *const mode_thrs = rd_opt->prediction_type_threshes[frame_type];
-    int64_t *const filter_thrs = rd_opt->filter_threshes[frame_type];
     const int is_alt_ref = frame_type == ALTREF_FRAME;
 
     /* prediction (compound, single or hybrid) mode selection */
@@ -4134,7 +4095,7 @@
       cm->reference_mode = REFERENCE_MODE_SELECT;
 
     if (cm->interp_filter == SWITCHABLE) {
-      cm->interp_filter = get_interp_filter(filter_thrs, is_alt_ref);
+      cm->interp_filter = get_cm_interp_filter(cpi);
     }
 
     encode_frame_internal(cpi);
@@ -4142,9 +4103,6 @@
     for (i = 0; i < REFERENCE_MODES; ++i)
       mode_thrs[i] = (mode_thrs[i] + rdc->comp_pred_diff[i] / cm->MBs) / 2;
 
-    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
-      filter_thrs[i] = (filter_thrs[i] + rdc->filter_diff[i] / cm->MBs) / 2;
-
     if (cm->reference_mode == REFERENCE_MODE_SELECT) {
       int single_count_zero = 0;
       int comp_count_zero = 0;
diff --git a/vp10/encoder/encoder.h b/vp10/encoder/encoder.h
index afe3292..a319901 100644
--- a/vp10/encoder/encoder.h
+++ b/vp10/encoder/encoder.h
@@ -251,7 +251,6 @@
 typedef struct RD_COUNTS {
   vp10_coeff_count coef_counts[TX_SIZES][PLANE_TYPES];
   int64_t comp_pred_diff[REFERENCE_MODES];
-  int64_t filter_diff[SWITCHABLE_FILTER_CONTEXTS];
   int m_search_count;
   int ex_search_count;
 } RD_COUNTS;
diff --git a/vp10/encoder/ethread.c b/vp10/encoder/ethread.c
index 6cb9494..c586b9a 100644
--- a/vp10/encoder/ethread.c
+++ b/vp10/encoder/ethread.c
@@ -19,9 +19,6 @@
   for (i = 0; i < REFERENCE_MODES; i++)
     td->rd_counts.comp_pred_diff[i] += td_t->rd_counts.comp_pred_diff[i];
 
-  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
-    td->rd_counts.filter_diff[i] += td_t->rd_counts.filter_diff[i];
-
   for (i = 0; i < TX_SIZES; i++)
     for (j = 0; j < PLANE_TYPES; j++)
       for (k = 0; k < REF_TYPES; k++)
diff --git a/vp10/encoder/hybrid_fwd_txfm.c b/vp10/encoder/hybrid_fwd_txfm.c
index c3a739b..faedb43 100644
--- a/vp10/encoder/hybrid_fwd_txfm.c
+++ b/vp10/encoder/hybrid_fwd_txfm.c
@@ -65,6 +65,8 @@
       break;
     case H_DCT:
     case V_DCT:
+      vp10_fht4x4_c(src_diff, coeff, diff_stride, tx_type);
+      break;
     case IDTX:
       vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 4, tx_type);
       break;
@@ -105,6 +107,8 @@
       break;
     case H_DCT:
     case V_DCT:
+      vp10_fht8x8_c(src_diff, coeff, diff_stride, tx_type);
+      break;
     case IDTX:
       vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 8, tx_type);
       break;
@@ -145,6 +149,8 @@
       break;
     case H_DCT:
     case V_DCT:
+      vp10_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
+      break;
     case IDTX:
       vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 16, tx_type);
       break;
@@ -185,6 +191,8 @@
       break;
     case H_DCT:
     case V_DCT:
+      vp10_fht32x32_c(src_diff, coeff, diff_stride, tx_type);
+      break;
     case IDTX:
       vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 32, tx_type);
       break;
@@ -226,11 +234,10 @@
     case ADST_DST:
     case DST_FLIPADST:
     case FLIPADST_DST:
-      // Use C version since DST exists only in C
-      vp10_highbd_fht4x4_c(src_diff, coeff, diff_stride, tx_type);
-      break;
     case H_DCT:
     case V_DCT:
+      vp10_highbd_fht4x4_c(src_diff, coeff, diff_stride, tx_type);
+      break;
     case IDTX:
       vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 4, tx_type);
       break;
@@ -270,11 +277,11 @@
     case ADST_DST:
     case DST_FLIPADST:
     case FLIPADST_DST:
+    case H_DCT:
+    case V_DCT:
       // Use C version since DST exists only in C
       vp10_highbd_fht8x8_c(src_diff, coeff, diff_stride, tx_type);
       break;
-    case H_DCT:
-    case V_DCT:
     case IDTX:
       vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 8, tx_type);
       break;
@@ -314,11 +321,11 @@
     case ADST_DST:
     case DST_FLIPADST:
     case FLIPADST_DST:
+    case H_DCT:
+    case V_DCT:
       // Use C version since DST exists only in C
       vp10_highbd_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
       break;
-    case H_DCT:
-    case V_DCT:
     case IDTX:
       vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 16, tx_type);
       break;
@@ -355,10 +362,10 @@
     case ADST_DST:
     case DST_FLIPADST:
     case FLIPADST_DST:
-      vp10_highbd_fht32x32_c(src_diff, coeff, diff_stride, tx_type);
-      break;
     case H_DCT:
     case V_DCT:
+      vp10_highbd_fht32x32_c(src_diff, coeff, diff_stride, tx_type);
+      break;
     case IDTX:
       vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 32, tx_type);
       break;
diff --git a/vp10/encoder/rd.h b/vp10/encoder/rd.h
index 5a6a44a..61feabe 100644
--- a/vp10/encoder/rd.h
+++ b/vp10/encoder/rd.h
@@ -279,8 +279,6 @@
 
   int64_t prediction_type_threshes[MAX_REF_FRAMES][REFERENCE_MODES];
 
-  int64_t filter_threshes[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS];
-
   int RDMULT;
   int RDDIV;
 } RD_OPT;
diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index c7a2702..16deebf 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@@ -5452,7 +5452,6 @@
 static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
                          int mode_index,
                          int64_t comp_pred_diff[REFERENCE_MODES],
-                         int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS],
                          int skippable) {
   MACROBLOCKD *const xd = &x->e_mbd;
 
@@ -5466,9 +5465,6 @@
   ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_REFERENCE];
   ctx->comp_pred_diff   = (int)comp_pred_diff[COMPOUND_REFERENCE];
   ctx->hybrid_pred_diff = (int)comp_pred_diff[REFERENCE_MODE_SELECT];
-
-  memcpy(ctx->best_filter_diff, best_filter_diff,
-         sizeof(*best_filter_diff) * SWITCHABLE_FILTER_CONTEXTS);
 }
 
 static void setup_buffer_inter(
@@ -6036,9 +6032,7 @@
                                  INTERP_FILTER (*single_filter)[MAX_REF_FRAMES],
                                  int (*single_skippable)[MAX_REF_FRAMES],
                                  int64_t *psse,
-                                 const int64_t ref_best_rd,
-                                 int64_t *mask_filter,
-                                 int64_t filter_cache[]) {
+                                 const int64_t ref_best_rd) {
   VP10_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
@@ -6075,28 +6069,6 @@
       !is_comp_interintra_pred &&
 #endif  // CONFIG_EXT_INTER
       is_obmc_allowed(mbmi);
-  int best_obmc_flag = 0;
-#if CONFIG_VP9_HIGHBITDEPTH
-  DECLARE_ALIGNED(16, uint16_t, tmp_buf1_16[MAX_MB_PLANE * CU_SIZE * CU_SIZE]);
-  DECLARE_ALIGNED(16, uint16_t, tmp_buf2_16[MAX_MB_PLANE * CU_SIZE * CU_SIZE]);
-  uint8_t *tmp_buf1, *tmp_buf2;
-  uint8_t *obmc_tmp_buf1[3];
-  uint8_t *obmc_tmp_buf2[3];
-#else
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf1[MAX_MB_PLANE * CU_SIZE * CU_SIZE]);
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf2[MAX_MB_PLANE * CU_SIZE * CU_SIZE]);
-  uint8_t *obmc_tmp_buf1[3] = {tmp_buf1, tmp_buf1 + CU_SIZE * CU_SIZE,
-    tmp_buf1 + CU_SIZE * CU_SIZE * 2};
-  uint8_t *obmc_tmp_buf2[3] = {tmp_buf2, tmp_buf2 + CU_SIZE * CU_SIZE,
-    tmp_buf2 + CU_SIZE * CU_SIZE * 2};
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-  int obmc_tmp_stride[3] = {CU_SIZE, CU_SIZE, CU_SIZE};
-
-  uint8_t skip_txfm_bestfilter[2][MAX_MB_PLANE << 2] = {{0}, {0}};
-  int64_t bsse_bestfilter[2][MAX_MB_PLANE << 2] = {{0}, {0}};
-  int skip_txfm_sb_bestfilter[2] = {0};
-  int64_t skip_sse_sb_bestfilter[2] = {INT64_MAX};
-
   int rate2_nocoeff, best_rate2 = INT_MAX,
       best_skippable, best_xskip, best_disable_skip = 0;
 #if CONFIG_SUPERTX
@@ -6154,25 +6126,9 @@
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf16);
-#if CONFIG_OBMC
-    tmp_buf1 = CONVERT_TO_BYTEPTR(tmp_buf1_16);
-    tmp_buf2 = CONVERT_TO_BYTEPTR(tmp_buf2_16);
-#endif  // CONFIG_OBMC
   } else {
     tmp_buf = (uint8_t *)tmp_buf16;
-#if CONFIG_OBMC
-    tmp_buf1 = (uint8_t *)tmp_buf1_16;
-    tmp_buf2 = (uint8_t *)tmp_buf2_16;
-#endif  // CONFIG_OBMC
   }
-#if CONFIG_OBMC
-  obmc_tmp_buf1[0] = tmp_buf1;
-  obmc_tmp_buf1[1] = tmp_buf1 + 4096;
-  obmc_tmp_buf1[2] = tmp_buf1 + 8192;
-  obmc_tmp_buf2[0] = tmp_buf2;
-  obmc_tmp_buf2[1] = tmp_buf2 + 4096;
-  obmc_tmp_buf2[2] = tmp_buf2 + 8192;
-#endif  // CONFIG_OBMC
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
   if (is_comp_pred) {
@@ -6419,11 +6375,6 @@
   if (is_comp_pred)
     intpel_mv &= !mv_has_subpel(&mbmi->mv[1].as_mv);
 
-  // Search for best switchable filter by checking the variance of
-  // pred error irrespective of whether the filter will be used
-  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
-    filter_cache[i] = INT64_MAX;
-
   best_filter = predict_interp_filter(cpi, x, bsize, mi_row, mi_col,
                                       single_filter);
   if (cm->interp_filter != BILINEAR && best_filter == SWITCHABLE) {
@@ -6436,15 +6387,6 @@
       int64_t rs_rd;
       int tmp_skip_sb = 0;
       int64_t tmp_skip_sse = INT64_MAX;
-#if CONFIG_OBMC
-      int obmc_flag = 0;
-      int tmp_skip_sb_obmc = 0;
-      int64_t tmp_skip_sse_obmc = INT64_MAX;
-      int64_t rdobmc = INT64_MAX;
-      uint8_t *obmc_tmp_buf[3];
-      uint8_t tmp_skip_txfm[MAX_MB_PLANE << 2] = {0};
-      int64_t tmp_bsse[MAX_MB_PLANE << 2] = {0};
-#endif  // CONFIG_OBMC
 
       mbmi->interp_filter = i;
       rs = vp10_get_switchable_rate(cpi, xd);
@@ -6452,26 +6394,12 @@
 
       if (i > 0 && intpel_mv && IsInterpolatingFilter(i)) {
         rd = RDCOST(x->rdmult, x->rddiv, tmp_rate_sum, tmp_dist_sum);
-        filter_cache[i] = rd;
-        filter_cache[SWITCHABLE_FILTERS] =
-            VPXMIN(filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
         if (cm->interp_filter == SWITCHABLE)
           rd += rs_rd;
-#if CONFIG_OBMC
-        if (allow_obmc) {
-          obmc_flag = best_obmc_flag;
-          rd += RDCOST(x->rdmult, x->rddiv,
-                       cpi->obmc_cost[bsize][obmc_flag], 0);
-        }
-#endif  // CONFIG_OBMC
-        *mask_filter = VPXMAX(*mask_filter, rd);
       } else {
         int rate_sum = 0;
         int64_t dist_sum = 0;
-#if CONFIG_OBMC
-        int rate_sum_obmc = 0;
-        int64_t dist_sum_obmc = 0;
-#endif  // CONFIG_OBMC
+
         if (i > 0 && cpi->sf.adaptive_interp_filter_search &&
             (cpi->sf.interp_filter_search_mask & (1 << i))) {
           rate_sum = INT_MAX;
@@ -6488,18 +6416,10 @@
              (cm->interp_filter == mbmi->interp_filter ||
               (i == 0 && intpel_mv && IsInterpolatingFilter(i))))) {
           restore_dst_buf(xd, orig_dst, orig_dst_stride);
-#if CONFIG_OBMC
-          for (j = 0; j < MAX_MB_PLANE; j++) {
-            obmc_tmp_buf[j] = obmc_tmp_buf1[j];
-          }
-#endif  // CONFIG_OBMC
         } else {
           for (j = 0; j < MAX_MB_PLANE; j++) {
             xd->plane[j].dst.buf = tmp_buf + j * 64 * 64;
             xd->plane[j].dst.stride = 64;
-#if CONFIG_OBMC
-            obmc_tmp_buf[j] = obmc_tmp_buf2[j];
-#endif  // CONFIG_OBMC
           }
         }
         vp10_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
@@ -6507,40 +6427,8 @@
                         &tmp_skip_sb, &tmp_skip_sse);
 
         rd = RDCOST(x->rdmult, x->rddiv, rate_sum, dist_sum);
-#if CONFIG_OBMC
-        if (allow_obmc) {
-          rd += RDCOST(x->rdmult, x->rddiv, cpi->obmc_cost[bsize][0], 0);
-          memcpy(tmp_skip_txfm, x->skip_txfm, sizeof(tmp_skip_txfm));
-          memcpy(tmp_bsse, x->bsse, sizeof(tmp_bsse));
-
-          vp10_build_obmc_inter_prediction(cm, xd, mi_row, mi_col, 1,
-                                           obmc_tmp_buf, obmc_tmp_stride,
-                                           dst_buf1, dst_stride1,
-                                           dst_buf2, dst_stride2);
-          for (j = 0; j < MAX_MB_PLANE; ++j) {
-            xd->plane[j].dst.buf = obmc_tmp_buf[j];
-            xd->plane[j].dst.stride = obmc_tmp_stride[j];
-          }
-          model_rd_for_sb(cpi, bsize, x, xd, &rate_sum_obmc, &dist_sum_obmc,
-                          &tmp_skip_sb_obmc, &tmp_skip_sse_obmc);
-          rdobmc = RDCOST(x->rdmult, x->rddiv,
-                          rate_sum_obmc + cpi->obmc_cost[bsize][1],
-                          dist_sum_obmc);
-
-          if ((double)rdobmc <= 0.99 * (double)rd) {
-            obmc_flag = 1;
-            rd = rdobmc;
-            rate_sum = rate_sum_obmc;
-            dist_sum = dist_sum_obmc;
-          }
-        }
-#endif  // CONFIG_OBMC
-        filter_cache[i] = rd;
-        filter_cache[SWITCHABLE_FILTERS] =
-            VPXMIN(filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
         if (cm->interp_filter == SWITCHABLE)
           rd += rs_rd;
-        *mask_filter = VPXMAX(*mask_filter, rd);
 
         if (i == 0 && intpel_mv && IsInterpolatingFilter(i)) {
           tmp_rate_sum = rate_sum;
@@ -6557,10 +6445,6 @@
       newbest = i == 0 || rd < best_rd;
 
       if (newbest) {
-#if CONFIG_OBMC
-        if (allow_obmc)
-          best_obmc_flag = obmc_flag;
-#endif  // CONFIG_OBMC
         best_rd = rd;
         best_filter = mbmi->interp_filter;
         if (cm->interp_filter == SWITCHABLE && i &&
@@ -6574,31 +6458,6 @@
         pred_exists = 1;
         tmp_rd = best_rd;
 
-#if CONFIG_OBMC
-        if (allow_obmc) {
-          skip_txfm_sb_bestfilter[0] = tmp_skip_sb;
-          skip_sse_sb_bestfilter[0] = tmp_skip_sse;
-          memcpy(skip_txfm_bestfilter[0], tmp_skip_txfm, sizeof(skip_txfm));
-          memcpy(bsse_bestfilter[0], tmp_bsse, sizeof(bsse));
-
-          skip_txfm_sb_bestfilter[1] = tmp_skip_sb_obmc;
-          skip_sse_sb_bestfilter[1] = tmp_skip_sse_obmc;
-          memcpy(skip_txfm_bestfilter[1], x->skip_txfm, sizeof(skip_txfm));
-          memcpy(bsse_bestfilter[1], x->bsse, sizeof(bsse));
-          if (best_obmc_flag) {
-            tmp_skip_sb = tmp_skip_sb_obmc;
-            tmp_skip_sse = tmp_skip_sse_obmc;
-          } else {
-            memcpy(x->skip_txfm, tmp_skip_txfm, sizeof(tmp_skip_txfm));
-            memcpy(x->bsse, tmp_bsse, sizeof(tmp_bsse));
-          }
-        } else {
-          skip_txfm_sb_bestfilter[0] = tmp_skip_sb;
-          skip_sse_sb_bestfilter[0] = tmp_skip_sse;
-          memcpy(skip_txfm_bestfilter[0], x->skip_txfm, sizeof(skip_txfm));
-          memcpy(bsse_bestfilter[0], x->bsse, sizeof(bsse));
-        }
-#endif  // CONFIG_OBMC
         skip_txfm_sb = tmp_skip_sb;
         skip_sse_sb = tmp_skip_sse;
         memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
@@ -6923,15 +6782,7 @@
 #endif  // CONFIG_EXT_INTERP
 #endif  // CONFIG_EXT_INTER
 
-#if CONFIG_OBMC
-  if (allow_obmc)
-    mbmi->obmc = best_obmc_flag;
-  else
-    mbmi->obmc = 0;
-#endif  // CONFIG_OBMC
-
   if (pred_exists) {
-#if !CONFIG_OBMC
     if (best_needs_copy) {
       // again temporarily set the buffers to local memory to prevent a memcpy
       for (i = 0; i < MAX_MB_PLANE; i++) {
@@ -6939,20 +6790,11 @@
         xd->plane[i].dst.stride = 64;
       }
     }
-#endif  // !CONFIG_OBMC
-    rd = tmp_rd + RDCOST(x->rdmult, x->rddiv, rs, 0);
-#if CONFIG_OBMC
-    if (allow_obmc)
-      rd += RDCOST(x->rdmult, x->rddiv,
-                   cpi->obmc_cost[bsize][mbmi->obmc], 0);
-#endif  // CONFIG_OBMC
+    rd = tmp_rd;
   } else {
     int tmp_rate;
     int64_t tmp_dist;
-#if CONFIG_OBMC
-    int64_t rdobmc = INT64_MAX;
-    restore_dst_buf(xd, orig_dst, orig_dst_stride);
-#endif  // CONFIG_OBMC
+
     // Handles the special case when a filter that is not in the
     // switchable list (ex. bilinear) is indicated at the frame level, or
     // skip condition holds.
@@ -6960,34 +6802,6 @@
     model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist,
                     &skip_txfm_sb, &skip_sse_sb);
     rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
-#if CONFIG_OBMC
-    skip_txfm_sb_bestfilter[0] = skip_txfm_sb;
-    skip_sse_sb_bestfilter[0] = skip_sse_sb;
-    memcpy(skip_txfm_bestfilter[0], x->skip_txfm, sizeof(skip_txfm));
-    memcpy(bsse_bestfilter[0], x->bsse, sizeof(bsse));
-    if (allow_obmc) {
-      rd += RDCOST(x->rdmult, x->rddiv, cpi->obmc_cost[bsize][0], 0);
-      vp10_build_obmc_inter_prediction(cm, xd, mi_row, mi_col, 1,
-                                       obmc_tmp_buf1, obmc_tmp_stride,
-                                       dst_buf1, dst_stride1,
-                                       dst_buf2, dst_stride2);
-      for (i = 0; i < MAX_MB_PLANE; ++i) {
-        xd->plane[i].dst.buf = obmc_tmp_buf1[i];
-        xd->plane[i].dst.stride = obmc_tmp_stride[i];
-      }
-      model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist,
-                      &skip_txfm_sb, &skip_sse_sb);
-      rdobmc = RDCOST(x->rdmult, x->rddiv,
-                      rs + tmp_rate + cpi->obmc_cost[bsize][1], tmp_dist);
-
-      skip_txfm_sb_bestfilter[1] = skip_txfm_sb;
-      skip_sse_sb_bestfilter[1] = skip_sse_sb;
-      memcpy(skip_txfm_bestfilter[1], x->skip_txfm, sizeof(skip_txfm));
-      memcpy(bsse_bestfilter[1], x->bsse, sizeof(bsse));
-      if ((double)rdobmc <= 0.99 * (double)rd)
-        rd = rdobmc;
-    }
-#endif  // CONFIG_OBMC
     memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
     memcpy(bsse, x->bsse, sizeof(bsse));
   }
@@ -7078,40 +6892,16 @@
 #if CONFIG_OBMC
   best_rd = INT64_MAX;
   for (mbmi->obmc = 0; mbmi->obmc <= allow_obmc; mbmi->obmc++) {
-    int64_t tmp_rd;
+    int64_t tmp_rd, tmp_dist;
+    int tmp_rate;
 
-    if (pred_exists) {
-      if (best_needs_copy) {
-        if (mbmi->obmc) {
-          for (i = 0; i < MAX_MB_PLANE; i++) {
-            xd->plane[i].dst.buf = obmc_tmp_buf2[i];
-            xd->plane[i].dst.stride = 64;
-          }
-        } else {
-          for (i = 0; i < MAX_MB_PLANE; i++) {
-            xd->plane[i].dst.buf = tmp_buf + i * 64 * 64;
-            xd->plane[i].dst.stride = 64;
-          }
-        }
-      } else {
-        if (mbmi->obmc) {
-          for (i = 0; i < MAX_MB_PLANE; i++) {
-            xd->plane[i].dst.buf = obmc_tmp_buf1[i];
-            xd->plane[i].dst.stride = 64;
-          }
-        } else {
-          restore_dst_buf(xd, orig_dst, orig_dst_stride);
-        }
-      }
-    } else {
-      if (mbmi->obmc) {
-        for (i = 0; i < MAX_MB_PLANE; i++) {
-          xd->plane[i].dst.buf = obmc_tmp_buf1[i];
-          xd->plane[i].dst.stride = 64;
-        }
-      } else {
-        restore_dst_buf(xd, orig_dst, orig_dst_stride);
-      }
+    if (mbmi->obmc) {
+      vp10_build_obmc_inter_prediction(cm, xd, mi_row, mi_col, 0,
+                                       NULL, NULL,
+                                       dst_buf1, dst_stride1,
+                                       dst_buf2, dst_stride2);
+      model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist,
+                      &skip_txfm_sb, &skip_sse_sb);
     }
 #if CONFIG_VP9_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -7126,11 +6916,6 @@
         vp10_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-    skip_txfm_sb = skip_txfm_sb_bestfilter[mbmi->obmc];
-    skip_sse_sb = skip_sse_sb_bestfilter[mbmi->obmc];
-    memcpy(x->skip_txfm, skip_txfm_bestfilter[mbmi->obmc],
-           sizeof(skip_txfm));
-    memcpy(x->bsse, bsse_bestfilter[mbmi->obmc], sizeof(bsse));
     x->skip = 0;
 
     *rate2 = rate2_nocoeff;
@@ -7231,10 +7016,7 @@
 
     // The cost of skip bit needs to be added.
 #if CONFIG_OBMC
-    mbmi->skip = xd->lossless[mbmi->segment_id] ? 0 : 1;
-    if (xd->lossless[mbmi->segment_id])
-      *rate2 += vp10_cost_bit(vp10_get_skip_prob(cm, xd), 0);
-    else
+    mbmi->skip = 0;
 #endif  // CONFIG_OBMC
     *rate2 += vp10_cost_bit(vp10_get_skip_prob(cm, xd), 1);
 
@@ -7590,8 +7372,6 @@
   int64_t best_rd = best_rd_so_far;
   int64_t best_pred_diff[REFERENCE_MODES];
   int64_t best_pred_rd[REFERENCE_MODES];
-  int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
-  int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
   MB_MODE_INFO best_mbmode;
   int best_mode_skippable = 0;
   int midx, best_mode_index = -1;
@@ -7629,8 +7409,6 @@
   int64_t mode_threshold[MAX_MODES];
   int *mode_map = tile_data->mode_map[bsize];
   const int mode_search_skip_flags = sf->mode_search_skip_flags;
-  int64_t mask_filter = 0;
-  int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
   const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
   int palette_ctx = 0;
   const int rows = 4 * num_4x4_blocks_high_lookup[bsize];
@@ -7686,16 +7464,11 @@
          sizeof(directional_mode_skip_mask[0]) * INTRA_MODES);
 #endif  // CONFIG_EXT_INTRA
 
-  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
-    filter_cache[i] = INT64_MAX;
-
   estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
                            &comp_mode_p);
 
   for (i = 0; i < REFERENCE_MODES; ++i)
     best_pred_rd[i] = INT64_MAX;
-  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
-    best_filter_rd[i] = INT64_MAX;
   for (i = 0; i < TX_SIZES; i++)
     rate_uv_intra[i] = INT_MAX;
   for (i = 0; i < MAX_REF_FRAMES; ++i)
@@ -8280,8 +8053,7 @@
 #endif  // CONFIG_EXT_INTER
                                   single_inter_filter,
                                   single_skippable,
-                                  &total_sse, best_rd,
-                                  &mask_filter, filter_cache);
+                                  &total_sse, best_rd);
 
 #if CONFIG_REF_MV
       // TODO(jingning): This needs some refactoring to improve code quality
@@ -8329,12 +8101,10 @@
           clamp_mv2(&cur_mv.as_mv, xd);
 
           if (!mv_check_bounds(x, &cur_mv.as_mv)) {
-            int64_t dummy_filter_cache[SWITCHABLE_FILTER_CONTEXTS];
             INTERP_FILTER dummy_single_inter_filter[MB_MODE_COUNT]
                                                    [MAX_REF_FRAMES];
             int dummy_single_skippable[MB_MODE_COUNT][MAX_REF_FRAMES];
             int dummy_disable_skip = 0;
-            int64_t dummy_mask_filter = 0;
 #if CONFIG_EXT_INTER
             int_mv dummy_single_newmvs[2][MAX_REF_FRAMES] =
                                           { { { 0 } },  { { 0 } } };
@@ -8367,9 +8137,7 @@
 #endif
                                            dummy_single_inter_filter,
                                            dummy_single_skippable,
-                                           &tmp_sse, best_rd,
-                                           &dummy_mask_filter,
-                                           dummy_filter_cache);
+                                           &tmp_sse, best_rd);
           }
 
           tmp_rate += cpi->drl_mode_cost0[drl0_ctx][1];
@@ -8511,8 +8279,6 @@
     if (!disable_skip && ref_frame == INTRA_FRAME) {
       for (i = 0; i < REFERENCE_MODES; ++i)
         best_pred_rd[i] = VPXMIN(best_pred_rd[i], this_rd);
-      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
-        best_filter_rd[i] = VPXMIN(best_filter_rd[i], this_rd);
     }
 
     // Did this mode help.. i.e. is it the new best mode
@@ -8532,11 +8298,12 @@
 
         rd_cost->rate = rate2;
 #if CONFIG_SUPERTX
-        *returnrate_nocoef = rate2 - rate_y - rate_uv;
-        if (!disable_skip) {
-          *returnrate_nocoef -= vp10_cost_bit(vp10_get_skip_prob(cm, xd),
-                                              skippable || this_skip2);
-        }
+        if (x->skip && rate_y == INT_MAX)
+          *returnrate_nocoef = rate2;
+        else
+          *returnrate_nocoef = rate2 - rate_y - rate_uv;
+        *returnrate_nocoef -= vp10_cost_bit(vp10_get_skip_prob(cm, xd),
+            disable_skip || skippable || this_skip2);
         *returnrate_nocoef -= vp10_cost_bit(vp10_get_intra_inter_prob(cm, xd),
                                             mbmi->ref_frame[0] != INTRA_FRAME);
 #if CONFIG_OBMC
@@ -8611,29 +8378,6 @@
       }
       if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT])
         best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
-
-      /* keep record of best filter type */
-      if (!mode_excluded && cm->interp_filter != BILINEAR) {
-        int64_t ref = filter_cache[cm->interp_filter == SWITCHABLE ?
-                              SWITCHABLE_FILTERS : cm->interp_filter];
-
-        for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
-          int64_t adj_rd;
-          if (ref == INT64_MAX)
-            adj_rd = 0;
-          else if (filter_cache[i] == INT64_MAX)
-            // when early termination is triggered, the encoder does not have
-            // access to the rate-distortion cost. it only knows that the cost
-            // should be above the maximum valid value. hence it takes the known
-            // maximum plus an arbitrary constant as the rate-distortion cost.
-            adj_rd = mask_filter - ref + 10;
-          else
-            adj_rd = filter_cache[i] - ref;
-
-          adj_rd += this_rd;
-          best_filter_rd[i] = VPXMIN(best_filter_rd[i], adj_rd);
-        }
-      }
     }
 
     if (early_term)
@@ -8928,21 +8672,6 @@
       best_pred_diff[i] = best_rd - best_pred_rd[i];
   }
 
-  if (!x->skip) {
-    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
-      if (best_filter_rd[i] == INT64_MAX)
-        best_filter_diff[i] = 0;
-      else
-        best_filter_diff[i] = best_rd - best_filter_rd[i];
-    }
-    if (cm->interp_filter == SWITCHABLE)
-      assert(best_filter_diff[SWITCHABLE_FILTERS] == 0);
-  } else {
-    vp10_zero(best_filter_diff);
-  }
-
-  // TODO(yunqingwang): Moving this line in front of the above best_filter_diff
-  // updating code causes PSNR loss. Need to figure out the confliction.
   x->skip |= best_mode_skippable;
 
   if (!x->skip && !x->select_tx_size) {
@@ -8966,7 +8695,7 @@
   assert(best_mode_index >= 0);
 
   store_coding_context(x, ctx, best_mode_index, best_pred_diff,
-                       best_filter_diff, best_mode_skippable);
+                       best_mode_skippable);
 
   if (cm->allow_screen_content_tools && pmi->palette_size[1] > 0) {
     restore_uv_color_map(cpi, x);
@@ -8987,7 +8716,6 @@
   const int comp_pred = 0;
   int i;
   int64_t best_pred_diff[REFERENCE_MODES];
-  int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
   vpx_prob comp_mode_p;
   INTERP_FILTER best_filter = SWITCHABLE;
@@ -9072,12 +8800,11 @@
                             cpi->sf.adaptive_rd_thresh, bsize, THR_ZEROMV);
 
   vp10_zero(best_pred_diff);
-  vp10_zero(best_filter_diff);
 
   if (!x->select_tx_size)
     swap_block_ptr(x, ctx, 1, 0, 0, MAX_MB_PLANE);
   store_coding_context(x, ctx, THR_ZEROMV,
-                       best_pred_diff, best_filter_diff, 0);
+                       best_pred_diff, 0);
 }
 
 void vp10_rd_pick_inter_mode_sub8x8(struct VP10_COMP *cpi,
@@ -9117,8 +8844,6 @@
   int64_t best_yrd = best_rd_so_far;  // FIXME(rbultje) more precise
   int64_t best_pred_diff[REFERENCE_MODES];
   int64_t best_pred_rd[REFERENCE_MODES];
-  int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
-  int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
   MB_MODE_INFO best_mbmode;
   int ref_index, best_ref_index = 0;
   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
@@ -9138,8 +8863,6 @@
   b_mode_info best_bmodes[4];
   int best_skip2 = 0;
   int ref_frame_skip_mask[2] = { 0 };
-  int64_t mask_filter = 0;
-  int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
   int internal_active_edge =
     vp10_active_edge_sb(cpi, mi_row, mi_col) && vp10_internal_image_edge(cpi);
 
@@ -9163,9 +8886,6 @@
   mbmi->use_wedge_interintra = 0;
 #endif  // CONFIG_EXT_INTER
 
-  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
-    filter_cache[i] = INT64_MAX;
-
   for (i = 0; i < 4; i++) {
     int j;
 #if CONFIG_EXT_INTER
@@ -9185,8 +8905,6 @@
 
   for (i = 0; i < REFERENCE_MODES; ++i)
     best_pred_rd[i] = INT64_MAX;
-  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
-    best_filter_rd[i] = INT64_MAX;
   rate_uv_intra = INT_MAX;
 
   rd_cost->rate = INT_MAX;
@@ -9444,8 +9162,6 @@
 #endif  // CONFIG_EXT_REFS
       this_rd_thresh = (ref_frame == GOLDEN_FRAME) ?
           rd_opt->threshes[segment_id][bsize][THR_GOLD] : this_rd_thresh;
-      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
-        filter_cache[i] = INT64_MAX;
 
       // TODO(any): Add search of the tx_type to improve rd performance at the
       // expense of speed.
@@ -9489,14 +9205,9 @@
               continue;
             rs = vp10_get_switchable_rate(cpi, xd);
             rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
-            filter_cache[switchable_filter_index] = tmp_rd;
-            filter_cache[SWITCHABLE_FILTERS] =
-                VPXMIN(filter_cache[SWITCHABLE_FILTERS], tmp_rd + rs_rd);
             if (cm->interp_filter == SWITCHABLE)
               tmp_rd += rs_rd;
 
-            mask_filter = VPXMAX(mask_filter, tmp_rd);
-
             newbest = (tmp_rd < tmp_best_rd);
             if (newbest) {
               tmp_best_filter = mbmi->interp_filter;
@@ -9668,8 +9379,6 @@
     if (!disable_skip && ref_frame == INTRA_FRAME) {
       for (i = 0; i < REFERENCE_MODES; ++i)
         best_pred_rd[i] = VPXMIN(best_pred_rd[i], this_rd);
-      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
-        best_filter_rd[i] = VPXMIN(best_filter_rd[i], this_rd);
     }
 
     // Did this mode help.. i.e. is it the new best mode
@@ -9764,29 +9473,6 @@
         best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
     }
 
-    /* keep record of best filter type */
-    if (!mode_excluded && !disable_skip && ref_frame != INTRA_FRAME &&
-        cm->interp_filter != BILINEAR) {
-      int64_t ref = filter_cache[cm->interp_filter == SWITCHABLE ?
-                              SWITCHABLE_FILTERS : cm->interp_filter];
-      int64_t adj_rd;
-      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
-        if (ref == INT64_MAX)
-          adj_rd = 0;
-        else if (filter_cache[i] == INT64_MAX)
-          // when early termination is triggered, the encoder does not have
-          // access to the rate-distortion cost. it only knows that the cost
-          // should be above the maximum valid value. hence it takes the known
-          // maximum plus an arbitrary constant as the rate-distortion cost.
-          adj_rd = mask_filter - ref + 10;
-        else
-          adj_rd = filter_cache[i] - ref;
-
-        adj_rd += this_rd;
-        best_filter_rd[i] = VPXMIN(best_filter_rd[i], adj_rd);
-      }
-    }
-
     if (early_term)
       break;
 
@@ -9858,21 +9544,8 @@
       best_pred_diff[i] = best_rd - best_pred_rd[i];
   }
 
-  if (!x->skip) {
-    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
-      if (best_filter_rd[i] == INT64_MAX)
-        best_filter_diff[i] = 0;
-      else
-        best_filter_diff[i] = best_rd - best_filter_rd[i];
-    }
-    if (cm->interp_filter == SWITCHABLE)
-      assert(best_filter_diff[SWITCHABLE_FILTERS] == 0);
-  } else {
-    vp10_zero(best_filter_diff);
-  }
-
   store_coding_context(x, ctx, best_ref_index,
-                       best_pred_diff, best_filter_diff, 0);
+                       best_pred_diff, 0);
 }
 
 #if CONFIG_OBMC
diff --git a/vp8/common/common.h b/vp8/common/common.h
index e58a9cc..c42e875 100644
--- a/vp8/common/common.h
+++ b/vp8/common/common.h
@@ -32,13 +32,13 @@
 /* Use this for variably-sized arrays. */
 
 #define vp8_copy_array( Dest, Src, N) { \
-        assert( sizeof( *Dest) == sizeof( *Src)); \
-        memcpy( Dest, Src, N * sizeof( *Src)); \
+        assert( sizeof( *(Dest)) == sizeof( *(Src))); \
+        memcpy( Dest, Src, N * sizeof( *(Src))); \
     }
 
-#define vp8_zero( Dest)  memset( &Dest, 0, sizeof( Dest));
+#define vp8_zero( Dest)  memset( &(Dest), 0, sizeof( Dest));
 
-#define vp8_zero_array( Dest, N)  memset( Dest, 0, N * sizeof( *Dest));
+#define vp8_zero_array( Dest, N)  memset( Dest, 0, N * sizeof( *(Dest)));
 
 
 #ifdef __cplusplus
diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h
index 76e7cd4..9d5dbc6 100644
--- a/vp9/common/vp9_common.h
+++ b/vp9/common/vp9_common.h
@@ -33,12 +33,12 @@
 
 // Use this for variably-sized arrays.
 #define vp9_copy_array(dest, src, n) {       \
-    assert(sizeof(*dest) == sizeof(*src));   \
-    memcpy(dest, src, n * sizeof(*src)); \
+    assert(sizeof(*(dest)) == sizeof(*(src)));   \
+    memcpy(dest, src, n * sizeof(*(src))); \
   }
 
 #define vp9_zero(dest) memset(&(dest), 0, sizeof(dest))
-#define vp9_zero_array(dest, n) memset(dest, 0, n * sizeof(*dest))
+#define vp9_zero_array(dest, n) memset(dest, 0, n * sizeof(*(dest)))
 
 static INLINE int get_unsigned_bits(unsigned int num_values) {
   return num_values > 0 ? get_msb(num_values) + 1 : 0;