Merge "remove filter_cache" into nextgenv2
diff --git a/configure b/configure
index 97366e4..ed1d048 100755
--- a/configure
+++ b/configure
@@ -284,6 +284,7 @@
     ext_partition
     ext_tile
     obmc
+    entropy
 "
 CONFIG_LIST="
     dependency_tracking
diff --git a/vp10/common/alloccommon.c b/vp10/common/alloccommon.c
index e14aee7..b3c216e 100644
--- a/vp10/common/alloccommon.c
+++ b/vp10/common/alloccommon.c
@@ -97,10 +97,13 @@
 }
 
 void vp10_free_context_buffers(VP10_COMMON *cm) {
+  int i;
   cm->free_mi(cm);
   free_seg_map(cm);
-  vpx_free(cm->above_context);
-  cm->above_context = NULL;
+  for (i = 0 ; i < MAX_MB_PLANE ; i++) {
+    vpx_free(cm->above_context[i]);
+    cm->above_context[i] = NULL;
+  }
   vpx_free(cm->above_seg_context);
   cm->above_seg_context = NULL;
 #if CONFIG_VAR_TX
@@ -128,11 +131,14 @@
   }
 
   if (cm->above_context_alloc_cols < cm->mi_cols) {
-    vpx_free(cm->above_context);
-    cm->above_context = (ENTROPY_CONTEXT *)vpx_calloc(
-        2 * mi_cols_aligned_to_sb(cm->mi_cols) * MAX_MB_PLANE,
-        sizeof(*cm->above_context));
-    if (!cm->above_context) goto fail;
+    int i;
+    for (i = 0 ; i < MAX_MB_PLANE ; i++) {
+    vpx_free(cm->above_context[i]);
+      cm->above_context[i] = (ENTROPY_CONTEXT *)vpx_calloc(
+          2 * mi_cols_aligned_to_sb(cm->mi_cols),
+          sizeof(*cm->above_context[0]));
+      if (!cm->above_context[i]) goto fail;
+    }
 
     vpx_free(cm->above_seg_context);
     cm->above_seg_context = (PARTITION_CONTEXT *)vpx_calloc(
diff --git a/vp10/common/idct.c b/vp10/common/idct.c
index f621ec6..863f0db 100644
--- a/vp10/common/idct.c
+++ b/vp10/common/idct.c
@@ -260,6 +260,30 @@
 }
 
 #if CONFIG_EXT_TX
+static void iidtx4_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 4; ++i)
+    output[i] = (tran_low_t)dct_const_round_shift(input[i] * Sqrt2);
+}
+
+static void iidtx8_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 8; ++i)
+    output[i] = input[i] * 2;
+}
+
+static void iidtx16_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 16; ++i)
+    output[i] = (tran_low_t)dct_const_round_shift(input[i] * 2 * Sqrt2);
+}
+
+static void iidtx32_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 32; ++i)
+    output[i] = input[i] * 4;
+}
+
 // For use in lieu of DST
 static void ihalfcenter32_c(const tran_low_t *input, tran_low_t *output) {
   int i;
@@ -291,6 +315,37 @@
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_iidtx4_c(const tran_low_t *input, tran_low_t *output,
+                            int bd) {
+  int i;
+  for (i = 0; i < 4; ++i)
+    output[i] = (tran_low_t)highbd_dct_const_round_shift(input[i] * Sqrt2, bd);
+}
+
+static void highbd_iidtx8_c(const tran_low_t *input, tran_low_t *output,
+                            int bd) {
+  int i;
+  (void) bd;
+  for (i = 0; i < 8; ++i)
+    output[i] = input[i] * 2;
+}
+
+static void highbd_iidtx16_c(const tran_low_t *input, tran_low_t *output,
+                            int bd) {
+  int i;
+  for (i = 0; i < 16; ++i)
+    output[i] = (tran_low_t)highbd_dct_const_round_shift(
+        input[i] * 2 * Sqrt2, bd);
+}
+
+static void highbd_iidtx32_c(const tran_low_t *input, tran_low_t *output,
+                             int bd) {
+  int i;
+  (void) bd;
+  for (i = 0; i < 32; ++i)
+    output[i] = input[i] * 4;
+}
+
 static void highbd_ihalfcenter32_c(const tran_low_t *input, tran_low_t *output,
                                    int bd) {
   int i;
@@ -331,85 +386,19 @@
                            int bs, int tx_type) {
   int r, c;
   const int shift = bs < 32 ? 3 : 2;
-
-  tran_low_t temp_in[32], temp_out[32];
-  transform_2d ht = {idct4_c, idct4_c};
-  int out_scale = 1;
-  int coeff_stride = 0;
-
-  switch (bs) {
-    case 4:
-      ht.cols = idct4_c;
-      ht.rows = idct4_c;
-      out_scale = cospi_16_64 >> 3;
-      coeff_stride = 4;
-      break;
-    case 8:
-      ht.cols = idct8_c;
-      ht.rows = idct8_c;
-      out_scale = (1 << (DCT_CONST_BITS - 4));
-      coeff_stride = 8;
-      break;
-    case 16:
-      ht.cols = idct16_c;
-      ht.rows = idct16_c;
-      out_scale = cospi_16_64 >> 4;
-      coeff_stride = 16;
-      break;
-    case 32:
-      ht.cols = idct32_c;
-      ht.rows = idct32_c;
-      out_scale = (1 << (DCT_CONST_BITS - 4));
-      coeff_stride = 32;
-      break;
-    default:
-      assert(0);
-  }
-
-  // Columns
-  if (tx_type == V_DCT) {
-    for (c = 0; c < bs; ++c) {
-      for (r = 0; r < bs; ++r)
-        temp_in[r] = input[r * coeff_stride + c];
-      ht.cols(temp_in, temp_out);
-
-      for (r = 0; r < bs; ++r) {
-        tran_high_t temp = (tran_high_t)temp_out[r] * out_scale;
-        temp >>= DCT_CONST_BITS;
-        dest[r * stride + c] = clip_pixel_add(dest[r * stride + c],
-                                              (tran_low_t)temp);
-      }
-    }
-    return;
-  }
-
-  if (tx_type == H_DCT) {
+  if (tx_type == IDTX) {
     for (r = 0; r < bs; ++r) {
       for (c = 0; c < bs; ++c)
-        temp_in[c] = input[r * coeff_stride + c];
-      ht.rows(temp_in, temp_out);
-
-      for (c = 0; c < bs; ++c) {
-        tran_high_t temp = (tran_high_t)temp_out[c] * out_scale;
-        temp >>= DCT_CONST_BITS;
-        dest[r * stride + c] = clip_pixel_add(dest[r * stride + c],
-                                              (tran_low_t)temp);
-      }
+        dest[c] = clip_pixel_add(dest[c], input[c] >> shift);
+      dest += stride;
+      input += bs;
     }
-    return;
-  }
-
-  for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c)
-      dest[c] = clip_pixel_add(dest[c], input[c] >> shift);
-    dest += stride;
-    input += bs;
   }
 }
 
 #define FLIPUD_PTR(dest, stride, size) do {     \
-    (dest) = (dest) + ((size) - 1) * (stride);  \
-    (stride) = - (stride);                      \
+  (dest) = (dest) + ((size) - 1) * (stride);  \
+  (stride) = - (stride);                      \
 } while (0)
 
 static void maybe_flip_strides(uint8_t **dst, int *dstride,
@@ -428,6 +417,7 @@
     case DST_DCT:
     case DST_ADST:
     case ADST_DST:
+    case IDTX:
     case V_DCT:
     case H_DCT:
       break;
@@ -705,78 +695,13 @@
   const int shift = bs < 32 ? 3 : 2;
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  tran_low_t temp_in[32], temp_out[32];
-  highbd_transform_2d ht = {vpx_highbd_idct4_c, vpx_highbd_idct4_c};
-  int out_scale = 1;
-  int coeff_stride = 0;
-
-  switch (bs) {
-    case 4:
-      ht.cols = vpx_highbd_idct4_c;
-      ht.rows = vpx_highbd_idct4_c;
-      out_scale = cospi_16_64 >> 3;
-      coeff_stride = 4;
-      break;
-    case 8:
-      ht.cols = vpx_highbd_idct8_c;
-      ht.rows = vpx_highbd_idct8_c;
-      out_scale = (1 << (DCT_CONST_BITS - 4));
-      coeff_stride = 8;
-      break;
-    case 16:
-      ht.cols = vpx_highbd_idct16_c;
-      ht.rows = vpx_highbd_idct16_c;
-      out_scale = cospi_16_64 >> 4;
-      coeff_stride = 16;
-      break;
-    case 32:
-      ht.cols = vpx_highbd_idct32_c;
-      ht.rows = vpx_highbd_idct32_c;
-      out_scale = (1 << (DCT_CONST_BITS - 4));
-      coeff_stride = 32;
-      break;
-    default:
-      assert(0);
-  }
-
-  // Columns
-  if (tx_type == V_DCT) {
-    for (c = 0; c < bs; ++c) {
-      for (r = 0; r < bs; ++r)
-        temp_in[r] = input[r * coeff_stride + c];
-      ht.cols(temp_in, temp_out, bd);
-
-      for (r = 0; r < bs; ++r) {
-        tran_high_t temp = (tran_high_t)temp_out[r] * out_scale;
-        temp >>= DCT_CONST_BITS;
-        dest[r * stride + c] = highbd_clip_pixel_add(dest[r * stride + c],
-                                                     (tran_low_t)temp, bd);
-      }
-    }
-    return;
-  }
-
-  if (tx_type == H_DCT) {
+  if (tx_type == IDTX) {
     for (r = 0; r < bs; ++r) {
       for (c = 0; c < bs; ++c)
-        temp_in[c] = input[r * coeff_stride + c];
-      ht.rows(temp_in, temp_out, bd);
-
-      for (c = 0; c < bs; ++c) {
-        tran_high_t temp = (tran_high_t)temp_out[c] * out_scale;
-        temp >>= DCT_CONST_BITS;
-        dest[r * stride + c] = highbd_clip_pixel_add(dest[r * stride + c],
-                                                     (tran_low_t)temp, bd);
-      }
+        dest[c] = highbd_clip_pixel_add(dest[c], input[c] >> shift, bd);
+      dest += stride;
+      input += bs;
     }
-    return;
-  }
-
-  for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c)
-      dest[c] = highbd_clip_pixel_add(dest[c], input[c] >> shift, bd);
-    dest += stride;
-    input += bs;
   }
 }
 
@@ -796,6 +721,9 @@
     case DST_DCT:
     case DST_ADST:
     case ADST_DST:
+    case IDTX:
+    case V_DCT:
+    case H_DCT:
       break;
     case FLIPADST_DCT:
     case FLIPADST_ADST:
@@ -843,6 +771,9 @@
     { idst4_c,  iadst4_c },  // DST_FLIPADST      = 13,
     { iadst4_c, idst4_c  },  // FLIPADST_DST      = 14,
     { idst4_c,  idst4_c  },  // DST_DST           = 15
+    { iidtx4_c, iidtx4_c },  // IDTX              = 16
+    { idct4_c,  iidtx4_c },  // V_DCT             = 17
+    { iidtx4_c, idct4_c  },  // H_DCT             = 18
 #endif  // CONFIG_EXT_TX
   };
 
@@ -906,6 +837,9 @@
     { idst8_c,  iadst8_c },  // DST_FLIPADST      = 13,
     { iadst8_c, idst8_c  },  // FLIPADST_DST      = 14,
     { idst8_c,  idst8_c  },  // DST_DST           = 15
+    { iidtx8_c, iidtx8_c },  // IDTX              = 16
+    { idct8_c,  iidtx8_c },  // V_DCT             = 17
+    { iidtx8_c, idct8_c  },  // H_DCT             = 18
 #endif  // CONFIG_EXT_TX
   };
 
@@ -969,6 +903,9 @@
     { idst16_c,  iadst16_c },  // DST_FLIPADST      = 13,
     { iadst16_c, idst16_c  },  // FLIPADST_DST      = 14,
     { idst16_c,  idst16_c  },  // DST_DST           = 15
+    { iidtx16_c, iidtx16_c },  // IDTX              = 16
+    { idct16_c,  iidtx16_c },  // V_DCT             = 17
+    { iidtx16_c, idct16_c  },  // H_DCT             = 18
 #endif  // CONFIG_EXT_TX
   };
 
@@ -1032,6 +969,9 @@
     { ihalfcenter32_c,  ihalfright32_c },    // DST_FLIPADST      = 13,
     { ihalfright32_c, ihalfcenter32_c  },    // FLIPADST_DST      = 14,
     { ihalfcenter32_c,  ihalfcenter32_c  },  // DST_DST           = 15
+    { iidtx32_c, iidtx32_c },                // IDTX              = 16
+    { idct32_c,  iidtx32_c },                // V_DCT             = 17
+    { iidtx32_c, idct32_c  },                // H_DCT             = 18
   };
 
   int i, j;
@@ -1165,11 +1105,11 @@
     case ADST_DST:
     case FLIPADST_DST:
     case DST_FLIPADST:
+    case H_DCT:
+    case V_DCT:
       // Use C version since DST only exists in C code
       vp10_iht4x4_16_add_c(input, dest, stride, tx_type);
       break;
-    case H_DCT:
-    case V_DCT:
     case IDTX:
       inv_idtx_add_c(input, dest, stride, 4, tx_type);
       break;
@@ -1206,11 +1146,11 @@
     case ADST_DST:
     case FLIPADST_DST:
     case DST_FLIPADST:
+    case H_DCT:
+    case V_DCT:
       // Use C version since DST only exists in C code
       vp10_iht8x8_64_add_c(input, dest, stride, tx_type);
       break;
-    case H_DCT:
-    case V_DCT:
     case IDTX:
       inv_idtx_add_c(input, dest, stride, 8, tx_type);
       break;
@@ -1247,11 +1187,11 @@
     case ADST_DST:
     case FLIPADST_DST:
     case DST_FLIPADST:
+    case H_DCT:
+    case V_DCT:
       // Use C version since DST only exists in C code
       vp10_iht16x16_256_add_c(input, dest, stride, tx_type);
       break;
-    case H_DCT:
-    case V_DCT:
     case IDTX:
       inv_idtx_add_c(input, dest, stride, 16, tx_type);
       break;
@@ -1284,10 +1224,10 @@
     case ADST_DST:
     case FLIPADST_DST:
     case DST_FLIPADST:
-      vp10_iht32x32_1024_add_c(input, dest, stride, tx_type);
-      break;
     case H_DCT:
     case V_DCT:
+      vp10_iht32x32_1024_add_c(input, dest, stride, tx_type);
+      break;
     case IDTX:
       inv_idtx_add_c(input, dest, stride, 32, tx_type);
       break;
@@ -1319,6 +1259,9 @@
     {     highbd_idst4_c,  vpx_highbd_iadst4_c },  // DST_FLIPADST      = 13,
     { vpx_highbd_iadst4_c,     highbd_idst4_c  },  // FLIPADST_DST      = 14,
     {     highbd_idst4_c,      highbd_idst4_c  },  // DST_DST           = 15
+    {     highbd_iidtx4_c,     highbd_iidtx4_c },  // IDTX              = 16
+    { vpx_highbd_idct4_c,      highbd_iidtx4_c },  // V_DCT             = 17
+    {     highbd_iidtx4_c, vpx_highbd_idct4_c  },  // H_DCT             = 18
 #endif  // CONFIG_EXT_TX
   };
 
@@ -1385,6 +1328,9 @@
     {     highbd_idst8_c,  vpx_highbd_iadst8_c },  // DST_FLIPADST      = 13,
     { vpx_highbd_iadst8_c,     highbd_idst8_c  },  // FLIPADST_DST      = 14,
     {     highbd_idst8_c,      highbd_idst8_c  },  // DST_DST           = 15
+    {     highbd_iidtx8_c,     highbd_iidtx8_c },  // IDTX              = 16
+    { vpx_highbd_idct8_c,      highbd_iidtx8_c },  // V_DCT             = 17
+    {     highbd_iidtx8_c, vpx_highbd_idct8_c  },  // H_DCT             = 18
 #endif  // CONFIG_EXT_TX
   };
 
@@ -1451,6 +1397,9 @@
     {     highbd_idst16_c,  vpx_highbd_iadst16_c },  // DST_FLIPADST      = 13,
     { vpx_highbd_iadst16_c,     highbd_idst16_c  },  // FLIPADST_DST      = 14,
     {     highbd_idst16_c,      highbd_idst16_c  },  // DST_DST           = 15
+    {     highbd_iidtx16_c,     highbd_iidtx16_c },  // IDTX              = 16
+    { vpx_highbd_idct16_c,      highbd_iidtx16_c },  // V_DCT             = 17
+    {     highbd_iidtx16_c, vpx_highbd_idct16_c  },  // H_DCT             = 18
 #endif  // CONFIG_EXT_TX
   };
 
@@ -1501,22 +1450,25 @@
 void vp10_highbd_iht32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
                                      int stride, int tx_type, int bd) {
   static const highbd_transform_2d HIGH_IHT_32[] = {
-    { vpx_highbd_idct32_c, vpx_highbd_idct32_c  },        // DCT_DCT
-    { highbd_ihalfright32_c, vpx_highbd_idct32_c  },      // ADST_DCT
-    { vpx_highbd_idct32_c, highbd_ihalfright32_c },       // DCT_ADST
-    { highbd_ihalfright32_c, highbd_ihalfright32_c },     // ADST_ADST
-    { highbd_ihalfright32_c, vpx_highbd_idct32_c  },      // FLIPADST_DCT
-    { vpx_highbd_idct32_c, highbd_ihalfright32_c },       // DCT_FLIPADST
-    { highbd_ihalfright32_c, highbd_ihalfright32_c },     // FLIPADST_FLIPADST
-    { highbd_ihalfright32_c, highbd_ihalfright32_c },     // ADST_FLIPADST
-    { highbd_ihalfright32_c, highbd_ihalfright32_c },     // FLIPADST_ADST
-    { highbd_ihalfcenter32_c, vpx_highbd_idct32_c  },     // DST_DCT
-    { vpx_highbd_idct32_c, highbd_ihalfcenter32_c  },     // DCT_DST
-    { highbd_ihalfcenter32_c, highbd_ihalfright32_c },    // DST_ADST
-    { highbd_ihalfright32_c, highbd_ihalfcenter32_c  },   // ADST_DST
-    { highbd_ihalfcenter32_c, highbd_ihalfright32_c },    // DST_FLIPADST
-    { highbd_ihalfright32_c, highbd_ihalfcenter32_c  },   // FLIPADST_DST
-    { highbd_ihalfcenter32_c, highbd_ihalfcenter32_c  },  // DST_DST
+    { vpx_highbd_idct32_c,    vpx_highbd_idct32_c    },  // DCT_DCT
+    { highbd_ihalfright32_c,  vpx_highbd_idct32_c    },  // ADST_DCT
+    { vpx_highbd_idct32_c,    highbd_ihalfright32_c  },  // DCT_ADST
+    { highbd_ihalfright32_c,  highbd_ihalfright32_c  },  // ADST_ADST
+    { highbd_ihalfright32_c,  vpx_highbd_idct32_c    },  // FLIPADST_DCT
+    { vpx_highbd_idct32_c,    highbd_ihalfright32_c  },  // DCT_FLIPADST
+    { highbd_ihalfright32_c,  highbd_ihalfright32_c  },  // FLIPADST_FLIPADST
+    { highbd_ihalfright32_c,  highbd_ihalfright32_c  },  // ADST_FLIPADST
+    { highbd_ihalfright32_c,  highbd_ihalfright32_c  },  // FLIPADST_ADST
+    { highbd_ihalfcenter32_c, vpx_highbd_idct32_c    },  // DST_DCT
+    { vpx_highbd_idct32_c,    highbd_ihalfcenter32_c },  // DCT_DST
+    { highbd_ihalfcenter32_c, highbd_ihalfright32_c  },  // DST_ADST
+    { highbd_ihalfright32_c,  highbd_ihalfcenter32_c },  // ADST_DST
+    { highbd_ihalfcenter32_c, highbd_ihalfright32_c  },  // DST_FLIPADST
+    { highbd_ihalfright32_c,  highbd_ihalfcenter32_c },  // FLIPADST_DST
+    { highbd_ihalfcenter32_c, highbd_ihalfcenter32_c },  // DST_DST
+    {     highbd_iidtx32_c,   highbd_iidtx32_c       },  // IDTX
+    { vpx_highbd_idct32_c,    highbd_iidtx32_c       },  // V_DCT
+    {     highbd_iidtx32_c,   vpx_highbd_idct32_c    },  // H_DCT
   };
 
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
@@ -1657,11 +1609,11 @@
     case ADST_DST:
     case FLIPADST_DST:
     case DST_FLIPADST:
+    case H_DCT:
+    case V_DCT:
       // Use C version since DST only exists in C code
       vp10_highbd_iht4x4_16_add_c(input, dest, stride, tx_type, bd);
       break;
-    case H_DCT:
-    case V_DCT:
     case IDTX:
       highbd_inv_idtx_add_c(input, dest, stride, 4, tx_type, bd);
       break;
@@ -1699,11 +1651,11 @@
     case ADST_DST:
     case FLIPADST_DST:
     case DST_FLIPADST:
+    case H_DCT:
+    case V_DCT:
       // Use C version since DST only exists in C code
       vp10_highbd_iht8x8_64_add_c(input, dest, stride, tx_type, bd);
       break;
-    case H_DCT:
-    case V_DCT:
     case IDTX:
       highbd_inv_idtx_add_c(input, dest, stride, 8, tx_type, bd);
       break;
@@ -1741,11 +1693,11 @@
     case ADST_DST:
     case FLIPADST_DST:
     case DST_FLIPADST:
+    case H_DCT:
+    case V_DCT:
       // Use C version since DST only exists in C code
       vp10_highbd_iht16x16_256_add_c(input, dest, stride, tx_type, bd);
       break;
-    case H_DCT:
-    case V_DCT:
     case IDTX:
       highbd_inv_idtx_add_c(input, dest, stride, 16, tx_type, bd);
       break;
@@ -1779,10 +1731,10 @@
     case ADST_DST:
     case FLIPADST_DST:
     case DST_FLIPADST:
-      vp10_highbd_iht32x32_1024_add_c(input, dest, stride, tx_type, bd);
-      break;
     case H_DCT:
     case V_DCT:
+      vp10_highbd_iht32x32_1024_add_c(input, dest, stride, tx_type, bd);
+      break;
     case IDTX:
       highbd_inv_idtx_add_c(input, dest, stride, 32, tx_type, bd);
       break;
diff --git a/vp10/common/onyxc_int.h b/vp10/common/onyxc_int.h
index bcc69f3..26ae569 100644
--- a/vp10/common/onyxc_int.h
+++ b/vp10/common/onyxc_int.h
@@ -313,7 +313,7 @@
   BufferPool *buffer_pool;
 
   PARTITION_CONTEXT *above_seg_context;
-  ENTROPY_CONTEXT *above_context;
+  ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
 #if CONFIG_VAR_TX
   TXFM_CONTEXT *above_txfm_context;
   TXFM_CONTEXT left_txfm_context[8];
@@ -405,9 +405,7 @@
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     xd->plane[i].dqcoeff = dqcoeff;
-    xd->above_context[i] = cm->above_context +
-        i * sizeof(*cm->above_context) * 2 * mi_cols_aligned_to_sb(cm->mi_cols);
-
+    xd->above_context[i] = cm->above_context[i];
     if (xd->plane[i].plane_type == PLANE_TYPE_Y) {
       memcpy(xd->plane[i].seg_dequant, cm->y_dequant, sizeof(cm->y_dequant));
     } else {
@@ -525,6 +523,27 @@
   return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
 }
 
+static INLINE void vp10_zero_above_context(VP10_COMMON *const cm,
+                             int mi_col_start, int mi_col_end) {
+  const int width = mi_col_end - mi_col_start;
+  int i;
+
+  for (i = 0 ; i < MAX_MB_PLANE ; i++)
+    vp10_zero_array(cm->above_context[i] + 2 * mi_col_start, 2 * width);
+  vp10_zero_array(cm->above_seg_context + mi_col_start, width);
+#if CONFIG_VAR_TX
+  vp10_zero_array(cm->above_txfm_context + mi_col_start, width);
+#endif  // CONFIG_VAR_TX
+}
+
+static INLINE void vp10_zero_left_context(MACROBLOCKD *const xd) {
+  vp10_zero(xd->left_context);
+  vp10_zero(xd->left_seg_context);
+#if CONFIG_VAR_TX
+  vp10_zero(xd->left_txfm_context_buffer);
+#endif
+}
+
 #if CONFIG_VAR_TX
 static INLINE void set_txfm_ctx(TXFM_CONTEXT *txfm_ctx,
                                 TX_SIZE tx_size,
diff --git a/vp10/decoder/decodeframe.c b/vp10/decoder/decodeframe.c
index ce6317c..0441662 100644
--- a/vp10/decoder/decodeframe.c
+++ b/vp10/decoder/decodeframe.c
@@ -2962,18 +2962,7 @@
   assert(tile_rows <= 4);
   assert(tile_cols <= (1 << 6));
 
-  // Note: this memset assumes above_context[0], [1] and [2]
-  // are allocated as part of the same buffer.
-  memset(cm->above_context, 0,
-         sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_cols);
-
-  memset(cm->above_seg_context, 0,
-         sizeof(*cm->above_seg_context) * aligned_cols);
-
-#if CONFIG_VAR_TX
-  memset(cm->above_txfm_context, 0,
-         sizeof(*cm->above_txfm_context) * aligned_cols);
-#endif
+  vp10_zero_above_context(cm, 0, aligned_cols);
 
   get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, tile_buffers);
 
@@ -3032,11 +3021,7 @@
                         tile_cols - tile_col - 1 : tile_col;
         tile_data = pbi->tile_data + tile_cols * tile_row + col;
         vp10_tile_set_col(&tile, tile_data->cm, col);
-        vp10_zero(tile_data->xd.left_context);
-        vp10_zero(tile_data->xd.left_seg_context);
-#if CONFIG_VAR_TX
-        vp10_zero(tile_data->xd.left_txfm_context_buffer);
-#endif
+        vp10_zero_left_context(&tile_data->xd);
         for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end;
              mi_col += MI_BLOCK_SIZE) {
           decode_partition(pbi, &tile_data->xd,
@@ -3126,11 +3111,7 @@
 
   for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
        mi_row += MI_BLOCK_SIZE) {
-    vp10_zero(tile_data->xd.left_context);
-    vp10_zero(tile_data->xd.left_seg_context);
-#if CONFIG_VAR_TX
-    vp10_zero(tile_data->xd.left_txfm_context_buffer);
-#endif
+    vp10_zero_left_context(&tile_data->xd);
     for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
          mi_col += MI_BLOCK_SIZE) {
       decode_partition(tile_data->pbi, &tile_data->xd,
@@ -3211,16 +3192,8 @@
     worker->data2 = &pbi->tile_worker_info[n];
   }
 
-  // Note: this memset assumes above_context[0], [1] and [2]
-  // are allocated as part of the same buffer.
-  memset(cm->above_context, 0,
-         sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_mi_cols);
-  memset(cm->above_seg_context, 0,
-         sizeof(*cm->above_seg_context) * aligned_mi_cols);
-#if CONFIG_VAR_TX
-  memset(cm->above_txfm_context, 0,
-         sizeof(*cm->above_txfm_context) * aligned_mi_cols);
-#endif
+  vp10_zero_above_context(cm, 0, aligned_mi_cols);
+
   // Load tile data into tile_buffers
   get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, tile_buffers);
 
diff --git a/vp10/encoder/bitstream.c b/vp10/encoder/bitstream.c
index 1ef2ea5..f20c224 100644
--- a/vp10/encoder/bitstream.c
+++ b/vp10/encoder/bitstream.c
@@ -1710,10 +1710,7 @@
 
   for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
        mi_row += MI_BLOCK_SIZE) {
-    vp10_zero(xd->left_seg_context);
-#if CONFIG_VAR_TX
-    vp10_zero(xd->left_txfm_context_buffer);
-#endif
+    vp10_zero_left_context(xd);
     for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
          mi_col += MI_BLOCK_SIZE)
       write_modes_sb(cpi, tile, w, tok, tok_end,
@@ -2190,12 +2187,7 @@
   const int tile_rows = 1 << cm->log2_tile_rows;
   unsigned int max_tile = 0;
 
-  memset(cm->above_seg_context, 0,
-         sizeof(*cm->above_seg_context) * mi_cols_aligned_to_sb(cm->mi_cols));
-#if CONFIG_VAR_TX
-  memset(cm->above_txfm_context, 0,
-         sizeof(*cm->above_txfm_context) * mi_cols_aligned_to_sb(cm->mi_cols));
-#endif
+  vp10_zero_above_context(cm, 0, mi_cols_aligned_to_sb(cm->mi_cols));
 
   for (tile_row = 0; tile_row < tile_rows; tile_row++) {
     for (tile_col = 0; tile_col < tile_cols; tile_col++) {
diff --git a/vp10/encoder/dct.c b/vp10/encoder/dct.c
index 31a4c87..8a1ee20 100644
--- a/vp10/encoder/dct.c
+++ b/vp10/encoder/dct.c
@@ -1212,6 +1212,30 @@
 }
 
 #if CONFIG_EXT_TX
+static void fidtx4(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 4; ++i)
+    output[i] = (tran_low_t)fdct_round_shift(input[i] * Sqrt2);
+}
+
+static void fidtx8(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 8; ++i)
+    output[i] = input[i] * 2;
+}
+
+static void fidtx16(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 16; ++i)
+    output[i] = (tran_low_t)fdct_round_shift(input[i] * 2 * Sqrt2);
+}
+
+static void fidtx32(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 32; ++i)
+    output[i] = input[i] * 4;
+}
+
 // For use in lieu of DST
 static void fhalfcenter32(const tran_low_t *input, tran_low_t *output) {
   int i;
@@ -1315,6 +1339,7 @@
     case DST_DCT:
     case DST_ADST:
     case ADST_DST:
+    case IDTX:
     case H_DCT:
     case V_DCT:
       break;
@@ -1362,6 +1387,9 @@
   { fdst4,  fadst4 },  // DST_FLIPADST      = 13,
   { fadst4, fdst4  },  // FLIPADST_DST      = 14,
   { fdst4,  fdst4  },  // DST_DST           = 15
+  { fidtx4, fidtx4 },  // IDTX              = 16
+  { fdct4,  fidtx4 },  // V_DCT             = 17
+  { fidtx4, fdct4  },  // H_DCT             = 18
 #endif  // CONFIG_EXT_TX
 };
 
@@ -1383,6 +1411,9 @@
   { fdst8,  fadst8 },  // DST_FLIPADST      = 13,
   { fadst8, fdst8  },  // FLIPADST_DST      = 14,
   { fdst8,  fdst8  },  // DST_DST           = 15
+  { fidtx8, fidtx8 },  // IDTX              = 16
+  { fdct8,  fidtx8 },  // V_DCT             = 17
+  { fidtx8, fdct8  },  // H_DCT             = 18
 #endif  // CONFIG_EXT_TX
 };
 
@@ -1404,6 +1435,9 @@
   { fdst16,  fadst16 },  // DST_FLIPADST      = 13,
   { fadst16, fdst16  },  // FLIPADST_DST      = 14,
   { fdst16,  fdst16  },  // DST_DST           = 15
+  { fidtx16, fidtx16 },  // IDTX              = 16
+  { fdct16,  fidtx16 },  // V_DCT             = 17
+  { fidtx16, fdct16  },  // H_DCT             = 18
 #endif  // CONFIG_EXT_TX
 };
 
@@ -1425,6 +1459,9 @@
   { fhalfcenter32,  fhalfright32 },    // DST_FLIPADST      = 13,
   { fhalfright32, fhalfcenter32  },    // FLIPADST_DST      = 14,
   { fhalfcenter32,  fhalfcenter32  },  // DST_DST           = 15
+  { fidtx32, fidtx32 },                // IDTX              = 16
+  { fdct32,  fidtx32 },                // V_DCT             = 17
+  { fidtx32, fdct32  },                // H_DCT             = 18
 };
 #endif  // CONFIG_EXT_TX
 
@@ -1766,86 +1803,12 @@
                      int bs, int tx_type) {
   int r, c;
   const int shift = bs < 32 ? 3 : 2;
-
-  const int16_t *input = src_diff;
-  tran_low_t *output = coeff;
-
-  int i, j;
-  tran_low_t temp_in[32], temp_out[32];
-  transform_2d ht = {fdct4, fdct4};
-  int in_scale = 1;
-  int out_scale = 1;
-  int coeff_stride = 0;
-
-  switch (bs) {
-    case 4:
-      ht.cols = fdct4;
-      ht.rows = fdct4;
-      in_scale = 16;
-      out_scale = cospi_16_64 >> 1;
-      coeff_stride = 4;
-      break;
-    case 8:
-      ht.cols = fdct8;
-      ht.rows = fdct8;
-      in_scale = 4;
-      out_scale = (1 << DCT_CONST_BITS);
-      coeff_stride = 8;
-      break;
-    case 16:
-      ht.cols = fdct16;
-      ht.rows = fdct16;
-      in_scale = 4;
-      out_scale = cospi_16_64;
-      coeff_stride = 16;
-      break;
-    case 32:
-      ht.cols = fdct32;
-      ht.rows = fdct32;
-      in_scale = 4;
-      out_scale = (1 << (DCT_CONST_BITS - 2));
-      coeff_stride = 32;
-      break;
-    default:
-      assert(0);
-  }
-
-  // Columns
-  if (tx_type == V_DCT) {
-    for (i = 0; i < bs; ++i) {
-      for (j = 0; j < bs; ++j)
-        temp_in[j] = input[j * stride + i] * in_scale;
-      ht.cols(temp_in, temp_out);
-
-      for (j = 0; j < bs; ++j) {
-        tran_high_t temp = (tran_high_t)temp_out[j] * out_scale;
-        temp >>= DCT_CONST_BITS;
-        output[j * coeff_stride + i] = (tran_low_t)temp;
-      }
+  if (tx_type == IDTX) {
+    for (r = 0; r < bs; ++r) {
+      for (c = 0; c < bs; ++c) coeff[c] = src_diff[c] << shift;
+      src_diff += stride;
+      coeff += bs;
     }
-    return;
-  }
-
-  // Rows
-  if (tx_type == H_DCT) {
-    for (j = 0; j < bs; ++j) {
-      for (i = 0; i < bs; ++i)
-        temp_in[i] = input[j * stride + i] * in_scale;
-      ht.rows(temp_in, temp_out);
-
-      for (i = 0; i < bs; ++i) {
-        tran_high_t temp = (tran_high_t)temp_out[i] * out_scale;
-        temp >>= DCT_CONST_BITS;
-        output[j * coeff_stride + i] = (tran_low_t)temp;
-      }
-    }
-    return;
-  }
-
-  for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c) coeff[c] = src_diff[c] << shift;
-    src_diff += stride;
-    coeff += bs;
   }
 }
 
diff --git a/vp10/encoder/encodeframe.c b/vp10/encoder/encodeframe.c
index 6c76523..ec00b62 100644
--- a/vp10/encoder/encodeframe.c
+++ b/vp10/encoder/encodeframe.c
@@ -1648,6 +1648,9 @@
                                    totalrate_nocoef,
 #endif  // CONFIG_SUPERTX
                                    bsize, ctx, best_rd);
+#if CONFIG_SUPERTX
+        assert(*totalrate_nocoef >= 0);
+#endif  // CONFIG_SUPERTX
       }
     } else {
       vp10_rd_pick_inter_mode_sub8x8(cpi, tile_data, x, mi_row, mi_col, rd_cost,
@@ -1655,6 +1658,9 @@
                                      totalrate_nocoef,
 #endif  // CONFIG_SUPERTX
                                      bsize, ctx, best_rd);
+#if CONFIG_SUPERTX
+      assert(*totalrate_nocoef >= 0);
+#endif  // CONFIG_SUPERTX
     }
   }
 
@@ -3681,13 +3687,8 @@
   SPEED_FEATURES *const sf = &cpi->sf;
   int mi_col;
 
-  // Initialize the left context for the new SB row
-  memset(&xd->left_context, 0, sizeof(xd->left_context));
-  memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context));
-#if CONFIG_VAR_TX
-  memset(xd->left_txfm_context_buffer, 0,
-         sizeof(xd->left_txfm_context_buffer));
-#endif
+  vp10_zero_left_context(xd);
+
   // Code each SB in the row
   for (mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end;
        mi_col += MI_BLOCK_SIZE) {
@@ -3785,19 +3786,9 @@
   // Copy data over into macro block data structures.
   vp10_setup_src_planes(x, cpi->Source, 0, 0);
 
-  vp10_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
+  vp10_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y);
 
-  // Note: this memset assumes above_context[0], [1] and [2]
-  // are allocated as part of the same buffer.
-  memset(xd->above_context[0], 0,
-         sizeof(*xd->above_context[0]) *
-         2 * aligned_mi_cols * MAX_MB_PLANE);
-  memset(xd->above_seg_context, 0,
-         sizeof(*xd->above_seg_context) * aligned_mi_cols);
-#if CONFIG_VAR_TX
-  memset(cm->above_txfm_context, 0,
-         sizeof(*xd->above_txfm_context) * aligned_mi_cols);
-#endif
+  vp10_zero_above_context(cm, 0, aligned_mi_cols);
 }
 
 static int check_dual_ref_flags(VP10_COMP *cpi) {
diff --git a/vp10/encoder/hybrid_fwd_txfm.c b/vp10/encoder/hybrid_fwd_txfm.c
index c3a739b..faedb43 100644
--- a/vp10/encoder/hybrid_fwd_txfm.c
+++ b/vp10/encoder/hybrid_fwd_txfm.c
@@ -65,6 +65,8 @@
       break;
     case H_DCT:
     case V_DCT:
+      vp10_fht4x4_c(src_diff, coeff, diff_stride, tx_type);
+      break;
     case IDTX:
       vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 4, tx_type);
       break;
@@ -105,6 +107,8 @@
       break;
     case H_DCT:
     case V_DCT:
+      vp10_fht8x8_c(src_diff, coeff, diff_stride, tx_type);
+      break;
     case IDTX:
       vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 8, tx_type);
       break;
@@ -145,6 +149,8 @@
       break;
     case H_DCT:
     case V_DCT:
+      vp10_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
+      break;
     case IDTX:
       vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 16, tx_type);
       break;
@@ -185,6 +191,8 @@
       break;
     case H_DCT:
     case V_DCT:
+      vp10_fht32x32_c(src_diff, coeff, diff_stride, tx_type);
+      break;
     case IDTX:
       vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 32, tx_type);
       break;
@@ -226,11 +234,10 @@
     case ADST_DST:
     case DST_FLIPADST:
     case FLIPADST_DST:
-      // Use C version since DST exists only in C
-      vp10_highbd_fht4x4_c(src_diff, coeff, diff_stride, tx_type);
-      break;
     case H_DCT:
     case V_DCT:
+      vp10_highbd_fht4x4_c(src_diff, coeff, diff_stride, tx_type);
+      break;
     case IDTX:
       vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 4, tx_type);
       break;
@@ -270,11 +277,11 @@
     case ADST_DST:
     case DST_FLIPADST:
     case FLIPADST_DST:
+    case H_DCT:
+    case V_DCT:
       // Use C version since DST exists only in C
       vp10_highbd_fht8x8_c(src_diff, coeff, diff_stride, tx_type);
       break;
-    case H_DCT:
-    case V_DCT:
     case IDTX:
       vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 8, tx_type);
       break;
@@ -314,11 +321,11 @@
     case ADST_DST:
     case DST_FLIPADST:
     case FLIPADST_DST:
+    case H_DCT:
+    case V_DCT:
       // Use C version since DST exists only in C
       vp10_highbd_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
       break;
-    case H_DCT:
-    case V_DCT:
     case IDTX:
       vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 16, tx_type);
       break;
@@ -355,10 +362,10 @@
     case ADST_DST:
     case DST_FLIPADST:
     case FLIPADST_DST:
-      vp10_highbd_fht32x32_c(src_diff, coeff, diff_stride, tx_type);
-      break;
     case H_DCT:
     case V_DCT:
+      vp10_highbd_fht32x32_c(src_diff, coeff, diff_stride, tx_type);
+      break;
     case IDTX:
       vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 32, tx_type);
       break;
diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index ac6adfe..16deebf 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@@ -8298,11 +8298,12 @@
 
         rd_cost->rate = rate2;
 #if CONFIG_SUPERTX
-        *returnrate_nocoef = rate2 - rate_y - rate_uv;
-        if (!disable_skip) {
-          *returnrate_nocoef -= vp10_cost_bit(vp10_get_skip_prob(cm, xd),
-                                              skippable || this_skip2);
-        }
+        if (x->skip && rate_y == INT_MAX)
+          *returnrate_nocoef = rate2;
+        else
+          *returnrate_nocoef = rate2 - rate_y - rate_uv;
+        *returnrate_nocoef -= vp10_cost_bit(vp10_get_skip_prob(cm, xd),
+            disable_skip || skippable || this_skip2);
         *returnrate_nocoef -= vp10_cost_bit(vp10_get_intra_inter_prob(cm, xd),
                                             mbmi->ref_frame[0] != INTRA_FRAME);
 #if CONFIG_OBMC