Merge "Refactor tx_size to pixel number mapping in reconintra.c" into nextgenv2
diff --git a/aom/aomdx.h b/aom/aomdx.h
index f9d1566..19256fb 100644
--- a/aom/aomdx.h
+++ b/aom/aomdx.h
@@ -37,6 +37,10 @@
 extern aom_codec_iface_t *aom_codec_av1_dx(void);
 /*!@} - end algorithm interface member group*/
 
+/** Data structure that stores bit accounting for debug
+ */
+typedef struct Accounting Accounting;
+
 /*!\enum aom_dec_control_id
  * \brief AOM decoder control functions
  *
@@ -103,6 +107,14 @@
    */
   AV1_SET_SKIP_LOOP_FILTER,
 
+  /** control function to retrieve a pointer to the Accounting struct.  When
+   * compiled without --enable-accounting, this returns AOM_CODEC_INCAPABLE.
+   * If called before a frame has been decoded, this returns AOM_CODEC_ERROR.
+   * The caller should ensure that AOM_CODEC_OK is returned before attempting
+   * to dereference the Accounting pointer.
+   */
+  AV1_GET_ACCOUNTING,
+
   AOM_DECODER_CTRL_ID_MAX,
 
   /** control function to set the range of tile decoding. A value that is
@@ -163,12 +175,14 @@
 #define AOM_CTRL_AV1D_GET_FRAME_SIZE
 AOM_CTRL_USE_TYPE(AV1_INVERT_TILE_DECODE_ORDER, int)
 #define AOM_CTRL_AV1_INVERT_TILE_DECODE_ORDER
+AOM_CTRL_USE_TYPE(AV1_GET_ACCOUNTING, Accounting **)
+#define AOM_CTRL_AV1_GET_ACCOUNTING
 AOM_CTRL_USE_TYPE(AV1_SET_DECODE_TILE_ROW, int)
 #define AOM_CTRL_AV1_SET_DECODE_TILE_ROW
 AOM_CTRL_USE_TYPE(AV1_SET_DECODE_TILE_COL, int)
 #define AOM_CTRL_AV1_SET_DECODE_TILE_COL
 /*!\endcond */
-/*! @} - end defgroup vp8_decoder */
+/*! @} - end defgroup aom_decoder */
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/aom_dsp/ans.c b/aom_dsp/ans.c
index 30f115c..6d705cd 100644
--- a/aom_dsp/ans.c
+++ b/aom_dsp/ans.c
@@ -39,7 +39,7 @@
   const int out_syms = in_syms + 1;
   assert(src_pdf != out_pdf);
 
-  out_pdf[0] = node_prob << (10 - 8);
+  out_pdf[0] = node_prob << (RANS_PROB_BITS - ANS_P8_SHIFT);
   adjustment -= out_pdf[0];
   for (i = 0; i < in_syms; ++i) {
     int p = (p1 * src_pdf[i] + round_fact) >> ANS_P8_SHIFT;
diff --git a/aom_dsp/aom_dsp.mk b/aom_dsp/aom_dsp.mk
index 28e7f12..4735199 100644
--- a/aom_dsp/aom_dsp.mk
+++ b/aom_dsp/aom_dsp.mk
@@ -205,6 +205,7 @@
 ifeq ($(ARCH_X86_64),yes)
 DSP_SRCS-$(HAVE_SSSE3)  += x86/fwd_txfm_ssse3_x86_64.asm
 endif
+DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_txfm_avx2.h
 DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_txfm_avx2.c
 DSP_SRCS-$(HAVE_AVX2)   += x86/txfm_common_avx2.h
 DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_dct32x32_impl_avx2.h
diff --git a/aom_dsp/x86/fwd_txfm_avx2.c b/aom_dsp/x86/fwd_txfm_avx2.c
index 670f864..d381a6e 100644
--- a/aom_dsp/x86/fwd_txfm_avx2.c
+++ b/aom_dsp/x86/fwd_txfm_avx2.c
@@ -17,6 +17,14 @@
 #undef FDCT32x32_2D_AVX2
 #undef FDCT32x32_HIGH_PRECISION
 
+// TODO(luoyi): The following macro hides an error. The second parameter type of
+// function,
+//   void FDCT32x32_2D_AVX2(const int16_t *, int16_t*, int);
+// is different from the one in,
+//   void aom_fdct32x32_avx2(const int16_t *, tran_low_t*, int);
+// In CONFIG_AOM_HIGHBITDEPTH=1 build, the second parameter type should be
+// int32_t.
+// This function should be removed after av1_fht32x32 scaling/rounding fix.
 #define FDCT32x32_2D_AVX2 aom_fdct32x32_avx2
 #define FDCT32x32_HIGH_PRECISION 1
 #include "aom_dsp/x86/fwd_dct32x32_impl_avx2.h"  // NOLINT
diff --git a/aom_dsp/x86/fwd_txfm_avx2.h b/aom_dsp/x86/fwd_txfm_avx2.h
new file mode 100644
index 0000000..2c3cfc8
--- /dev/null
+++ b/aom_dsp/x86/fwd_txfm_avx2.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_X86_FWD_TXFM_AVX2_H
+#define AOM_DSP_X86_FWD_TXFM_AVX2_H
+
+#include "./aom_config.h"
+
+static INLINE void storeu_output_avx2(const __m256i *coeff, tran_low_t *out) {
+#if CONFIG_AOM_HIGHBITDEPTH
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i sign = _mm256_cmpgt_epi16(zero, *coeff);
+
+  __m256i x0 = _mm256_unpacklo_epi16(*coeff, sign);
+  __m256i x1 = _mm256_unpackhi_epi16(*coeff, sign);
+
+  __m256i y0 = _mm256_permute2x128_si256(x0, x1, 0x20);
+  __m256i y1 = _mm256_permute2x128_si256(x0, x1, 0x31);
+
+  _mm256_storeu_si256((__m256i *)out, y0);
+  _mm256_storeu_si256((__m256i *)(out + 8), y1);
+#else
+  _mm256_storeu_si256((__m256i *)out, *coeff);
+#endif
+}
+
+#endif  // AOM_DSP_X86_FWD_TXFM_AVX2_H
diff --git a/aom_scale/generic/aom_scale.c b/aom_scale/generic/aom_scale.c
index 28604ac..9007459 100644
--- a/aom_scale/generic/aom_scale.c
+++ b/aom_scale/generic/aom_scale.c
@@ -68,7 +68,6 @@
                           unsigned int source_scale, unsigned int source_length,
                           unsigned char *dest, int dest_step,
                           unsigned int dest_scale, unsigned int dest_length) {
-  const unsigned int source_pitch = source_step;
   const unsigned char *const dest_end = dest + dest_length * dest_step;
   (void)source_length;
   (void)source_scale;
@@ -81,9 +80,9 @@
   dest += dest_step;
 
   while (dest < dest_end) {
-    const unsigned int a = 3 * source[-source_pitch];
+    const unsigned int a = 3 * source[-source_step];
     const unsigned int b = 10 * source[0];
-    const unsigned int c = 3 * source[source_pitch];
+    const unsigned int c = 3 * source[source_step];
     *dest = (unsigned char)((8 + a + b + c) >> 4);
     source += source_step;
     dest += dest_step;
diff --git a/aomenc.c b/aomenc.c
index 497c8d5..742f264 100644
--- a/aomenc.c
+++ b/aomenc.c
@@ -40,12 +40,12 @@
 #include "aom/aomdx.h"
 #endif
 
-#include "aom/aom_integer.h"
-#include "aom_ports/mem_ops.h"
-#include "aom_ports/aom_timer.h"
-#include "./rate_hist.h"
 #include "./aomstats.h"
+#include "./rate_hist.h"
 #include "./warnings.h"
+#include "aom/aom_integer.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem_ops.h"
 #if CONFIG_WEBM_IO
 #include "./webmenc.h"
 #endif
@@ -1860,6 +1860,9 @@
   uint64_t cx_time = 0;
   int stream_cnt = 0;
   int res = 0;
+#if CONFIG_AOM_HIGHBITDEPTH
+  int profile_updated = 0;
+#endif
 
   memset(&input, 0, sizeof(input));
   exec_name = argv_[0];
@@ -1963,6 +1966,39 @@
           { stream->config.cfg.g_input_bit_depth = input.bit_depth; });
     }
 
+#if CONFIG_AOM_HIGHBITDEPTH
+    /* Automatically set the codec bit depth to match the input bit depth.
+     * Upgrade the profile if required. */
+    FOREACH_STREAM({
+      if (stream->config.cfg.g_input_bit_depth >
+          (unsigned int)stream->config.cfg.g_bit_depth) {
+        stream->config.cfg.g_bit_depth = stream->config.cfg.g_input_bit_depth;
+      }
+      if (stream->config.cfg.g_bit_depth > 8) {
+        switch (stream->config.cfg.g_profile) {
+          case 0:
+            stream->config.cfg.g_profile = 2;
+            profile_updated = 1;
+            break;
+          case 1:
+            stream->config.cfg.g_profile = 3;
+            profile_updated = 1;
+            break;
+          default: break;
+        }
+      }
+      if (stream->config.cfg.g_profile > 1) {
+        stream->config.use_16bit_internal = 1;
+      }
+      if (profile_updated) {
+        fprintf(stderr,
+                "Warning: automatically upgrading to profile %d to "
+                "match input format.\n",
+                stream->config.cfg.g_profile);
+      }
+    });
+#endif
+
     FOREACH_STREAM(set_stream_dimensions(stream, input.width, input.height));
     FOREACH_STREAM(validate_stream_config(stream, &global));
 
diff --git a/av1/av1_cx_iface.c b/av1/av1_cx_iface.c
index fae7d04..bacb23c 100644
--- a/av1/av1_cx_iface.c
+++ b/av1/av1_cx_iface.c
@@ -932,6 +932,7 @@
   return flags;
 }
 
+const size_t kMinCompressedSize = 8192;
 static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
                                       const aom_image_t *img,
                                       aom_codec_pts_t pts,
@@ -952,14 +953,16 @@
     // failure condition, encoder setup is done fully in init() currently.
     if (res == AOM_CODEC_OK) {
 #if CONFIG_EXT_REFS
-      data_sz = ctx->cfg.g_w * ctx->cfg.g_h * get_image_bps(img);
+      data_sz = ALIGN_POWER_OF_TWO(ctx->cfg.g_w, 5) *
+                ALIGN_POWER_OF_TWO(ctx->cfg.g_h, 5) * get_image_bps(img);
 #else
       // There's no codec control for multiple alt-refs so check the encoder
       // instance for its status to determine the compressed data size.
-      data_sz = ctx->cfg.g_w * ctx->cfg.g_h * get_image_bps(img) / 8 *
+      data_sz = ALIGN_POWER_OF_TWO(ctx->cfg.g_w, 5) *
+                ALIGN_POWER_OF_TWO(ctx->cfg.g_h, 5) * get_image_bps(img) / 8 *
                 (cpi->multi_arf_allowed ? 8 : 2);
 #endif  // CONFIG_EXT_REFS
-      if (data_sz < 4096) data_sz = 4096;
+      if (data_sz < kMinCompressedSize) data_sz = kMinCompressedSize;
       if (ctx->cx_data == NULL || ctx->cx_data_sz < data_sz) {
         ctx->cx_data_sz = data_sz;
         free(ctx->cx_data);
diff --git a/av1/av1_dx_iface.c b/av1/av1_dx_iface.c
index 43cc3a2..2caed90 100644
--- a/av1/av1_dx_iface.c
+++ b/av1/av1_dx_iface.c
@@ -1083,6 +1083,24 @@
   return AOM_CODEC_OK;
 }
 
+static aom_codec_err_t ctrl_get_accounting(aom_codec_alg_priv_t *ctx,
+                                           va_list args) {
+#if !CONFIG_ACCOUNTING
+  (void)ctx;
+  (void)args;
+  return AOM_CODEC_INCAPABLE;
+#else
+  if (ctx->frame_workers) {
+    AVxWorker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    AV1Decoder *pbi = frame_worker_data->pbi;
+    Accounting **acct = va_arg(args, Accounting **);
+    *acct = &pbi->accounting;
+    return AOM_CODEC_OK;
+  }
+  return AOM_CODEC_ERROR;
+#endif
+}
 static aom_codec_err_t ctrl_set_decode_tile_row(aom_codec_alg_priv_t *ctx,
                                                 va_list args) {
   ctx->decode_tile_row = va_arg(args, int);
@@ -1119,6 +1137,7 @@
   { AV1D_GET_DISPLAY_SIZE, ctrl_get_render_size },
   { AV1D_GET_BIT_DEPTH, ctrl_get_bit_depth },
   { AV1D_GET_FRAME_SIZE, ctrl_get_frame_size },
+  { AV1_GET_ACCOUNTING, ctrl_get_accounting },
   { AV1_GET_NEW_FRAME_IMAGE, ctrl_get_new_frame_image },
 
   { -1, NULL },
diff --git a/av1/common/accounting.h b/av1/common/accounting.h
index 04be326..1fe1d9a 100644
--- a/av1/common/accounting.h
+++ b/av1/common/accounting.h
@@ -54,14 +54,16 @@
   AccountingDictionary dictionary;
 } AccountingSymbols;
 
-typedef struct {
+typedef struct Accounting Accounting;
+
+struct Accounting {
   AccountingSymbols syms;
   /** Size allocated for symbols (not all may be used). */
   int num_syms_allocated;
   int16_t hash_dictionary[AOM_ACCOUNTING_HASH_SIZE];
   AccountingSymbolContext context;
   uint32_t last_tell_frac;
-} Accounting;
+};
 
 void aom_accounting_init(Accounting *accounting);
 void aom_accounting_reset(Accounting *accounting);
diff --git a/av1/common/blockd.c b/av1/common/blockd.c
index 6332fed..c5eb85d 100644
--- a/av1/common/blockd.c
+++ b/av1/common/blockd.c
@@ -95,21 +95,22 @@
                       TX_SIZE tx_size, int has_eob, int aoff, int loff) {
   ENTROPY_CONTEXT *const a = pd->above_context + aoff;
   ENTROPY_CONTEXT *const l = pd->left_context + loff;
-  const int tx_size_in_blocks = 1 << tx_size;
+  const int txs_wide = tx_size_wide_unit[tx_size];
+  const int txs_high = tx_size_high_unit[tx_size];
 
   // above
   if (has_eob && xd->mb_to_right_edge < 0) {
     int i;
     const int blocks_wide =
         pd->n4_w + (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
-    int above_contexts = tx_size_in_blocks;
+    int above_contexts = txs_wide;
     if (above_contexts + aoff > blocks_wide)
       above_contexts = blocks_wide - aoff;
 
     for (i = 0; i < above_contexts; ++i) a[i] = has_eob;
-    for (i = above_contexts; i < tx_size_in_blocks; ++i) a[i] = 0;
+    for (i = above_contexts; i < txs_wide; ++i) a[i] = 0;
   } else {
-    memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+    memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * txs_wide);
   }
 
   // left
@@ -117,13 +118,13 @@
     int i;
     const int blocks_high =
         pd->n4_h + (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
-    int left_contexts = tx_size_in_blocks;
+    int left_contexts = txs_high;
     if (left_contexts + loff > blocks_high) left_contexts = blocks_high - loff;
 
     for (i = 0; i < left_contexts; ++i) l[i] = has_eob;
-    for (i = left_contexts; i < tx_size_in_blocks; ++i) l[i] = 0;
+    for (i = left_contexts; i < txs_high; ++i) l[i] = 0;
   } else {
-    memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+    memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * txs_high);
   }
 }
 
diff --git a/av1/common/entropymode.c b/av1/common/entropymode.c
index 78f4ffe..e812f15 100644
--- a/av1/common/entropymode.c
+++ b/av1/common/entropymode.c
@@ -147,6 +147,9 @@
       { 43, 81, 53, 140, 169, 204, 68, 84, 72 }     // left = tm
   }
 };
+#if CONFIG_DAALA_EC
+aom_cdf_prob av1_kf_y_mode_cdf[INTRA_MODES][INTRA_MODES][INTRA_MODES];
+#endif
 
 static const aom_prob default_if_y_probs[BLOCK_SIZE_GROUPS][INTRA_MODES - 1] = {
   { 65, 32, 18, 144, 162, 194, 41, 51, 98 },   // block_size < 8x8
@@ -368,6 +371,10 @@
 static const aom_prob default_delta_q_probs[DELTA_Q_CONTEXTS] = { 220, 220,
                                                                   220 };
 #endif
+int av1_intra_mode_ind[INTRA_MODES];
+int av1_intra_mode_inv[INTRA_MODES];
+int av1_inter_mode_ind[INTER_MODES];
+int av1_inter_mode_inv[INTER_MODES];
 
 /* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */
 const aom_tree_index av1_intra_mode_tree[TREE_SIZE(INTRA_MODES)] = {
@@ -1406,14 +1413,22 @@
   av1_copy(fc->switchable_restore_prob, default_switchable_restore_prob);
 #endif  // CONFIG_LOOP_RESTORATION
 #if CONFIG_DAALA_EC
+  av1_tree_to_cdf_1D(av1_intra_mode_tree, fc->y_mode_prob, fc->y_mode_cdf,
+                     BLOCK_SIZE_GROUPS);
+  av1_tree_to_cdf_1D(av1_intra_mode_tree, fc->uv_mode_prob, fc->uv_mode_cdf,
+                     INTRA_MODES);
   av1_tree_to_cdf_1D(av1_switchable_interp_tree, fc->switchable_interp_prob,
                      fc->switchable_interp_cdf, SWITCHABLE_FILTER_CONTEXTS);
+  av1_tree_to_cdf_1D(av1_partition_tree, fc->partition_prob, fc->partition_cdf,
+                     PARTITION_CONTEXTS);
+  av1_tree_to_cdf_1D(av1_inter_mode_tree, fc->inter_mode_probs,
+                     fc->inter_mode_cdf, INTER_MODE_CONTEXTS);
   av1_tree_to_cdf_2D(av1_ext_tx_tree, fc->intra_ext_tx_prob,
                      fc->intra_ext_tx_cdf, EXT_TX_SIZES, TX_TYPES);
   av1_tree_to_cdf_1D(av1_ext_tx_tree, fc->inter_ext_tx_prob,
                      fc->inter_ext_tx_cdf, EXT_TX_SIZES);
-  av1_tree_to_cdf_1D(av1_partition_tree, fc->partition_prob, fc->partition_cdf,
-                     PARTITION_CONTEXTS);
+  av1_tree_to_cdf_2D(av1_intra_mode_tree, av1_kf_y_mode_prob, av1_kf_y_mode_cdf,
+                     INTRA_MODES, INTRA_MODES);
   av1_tree_to_cdf(av1_segment_tree, fc->seg.tree_probs, fc->seg.tree_cdf);
 #endif
 #if CONFIG_DELTA_Q
diff --git a/av1/common/entropymode.h b/av1/common/entropymode.h
index 68a6400..3043114 100644
--- a/av1/common/entropymode.h
+++ b/av1/common/entropymode.h
@@ -165,9 +165,12 @@
   aom_prob switchable_restore_prob[RESTORE_SWITCHABLE_TYPES - 1];
 #endif  // CONFIG_LOOP_RESTORATION
 #if CONFIG_DAALA_EC
+  aom_cdf_prob y_mode_cdf[BLOCK_SIZE_GROUPS][INTRA_MODES];
+  aom_cdf_prob uv_mode_cdf[INTRA_MODES][INTRA_MODES];
   aom_cdf_prob partition_cdf[PARTITION_CONTEXTS][PARTITION_TYPES];
   aom_cdf_prob switchable_interp_cdf[SWITCHABLE_FILTER_CONTEXTS]
                                     [SWITCHABLE_FILTERS];
+  aom_cdf_prob inter_mode_cdf[INTER_MODE_CONTEXTS][INTER_MODES];
   aom_cdf_prob intra_ext_tx_cdf[EXT_TX_SIZES][TX_TYPES][TX_TYPES];
   aom_cdf_prob inter_ext_tx_cdf[EXT_TX_SIZES][TX_TYPES];
 #endif
@@ -276,6 +279,9 @@
 
 extern const aom_prob av1_kf_y_mode_prob[INTRA_MODES][INTRA_MODES]
                                         [INTRA_MODES - 1];
+#if CONFIG_DAALA_EC
+extern aom_cdf_prob av1_kf_y_mode_cdf[INTRA_MODES][INTRA_MODES][INTRA_MODES];
+#endif
 #if CONFIG_PALETTE
 extern const aom_prob av1_default_palette_y_mode_prob[PALETTE_BLOCK_SIZES]
                                                      [PALETTE_Y_MODE_CONTEXTS];
@@ -294,6 +300,12 @@
 
 extern const aom_tree_index av1_intra_mode_tree[TREE_SIZE(INTRA_MODES)];
 extern const aom_tree_index av1_inter_mode_tree[TREE_SIZE(INTER_MODES)];
+#if CONFIG_DAALA_EC
+extern int av1_intra_mode_ind[INTRA_MODES];
+extern int av1_intra_mode_inv[INTRA_MODES];
+extern int av1_inter_mode_ind[INTER_MODES];
+extern int av1_inter_mode_inv[INTER_MODES];
+#endif
 #if CONFIG_EXT_INTER
 extern const aom_tree_index
     av1_interintra_mode_tree[TREE_SIZE(INTERINTRA_MODES)];
diff --git a/av1/common/entropymv.c b/av1/common/entropymv.c
index 34918b3..a80165e 100644
--- a/av1/common/entropymv.c
+++ b/av1/common/entropymv.c
@@ -42,28 +42,45 @@
                                                                4,  -2, -3 };
 
 static const nmv_context default_nmv_context = {
-  { 32, 64, 96 },
+  { 32, 64, 96 },  // joints
+#if CONFIG_DAALA_EC
+  { 0, 0, 0, 0 },  // joint_cdf is computed from joints in av1_init_mv_probs()
+#endif
   { {
         // Vertical component
         128,                                                   // sign
         { 224, 144, 192, 168, 192, 176, 192, 198, 198, 245 },  // class
+#if CONFIG_DAALA_EC
+        { 0 },  // class_cdf is computed from class in av1_init_mv_probs()
+#endif
         { 216 },                                               // class0
         { 136, 140, 148, 160, 176, 192, 224, 234, 234, 240 },  // bits
         { { 128, 128, 64 }, { 96, 112, 64 } },                 // class0_fp
         { 64, 96, 64 },                                        // fp
-        160,                                                   // class0_hp bit
-        128,                                                   // hp
+#if CONFIG_DAALA_EC
+        { { 0 }, { 0 } },  // class0_fp_cdf is computed in av1_init_mv_probs()
+        { 0 },             // fp_cdf is computed from fp in av1_init_mv_probs()
+#endif
+        160,  // class0_hp bit
+        128,  // hp
     },
     {
         // Horizontal component
         128,                                                   // sign
         { 216, 128, 176, 160, 176, 176, 192, 198, 198, 208 },  // class
+#if CONFIG_DAALA_EC
+        { 0 },  // class_cdf is computed from class in av1_init_mv_probs()
+#endif
         { 208 },                                               // class0
         { 136, 140, 148, 160, 176, 192, 224, 234, 234, 240 },  // bits
         { { 128, 128, 64 }, { 96, 112, 64 } },                 // class0_fp
         { 64, 96, 64 },                                        // fp
-        160,                                                   // class0_hp bit
-        128,                                                   // hp
+#if CONFIG_DAALA_EC
+        { { 0 }, { 0 } },  // class0_fp_cdf is computed in av1_init_mv_probs()
+        { 0 },             // fp_cdf is computed from fp in av1_init_mv_probs()
+#endif
+        160,  // class0_hp bit
+        128,  // hp
     } },
 };
 
@@ -262,6 +279,23 @@
   for (i = 0; i < NMV_CONTEXTS; ++i) cm->fc->nmvc[i] = default_nmv_context;
 #else
   cm->fc->nmvc = default_nmv_context;
+#if CONFIG_DAALA_EC
+  {
+    int i, j;
+    av1_tree_to_cdf(av1_mv_joint_tree, cm->fc->nmvc.joints,
+                    cm->fc->nmvc.joint_cdf);
+    for (i = 0; i < 2; i++) {
+      av1_tree_to_cdf(av1_mv_class_tree, cm->fc->nmvc.comps[i].classes,
+                      cm->fc->nmvc.comps[i].class_cdf);
+      av1_tree_to_cdf(av1_mv_fp_tree, cm->fc->nmvc.comps[i].fp,
+                      cm->fc->nmvc.comps[i].fp_cdf);
+      for (j = 0; j < CLASS0_SIZE; j++) {
+        av1_tree_to_cdf(av1_mv_fp_tree, cm->fc->nmvc.comps[i].class0_fp[j],
+                        cm->fc->nmvc.comps[i].class0_fp_cdf[j]);
+      }
+    }
+  }
+#endif
 #endif
 #if CONFIG_GLOBAL_MOTION
   av1_copy(cm->fc->global_motion_types_prob, default_global_motion_types_prob);
diff --git a/av1/common/entropymv.h b/av1/common/entropymv.h
index f97dd85..f308ef3 100644
--- a/av1/common/entropymv.h
+++ b/av1/common/entropymv.h
@@ -85,16 +85,26 @@
 typedef struct {
   aom_prob sign;
   aom_prob classes[MV_CLASSES - 1];
+#if CONFIG_DAALA_EC
+  aom_cdf_prob class_cdf[MV_CLASSES];
+#endif
   aom_prob class0[CLASS0_SIZE - 1];
   aom_prob bits[MV_OFFSET_BITS];
   aom_prob class0_fp[CLASS0_SIZE][MV_FP_SIZE - 1];
   aom_prob fp[MV_FP_SIZE - 1];
+#if CONFIG_DAALA_EC
+  aom_cdf_prob class0_fp_cdf[CLASS0_SIZE][MV_FP_SIZE];
+  aom_cdf_prob fp_cdf[MV_FP_SIZE];
+#endif
   aom_prob class0_hp;
   aom_prob hp;
 } nmv_component;
 
 typedef struct {
   aom_prob joints[MV_JOINTS - 1];
+#if CONFIG_DAALA_EC
+  aom_cdf_prob joint_cdf[MV_JOINTS];
+#endif
   nmv_component comps[2];
 } nmv_context;
 
diff --git a/av1/common/enums.h b/av1/common/enums.h
index b02c814..0a1f7a3 100644
--- a/av1/common/enums.h
+++ b/av1/common/enums.h
@@ -58,6 +58,10 @@
 #define MAX_TILE_COLS 64
 #endif  // CONFIG_EXT_TILE
 
+#if CONFIG_VAR_TX
+#define MAX_VARTX_DEPTH 2
+#endif
+
 // Bitstream profiles indicated by 2-3 bits in the uncompressed header.
 // 00: Profile 0.  8-bit 4:2:0 only.
 // 10: Profile 1.  8-bit 4:4:4, 4:2:2, and 4:4:0.
diff --git a/av1/common/onyxc_int.h b/av1/common/onyxc_int.h
index 6ec5c67..be1cbc1 100644
--- a/av1/common/onyxc_int.h
+++ b/av1/common/onyxc_int.h
@@ -378,6 +378,9 @@
   // - this is intentionally not placed in FRAME_CONTEXT since it's reset upon
   // each keyframe and not used afterwards
   aom_prob kf_y_prob[INTRA_MODES][INTRA_MODES][INTRA_MODES - 1];
+#if CONFIG_DAALA_EC
+  aom_cdf_prob kf_y_cdf[INTRA_MODES][INTRA_MODES][INTRA_MODES];
+#endif
 #if CONFIG_GLOBAL_MOTION
   Global_Motion_Params global_motion[TOTAL_REFS_PER_FRAME];
 #endif
@@ -584,6 +587,18 @@
   return cm->kf_y_prob[above][left];
 }
 
+#if CONFIG_DAALA_EC
+static INLINE const aom_cdf_prob *get_y_mode_cdf(const AV1_COMMON *cm,
+                                                 const MODE_INFO *mi,
+                                                 const MODE_INFO *above_mi,
+                                                 const MODE_INFO *left_mi,
+                                                 int block) {
+  const PREDICTION_MODE above = av1_above_block_mode(mi, above_mi, block);
+  const PREDICTION_MODE left = av1_left_block_mode(mi, left_mi, block);
+  return cm->kf_y_cdf[above][left];
+}
+#endif
+
 static INLINE void update_partition_context(MACROBLOCKD *xd, int mi_row,
                                             int mi_col, BLOCK_SIZE subsize,
                                             BLOCK_SIZE bsize) {
diff --git a/av1/common/reconintra.c b/av1/common/reconintra.c
index ee803b5..cfd283f 100644
--- a/av1/common/reconintra.c
+++ b/av1/common/reconintra.c
@@ -238,41 +238,39 @@
   // blocks in inverted N order, and then update this function appropriately.
   if (bsize == BLOCK_4X8 && y == 1) return 0;
 
-  if (!right_available) {
-    return 0;
-  } else {
-    // Handle block size 4x8 and 4x4
-    if (ss_x == 0 && num_4x4_blocks_wide_lookup[bsize] < 2 && x == 0) return 1;
+  if (!right_available) return 0;
 
-    if (y == 0) {
-      const int hl = mi_height_log2_lookup[bsize];
-      const uint8_t *order;
-      int my_order, tr_order;
+  // Handle block size 4x8 and 4x4
+  if (ss_x == 0 && num_4x4_blocks_wide_lookup[bsize] < 2 && x == 0) return 1;
+
+  if (y == 0) {
+    const int hl = mi_height_log2_lookup[bsize];
+    const uint8_t *order;
+    int my_order, tr_order;
 #if CONFIG_EXT_PARTITION_TYPES
-      if (partition == PARTITION_VERT_A)
-        order = orders_verta[bsize];
-      else
+    if (partition == PARTITION_VERT_A)
+      order = orders_verta[bsize];
+    else
 #endif  // CONFIG_EXT_PARTITION_TYPES
-        order = orders[bsize];
+      order = orders[bsize];
 
-      if (x + step < w) return 1;
+    if (x + step < w) return 1;
 
-      mi_row = (mi_row & MAX_MIB_MASK) >> hl;
-      mi_col = (mi_col & MAX_MIB_MASK) >> wl;
+    mi_row = (mi_row & MAX_MIB_MASK) >> hl;
+    mi_col = (mi_col & MAX_MIB_MASK) >> wl;
 
-      // If top row of coding unit
-      if (mi_row == 0) return 1;
+    // If top row of coding unit
+    if (mi_row == 0) return 1;
 
-      // If rightmost column of coding unit
-      if (((mi_col + 1) << wl) >= MAX_MIB_SIZE) return 0;
+    // If rightmost column of coding unit
+    if (((mi_col + 1) << wl) >= MAX_MIB_SIZE) return 0;
 
-      my_order = order[((mi_row + 0) << (MAX_MIB_SIZE_LOG2 - wl)) + mi_col + 0];
-      tr_order = order[((mi_row - 1) << (MAX_MIB_SIZE_LOG2 - wl)) + mi_col + 1];
+    my_order = order[((mi_row + 0) << (MAX_MIB_SIZE_LOG2 - wl)) + mi_col + 0];
+    tr_order = order[((mi_row - 1) << (MAX_MIB_SIZE_LOG2 - wl)) + mi_col + 1];
 
-      return my_order > tr_order;
-    } else {
-      return x + step < w;
-    }
+    return my_order > tr_order;
+  } else {
+    return x + step < w;
   }
 }
 
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index edbf463..165609a 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -134,9 +134,14 @@
 #endif  // CONFIG_EXT_INTER
 #else
   int j;
-  for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
+  for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
     for (j = 0; j < INTER_MODES - 1; ++j)
       av1_diff_update_prob(r, &fc->inter_mode_probs[i][j], ACCT_STR);
+#if CONFIG_DAALA_EC
+    av1_tree_to_cdf(av1_inter_mode_tree, fc->inter_mode_probs[i],
+                    fc->inter_mode_cdf[i]);
+#endif
+  }
 #endif
 }
 
@@ -204,6 +209,9 @@
   int i, j;
 
   update_mv_probs(ctx->joints, MV_JOINTS - 1, r);
+#if CONFIG_DAALA_EC
+  av1_tree_to_cdf(av1_mv_joint_tree, ctx->joints, ctx->joint_cdf);
+#endif
 
   for (i = 0; i < 2; ++i) {
     nmv_component *const comp_ctx = &ctx->comps[i];
@@ -211,13 +219,24 @@
     update_mv_probs(comp_ctx->classes, MV_CLASSES - 1, r);
     update_mv_probs(comp_ctx->class0, CLASS0_SIZE - 1, r);
     update_mv_probs(comp_ctx->bits, MV_OFFSET_BITS, r);
+#if CONFIG_DAALA_EC
+    av1_tree_to_cdf(av1_mv_class_tree, comp_ctx->classes, comp_ctx->class_cdf);
+#endif
   }
 
   for (i = 0; i < 2; ++i) {
     nmv_component *const comp_ctx = &ctx->comps[i];
-    for (j = 0; j < CLASS0_SIZE; ++j)
+    for (j = 0; j < CLASS0_SIZE; ++j) {
       update_mv_probs(comp_ctx->class0_fp[j], MV_FP_SIZE - 1, r);
+#if CONFIG_DAALA_EC
+      av1_tree_to_cdf(av1_mv_fp_tree, comp_ctx->class0_fp[j],
+                      comp_ctx->class0_fp_cdf[j]);
+#endif
+    }
     update_mv_probs(comp_ctx->fp, MV_FP_SIZE - 1, r);
+#if CONFIG_DAALA_EC
+    av1_tree_to_cdf(av1_mv_fp_tree, comp_ctx->fp, comp_ctx->fp_cdf);
+#endif
   }
 
   if (allow_hp) {
@@ -293,7 +312,7 @@
 #if CONFIG_VAR_TX
 static void decode_reconstruct_tx(AV1_COMMON *cm, MACROBLOCKD *const xd,
                                   aom_reader *r, MB_MODE_INFO *const mbmi,
-                                  int plane, BLOCK_SIZE plane_bsize, int block,
+                                  int plane, BLOCK_SIZE plane_bsize,
                                   int blk_row, int blk_col, TX_SIZE tx_size,
                                   int *eob_total) {
   const struct macroblockd_plane *const pd = &xd->plane[plane];
@@ -315,7 +334,8 @@
 
   if (tx_size == plane_tx_size) {
     PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
-    TX_TYPE tx_type = get_tx_type(plane_type, xd, block, plane_tx_size);
+    int block_idx = (blk_row << 1) + blk_col;
+    TX_TYPE tx_type = get_tx_type(plane_type, xd, block_idx, plane_tx_size);
     const SCAN_ORDER *sc = get_scan(cm, plane_tx_size, tx_type, 1);
     int16_t max_scan_line = 0;
     const int eob =
@@ -336,13 +356,11 @@
     for (i = 0; i < 4; ++i) {
       const int offsetr = blk_row + ((i >> 1) << bsl);
       const int offsetc = blk_col + ((i & 0x01) << bsl);
-      int step = num_4x4_blocks_txsize_lookup[tx_size - 1];
 
       if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
 
-      decode_reconstruct_tx(cm, xd, r, mbmi, plane, plane_bsize,
-                            block + i * step, offsetr, offsetc, tx_size - 1,
-                            eob_total);
+      decode_reconstruct_tx(cm, xd, r, mbmi, plane, plane_bsize, offsetr,
+                            offsetc, tx_size - 1, eob_total);
     }
   }
 }
@@ -1146,11 +1164,12 @@
   const int bh = 1 << (bhl - 1);
   const int x_mis = AOMMIN(bw, cm->mi_cols - mi_col);
   const int y_mis = AOMMIN(bh, cm->mi_rows - mi_row);
+  MB_MODE_INFO *mbmi;
+
 #if CONFIG_ACCOUNTING
   aom_accounting_set_context(&pbi->accounting, mi_col, mi_row);
 #endif
 #if CONFIG_SUPERTX
-  MB_MODE_INFO *mbmi;
   if (supertx_enabled) {
     mbmi = set_mb_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, y_mis);
   } else {
@@ -1162,8 +1181,8 @@
 #endif
   av1_read_mode_info(pbi, xd, supertx_enabled, mi_row, mi_col, r, x_mis, y_mis);
 #else
-  MB_MODE_INFO *mbmi = set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis,
-                                   y_mis, bwl, bhl);
+  mbmi = set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, y_mis, bwl,
+                     bhl);
 #if CONFIG_EXT_PARTITION_TYPES
   xd->mi[0]->mbmi.partition = partition;
 #endif
@@ -1308,8 +1327,6 @@
         const TX_SIZE max_tx_size = max_txsize_lookup[plane_bsize];
         const int bw_var_tx = tx_size_high_unit[max_tx_size];
         const int bh_var_tx = tx_size_wide_unit[max_tx_size];
-        const int step = num_4x4_blocks_txsize_lookup[max_tx_size];
-        int block = 0;
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
         if (is_rect_tx(mbmi->tx_size)) {
           const TX_SIZE tx_size =
@@ -1331,13 +1348,10 @@
                                                   plane, row, col, tx_size);
         } else {
 #endif
-          for (row = 0; row < num_4x4_h; row += bh_var_tx) {
-            for (col = 0; col < num_4x4_w; col += bw_var_tx) {
-              decode_reconstruct_tx(cm, xd, r, mbmi, plane, plane_bsize, block,
-                                    row, col, max_tx_size, &eobtotal);
-              block += step;
-            }
-          }
+          for (row = 0; row < num_4x4_h; row += bh_var_tx)
+            for (col = 0; col < num_4x4_w; col += bw_var_tx)
+              decode_reconstruct_tx(cm, xd, r, mbmi, plane, plane_bsize, row,
+                                    col, max_tx_size, &eobtotal);
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
         }
 #endif
@@ -1838,7 +1852,7 @@
   if (bsize == BLOCK_64X64) {
     if (cm->dering_level != 0 && !sb_all_skip(cm, mi_row, mi_col)) {
       cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]->mbmi.dering_gain =
-          aom_read_literal(r, DERING_REFINEMENT_BITS);
+          aom_read_literal(r, DERING_REFINEMENT_BITS, ACCT_STR);
     } else {
       cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]->mbmi.dering_gain =
           0;
@@ -2772,7 +2786,9 @@
     pbi->allocated_tiles = n_tiles;
   }
 #if CONFIG_ACCOUNTING
-  aom_accounting_reset(&pbi->accounting);
+  if (pbi->acct_enabled) {
+    aom_accounting_reset(&pbi->accounting);
+  }
 #endif
   // Load all tile information into tile_data.
   for (tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) {
@@ -2797,7 +2813,11 @@
                           &td->bit_reader, pbi->decrypt_cb, pbi->decrypt_state);
 #endif
 #if CONFIG_ACCOUNTING
-      tile_data->bit_reader.accounting = &pbi->accounting;
+      if (pbi->acct_enabled) {
+        tile_data->bit_reader.accounting = &pbi->accounting;
+      } else {
+        tile_data->bit_reader.accounting = NULL;
+      }
 #endif
       av1_init_macroblockd(cm, &td->xd, td->dqcoeff);
 #if CONFIG_PALETTE
@@ -2818,8 +2838,10 @@
       const int col = inv_col_order ? tile_cols - 1 - tile_col : tile_col;
       TileData *const td = pbi->tile_data + tile_cols * row + col;
 #if CONFIG_ACCOUNTING
-      tile_data->bit_reader.accounting->last_tell_frac =
-          aom_reader_tell_frac(&tile_data->bit_reader);
+      if (pbi->acct_enabled) {
+        tile_data->bit_reader.accounting->last_tell_frac =
+            aom_reader_tell_frac(&tile_data->bit_reader);
+      }
 #endif
 
       av1_tile_set_col(&tile_info, cm, col);
@@ -3715,9 +3737,14 @@
 #endif
   }
 
-  for (j = 0; j < INTRA_MODES; j++)
+  for (j = 0; j < INTRA_MODES; j++) {
     for (i = 0; i < INTRA_MODES - 1; ++i)
       av1_diff_update_prob(&r, &fc->uv_mode_prob[j][i], ACCT_STR);
+#if CONFIG_DAALA_EC
+    av1_tree_to_cdf(av1_intra_mode_tree, fc->uv_mode_prob[j],
+                    fc->uv_mode_cdf[j]);
+#endif
+  }
 
 #if CONFIG_EXT_PARTITION_TYPES
   for (i = 0; i < PARTITION_TYPES - 1; ++i)
@@ -3744,10 +3771,18 @@
 
   if (frame_is_intra_only(cm)) {
     av1_copy(cm->kf_y_prob, av1_kf_y_mode_prob);
+#if CONFIG_DAALA_EC
+    av1_copy(cm->kf_y_cdf, av1_kf_y_mode_cdf);
+#endif
     for (k = 0; k < INTRA_MODES; k++)
-      for (j = 0; j < INTRA_MODES; j++)
+      for (j = 0; j < INTRA_MODES; j++) {
         for (i = 0; i < INTRA_MODES - 1; ++i)
           av1_diff_update_prob(&r, &cm->kf_y_prob[k][j][i], ACCT_STR);
+#if CONFIG_DAALA_EC
+        av1_tree_to_cdf(av1_intra_mode_tree, cm->kf_y_prob[k][j],
+                        cm->kf_y_cdf[k][j]);
+#endif
+      }
   } else {
 #if !CONFIG_REF_MV
     nmv_context *const nmvc = &fc->nmvc;
@@ -3799,9 +3834,14 @@
 
     read_frame_reference_mode_probs(cm, &r);
 
-    for (j = 0; j < BLOCK_SIZE_GROUPS; j++)
+    for (j = 0; j < BLOCK_SIZE_GROUPS; j++) {
       for (i = 0; i < INTRA_MODES - 1; ++i)
         av1_diff_update_prob(&r, &fc->y_mode_prob[j][i], ACCT_STR);
+#if CONFIG_DAALA_EC
+      av1_tree_to_cdf(av1_intra_mode_tree, fc->y_mode_prob[j],
+                      fc->y_mode_cdf[j]);
+#endif
+    }
 
 #if CONFIG_REF_MV
     for (i = 0; i < NMV_CONTEXTS; ++i)
diff --git a/av1/decoder/decodemv.c b/av1/decoder/decodemv.c
index 8260f9d..3993e72 100644
--- a/av1/decoder/decodemv.c
+++ b/av1/decoder/decodemv.c
@@ -26,7 +26,6 @@
 #include "aom_dsp/aom_dsp_common.h"
 
 #define ACCT_STR __func__
-
 #if CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA || CONFIG_PALETTE
 static INLINE int read_uniform(aom_reader *r, int n) {
   int l = get_unsigned_bits(n);
@@ -42,9 +41,16 @@
 }
 #endif  // CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA || CONFIG_PALETTE
 
+#if CONFIG_DAALA_EC
+static PREDICTION_MODE read_intra_mode(aom_reader *r, const aom_cdf_prob *cdf) {
+  return (PREDICTION_MODE)
+      av1_intra_mode_inv[aom_read_symbol(r, cdf, INTRA_MODES, ACCT_STR)];
+}
+#else
 static PREDICTION_MODE read_intra_mode(aom_reader *r, const aom_prob *p) {
   return (PREDICTION_MODE)aom_read_tree(r, av1_intra_mode_tree, p, ACCT_STR);
 }
+#endif
 
 #if CONFIG_DELTA_Q
 static int read_delta_qindex(AV1_COMMON *cm, MACROBLOCKD *xd, aom_reader *r,
@@ -85,7 +91,11 @@
 static PREDICTION_MODE read_intra_mode_y(AV1_COMMON *cm, MACROBLOCKD *xd,
                                          aom_reader *r, int size_group) {
   const PREDICTION_MODE y_mode =
+#if CONFIG_DAALA_EC
+      read_intra_mode(r, cm->fc->y_mode_cdf[size_group]);
+#else
       read_intra_mode(r, cm->fc->y_mode_prob[size_group]);
+#endif
   FRAME_COUNTS *counts = xd->counts;
   if (counts) ++counts->y_mode[size_group][y_mode];
   return y_mode;
@@ -95,7 +105,11 @@
                                           aom_reader *r,
                                           PREDICTION_MODE y_mode) {
   const PREDICTION_MODE uv_mode =
+#if CONFIG_DAALA_EC
+      read_intra_mode(r, cm->fc->uv_mode_cdf[y_mode]);
+#else
       read_intra_mode(r, cm->fc->uv_mode_prob[y_mode]);
+#endif
   FRAME_COUNTS *counts = xd->counts;
   if (counts) ++counts->uv_mode[y_mode][uv_mode];
   return uv_mode;
@@ -176,8 +190,13 @@
   // Invalid prediction mode.
   assert(0);
 #else
+#if CONFIG_DAALA_EC
+  const int mode = av1_inter_mode_inv[aom_read_symbol(
+      r, cm->fc->inter_mode_cdf[ctx], INTER_MODES, ACCT_STR)];
+#else
   const int mode = aom_read_tree(r, av1_inter_mode_tree,
                                  cm->fc->inter_mode_probs[ctx], ACCT_STR);
+#endif
   FRAME_COUNTS *counts = xd->counts;
   if (counts) ++counts->inter_mode[ctx][mode];
 
@@ -257,8 +276,8 @@
 #if CONFIG_VAR_TX
 static void read_tx_size_vartx(AV1_COMMON *cm, MACROBLOCKD *xd,
                                MB_MODE_INFO *mbmi, FRAME_COUNTS *counts,
-                               TX_SIZE tx_size, int blk_row, int blk_col,
-                               aom_reader *r) {
+                               TX_SIZE tx_size, int depth, int blk_row,
+                               int blk_col, aom_reader *r) {
   int is_split = 0;
   const int tx_row = blk_row >> 1;
   const int tx_col = blk_col >> 1;
@@ -275,6 +294,19 @@
 
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
+  if (depth == MAX_VARTX_DEPTH) {
+    int idx, idy;
+    inter_tx_size[0][0] = tx_size;
+    for (idy = 0; idy < num_4x4_blocks_high_txsize_lookup[tx_size] / 2; ++idy)
+      for (idx = 0; idx < num_4x4_blocks_wide_txsize_lookup[tx_size] / 2; ++idx)
+        inter_tx_size[idy][idx] = tx_size;
+    mbmi->tx_size = tx_size;
+    if (counts) ++counts->txfm_partition[ctx][0];
+    txfm_partition_update(xd->above_txfm_context + tx_col,
+                          xd->left_txfm_context + tx_row, tx_size);
+    return;
+  }
+
   is_split = aom_read(r, cm->fc->txfm_partition_prob[ctx], ACCT_STR);
 
   if (is_split) {
@@ -297,8 +329,8 @@
     for (i = 0; i < 4; ++i) {
       int offsetr = blk_row + ((i >> 1) << bsl);
       int offsetc = blk_col + ((i & 0x01) << bsl);
-      read_tx_size_vartx(cm, xd, mbmi, counts, tx_size - 1, offsetr, offsetc,
-                         r);
+      read_tx_size_vartx(cm, xd, mbmi, counts, tx_size - 1, depth + 1, offsetr,
+                         offsetc, r);
     }
   } else {
     int idx, idy;
@@ -657,24 +689,48 @@
     case BLOCK_4X4:
       for (i = 0; i < 4; ++i)
         mi->bmi[i].as_mode =
+#if CONFIG_DAALA_EC
+            read_intra_mode(r, get_y_mode_cdf(cm, mi, above_mi, left_mi, i));
+#else
             read_intra_mode(r, get_y_mode_probs(cm, mi, above_mi, left_mi, i));
+#endif
       mbmi->mode = mi->bmi[3].as_mode;
       break;
     case BLOCK_4X8:
       mi->bmi[0].as_mode = mi->bmi[2].as_mode =
+#if CONFIG_DAALA_EC
+          read_intra_mode(r, get_y_mode_cdf(cm, mi, above_mi, left_mi, 0));
+#else
           read_intra_mode(r, get_y_mode_probs(cm, mi, above_mi, left_mi, 0));
+#endif
       mi->bmi[1].as_mode = mi->bmi[3].as_mode = mbmi->mode =
+#if CONFIG_DAALA_EC
+          read_intra_mode(r, get_y_mode_cdf(cm, mi, above_mi, left_mi, 1));
+#else
           read_intra_mode(r, get_y_mode_probs(cm, mi, above_mi, left_mi, 1));
+#endif
       break;
     case BLOCK_8X4:
       mi->bmi[0].as_mode = mi->bmi[1].as_mode =
+#if CONFIG_DAALA_EC
+          read_intra_mode(r, get_y_mode_cdf(cm, mi, above_mi, left_mi, 0));
+#else
           read_intra_mode(r, get_y_mode_probs(cm, mi, above_mi, left_mi, 0));
+#endif
       mi->bmi[2].as_mode = mi->bmi[3].as_mode = mbmi->mode =
+#if CONFIG_DAALA_EC
+          read_intra_mode(r, get_y_mode_cdf(cm, mi, above_mi, left_mi, 2));
+#else
           read_intra_mode(r, get_y_mode_probs(cm, mi, above_mi, left_mi, 2));
+#endif
       break;
     default:
       mbmi->mode =
+#if CONFIG_DAALA_EC
+          read_intra_mode(r, get_y_mode_cdf(cm, mi, above_mi, left_mi, 0));
+#else
           read_intra_mode(r, get_y_mode_probs(cm, mi, above_mi, left_mi, 0));
+#endif
   }
 
   mbmi->uv_mode = read_intra_mode_uv(cm, xd, r, mbmi->mode);
@@ -741,7 +797,11 @@
   int mag, d, fr, hp;
   const int sign = aom_read(r, mvcomp->sign, ACCT_STR);
   const int mv_class =
+#if CONFIG_DAALA_EC
+      aom_read_symbol(r, mvcomp->class_cdf, MV_CLASSES, ACCT_STR);
+#else
       aom_read_tree(r, av1_mv_class_tree, mvcomp->classes, ACCT_STR);
+#endif
   const int class0 = mv_class == MV_CLASS_0;
 
   // Integer part
@@ -757,9 +817,14 @@
     mag = CLASS0_SIZE << (mv_class + 2);
   }
 
-  // Fractional part
+// Fractional part
+#if CONFIG_DAALA_EC
+  fr = aom_read_symbol(r, class0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf,
+                       MV_FP_SIZE, ACCT_STR);
+#else
   fr = aom_read_tree(r, av1_mv_fp_tree,
                      class0 ? mvcomp->class0_fp[d] : mvcomp->fp, ACCT_STR);
+#endif
 
   // High precision part (if hp is not used, the default value of the hp is 1)
   hp = usehp ? aom_read(r, class0 ? mvcomp->class0_hp : mvcomp->hp, ACCT_STR)
@@ -777,7 +842,11 @@
   const int use_hp = allow_hp && av1_use_mv_hp(ref);
   MV diff = { 0, 0 };
   joint_type =
+#if CONFIG_DAALA_EC
+      (MV_JOINT_TYPE)aom_read_symbol(r, ctx->joint_cdf, MV_JOINTS, ACCT_STR);
+#else
       (MV_JOINT_TYPE)aom_read_tree(r, av1_mv_joint_tree, ctx->joints, ACCT_STR);
+#endif
 
   if (mv_joint_vertical(joint_type))
     diff.row = read_mv_component(r, &ctx->comps[0], use_hp);
@@ -1736,12 +1805,11 @@
       const int width = num_4x4_blocks_wide_lookup[bsize];
       const int height = num_4x4_blocks_high_lookup[bsize];
       int idx, idy;
-      int tx_size_cat = inter_tx_size_cat_lookup[bsize];
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
       int is_rect_tx_allowed = inter_block && is_rect_tx_allowed_bsize(bsize) &&
                                !xd->lossless[mbmi->segment_id];
       int use_rect_tx = 0;
-
+      int tx_size_cat = inter_tx_size_cat_lookup[bsize];
       if (is_rect_tx_allowed) {
         use_rect_tx = aom_read(r, cm->fc->rect_tx_prob[tx_size_cat], ACCT_STR);
         if (xd->counts) {
@@ -1756,16 +1824,11 @@
 #endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
         for (idy = 0; idy < height; idy += bs)
           for (idx = 0; idx < width; idx += bs)
-            read_tx_size_vartx(cm, xd, mbmi, xd->counts, max_tx_size, idy, idx,
-                               r);
+            read_tx_size_vartx(cm, xd, mbmi, xd->counts, max_tx_size,
+                               height != width, idy, idx, r);
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
       }
 #endif
-      if (xd->counts) {
-        const int ctx = get_tx_size_context(xd);
-        ++xd->counts->tx_size[tx_size_cat][ctx]
-                             [txsize_sqr_up_map[mbmi->tx_size]];
-      }
     } else {
       if (inter_block)
         mbmi->tx_size = read_tx_size_inter(cm, xd, !mbmi->skip, r);
diff --git a/av1/decoder/decoder.c b/av1/decoder/decoder.c
index 9952650..c3099ba 100644
--- a/av1/decoder/decoder.c
+++ b/av1/decoder/decoder.c
@@ -52,6 +52,10 @@
                           SWITCHABLE_FILTERS, av1_switchable_interp_tree);
     av1_indices_from_tree(av1_ext_tx_ind, av1_ext_tx_inv, TX_TYPES,
                           av1_ext_tx_tree);
+    av1_indices_from_tree(av1_intra_mode_ind, av1_intra_mode_inv, INTRA_MODES,
+                          av1_intra_mode_tree);
+    av1_indices_from_tree(av1_inter_mode_ind, av1_inter_mode_inv, INTER_MODES,
+                          av1_inter_mode_tree);
 #endif
   }
 }
@@ -95,10 +99,13 @@
 
   cm->error.setjmp = 1;
 
-  CHECK_MEM_ERROR(cm, cm->fc, (FRAME_CONTEXT *)aom_calloc(1, sizeof(*cm->fc)));
-  CHECK_MEM_ERROR(
-      cm, cm->frame_contexts,
-      (FRAME_CONTEXT *)aom_calloc(FRAME_CONTEXTS, sizeof(*cm->frame_contexts)));
+  CHECK_MEM_ERROR(cm, cm->fc,
+                  (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->fc)));
+  CHECK_MEM_ERROR(cm, cm->frame_contexts,
+                  (FRAME_CONTEXT *)aom_memalign(
+                      32, FRAME_CONTEXTS * sizeof(*cm->frame_contexts)));
+  memset(cm->fc, 0, sizeof(*cm->fc));
+  memset(cm->frame_contexts, 0, FRAME_CONTEXTS * sizeof(*cm->frame_contexts));
 
   pbi->need_resync = 1;
   once(initialize_dec);
@@ -127,6 +134,7 @@
   av1_loop_restoration_precal();
 #endif  // CONFIG_LOOP_RESTORATION
 #if CONFIG_ACCOUNTING
+  pbi->acct_enabled = 1;
   aom_accounting_init(&pbi->accounting);
 #endif
 
diff --git a/av1/decoder/decoder.h b/av1/decoder/decoder.h
index 7575260..fd68d13 100644
--- a/av1/decoder/decoder.h
+++ b/av1/decoder/decoder.h
@@ -104,6 +104,7 @@
   int dec_tile_row, dec_tile_col;
 #endif  // CONFIG_EXT_TILE
 #if CONFIG_ACCOUNTING
+  int acct_enabled;
   Accounting accounting;
 #endif
 
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index e0fb7ec..032ae73 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -192,13 +192,19 @@
       structure. */
   av1_indices_from_tree(av1_ext_tx_ind, av1_ext_tx_inv, TX_TYPES,
                         av1_ext_tx_tree);
+  av1_indices_from_tree(av1_intra_mode_ind, av1_intra_mode_inv, INTRA_MODES,
+                        av1_intra_mode_tree);
+  av1_indices_from_tree(av1_inter_mode_ind, av1_inter_mode_inv, INTER_MODES,
+                        av1_inter_mode_tree);
 #endif
 }
 
+#if !CONFIG_DAALA_EC
 static void write_intra_mode(aom_writer *w, PREDICTION_MODE mode,
                              const aom_prob *probs) {
   av1_write_token(w, av1_intra_mode_tree, probs, &intra_mode_encodings[mode]);
 }
+#endif
 
 #if CONFIG_EXT_INTER
 static void write_interintra_mode(aom_writer *w, INTERINTRA_MODE mode,
@@ -252,10 +258,17 @@
     }
   }
 #else
-  const aom_prob *const inter_probs = cm->fc->inter_mode_probs[mode_ctx];
   assert(is_inter_mode(mode));
-  av1_write_token(w, av1_inter_mode_tree, inter_probs,
-                  &inter_mode_encodings[INTER_OFFSET(mode)]);
+#if CONFIG_DAALA_EC
+  aom_write_symbol(w, av1_inter_mode_ind[INTER_OFFSET(mode)],
+                   cm->fc->inter_mode_cdf[mode_ctx], INTER_MODES);
+#else
+  {
+    const aom_prob *const inter_probs = cm->fc->inter_mode_probs[mode_ctx];
+    av1_write_token(w, av1_inter_mode_tree, inter_probs,
+                    &inter_mode_encodings[INTER_OFFSET(mode)]);
+  }
+#endif
 #endif
 }
 
@@ -352,7 +365,8 @@
 #if CONFIG_VAR_TX
 static void write_tx_size_vartx(const AV1_COMMON *cm, const MACROBLOCKD *xd,
                                 const MB_MODE_INFO *mbmi, TX_SIZE tx_size,
-                                int blk_row, int blk_col, aom_writer *w) {
+                                int depth, int blk_row, int blk_col,
+                                aom_writer *w) {
   const int tx_row = blk_row >> 1;
   const int tx_col = blk_col >> 1;
   int max_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
@@ -365,6 +379,12 @@
 
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
+  if (depth == MAX_VARTX_DEPTH) {
+    txfm_partition_update(xd->above_txfm_context + tx_col,
+                          xd->left_txfm_context + tx_row, tx_size);
+    return;
+  }
+
   if (tx_size == mbmi->inter_tx_size[tx_row][tx_col]) {
     aom_write(w, 0, cm->fc->txfm_partition_prob[ctx]);
     txfm_partition_update(xd->above_txfm_context + tx_col,
@@ -386,7 +406,8 @@
     for (i = 0; i < 4; ++i) {
       int offsetr = blk_row + ((i >> 1) << bsl);
       int offsetc = blk_col + ((i & 0x01) << bsl);
-      write_tx_size_vartx(cm, xd, mbmi, tx_size - 1, offsetr, offsetc, w);
+      write_tx_size_vartx(cm, xd, mbmi, tx_size - 1, depth + 1, offsetr,
+                          offsetc, w);
     }
   }
 }
@@ -1216,7 +1237,8 @@
 #endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
         for (idy = 0; idy < height; idy += bs)
           for (idx = 0; idx < width; idx += bs)
-            write_tx_size_vartx(cm, xd, mbmi, max_tx_size, idy, idx, w);
+            write_tx_size_vartx(cm, xd, mbmi, max_tx_size, height != width, idy,
+                                idx, w);
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
       }
 #endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
@@ -1233,7 +1255,13 @@
 
   if (!is_inter) {
     if (bsize >= BLOCK_8X8) {
+#if CONFIG_DAALA_EC
+      aom_write_symbol(w, av1_intra_mode_ind[mode],
+                       cm->fc->y_mode_cdf[size_group_lookup[bsize]],
+                       INTRA_MODES);
+#else
       write_intra_mode(w, mode, cm->fc->y_mode_prob[size_group_lookup[bsize]]);
+#endif
     } else {
       int idx, idy;
       const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
@@ -1241,11 +1269,21 @@
       for (idy = 0; idy < 2; idy += num_4x4_h) {
         for (idx = 0; idx < 2; idx += num_4x4_w) {
           const PREDICTION_MODE b_mode = mi->bmi[idy * 2 + idx].as_mode;
+#if CONFIG_DAALA_EC
+          aom_write_symbol(w, av1_intra_mode_ind[b_mode], cm->fc->y_mode_cdf[0],
+                           INTRA_MODES);
+#else
           write_intra_mode(w, b_mode, cm->fc->y_mode_prob[0]);
+#endif
         }
       }
     }
+#if CONFIG_DAALA_EC
+    aom_write_symbol(w, av1_intra_mode_ind[mbmi->uv_mode],
+                     cm->fc->uv_mode_cdf[mode], INTRA_MODES);
+#else
     write_intra_mode(w, mbmi->uv_mode, cm->fc->uv_mode_prob[mode]);
+#endif
 #if CONFIG_EXT_INTRA
     write_intra_angle_info(cm, xd, w);
 #endif  // CONFIG_EXT_INTRA
@@ -1622,8 +1660,13 @@
     write_selected_tx_size(cm, xd, w);
 
   if (bsize >= BLOCK_8X8) {
+#if CONFIG_DAALA_EC
+    aom_write_symbol(w, av1_intra_mode_ind[mbmi->mode],
+                     get_y_mode_cdf(cm, mi, above_mi, left_mi, 0), INTRA_MODES);
+#else
     write_intra_mode(w, mbmi->mode,
                      get_y_mode_probs(cm, mi, above_mi, left_mi, 0));
+#endif
   } else {
     const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
     const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
@@ -1632,13 +1675,23 @@
     for (idy = 0; idy < 2; idy += num_4x4_h) {
       for (idx = 0; idx < 2; idx += num_4x4_w) {
         const int block = idy * 2 + idx;
+#if CONFIG_DAALA_EC
+        aom_write_symbol(w, av1_intra_mode_ind[mi->bmi[block].as_mode],
+                         get_y_mode_cdf(cm, mi, above_mi, left_mi, block),
+                         INTRA_MODES);
+#else
         write_intra_mode(w, mi->bmi[block].as_mode,
                          get_y_mode_probs(cm, mi, above_mi, left_mi, block));
+#endif
       }
     }
   }
-
+#if CONFIG_DAALA_EC
+  aom_write_symbol(w, av1_intra_mode_ind[mbmi->uv_mode],
+                   cm->fc->uv_mode_cdf[mbmi->mode], INTRA_MODES);
+#else
   write_intra_mode(w, mbmi->uv_mode, cm->fc->uv_mode_prob[mbmi->mode]);
+#endif
 #if CONFIG_EXT_INTRA
   write_intra_angle_info(cm, xd, w);
 #endif  // CONFIG_EXT_INTRA
@@ -3592,9 +3645,14 @@
 #endif
   update_seg_probs(cpi, header_bc);
 
-  for (i = 0; i < INTRA_MODES; ++i)
+  for (i = 0; i < INTRA_MODES; ++i) {
     prob_diff_update(av1_intra_mode_tree, fc->uv_mode_prob[i],
                      counts->uv_mode[i], INTRA_MODES, header_bc);
+#if CONFIG_DAALA_EC
+    av1_tree_to_cdf(av1_intra_mode_tree, fc->uv_mode_prob[i],
+                    fc->uv_mode_cdf[i]);
+#endif
+  }
 
 #if CONFIG_EXT_PARTITION_TYPES
   prob_diff_update(av1_partition_tree, fc->partition_prob[0],
@@ -3621,17 +3679,30 @@
 
   if (frame_is_intra_only(cm)) {
     av1_copy(cm->kf_y_prob, av1_kf_y_mode_prob);
+#if CONFIG_DAALA_EC
+    av1_copy(cm->kf_y_cdf, av1_kf_y_mode_cdf);
+#endif
     for (i = 0; i < INTRA_MODES; ++i)
-      for (j = 0; j < INTRA_MODES; ++j)
+      for (j = 0; j < INTRA_MODES; ++j) {
         prob_diff_update(av1_intra_mode_tree, cm->kf_y_prob[i][j],
                          counts->kf_y_mode[i][j], INTRA_MODES, header_bc);
+#if CONFIG_DAALA_EC
+        av1_tree_to_cdf(av1_intra_mode_tree, cm->kf_y_prob[i][j],
+                        cm->kf_y_cdf[i][j]);
+#endif
+      }
   } else {
 #if CONFIG_REF_MV
     update_inter_mode_probs(cm, header_bc, counts);
 #else
-    for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
+    for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
       prob_diff_update(av1_inter_mode_tree, cm->fc->inter_mode_probs[i],
                        counts->inter_mode[i], INTER_MODES, header_bc);
+#if CONFIG_DAALA_EC
+      av1_tree_to_cdf(av1_inter_mode_tree, cm->fc->inter_mode_probs[i],
+                      cm->fc->inter_mode_cdf[i]);
+#endif
+    }
 #endif
 
 #if CONFIG_EXT_INTER
@@ -3713,9 +3784,14 @@
       }
     }
 
-    for (i = 0; i < BLOCK_SIZE_GROUPS; ++i)
+    for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) {
       prob_diff_update(av1_intra_mode_tree, cm->fc->y_mode_prob[i],
                        counts->y_mode[i], INTRA_MODES, header_bc);
+#if CONFIG_DAALA_EC
+      av1_tree_to_cdf(av1_intra_mode_tree, cm->fc->y_mode_prob[i],
+                      cm->fc->y_mode_cdf[i]);
+#endif
+    }
 
     av1_write_nmv_probs(cm, cm->allow_high_precision_mv, header_bc,
 #if CONFIG_REF_MV
@@ -3723,6 +3799,10 @@
 #else
                         &counts->mv);
 #endif
+#if CONFIG_DAALA_EC
+    av1_tree_to_cdf(av1_mv_joint_tree, cm->fc->nmvc.joints,
+                    cm->fc->nmvc.joint_cdf);
+#endif
     update_ext_tx_probs(cm, header_bc);
 #if CONFIG_SUPERTX
     if (!xd->lossless[0]) update_supertx_probs(cm, header_bc);
diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index 310325e..dcdf97e 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h
@@ -81,7 +81,7 @@
   MB_MODE_INFO_EXT *mbmi_ext;
   int skip_block;
   int select_tx_size;
-  int q_index;
+  int qindex;
 
   // The equivalent error at the current rdmult of one whole bit (not one
   // bitcost unit).
@@ -98,6 +98,10 @@
   int *m_search_count_ptr;
   int *ex_search_count_ptr;
 
+#if CONFIG_VAR_TX
+  unsigned int txb_split_count;
+#endif
+
   // These are set to their default values at the beginning, and then adjusted
   // further in the encoding process.
   BLOCK_SIZE min_partition_size;
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 9dc6a2e..acca6f1 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -852,7 +852,6 @@
 
     if (cyclic_refresh_segment_id_boosted(segment_id)) {
       int q = av1_get_qindex(&cm->seg, segment_id, cm->base_qindex);
-      assert(q == xd->qindex[segment_id]);
       set_vbp_thresholds(cpi, thresholds, q);
     }
   }
@@ -1606,7 +1605,6 @@
   av1_init_plane_quantizers(cpi, x, segment_id);
   aom_clear_system_state();
   segment_qindex = av1_get_qindex(&cm->seg, segment_id, cm->base_qindex);
-  assert(segment_qindex == x->e_mbd.qindex[segment_id]);
   return av1_compute_rd_mult(cpi, segment_qindex + cm->y_dc_delta_q);
 }
 
@@ -4675,7 +4673,6 @@
                            : cm->base_qindex;
     xd->lossless[i] = qindex == 0 && cm->y_dc_delta_q == 0 &&
                       cm->uv_dc_delta_q == 0 && cm->uv_ac_delta_q == 0;
-    xd->qindex[i] = qindex;
   }
 
   if (!cm->seg.enabled && xd->lossless[0]) x->optimize = 0;
@@ -4720,6 +4717,7 @@
       cm->use_prev_frame_mvs ? cm->prev_mip + cm->mi_stride + 1 : NULL;
 
 #if CONFIG_VAR_TX
+  x->txb_split_count = 0;
 #if CONFIG_REF_MV
   av1_zero(x->blk_skip_drl);
 #endif
@@ -4853,7 +4851,10 @@
       }
     }
 
-#if !CONFIG_VAR_TX
+#if CONFIG_VAR_TX
+    if (cm->tx_mode == TX_MODE_SELECT && cpi->td.mb.txb_split_count == 0)
+      cm->tx_mode = ALLOW_32X32;
+#else
     if (cm->tx_mode == TX_MODE_SELECT) {
       int count4x4 = 0;
       int count8x8_lp = 0, count8x8_8x8p = 0;
@@ -4956,8 +4957,9 @@
 }
 
 #if CONFIG_VAR_TX
-static void update_txfm_count(MACROBLOCKD *xd, FRAME_COUNTS *counts,
-                              TX_SIZE tx_size, int blk_row, int blk_col) {
+static void update_txfm_count(MACROBLOCK *x, MACROBLOCKD *xd,
+                              FRAME_COUNTS *counts, TX_SIZE tx_size,
+                              int blk_row, int blk_col) {
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   const int tx_row = blk_row >> 1;
   const int tx_col = blk_col >> 1;
@@ -4982,6 +4984,7 @@
     int bh = num_4x4_blocks_high_lookup[bsize];
     int i;
     ++counts->txfm_partition[ctx][1];
+    ++x->txb_split_count;
 
     if (tx_size == TX_8X8) {
       mbmi->inter_tx_size[tx_row][tx_col] = TX_4X4;
@@ -4994,16 +4997,16 @@
     for (i = 0; i < 4; ++i) {
       int offsetr = (i >> 1) * bh / 2;
       int offsetc = (i & 0x01) * bh / 2;
-      update_txfm_count(xd, counts, tx_size - 1, blk_row + offsetr,
+      update_txfm_count(x, xd, counts, tx_size - 1, blk_row + offsetr,
                         blk_col + offsetc);
     }
   }
 }
 
-static void tx_partition_count_update(const AV1_COMMON *const cm,
-                                      MACROBLOCKD *xd, BLOCK_SIZE plane_bsize,
-                                      int mi_row, int mi_col,
-                                      FRAME_COUNTS *td_counts) {
+static void tx_partition_count_update(const AV1_COMMON *const cm, MACROBLOCK *x,
+                                      BLOCK_SIZE plane_bsize, int mi_row,
+                                      int mi_col, FRAME_COUNTS *td_counts) {
+  MACROBLOCKD *xd = &x->e_mbd;
   const int mi_width = num_4x4_blocks_wide_lookup[plane_bsize];
   const int mi_height = num_4x4_blocks_high_lookup[plane_bsize];
   TX_SIZE max_tx_size = max_txsize_lookup[plane_bsize];
@@ -5017,7 +5020,7 @@
 
   for (idy = 0; idy < mi_height; idy += bh)
     for (idx = 0; idx < mi_width; idx += bh)
-      update_txfm_count(xd, td_counts, max_tx_size, idy, idx);
+      update_txfm_count(x, xd, td_counts, max_tx_size, idy, idx);
 }
 
 static void set_txfm_context(MACROBLOCKD *xd, TX_SIZE tx_size, int blk_row,
@@ -5259,13 +5262,19 @@
       }
       if (!is_rect_tx_allowed(xd, mbmi) || !is_rect_tx(mbmi->tx_size)) {
 #endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
-        if (is_inter)
-          tx_partition_count_update(cm, xd, bsize, mi_row, mi_col, td->counts);
+        if (is_inter) {
+          tx_partition_count_update(cm, x, bsize, mi_row, mi_col, td->counts);
+        } else {
+          ++td->counts->tx_size[tx_size_cat][tx_size_ctx][coded_tx_size];
+          if (mbmi->tx_size != max_txsize_lookup[bsize]) ++x->txb_split_count;
+        }
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
       }
 #endif
 #endif
+#if !CONFIG_VAR_TX
       ++td->counts->tx_size[tx_size_cat][tx_size_ctx][coded_tx_size];
+#endif
     } else {
       int i, j;
       TX_SIZE tx_size;
@@ -5288,7 +5297,12 @@
         for (i = 0; i < mi_width; i++)
           if (mi_col + i < cm->mi_cols && mi_row + j < cm->mi_rows)
             mi_8x8[mis * j + i]->mbmi.tx_size = tx_size;
+
+#if CONFIG_VAR_TX
+      if (mbmi->tx_size != max_txsize_lookup[bsize]) ++x->txb_split_count;
+#endif
     }
+
     ++td->counts->tx_size_totals[txsize_sqr_map[mbmi->tx_size]];
     ++td->counts
           ->tx_size_totals[txsize_sqr_map[get_uv_tx_size(mbmi, &xd->plane[1])]];
diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index fe975f0..6b7e72c 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c
@@ -95,8 +95,7 @@
 #endif
   const int shift = get_tx_scale(xd, tx_type, tx_size);
 #if CONFIG_NEW_QUANT
-  int dq = get_dq_profile_from_ctx(xd->qindex[xd->mi[0]->mbmi.segment_id], ctx,
-                                   ref, plane_type);
+  int dq = get_dq_profile_from_ctx(mb->qindex, ctx, ref, plane_type);
   const dequant_val_type_nuq *dequant_val = pd->dequant_val_nuq[dq];
 #else
   const int dq_step[2] = { dequant_ptr[0] >> shift, dequant_ptr[1] >> shift };
@@ -123,8 +122,7 @@
   int shortcut = 0;
   int next_shortcut = 0;
 
-  assert((xd->qindex[xd->mi[0]->mbmi.segment_id] == 0) ^
-         (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0));
+  assert((mb->qindex == 0) ^ (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0));
 
   token_costs += band;
 
@@ -518,8 +516,7 @@
   tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
   tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  int dq = get_dq_profile_from_ctx(xd->qindex[xd->mi[0]->mbmi.segment_id], ctx,
-                                   is_inter, plane_type);
+  int dq = get_dq_profile_from_ctx(x->qindex, ctx, is_inter, plane_type);
   uint16_t *const eob = &p->eobs[block];
   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
   const int16_t *src_diff;
@@ -527,8 +524,7 @@
 
   FWD_TXFM_PARAM fwd_txfm_param;
 
-  assert((xd->qindex[xd->mi[0]->mbmi.segment_id] == 0) ^
-         (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0));
+  assert((x->qindex == 0) ^ (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0));
 
   fwd_txfm_param.tx_type = tx_type;
   fwd_txfm_param.tx_size = tx_size;
@@ -588,8 +584,7 @@
   PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
   TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
   const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, is_inter);
-  int dq = get_dq_profile_from_ctx(xd->qindex[xd->mi[0]->mbmi.segment_id], ctx,
-                                   is_inter, plane_type);
+  int dq = get_dq_profile_from_ctx(x->qindex, ctx, is_inter, plane_type);
   tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
   tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
@@ -600,8 +595,7 @@
 
   FWD_TXFM_PARAM fwd_txfm_param;
 
-  assert((xd->qindex[xd->mi[0]->mbmi.segment_id] == 0) ^
-         (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0));
+  assert((x->qindex == 0) ^ (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0));
 
   fwd_txfm_param.tx_type = tx_type;
   fwd_txfm_param.tx_size = tx_size;
@@ -665,13 +659,11 @@
   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
   const int16_t *src_diff;
   const int is_inter = is_inter_block(&xd->mi[0]->mbmi);
-  int dq = get_dq_profile_from_ctx(xd->qindex[xd->mi[0]->mbmi.segment_id], ctx,
-                                   is_inter, plane_type);
+  int dq = get_dq_profile_from_ctx(x->qindex, ctx, is_inter, plane_type);
 
   FWD_TXFM_PARAM fwd_txfm_param;
 
-  assert((xd->qindex[xd->mi[0]->mbmi.segment_id] == 0) ^
-         (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0));
+  assert((x->qindex == 0) ^ (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0));
 
   fwd_txfm_param.tx_type = tx_type;
   fwd_txfm_param.tx_size = tx_size;
@@ -730,13 +722,11 @@
   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
   const int16_t *src_diff;
   const int is_inter = is_inter_block(&xd->mi[0]->mbmi);
-  int dq = get_dq_profile_from_ctx(xd->qindex[xd->mi[0]->mbmi.segment_id], ctx,
-                                   is_inter, plane_type);
+  int dq = get_dq_profile_from_ctx(x->qindex, ctx, is_inter, plane_type);
 
   FWD_TXFM_PARAM fwd_txfm_param;
 
-  assert((xd->qindex[xd->mi[0]->mbmi.segment_id] == 0) ^
-         (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0));
+  assert((x->qindex == 0) ^ (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0));
 
   fwd_txfm_param.tx_type = tx_type;
   fwd_txfm_param.tx_size = tx_size;
diff --git a/av1/encoder/encodemv.c b/av1/encoder/encodemv.c
index 7276fee..53dac12 100644
--- a/av1/encoder/encodemv.c
+++ b/av1/encoder/encodemv.c
@@ -45,9 +45,13 @@
   // Sign
   aom_write(w, sign, mvcomp->sign);
 
-  // Class
+// Class
+#if CONFIG_DAALA_EC
+  aom_write_symbol(w, mv_class, mvcomp->class_cdf, MV_CLASSES);
+#else
   av1_write_token(w, av1_mv_class_tree, mvcomp->classes,
                   &mv_class_encodings[mv_class]);
+#endif
 
   // Integer bits
   if (mv_class == MV_CLASS_0) {
@@ -58,10 +62,16 @@
     for (i = 0; i < n; ++i) aom_write(w, (d >> i) & 1, mvcomp->bits[i]);
   }
 
-  // Fractional bits
+// Fractional bits
+#if CONFIG_DAALA_EC
+  aom_write_symbol(
+      w, fr, mv_class == MV_CLASS_0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf,
+      MV_FP_SIZE);
+#else
   av1_write_token(w, av1_mv_fp_tree,
                   mv_class == MV_CLASS_0 ? mvcomp->class0_fp[d] : mvcomp->fp,
                   &mv_fp_encodings[fr]);
+#endif
 
   // High precision bit
   if (usehp)
@@ -203,6 +213,9 @@
     update_mv(w, comp_counts->sign, &comp->sign, MV_UPDATE_PROB);
     write_mv_update(av1_mv_class_tree, comp->classes, comp_counts->classes,
                     MV_CLASSES, w);
+#if CONFIG_DAALA_EC
+    av1_tree_to_cdf(av1_mv_class_tree, comp->classes, comp->class_cdf);
+#endif
     write_mv_update(av1_mv_class0_tree, comp->class0, comp_counts->class0,
                     CLASS0_SIZE, w);
     for (j = 0; j < MV_OFFSET_BITS; ++j)
@@ -210,12 +223,19 @@
   }
 
   for (i = 0; i < 2; ++i) {
-    for (j = 0; j < CLASS0_SIZE; ++j)
+    for (j = 0; j < CLASS0_SIZE; ++j) {
       write_mv_update(av1_mv_fp_tree, mvc->comps[i].class0_fp[j],
                       counts->comps[i].class0_fp[j], MV_FP_SIZE, w);
-
+#if CONFIG_DAALA_EC
+      av1_tree_to_cdf(av1_mv_fp_tree, mvc->comps[i].class0_fp[j],
+                      mvc->comps[i].class0_fp_cdf[j]);
+#endif
+    }
     write_mv_update(av1_mv_fp_tree, mvc->comps[i].fp, counts->comps[i].fp,
                     MV_FP_SIZE, w);
+#if CONFIG_DAALA_EC
+    av1_tree_to_cdf(av1_mv_fp_tree, mvc->comps[i].fp, mvc->comps[i].fp_cdf);
+#endif
   }
 
   if (usehp) {
@@ -239,7 +259,11 @@
 #if CONFIG_REF_MV
   (void)is_compound;
 #endif
+#if CONFIG_DAALA_EC
+  aom_write_symbol(w, j, mvctx->joint_cdf, MV_JOINTS);
+#else
   av1_write_token(w, av1_mv_joint_tree, mvctx->joints, &mv_joint_encodings[j]);
+#endif
   if (mv_joint_vertical(j))
     encode_mv_component(w, diff.row, &mvctx->comps[0], usehp);
 
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index f1a6f72..33c536d 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -2040,10 +2040,13 @@
   cm->free_mi = av1_enc_free_mi;
   cm->setup_mi = av1_enc_setup_mi;
 
-  CHECK_MEM_ERROR(cm, cm->fc, (FRAME_CONTEXT *)aom_calloc(1, sizeof(*cm->fc)));
-  CHECK_MEM_ERROR(
-      cm, cm->frame_contexts,
-      (FRAME_CONTEXT *)aom_calloc(FRAME_CONTEXTS, sizeof(*cm->frame_contexts)));
+  CHECK_MEM_ERROR(cm, cm->fc,
+                  (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->fc)));
+  CHECK_MEM_ERROR(cm, cm->frame_contexts,
+                  (FRAME_CONTEXT *)aom_memalign(
+                      32, FRAME_CONTEXTS * sizeof(*cm->frame_contexts)));
+  memset(cm->fc, 0, sizeof(*cm->fc));
+  memset(cm->frame_contexts, 0, FRAME_CONTEXTS * sizeof(*cm->frame_contexts));
 
   cpi->resize_state = 0;
   cpi->resize_avg_qp = 0;
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index 0c66905..1bf7ff4 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -751,12 +751,10 @@
 #endif  // CONFIG_EXT_REFS
 
 static INLINE unsigned int get_token_alloc(int mb_rows, int mb_cols) {
-  // TODO(JBB): double check we can't exceed this token count if we have a
-  // 32x32 transform crossing a boundary at a multiple of 16.
-  // mb_rows, cols are in units of 16 pixels. We assume 3 planes all at full
-  // resolution. We assume up to 1 token per pixel, and then allow
-  // a head room of 1 EOSB token per 8x8 block per plane.
-  return mb_rows * mb_cols * (16 * 16 + 4) * 3;
+  // We assume 3 planes all at full resolution. We assume up to 1 token per
+  // pixel, and then allow a head room of 1 EOSB token per 4x4 block per plane,
+  // plus EOSB_TOKEN per plane.
+  return mb_rows * mb_cols * (16 * 16 + 17) * 3;
 }
 
 // Get the allocated token size for a tile. It does the same calculation as in
diff --git a/av1/encoder/firstpass.c b/av1/encoder/firstpass.c
index 4d74246..dc97ddf 100644
--- a/av1/encoder/firstpass.c
+++ b/av1/encoder/firstpass.c
@@ -588,7 +588,6 @@
 #if CONFIG_SUPERTX
       xd->mi[0]->mbmi.segment_id_supertx = 0;
 #endif  // CONFIG_SUPERTX
-      xd->qindex[xd->mi[0]->mbmi.segment_id] = qindex;
       xd->lossless[xd->mi[0]->mbmi.segment_id] = (qindex == 0);
       xd->mi[0]->mbmi.mode = DC_PRED;
       xd->mi[0]->mbmi.tx_size =
diff --git a/av1/encoder/quantize.c b/av1/encoder/quantize.c
index db2fdb8..771f94b 100644
--- a/av1/encoder/quantize.c
+++ b/av1/encoder/quantize.c
@@ -1293,11 +1293,11 @@
   }
 
   x->skip_block = segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
-  x->q_index = qindex;
+  x->qindex = qindex;
 
   set_error_per_bit(x, rdmult);
 
-  av1_initialize_me_consts(cpi, x, x->q_index);
+  av1_initialize_me_consts(cpi, x, qindex);
 }
 
 void av1_frame_init_quantizer(AV1_COMP *cpi) {
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 82716fe..94d189a 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -3022,11 +3022,11 @@
 
 static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
                             int blk_col, int plane, int block, TX_SIZE tx_size,
-                            BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
-                            ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above,
-                            TXFM_CONTEXT *tx_left, int *rate, int64_t *dist,
-                            int64_t *bsse, int *skip, int64_t ref_best_rd,
-                            int *is_cost_valid) {
+                            int depth, BLOCK_SIZE plane_bsize,
+                            ENTROPY_CONTEXT *ta, ENTROPY_CONTEXT *tl,
+                            TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
+                            int *rate, int64_t *dist, int64_t *bsse, int *skip,
+                            int64_t ref_best_rd, int *is_cost_valid) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   struct macroblock_plane *const p = &x->plane[plane];
@@ -3118,13 +3118,13 @@
       *skip = 0;
     }
 
-    if (tx_size > TX_4X4)
+    if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
       *rate += av1_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 0);
     this_rd = RDCOST(x->rdmult, x->rddiv, *rate, *dist);
     tmp_eob = p->eobs[block];
   }
 
-  if (tx_size > TX_4X4) {
+  if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH) {
     BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
     int bsl = b_height_log2_lookup[bsize];
     int sub_step = num_4x4_blocks_txsize_lookup[tx_size - 1];
@@ -3143,9 +3143,10 @@
       int offsetr = (i >> 1) << bsl;
       int offsetc = (i & 0x01) << bsl;
       select_tx_block(cpi, x, blk_row + offsetr, blk_col + offsetc, plane,
-                      block + i * sub_step, tx_size - 1, plane_bsize, ta, tl,
-                      tx_above, tx_left, &this_rate, &this_dist, &this_bsse,
-                      &this_skip, ref_best_rd - tmp_rd, &this_cost_valid);
+                      block + i * sub_step, tx_size - 1, depth + 1, plane_bsize,
+                      ta, tl, tx_above, tx_left, &this_rate, &this_dist,
+                      &this_bsse, &this_skip, ref_best_rd - tmp_rd,
+                      &this_cost_valid);
       sum_rate += this_rate;
       sum_dist += this_dist;
       sum_bsse += this_bsse;
@@ -3221,9 +3222,10 @@
     for (idy = 0; idy < mi_height; idy += bh) {
       for (idx = 0; idx < mi_width; idx += bh) {
         select_tx_block(cpi, x, idy, idx, 0, block,
-                        max_txsize_lookup[plane_bsize], plane_bsize, ctxa, ctxl,
-                        tx_above, tx_left, &pnrate, &pndist, &pnsse, &pnskip,
-                        ref_best_rd - this_rd, &is_cost_valid);
+                        max_txsize_lookup[plane_bsize], mi_height != mi_width,
+                        plane_bsize, ctxa, ctxl, tx_above, tx_left, &pnrate,
+                        &pndist, &pnsse, &pnskip, ref_best_rd - this_rd,
+                        &is_cost_valid);
         *rate += pnrate;
         *distortion += pndist;
         *sse += pnsse;
@@ -7588,7 +7590,7 @@
       // Y cost and distortion
       av1_subtract_plane(x, bsize, 0);
 #if CONFIG_VAR_TX
-      if (cm->tx_mode == TX_MODE_SELECT || xd->lossless[mbmi->segment_id]) {
+      if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
         select_tx_type_yrd(cpi, x, rate_y, &distortion_y, &skippable_y, psse,
                            bsize, ref_best_rd);
       } else {
@@ -8991,10 +8993,6 @@
           clamp_mv2(&cur_mv.as_mv, xd);
 
           if (!mv_check_bounds(x, &cur_mv.as_mv)) {
-            InterpFilter dummy_single_inter_filter[MB_MODE_COUNT]
-                                                  [TOTAL_REFS_PER_FRAME] = {
-                                                    { 0 }
-                                                  };
             int dummy_single_skippable[MB_MODE_COUNT]
                                       [TOTAL_REFS_PER_FRAME] = { { 0 } };
 #if CONFIG_EXT_INTER
@@ -9022,8 +9020,7 @@
 #else
                 dummy_single_newmv,
 #endif
-                dummy_single_inter_filter, dummy_single_skippable, &tmp_sse,
-                best_rd);
+                single_inter_filter, dummy_single_skippable, &tmp_sse, best_rd);
           }
 
           for (i = 0; i < mbmi->ref_mv_idx; ++i) {
@@ -9787,6 +9784,7 @@
 #else   // CONFIG_GLOBAL_MOTION
   mbmi->mv[0].as_int = 0;
 #endif  // CONFIG_GLOBAL_MOTION
+  mbmi->tx_size = max_txsize_lookup[bsize];
   x->skip = 1;
 
 #if CONFIG_REF_MV
diff --git a/av1/encoder/x86/hybrid_fwd_txfm_avx2.c b/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
index 928af13..f4bd142 100644
--- a/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
+++ b/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
@@ -14,6 +14,7 @@
 #include "./av1_rtcd.h"
 #include "./aom_dsp_rtcd.h"
 
+#include "aom_dsp/x86/fwd_txfm_avx2.h"
 #include "aom_dsp/txfm_common.h"
 #include "aom_dsp/x86/txfm_common_avx2.h"
 
@@ -273,24 +274,11 @@
   in[15] = _mm256_slli_epi16(in[15], 2);
 }
 
-static INLINE void write_buffer_16x16(const __m256i *in, int stride,
-                                      tran_low_t *output) {
-  _mm256_storeu_si256((__m256i *)output, in[0]);
-  _mm256_storeu_si256((__m256i *)(output + stride), in[1]);
-  _mm256_storeu_si256((__m256i *)(output + 2 * stride), in[2]);
-  _mm256_storeu_si256((__m256i *)(output + 3 * stride), in[3]);
-  _mm256_storeu_si256((__m256i *)(output + 4 * stride), in[4]);
-  _mm256_storeu_si256((__m256i *)(output + 5 * stride), in[5]);
-  _mm256_storeu_si256((__m256i *)(output + 6 * stride), in[6]);
-  _mm256_storeu_si256((__m256i *)(output + 7 * stride), in[7]);
-  _mm256_storeu_si256((__m256i *)(output + 8 * stride), in[8]);
-  _mm256_storeu_si256((__m256i *)(output + 9 * stride), in[9]);
-  _mm256_storeu_si256((__m256i *)(output + 10 * stride), in[10]);
-  _mm256_storeu_si256((__m256i *)(output + 11 * stride), in[11]);
-  _mm256_storeu_si256((__m256i *)(output + 12 * stride), in[12]);
-  _mm256_storeu_si256((__m256i *)(output + 13 * stride), in[13]);
-  _mm256_storeu_si256((__m256i *)(output + 14 * stride), in[14]);
-  _mm256_storeu_si256((__m256i *)(output + 15 * stride), in[15]);
+static INLINE void write_buffer_16x16(const __m256i *in, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 16; ++i) {
+    storeu_output_avx2(&in[i], output + (i << 4));
+  }
 }
 
 static void right_shift_16x16(__m256i *in) {
@@ -1253,7 +1241,7 @@
     default: assert(0); break;
   }
   mm256_transpose_16x16(in);
-  write_buffer_16x16(in, 16, output);
+  write_buffer_16x16(in, output);
   _mm256_zeroupper();
 }
 
@@ -1623,12 +1611,13 @@
 }
 
 static INLINE void write_buffer_32x32(const __m256i *in0, const __m256i *in1,
-                                      int stride, tran_low_t *output) {
+                                      tran_low_t *output) {
   int i = 0;
+  const int stride = 32;
   tran_low_t *coeff = output;
   while (i < 32) {
-    _mm256_storeu_si256((__m256i *)coeff, in0[i]);
-    _mm256_storeu_si256((__m256i *)(coeff + 16), in1[i]);
+    storeu_output_avx2(&in0[i], coeff);
+    storeu_output_avx2(&in1[i], coeff + 16);
     coeff += stride;
     i += 1;
   }
@@ -1885,6 +1874,6 @@
     default: assert(0); break;
   }
   nr_right_shift_32x32(in0, in1);
-  write_buffer_32x32(in0, in1, 32, output);
+  write_buffer_32x32(in0, in1, output);
   _mm256_zeroupper();
 }
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index 9811955..b4bb14f 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -533,10 +533,10 @@
 
       for (int j = 0; j < kNumCoeffs; ++j) {
 #if CONFIG_AOM_HIGHBITDEPTH
-        const uint32_t diff =
+        const int diff =
             bit_depth_ == AOM_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
 #else
-        const uint32_t diff = dst[j] - src[j];
+        const int diff = dst[j] - src[j];
 #endif  // CONFIG_AOM_HIGHBITDEPTH
         const uint32_t error = diff * diff;
         EXPECT_GE(1u, error) << "Error: 16x16 IDCT has error " << error
@@ -589,10 +589,10 @@
 
       for (int j = 0; j < kNumCoeffs; ++j) {
 #if CONFIG_AOM_HIGHBITDEPTH
-        const uint32_t diff =
+        const int diff =
             bit_depth_ == AOM_BITS_8 ? dst[j] - ref[j] : dst16[j] - ref16[j];
 #else
-        const uint32_t diff = dst[j] - ref[j];
+        const int diff = dst[j] - ref[j];
 #endif  // CONFIG_AOM_HIGHBITDEPTH
         const uint32_t error = diff * diff;
         EXPECT_EQ(0u, error) << "Error: 16x16 IDCT Comparison has error "
diff --git a/test/fht32x32_test.cc b/test/fht32x32_test.cc
index 3d07b44..1f85761 100644
--- a/test/fht32x32_test.cc
+++ b/test/fht32x32_test.cc
@@ -90,8 +90,14 @@
   IhtFunc inv_txfm_;
 };
 
+// TODO(luoyi): Owing to the range check in DCT_DCT of av1_fht32x32_avx2, as
+// input is out of the range, we use aom_fdct32x32_avx2. However this function
+// does not support CONFIG_AOM_HIGHBITDEPTH. I need to fix the scaling/rounding
+// of av1_fht32x32_avx2 then add this test on CONFIG_AOM_HIGHBITDEPTH.
+#if !CONFIG_AOM_HIGHBITDEPTH
 TEST_P(AV1Trans32x32HT, CoeffCheck) { RunCoeffCheck(); }
 TEST_P(AV1Trans32x32HT, MemCheck) { RunMemCheck(); }
+#endif
 
 #if CONFIG_AOM_HIGHBITDEPTH
 class AV1HighbdTrans32x32HT
diff --git a/test/transform_test_base.h b/test/transform_test_base.h
index 195058e..540136c 100644
--- a/test/transform_test_base.h
+++ b/test/transform_test_base.h
@@ -90,11 +90,11 @@
 
       for (int j = 0; j < num_coeffs_; ++j) {
 #if CONFIG_AOM_HIGHBITDEPTH
-        const uint32_t diff =
+        const int diff =
             bit_depth_ == AOM_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
 #else
         ASSERT_EQ(AOM_BITS_8, bit_depth_);
-        const uint32_t diff = dst[j] - src[j];
+        const int diff = dst[j] - src[j];
 #endif
         const uint32_t error = diff * diff;
         if (max_error < error) max_error = error;
@@ -309,10 +309,10 @@
 
       for (int j = 0; j < num_coeffs_; ++j) {
 #if CONFIG_AOM_HIGHBITDEPTH
-        const uint32_t diff =
+        const int diff =
             bit_depth_ == AOM_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
 #else
-        const uint32_t diff = dst[j] - src[j];
+        const int diff = dst[j] - src[j];
 #endif
         const uint32_t error = diff * diff;
         EXPECT_GE(static_cast<uint32_t>(limit), error)