Merge changes I3922dea2,I3bab2848,I21f7478a,Ida5de713,Ib9f0eefe, ... into nextgenv2

* changes:
  Fix warnings reported by -Wshadow: Part4: main directory
  Fix warnings reported by -Wshadow: Part3: test/ directory
  Fix warnings reported by -Wshadow: Part2b: more from av1 directory
  Fix warnings reported by -Wshadow: Part2: av1 directory
  Fix warnings reported by -Wshadow: Part1b: scan_order struct and variable
  Fix warnings reported by -Wshadow: Part1: aom_dsp directory
  Move STAT_TYPE enum to source file.
  Code cleanup: mainly rd_pick_partition and methods called from there.
diff --git a/aom_dsp/bitreader.h b/aom_dsp/bitreader.h
index d2fd5f2..68e1339 100644
--- a/aom_dsp/bitreader.h
+++ b/aom_dsp/bitreader.h
@@ -28,6 +28,29 @@
 #include "aom_dsp/prob.h"
 #include "av1/common/odintrin.h"
 
+#if CONFIG_ACCOUNTING
+#include "av1/common/accounting.h"
+#define ACCT_STR_NAME acct_str
+#define ACCT_STR_PARAM , const char *ACCT_STR_NAME
+#define ACCT_STR_ARG(s) , s
+#else
+#define ACCT_STR_PARAM
+#define ACCT_STR_ARG(s)
+#endif
+
+#define aom_read(r, prob, ACCT_STR_NAME) \
+  aom_read_(r, prob ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_read_bit(r, ACCT_STR_NAME) \
+  aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_read_tree(r, tree, probs, ACCT_STR_NAME) \
+  aom_read_tree_(r, tree, probs ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_read_literal(r, bits, ACCT_STR_NAME) \
+  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_read_tree_bits(r, tree, probs, ACCT_STR_NAME) \
+  aom_read_tree_bits_(r, tree, probs ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_read_symbol(r, cdf, nsymbs, ACCT_STR_NAME) \
+  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -79,6 +102,7 @@
 #endif
 }
 
+// Returns the position in the bit reader in bits.
 static INLINE ptrdiff_t aom_reader_tell(const aom_reader *r) {
 #if CONFIG_ANS
   (void)r;
@@ -91,75 +115,113 @@
 #endif
 }
 
-static INLINE int aom_read(aom_reader *r, int prob) {
+// Returns the position in the bit reader in 1/8th bits.
+static INLINE ptrdiff_t aom_reader_tell_frac(const aom_reader *r) {
 #if CONFIG_ANS
-  return uabs_read(r, prob);
+  (void)r;
+  assert(0 && "aom_reader_tell_frac() is unimplemented for ANS");
+  return 0;
 #elif CONFIG_DAALA_EC
-  return aom_daala_read(r, prob);
+  return aom_daala_reader_tell_frac(r);
 #else
-  return aom_dk_read(r, prob);
+  return aom_dk_reader_tell_frac(r);
 #endif
 }
 
-static INLINE int aom_read_bit(aom_reader *r) {
+#if CONFIG_ACCOUNTING
+static INLINE void aom_process_accounting(const aom_reader *r ACCT_STR_PARAM) {
+  if (r->accounting != NULL) {
+    uint32_t tell_frac;
+    tell_frac = aom_reader_tell_frac(r);
+    aom_accounting_record(r->accounting, ACCT_STR_NAME,
+                          tell_frac - r->accounting->last_tell_frac);
+    r->accounting->last_tell_frac = tell_frac;
+  }
+}
+#endif
+
+static INLINE int aom_read_(aom_reader *r, int prob ACCT_STR_PARAM) {
+  int ret;
 #if CONFIG_ANS
-  return uabs_read_bit(r);  // Non trivial optimization at half probability
+  ret = uabs_read(r, prob);
+#elif CONFIG_DAALA_EC
+  ret = aom_daala_read(r, prob);
 #else
-  return aom_read(r, 128);  // aom_prob_half
+  ret = aom_dk_read(r, prob);
 #endif
+#if CONFIG_ACCOUNTING
+  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
+#endif
+  return ret;
 }
 
-static INLINE int aom_read_literal(aom_reader *r, int bits) {
+static INLINE int aom_read_bit_(aom_reader *r ACCT_STR_PARAM) {
+  int ret;
+#if CONFIG_ANS
+  ret = uabs_read_bit(r);  // Non trivial optimization at half probability
+#else
+  ret = aom_read(r, 128, NULL);  // aom_prob_half
+#endif
+#if CONFIG_ACCOUNTING
+  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
+#endif
+  return ret;
+}
+
+static INLINE int aom_read_literal_(aom_reader *r, int bits ACCT_STR_PARAM) {
   int literal = 0, bit;
 
-  for (bit = bits - 1; bit >= 0; bit--) literal |= aom_read_bit(r) << bit;
-
+  for (bit = bits - 1; bit >= 0; bit--) literal |= aom_read_bit(r, NULL) << bit;
+#if CONFIG_ACCOUNTING
+  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
+#endif
   return literal;
 }
 
-static INLINE int aom_read_tree_bits(aom_reader *r, const aom_tree_index *tree,
-                                     const aom_prob *probs) {
+static INLINE int aom_read_tree_bits_(aom_reader *r, const aom_tree_index *tree,
+                                      const aom_prob *probs ACCT_STR_PARAM) {
   aom_tree_index i = 0;
 
-  while ((i = tree[i + aom_read(r, probs[i >> 1])]) > 0) continue;
-
+  while ((i = tree[i + aom_read(r, probs[i >> 1], NULL)]) > 0) continue;
+#if CONFIG_ACCOUNTING
+  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
+#endif
   return -i;
 }
 
-static INLINE int aom_read_tree(aom_reader *r, const aom_tree_index *tree,
-                                const aom_prob *probs) {
+static INLINE int aom_read_tree_(aom_reader *r, const aom_tree_index *tree,
+                                 const aom_prob *probs ACCT_STR_PARAM) {
+  int ret;
 #if CONFIG_DAALA_EC
-  return daala_read_tree_bits(r, tree, probs);
+  ret = daala_read_tree_bits(r, tree, probs);
 #else
-  return aom_read_tree_bits(r, tree, probs);
+  ret = aom_read_tree_bits(r, tree, probs, NULL);
 #endif
+#if CONFIG_ACCOUNTING
+  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
+#endif
+  return ret;
 }
 
-static INLINE int aom_read_symbol(aom_reader *r, const aom_cdf_prob *cdf,
-                                  int nsymbs) {
+static INLINE int aom_read_symbol_(aom_reader *r, const aom_cdf_prob *cdf,
+                                   int nsymbs ACCT_STR_PARAM) {
+  int ret;
 #if CONFIG_ANS
   (void)nsymbs;
-  return rans_read(r, cdf);
+  ret = rans_read(r, cdf);
+#elif CONFIG_DAALA_EC
+  ret = daala_read_symbol(r, cdf, nsymbs);
 #else
   (void)r;
   (void)cdf;
   (void)nsymbs;
   assert(0 && "Unsupported bitreader operation");
-  return -1;
+  ret = -1;
 #endif
-}
-
-static INLINE int aom_read_tree_cdf(aom_reader *r, const uint16_t *cdf,
-                                    int nsymbs) {
-#if CONFIG_DAALA_EC
-  return daala_read_symbol(r, cdf, nsymbs);
-#else
-  (void)r;
-  (void)cdf;
-  (void)nsymbs;
-  assert(0 && "Unsupported bitreader operation");
-  return -1;
+#if CONFIG_ACCOUNTING
+  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
 #endif
+  return ret;
 }
 
 #ifdef __cplusplus
diff --git a/aom_dsp/daalaboolreader.c b/aom_dsp/daalaboolreader.c
index 8e1c782..f0da8eb 100644
--- a/aom_dsp/daalaboolreader.c
+++ b/aom_dsp/daalaboolreader.c
@@ -18,6 +18,9 @@
   r->buffer_end = buffer + size;
   r->buffer = buffer;
   od_ec_dec_init(&r->ec, buffer, size - 1);
+#if CONFIG_ACCOUNTING
+  r->accounting = NULL;
+#endif
   return 0;
 }
 
@@ -28,3 +31,7 @@
 ptrdiff_t aom_daala_reader_tell(const daala_reader *r) {
   return od_ec_dec_tell(&r->ec);
 }
+
+ptrdiff_t aom_daala_reader_tell_frac(const daala_reader *r) {
+  return od_ec_dec_tell_frac(&r->ec);
+}
diff --git a/aom_dsp/daalaboolreader.h b/aom_dsp/daalaboolreader.h
index 4fc7ff4..10dc391 100644
--- a/aom_dsp/daalaboolreader.h
+++ b/aom_dsp/daalaboolreader.h
@@ -12,8 +12,12 @@
 #ifndef AOM_DSP_DAALABOOLREADER_H_
 #define AOM_DSP_DAALABOOLREADER_H_
 
+#include "aom/aom_integer.h"
 #include "aom_dsp/entdec.h"
 #include "aom_dsp/prob.h"
+#if CONFIG_ACCOUNTING
+#include "av1/common/accounting.h"
+#endif
 
 #ifdef __cplusplus
 extern "C" {
@@ -23,6 +27,9 @@
   const uint8_t *buffer;
   const uint8_t *buffer_end;
   od_ec_dec ec;
+#if CONFIG_ACCOUNTING
+  Accounting *accounting;
+#endif
 };
 
 typedef struct daala_reader daala_reader;
@@ -30,6 +37,7 @@
 int aom_daala_reader_init(daala_reader *r, const uint8_t *buffer, int size);
 const uint8_t *aom_daala_reader_find_end(daala_reader *r);
 ptrdiff_t aom_daala_reader_tell(const daala_reader *r);
+ptrdiff_t aom_daala_reader_tell_frac(const daala_reader *r);
 
 static INLINE int aom_daala_read(daala_reader *r, int prob) {
   if (prob == 128) {
diff --git a/aom_dsp/dkboolreader.c b/aom_dsp/dkboolreader.c
index c3b7782..4079d70 100644
--- a/aom_dsp/dkboolreader.c
+++ b/aom_dsp/dkboolreader.c
@@ -36,6 +36,9 @@
     r->decrypt_cb = decrypt_cb;
     r->decrypt_state = decrypt_state;
     aom_dk_reader_fill(r);
+#if CONFIG_ACCOUNTING
+    r->accounting = NULL;
+#endif
     return aom_dk_read_bit(r) != 0;  // marker bit
   }
 }
diff --git a/aom_dsp/dkboolreader.h b/aom_dsp/dkboolreader.h
index 2fd2b37..bc4b02f 100644
--- a/aom_dsp/dkboolreader.h
+++ b/aom_dsp/dkboolreader.h
@@ -26,6 +26,9 @@
 #include "aom/aomdx.h"
 #include "aom/aom_integer.h"
 #include "aom_dsp/prob.h"
+#if CONFIG_ACCOUNTING
+#include "av1/common/accounting.h"
+#endif
 
 #ifdef __cplusplus
 extern "C" {
@@ -51,6 +54,9 @@
   aom_decrypt_cb decrypt_cb;
   void *decrypt_state;
   uint8_t clear_buffer[sizeof(BD_VALUE) + 1];
+#if CONFIG_ACCOUNTING
+  Accounting *accounting;
+#endif
 };
 
 int aom_dk_reader_init(struct aom_dk_reader *r, const uint8_t *buffer,
@@ -65,7 +71,29 @@
   const size_t bits_read = (r->buffer - r->buffer_start) * CHAR_BIT;
   const int count =
       (r->count < LOTS_OF_BITS) ? r->count : r->count - LOTS_OF_BITS;
-  return bits_read + BD_VALUE_SIZE - (count + CHAR_BIT);
+  return bits_read - (count + CHAR_BIT);
+}
+
+/*The resolution of fractional-precision bit usage measurements, i.e.,
+   3 => 1/8th bits.*/
+#define DK_BITRES (3)
+
+static INLINE ptrdiff_t aom_dk_reader_tell_frac(const struct aom_dk_reader *r) {
+  uint32_t num_bits;
+  uint32_t range;
+  int l;
+  int i;
+  num_bits = aom_dk_reader_tell(r) << DK_BITRES;
+  range = r->range;
+  l = 0;
+  for (i = DK_BITRES; i-- > 0;) {
+    int b;
+    range = range * range >> 7;
+    b = (int)(range >> 8);
+    l = l << 1 | b;
+    range >>= b;
+  }
+  return num_bits - l;
 }
 
 static INLINE int aom_dk_reader_has_error(struct aom_dk_reader *r) {
diff --git a/av1/av1_common.mk b/av1/av1_common.mk
index 9730bee..29b71f3 100644
--- a/av1/av1_common.mk
+++ b/av1/av1_common.mk
@@ -103,6 +103,10 @@
 AV1_COMMON_SRCS-yes += common/dering.c
 AV1_COMMON_SRCS-yes += common/dering.h
 endif
+ifeq ($(CONFIG_ACCOUNTING),yes)
+AV1_COMMON_SRCS-yes += common/accounting.h
+AV1_COMMON_SRCS-yes += common/accounting.c
+endif
 AV1_COMMON_SRCS-yes += common/odintrin.c
 AV1_COMMON_SRCS-yes += common/odintrin.h
 
diff --git a/av1/common/accounting.c b/av1/common/accounting.c
new file mode 100644
index 0000000..41280af
--- /dev/null
+++ b/av1/common/accounting.c
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_integer.h"
+#include "./accounting.h"
+
+static int aom_accounting_hash(const char *str) {
+  uint32_t val;
+  const unsigned char *ustr;
+  val = 0;
+  ustr = (const unsigned char *)str;
+  /* This is about the worst hash one can design, but it should be good enough
+     here. */
+  while (*ustr) val += *ustr++;
+  return val % AOM_ACCOUNTING_HASH_SIZE;
+}
+
+/* Dictionary lookup based on an open-addressing hash table. */
+int aom_accounting_dictionary_lookup(Accounting *accounting, const char *str) {
+  int hash;
+  int len;
+  AccountingDictionary *dictionary;
+  dictionary = &accounting->syms.dictionary;
+  hash = aom_accounting_hash(str);
+  while (accounting->hash_dictionary[hash] != -1) {
+    if (strcmp(dictionary->strs[accounting->hash_dictionary[hash]], str) == 0) {
+      return accounting->hash_dictionary[hash];
+    }
+    hash++;
+    if (hash == AOM_ACCOUNTING_HASH_SIZE) hash = 0;
+  }
+  /* No match found. */
+  assert(dictionary->num_strs + 1 < MAX_SYMBOL_TYPES);
+  accounting->hash_dictionary[hash] = dictionary->num_strs;
+  len = strlen(str);
+  dictionary->strs[dictionary->num_strs] = malloc(len + 1);
+  snprintf(dictionary->strs[dictionary->num_strs], len + 1, "%s", str);
+  dictionary->num_strs++;
+  return dictionary->num_strs - 1;
+}
+
+void aom_accounting_init(Accounting *accounting) {
+  int i;
+  accounting->num_syms_allocated = 1000;
+  accounting->syms.syms =
+      malloc(sizeof(AccountingSymbol) * accounting->num_syms_allocated);
+  accounting->syms.dictionary.num_strs = 0;
+  assert(AOM_ACCOUNTING_HASH_SIZE > 2 * MAX_SYMBOL_TYPES);
+  for (i = 0; i < AOM_ACCOUNTING_HASH_SIZE; i++)
+    accounting->hash_dictionary[i] = -1;
+  aom_accounting_reset(accounting);
+}
+
+void aom_accounting_reset(Accounting *accounting) {
+  accounting->syms.num_syms = 0;
+  accounting->context.x = -1;
+  accounting->context.y = -1;
+  accounting->last_tell_frac = 0;
+}
+
+void aom_accounting_clear(Accounting *accounting) {
+  int i;
+  AccountingDictionary *dictionary;
+  free(accounting->syms.syms);
+  dictionary = &accounting->syms.dictionary;
+  for (i = 0; i < dictionary->num_strs; i++) {
+    free(dictionary->strs[i]);
+  }
+}
+
+void aom_accounting_set_context(Accounting *accounting, int16_t x, int16_t y) {
+  accounting->context.x = x;
+  accounting->context.y = y;
+}
+
+void aom_accounting_record(Accounting *accounting, const char *str,
+                           uint32_t bits) {
+  AccountingSymbol sym;
+  // Reuse previous symbol if it has the same context and symbol id.
+  if (accounting->syms.num_syms) {
+    AccountingSymbol *last_sym;
+    last_sym = &accounting->syms.syms[accounting->syms.num_syms - 1];
+    if (memcmp(&last_sym->context, &accounting->context,
+               sizeof(AccountingSymbolContext)) == 0) {
+      uint32_t id;
+      id = aom_accounting_dictionary_lookup(accounting, str);
+      if (id == last_sym->id) {
+        last_sym->bits += bits;
+        last_sym->samples++;
+        return;
+      }
+    }
+  }
+  sym.context = accounting->context;
+  sym.samples = 1;
+  sym.bits = bits;
+  sym.id = aom_accounting_dictionary_lookup(accounting, str);
+  assert(sym.id <= 255);
+  if (accounting->syms.num_syms == accounting->num_syms_allocated) {
+    accounting->num_syms_allocated *= 2;
+    accounting->syms.syms =
+        realloc(accounting->syms.syms,
+                sizeof(AccountingSymbol) * accounting->num_syms_allocated);
+    assert(accounting->syms.syms != NULL);
+  }
+  accounting->syms.syms[accounting->syms.num_syms++] = sym;
+}
+
+void aom_accounting_dump(Accounting *accounting) {
+  int i;
+  AccountingSymbol *sym;
+  printf("----- %d -----\n", accounting->syms.num_syms);
+  for (i = 0; i < accounting->syms.num_syms; i++) {
+    sym = &accounting->syms.syms[i];
+    printf("%s x: %d, y: %d bits: %f samples: %d\n",
+           accounting->syms.dictionary.strs[sym->id], sym->context.x,
+           sym->context.y, (float)sym->bits / 8.0, sym->samples);
+  }
+}
diff --git a/av1/common/accounting.h b/av1/common/accounting.h
new file mode 100644
index 0000000..04be326
--- /dev/null
+++ b/av1/common/accounting.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_ACCOUNTING_H_
+#define AOM_ACCOUNTING_H_
+#include <stdlib.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+#define AOM_ACCOUNTING_HASH_SIZE (1021)
+
+/* Max number of entries for symbol types in the dictionary (increase as
+   necessary). */
+#define MAX_SYMBOL_TYPES (256)
+
+/*The resolution of fractional-precision bit usage measurements, i.e.,
+   3 => 1/8th bits.*/
+#define AOM_ACCT_BITRES (3)
+
+typedef struct {
+  int16_t x;
+  int16_t y;
+} AccountingSymbolContext;
+
+typedef struct {
+  AccountingSymbolContext context;
+  uint32_t id;
+  /** Number of bits in units of 1/8 bit. */
+  uint32_t bits;
+  uint32_t samples;
+} AccountingSymbol;
+
+/** Dictionary for translating strings into id. */
+typedef struct {
+  char *(strs[MAX_SYMBOL_TYPES]);
+  int num_strs;
+} AccountingDictionary;
+
+typedef struct {
+  /** All recorded symbols decoded. */
+  AccountingSymbol *syms;
+  /** Number of symbols actually recorded. */
+  int num_syms;
+  /** Dictionary for translating strings into id. */
+  AccountingDictionary dictionary;
+} AccountingSymbols;
+
+typedef struct {
+  AccountingSymbols syms;
+  /** Size allocated for symbols (not all may be used). */
+  int num_syms_allocated;
+  int16_t hash_dictionary[AOM_ACCOUNTING_HASH_SIZE];
+  AccountingSymbolContext context;
+  uint32_t last_tell_frac;
+} Accounting;
+
+void aom_accounting_init(Accounting *accounting);
+void aom_accounting_reset(Accounting *accounting);
+void aom_accounting_clear(Accounting *accounting);
+void aom_accounting_set_context(Accounting *accounting, int16_t x, int16_t y);
+int aom_accounting_dictionary_lookup(Accounting *accounting, const char *str);
+void aom_accounting_record(Accounting *accounting, const char *str,
+                           uint32_t bits);
+void aom_accounting_dump(Accounting *accounting);
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // AOM_ACCOUNTING_H_
diff --git a/av1/common/av1_fwd_txfm1d.c b/av1/common/av1_fwd_txfm1d.c
index 4c695ae..3e9d5ec 100644
--- a/av1/common/av1_fwd_txfm1d.c
+++ b/av1/common/av1_fwd_txfm1d.c
@@ -40,6 +40,7 @@
   }
 #endif
 
+// TODO(angiebird): Make 1-d txfm functions static
 void av1_fdct4_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
                    const int8_t *stage_range) {
   const int32_t size = 4;
@@ -1528,3 +1529,798 @@
   bf1[31] = -bf0[1];
   range_check(stage, input, bf1, size, stage_range[stage]);
 }
+
+#if CONFIG_TX64X64
+void av1_fdct64_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 64;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[64];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf1 = output;
+  bf1[0] = input[0] + input[63];
+  bf1[1] = input[1] + input[62];
+  bf1[2] = input[2] + input[61];
+  bf1[3] = input[3] + input[60];
+  bf1[4] = input[4] + input[59];
+  bf1[5] = input[5] + input[58];
+  bf1[6] = input[6] + input[57];
+  bf1[7] = input[7] + input[56];
+  bf1[8] = input[8] + input[55];
+  bf1[9] = input[9] + input[54];
+  bf1[10] = input[10] + input[53];
+  bf1[11] = input[11] + input[52];
+  bf1[12] = input[12] + input[51];
+  bf1[13] = input[13] + input[50];
+  bf1[14] = input[14] + input[49];
+  bf1[15] = input[15] + input[48];
+  bf1[16] = input[16] + input[47];
+  bf1[17] = input[17] + input[46];
+  bf1[18] = input[18] + input[45];
+  bf1[19] = input[19] + input[44];
+  bf1[20] = input[20] + input[43];
+  bf1[21] = input[21] + input[42];
+  bf1[22] = input[22] + input[41];
+  bf1[23] = input[23] + input[40];
+  bf1[24] = input[24] + input[39];
+  bf1[25] = input[25] + input[38];
+  bf1[26] = input[26] + input[37];
+  bf1[27] = input[27] + input[36];
+  bf1[28] = input[28] + input[35];
+  bf1[29] = input[29] + input[34];
+  bf1[30] = input[30] + input[33];
+  bf1[31] = input[31] + input[32];
+  bf1[32] = -input[32] + input[31];
+  bf1[33] = -input[33] + input[30];
+  bf1[34] = -input[34] + input[29];
+  bf1[35] = -input[35] + input[28];
+  bf1[36] = -input[36] + input[27];
+  bf1[37] = -input[37] + input[26];
+  bf1[38] = -input[38] + input[25];
+  bf1[39] = -input[39] + input[24];
+  bf1[40] = -input[40] + input[23];
+  bf1[41] = -input[41] + input[22];
+  bf1[42] = -input[42] + input[21];
+  bf1[43] = -input[43] + input[20];
+  bf1[44] = -input[44] + input[19];
+  bf1[45] = -input[45] + input[18];
+  bf1[46] = -input[46] + input[17];
+  bf1[47] = -input[47] + input[16];
+  bf1[48] = -input[48] + input[15];
+  bf1[49] = -input[49] + input[14];
+  bf1[50] = -input[50] + input[13];
+  bf1[51] = -input[51] + input[12];
+  bf1[52] = -input[52] + input[11];
+  bf1[53] = -input[53] + input[10];
+  bf1[54] = -input[54] + input[9];
+  bf1[55] = -input[55] + input[8];
+  bf1[56] = -input[56] + input[7];
+  bf1[57] = -input[57] + input[6];
+  bf1[58] = -input[58] + input[5];
+  bf1[59] = -input[59] + input[4];
+  bf1[60] = -input[60] + input[3];
+  bf1[61] = -input[61] + input[2];
+  bf1[62] = -input[62] + input[1];
+  bf1[63] = -input[63] + input[0];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[31];
+  bf1[1] = bf0[1] + bf0[30];
+  bf1[2] = bf0[2] + bf0[29];
+  bf1[3] = bf0[3] + bf0[28];
+  bf1[4] = bf0[4] + bf0[27];
+  bf1[5] = bf0[5] + bf0[26];
+  bf1[6] = bf0[6] + bf0[25];
+  bf1[7] = bf0[7] + bf0[24];
+  bf1[8] = bf0[8] + bf0[23];
+  bf1[9] = bf0[9] + bf0[22];
+  bf1[10] = bf0[10] + bf0[21];
+  bf1[11] = bf0[11] + bf0[20];
+  bf1[12] = bf0[12] + bf0[19];
+  bf1[13] = bf0[13] + bf0[18];
+  bf1[14] = bf0[14] + bf0[17];
+  bf1[15] = bf0[15] + bf0[16];
+  bf1[16] = -bf0[16] + bf0[15];
+  bf1[17] = -bf0[17] + bf0[14];
+  bf1[18] = -bf0[18] + bf0[13];
+  bf1[19] = -bf0[19] + bf0[12];
+  bf1[20] = -bf0[20] + bf0[11];
+  bf1[21] = -bf0[21] + bf0[10];
+  bf1[22] = -bf0[22] + bf0[9];
+  bf1[23] = -bf0[23] + bf0[8];
+  bf1[24] = -bf0[24] + bf0[7];
+  bf1[25] = -bf0[25] + bf0[6];
+  bf1[26] = -bf0[26] + bf0[5];
+  bf1[27] = -bf0[27] + bf0[4];
+  bf1[28] = -bf0[28] + bf0[3];
+  bf1[29] = -bf0[29] + bf0[2];
+  bf1[30] = -bf0[30] + bf0[1];
+  bf1[31] = -bf0[31] + bf0[0];
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = bf0[34];
+  bf1[35] = bf0[35];
+  bf1[36] = bf0[36];
+  bf1[37] = bf0[37];
+  bf1[38] = bf0[38];
+  bf1[39] = bf0[39];
+  bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit[stage]);
+  bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit[stage]);
+  bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit[stage]);
+  bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit[stage]);
+  bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit[stage]);
+  bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit[stage]);
+  bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit[stage]);
+  bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit[stage]);
+  bf1[48] = half_btf(cospi[32], bf0[48], cospi[32], bf0[47], cos_bit[stage]);
+  bf1[49] = half_btf(cospi[32], bf0[49], cospi[32], bf0[46], cos_bit[stage]);
+  bf1[50] = half_btf(cospi[32], bf0[50], cospi[32], bf0[45], cos_bit[stage]);
+  bf1[51] = half_btf(cospi[32], bf0[51], cospi[32], bf0[44], cos_bit[stage]);
+  bf1[52] = half_btf(cospi[32], bf0[52], cospi[32], bf0[43], cos_bit[stage]);
+  bf1[53] = half_btf(cospi[32], bf0[53], cospi[32], bf0[42], cos_bit[stage]);
+  bf1[54] = half_btf(cospi[32], bf0[54], cospi[32], bf0[41], cos_bit[stage]);
+  bf1[55] = half_btf(cospi[32], bf0[55], cospi[32], bf0[40], cos_bit[stage]);
+  bf1[56] = bf0[56];
+  bf1[57] = bf0[57];
+  bf1[58] = bf0[58];
+  bf1[59] = bf0[59];
+  bf1[60] = bf0[60];
+  bf1[61] = bf0[61];
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[15];
+  bf1[1] = bf0[1] + bf0[14];
+  bf1[2] = bf0[2] + bf0[13];
+  bf1[3] = bf0[3] + bf0[12];
+  bf1[4] = bf0[4] + bf0[11];
+  bf1[5] = bf0[5] + bf0[10];
+  bf1[6] = bf0[6] + bf0[9];
+  bf1[7] = bf0[7] + bf0[8];
+  bf1[8] = -bf0[8] + bf0[7];
+  bf1[9] = -bf0[9] + bf0[6];
+  bf1[10] = -bf0[10] + bf0[5];
+  bf1[11] = -bf0[11] + bf0[4];
+  bf1[12] = -bf0[12] + bf0[3];
+  bf1[13] = -bf0[13] + bf0[2];
+  bf1[14] = -bf0[14] + bf0[1];
+  bf1[15] = -bf0[15] + bf0[0];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
+  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
+  bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit[stage]);
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = bf0[32] + bf0[47];
+  bf1[33] = bf0[33] + bf0[46];
+  bf1[34] = bf0[34] + bf0[45];
+  bf1[35] = bf0[35] + bf0[44];
+  bf1[36] = bf0[36] + bf0[43];
+  bf1[37] = bf0[37] + bf0[42];
+  bf1[38] = bf0[38] + bf0[41];
+  bf1[39] = bf0[39] + bf0[40];
+  bf1[40] = -bf0[40] + bf0[39];
+  bf1[41] = -bf0[41] + bf0[38];
+  bf1[42] = -bf0[42] + bf0[37];
+  bf1[43] = -bf0[43] + bf0[36];
+  bf1[44] = -bf0[44] + bf0[35];
+  bf1[45] = -bf0[45] + bf0[34];
+  bf1[46] = -bf0[46] + bf0[33];
+  bf1[47] = -bf0[47] + bf0[32];
+  bf1[48] = -bf0[48] + bf0[63];
+  bf1[49] = -bf0[49] + bf0[62];
+  bf1[50] = -bf0[50] + bf0[61];
+  bf1[51] = -bf0[51] + bf0[60];
+  bf1[52] = -bf0[52] + bf0[59];
+  bf1[53] = -bf0[53] + bf0[58];
+  bf1[54] = -bf0[54] + bf0[57];
+  bf1[55] = -bf0[55] + bf0[56];
+  bf1[56] = bf0[56] + bf0[55];
+  bf1[57] = bf0[57] + bf0[54];
+  bf1[58] = bf0[58] + bf0[53];
+  bf1[59] = bf0[59] + bf0[52];
+  bf1[60] = bf0[60] + bf0[51];
+  bf1[61] = bf0[61] + bf0[50];
+  bf1[62] = bf0[62] + bf0[49];
+  bf1[63] = bf0[63] + bf0[48];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[7];
+  bf1[1] = bf0[1] + bf0[6];
+  bf1[2] = bf0[2] + bf0[5];
+  bf1[3] = bf0[3] + bf0[4];
+  bf1[4] = -bf0[4] + bf0[3];
+  bf1[5] = -bf0[5] + bf0[2];
+  bf1[6] = -bf0[6] + bf0[1];
+  bf1[7] = -bf0[7] + bf0[0];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit[stage]);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[23];
+  bf1[17] = bf0[17] + bf0[22];
+  bf1[18] = bf0[18] + bf0[21];
+  bf1[19] = bf0[19] + bf0[20];
+  bf1[20] = -bf0[20] + bf0[19];
+  bf1[21] = -bf0[21] + bf0[18];
+  bf1[22] = -bf0[22] + bf0[17];
+  bf1[23] = -bf0[23] + bf0[16];
+  bf1[24] = -bf0[24] + bf0[31];
+  bf1[25] = -bf0[25] + bf0[30];
+  bf1[26] = -bf0[26] + bf0[29];
+  bf1[27] = -bf0[27] + bf0[28];
+  bf1[28] = bf0[28] + bf0[27];
+  bf1[29] = bf0[29] + bf0[26];
+  bf1[30] = bf0[30] + bf0[25];
+  bf1[31] = bf0[31] + bf0[24];
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = bf0[34];
+  bf1[35] = bf0[35];
+  bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit[stage]);
+  bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit[stage]);
+  bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit[stage]);
+  bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit[stage]);
+  bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit[stage]);
+  bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit[stage]);
+  bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit[stage]);
+  bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit[stage]);
+  bf1[44] = bf0[44];
+  bf1[45] = bf0[45];
+  bf1[46] = bf0[46];
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = bf0[49];
+  bf1[50] = bf0[50];
+  bf1[51] = bf0[51];
+  bf1[52] = half_btf(cospi[48], bf0[52], -cospi[16], bf0[43], cos_bit[stage]);
+  bf1[53] = half_btf(cospi[48], bf0[53], -cospi[16], bf0[42], cos_bit[stage]);
+  bf1[54] = half_btf(cospi[48], bf0[54], -cospi[16], bf0[41], cos_bit[stage]);
+  bf1[55] = half_btf(cospi[48], bf0[55], -cospi[16], bf0[40], cos_bit[stage]);
+  bf1[56] = half_btf(cospi[16], bf0[56], cospi[48], bf0[39], cos_bit[stage]);
+  bf1[57] = half_btf(cospi[16], bf0[57], cospi[48], bf0[38], cos_bit[stage]);
+  bf1[58] = half_btf(cospi[16], bf0[58], cospi[48], bf0[37], cos_bit[stage]);
+  bf1[59] = half_btf(cospi[16], bf0[59], cospi[48], bf0[36], cos_bit[stage]);
+  bf1[60] = bf0[60];
+  bf1[61] = bf0[61];
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = -bf0[2] + bf0[1];
+  bf1[3] = -bf0[3] + bf0[0];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit[stage]);
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8] + bf0[11];
+  bf1[9] = bf0[9] + bf0[10];
+  bf1[10] = -bf0[10] + bf0[9];
+  bf1[11] = -bf0[11] + bf0[8];
+  bf1[12] = -bf0[12] + bf0[15];
+  bf1[13] = -bf0[13] + bf0[14];
+  bf1[14] = bf0[14] + bf0[13];
+  bf1[15] = bf0[15] + bf0[12];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit[stage]);
+  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit[stage]);
+  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit[stage]);
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit[stage]);
+  bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit[stage]);
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = bf0[32] + bf0[39];
+  bf1[33] = bf0[33] + bf0[38];
+  bf1[34] = bf0[34] + bf0[37];
+  bf1[35] = bf0[35] + bf0[36];
+  bf1[36] = -bf0[36] + bf0[35];
+  bf1[37] = -bf0[37] + bf0[34];
+  bf1[38] = -bf0[38] + bf0[33];
+  bf1[39] = -bf0[39] + bf0[32];
+  bf1[40] = -bf0[40] + bf0[47];
+  bf1[41] = -bf0[41] + bf0[46];
+  bf1[42] = -bf0[42] + bf0[45];
+  bf1[43] = -bf0[43] + bf0[44];
+  bf1[44] = bf0[44] + bf0[43];
+  bf1[45] = bf0[45] + bf0[42];
+  bf1[46] = bf0[46] + bf0[41];
+  bf1[47] = bf0[47] + bf0[40];
+  bf1[48] = bf0[48] + bf0[55];
+  bf1[49] = bf0[49] + bf0[54];
+  bf1[50] = bf0[50] + bf0[53];
+  bf1[51] = bf0[51] + bf0[52];
+  bf1[52] = -bf0[52] + bf0[51];
+  bf1[53] = -bf0[53] + bf0[50];
+  bf1[54] = -bf0[54] + bf0[49];
+  bf1[55] = -bf0[55] + bf0[48];
+  bf1[56] = -bf0[56] + bf0[63];
+  bf1[57] = -bf0[57] + bf0[62];
+  bf1[58] = -bf0[58] + bf0[61];
+  bf1[59] = -bf0[59] + bf0[60];
+  bf1[60] = bf0[60] + bf0[59];
+  bf1[61] = bf0[61] + bf0[58];
+  bf1[62] = bf0[62] + bf0[57];
+  bf1[63] = bf0[63] + bf0[56];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = -bf0[5] + bf0[4];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[7] + bf0[6];
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit[stage]);
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[19];
+  bf1[17] = bf0[17] + bf0[18];
+  bf1[18] = -bf0[18] + bf0[17];
+  bf1[19] = -bf0[19] + bf0[16];
+  bf1[20] = -bf0[20] + bf0[23];
+  bf1[21] = -bf0[21] + bf0[22];
+  bf1[22] = bf0[22] + bf0[21];
+  bf1[23] = bf0[23] + bf0[20];
+  bf1[24] = bf0[24] + bf0[27];
+  bf1[25] = bf0[25] + bf0[26];
+  bf1[26] = -bf0[26] + bf0[25];
+  bf1[27] = -bf0[27] + bf0[24];
+  bf1[28] = -bf0[28] + bf0[31];
+  bf1[29] = -bf0[29] + bf0[30];
+  bf1[30] = bf0[30] + bf0[29];
+  bf1[31] = bf0[31] + bf0[28];
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit[stage]);
+  bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit[stage]);
+  bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit[stage]);
+  bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit[stage]);
+  bf1[38] = bf0[38];
+  bf1[39] = bf0[39];
+  bf1[40] = bf0[40];
+  bf1[41] = bf0[41];
+  bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit[stage]);
+  bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit[stage]);
+  bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit[stage]);
+  bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit[stage]);
+  bf1[46] = bf0[46];
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = bf0[49];
+  bf1[50] = half_btf(cospi[24], bf0[50], -cospi[40], bf0[45], cos_bit[stage]);
+  bf1[51] = half_btf(cospi[24], bf0[51], -cospi[40], bf0[44], cos_bit[stage]);
+  bf1[52] = half_btf(cospi[40], bf0[52], cospi[24], bf0[43], cos_bit[stage]);
+  bf1[53] = half_btf(cospi[40], bf0[53], cospi[24], bf0[42], cos_bit[stage]);
+  bf1[54] = bf0[54];
+  bf1[55] = bf0[55];
+  bf1[56] = bf0[56];
+  bf1[57] = bf0[57];
+  bf1[58] = half_btf(cospi[56], bf0[58], -cospi[8], bf0[37], cos_bit[stage]);
+  bf1[59] = half_btf(cospi[56], bf0[59], -cospi[8], bf0[36], cos_bit[stage]);
+  bf1[60] = half_btf(cospi[8], bf0[60], cospi[56], bf0[35], cos_bit[stage]);
+  bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit[stage]);
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit[stage]);
+  bf1[8] = bf0[8] + bf0[9];
+  bf1[9] = -bf0[9] + bf0[8];
+  bf1[10] = -bf0[10] + bf0[11];
+  bf1[11] = bf0[11] + bf0[10];
+  bf1[12] = bf0[12] + bf0[13];
+  bf1[13] = -bf0[13] + bf0[12];
+  bf1[14] = -bf0[14] + bf0[15];
+  bf1[15] = bf0[15] + bf0[14];
+  bf1[16] = bf0[16];
+  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit[stage]);
+  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit[stage]);
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit[stage]);
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit[stage]);
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit[stage]);
+  bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit[stage]);
+  bf1[31] = bf0[31];
+  bf1[32] = bf0[32] + bf0[35];
+  bf1[33] = bf0[33] + bf0[34];
+  bf1[34] = -bf0[34] + bf0[33];
+  bf1[35] = -bf0[35] + bf0[32];
+  bf1[36] = -bf0[36] + bf0[39];
+  bf1[37] = -bf0[37] + bf0[38];
+  bf1[38] = bf0[38] + bf0[37];
+  bf1[39] = bf0[39] + bf0[36];
+  bf1[40] = bf0[40] + bf0[43];
+  bf1[41] = bf0[41] + bf0[42];
+  bf1[42] = -bf0[42] + bf0[41];
+  bf1[43] = -bf0[43] + bf0[40];
+  bf1[44] = -bf0[44] + bf0[47];
+  bf1[45] = -bf0[45] + bf0[46];
+  bf1[46] = bf0[46] + bf0[45];
+  bf1[47] = bf0[47] + bf0[44];
+  bf1[48] = bf0[48] + bf0[51];
+  bf1[49] = bf0[49] + bf0[50];
+  bf1[50] = -bf0[50] + bf0[49];
+  bf1[51] = -bf0[51] + bf0[48];
+  bf1[52] = -bf0[52] + bf0[55];
+  bf1[53] = -bf0[53] + bf0[54];
+  bf1[54] = bf0[54] + bf0[53];
+  bf1[55] = bf0[55] + bf0[52];
+  bf1[56] = bf0[56] + bf0[59];
+  bf1[57] = bf0[57] + bf0[58];
+  bf1[58] = -bf0[58] + bf0[57];
+  bf1[59] = -bf0[59] + bf0[56];
+  bf1[60] = -bf0[60] + bf0[63];
+  bf1[61] = -bf0[61] + bf0[62];
+  bf1[62] = bf0[62] + bf0[61];
+  bf1[63] = bf0[63] + bf0[60];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit[stage]);
+  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit[stage]);
+  bf1[16] = bf0[16] + bf0[17];
+  bf1[17] = -bf0[17] + bf0[16];
+  bf1[18] = -bf0[18] + bf0[19];
+  bf1[19] = bf0[19] + bf0[18];
+  bf1[20] = bf0[20] + bf0[21];
+  bf1[21] = -bf0[21] + bf0[20];
+  bf1[22] = -bf0[22] + bf0[23];
+  bf1[23] = bf0[23] + bf0[22];
+  bf1[24] = bf0[24] + bf0[25];
+  bf1[25] = -bf0[25] + bf0[24];
+  bf1[26] = -bf0[26] + bf0[27];
+  bf1[27] = bf0[27] + bf0[26];
+  bf1[28] = bf0[28] + bf0[29];
+  bf1[29] = -bf0[29] + bf0[28];
+  bf1[30] = -bf0[30] + bf0[31];
+  bf1[31] = bf0[31] + bf0[30];
+  bf1[32] = bf0[32];
+  bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit[stage]);
+  bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit[stage]);
+  bf1[35] = bf0[35];
+  bf1[36] = bf0[36];
+  bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit[stage]);
+  bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit[stage]);
+  bf1[39] = bf0[39];
+  bf1[40] = bf0[40];
+  bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit[stage]);
+  bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit[stage]);
+  bf1[43] = bf0[43];
+  bf1[44] = bf0[44];
+  bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit[stage]);
+  bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit[stage]);
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = half_btf(cospi[12], bf0[49], -cospi[52], bf0[46], cos_bit[stage]);
+  bf1[50] = half_btf(cospi[52], bf0[50], cospi[12], bf0[45], cos_bit[stage]);
+  bf1[51] = bf0[51];
+  bf1[52] = bf0[52];
+  bf1[53] = half_btf(cospi[44], bf0[53], -cospi[20], bf0[42], cos_bit[stage]);
+  bf1[54] = half_btf(cospi[20], bf0[54], cospi[44], bf0[41], cos_bit[stage]);
+  bf1[55] = bf0[55];
+  bf1[56] = bf0[56];
+  bf1[57] = half_btf(cospi[28], bf0[57], -cospi[36], bf0[38], cos_bit[stage]);
+  bf1[58] = half_btf(cospi[36], bf0[58], cospi[28], bf0[37], cos_bit[stage]);
+  bf1[59] = bf0[59];
+  bf1[60] = bf0[60];
+  bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit[stage]);
+  bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit[stage]);
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit[stage]);
+  bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit[stage]);
+  bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit[stage]);
+  bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit[stage]);
+  bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit[stage]);
+  bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit[stage]);
+  bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit[stage]);
+  bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit[stage]);
+  bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit[stage]);
+  bf1[32] = bf0[32] + bf0[33];
+  bf1[33] = -bf0[33] + bf0[32];
+  bf1[34] = -bf0[34] + bf0[35];
+  bf1[35] = bf0[35] + bf0[34];
+  bf1[36] = bf0[36] + bf0[37];
+  bf1[37] = -bf0[37] + bf0[36];
+  bf1[38] = -bf0[38] + bf0[39];
+  bf1[39] = bf0[39] + bf0[38];
+  bf1[40] = bf0[40] + bf0[41];
+  bf1[41] = -bf0[41] + bf0[40];
+  bf1[42] = -bf0[42] + bf0[43];
+  bf1[43] = bf0[43] + bf0[42];
+  bf1[44] = bf0[44] + bf0[45];
+  bf1[45] = -bf0[45] + bf0[44];
+  bf1[46] = -bf0[46] + bf0[47];
+  bf1[47] = bf0[47] + bf0[46];
+  bf1[48] = bf0[48] + bf0[49];
+  bf1[49] = -bf0[49] + bf0[48];
+  bf1[50] = -bf0[50] + bf0[51];
+  bf1[51] = bf0[51] + bf0[50];
+  bf1[52] = bf0[52] + bf0[53];
+  bf1[53] = -bf0[53] + bf0[52];
+  bf1[54] = -bf0[54] + bf0[55];
+  bf1[55] = bf0[55] + bf0[54];
+  bf1[56] = bf0[56] + bf0[57];
+  bf1[57] = -bf0[57] + bf0[56];
+  bf1[58] = -bf0[58] + bf0[59];
+  bf1[59] = bf0[59] + bf0[58];
+  bf1[60] = bf0[60] + bf0[61];
+  bf1[61] = -bf0[61] + bf0[60];
+  bf1[62] = -bf0[62] + bf0[63];
+  bf1[63] = bf0[63] + bf0[62];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 10
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = bf0[21];
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = bf0[26];
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = half_btf(cospi[63], bf0[32], cospi[1], bf0[63], cos_bit[stage]);
+  bf1[33] = half_btf(cospi[31], bf0[33], cospi[33], bf0[62], cos_bit[stage]);
+  bf1[34] = half_btf(cospi[47], bf0[34], cospi[17], bf0[61], cos_bit[stage]);
+  bf1[35] = half_btf(cospi[15], bf0[35], cospi[49], bf0[60], cos_bit[stage]);
+  bf1[36] = half_btf(cospi[55], bf0[36], cospi[9], bf0[59], cos_bit[stage]);
+  bf1[37] = half_btf(cospi[23], bf0[37], cospi[41], bf0[58], cos_bit[stage]);
+  bf1[38] = half_btf(cospi[39], bf0[38], cospi[25], bf0[57], cos_bit[stage]);
+  bf1[39] = half_btf(cospi[7], bf0[39], cospi[57], bf0[56], cos_bit[stage]);
+  bf1[40] = half_btf(cospi[59], bf0[40], cospi[5], bf0[55], cos_bit[stage]);
+  bf1[41] = half_btf(cospi[27], bf0[41], cospi[37], bf0[54], cos_bit[stage]);
+  bf1[42] = half_btf(cospi[43], bf0[42], cospi[21], bf0[53], cos_bit[stage]);
+  bf1[43] = half_btf(cospi[11], bf0[43], cospi[53], bf0[52], cos_bit[stage]);
+  bf1[44] = half_btf(cospi[51], bf0[44], cospi[13], bf0[51], cos_bit[stage]);
+  bf1[45] = half_btf(cospi[19], bf0[45], cospi[45], bf0[50], cos_bit[stage]);
+  bf1[46] = half_btf(cospi[35], bf0[46], cospi[29], bf0[49], cos_bit[stage]);
+  bf1[47] = half_btf(cospi[3], bf0[47], cospi[61], bf0[48], cos_bit[stage]);
+  bf1[48] = half_btf(cospi[3], bf0[48], -cospi[61], bf0[47], cos_bit[stage]);
+  bf1[49] = half_btf(cospi[35], bf0[49], -cospi[29], bf0[46], cos_bit[stage]);
+  bf1[50] = half_btf(cospi[19], bf0[50], -cospi[45], bf0[45], cos_bit[stage]);
+  bf1[51] = half_btf(cospi[51], bf0[51], -cospi[13], bf0[44], cos_bit[stage]);
+  bf1[52] = half_btf(cospi[11], bf0[52], -cospi[53], bf0[43], cos_bit[stage]);
+  bf1[53] = half_btf(cospi[43], bf0[53], -cospi[21], bf0[42], cos_bit[stage]);
+  bf1[54] = half_btf(cospi[27], bf0[54], -cospi[37], bf0[41], cos_bit[stage]);
+  bf1[55] = half_btf(cospi[59], bf0[55], -cospi[5], bf0[40], cos_bit[stage]);
+  bf1[56] = half_btf(cospi[7], bf0[56], -cospi[57], bf0[39], cos_bit[stage]);
+  bf1[57] = half_btf(cospi[39], bf0[57], -cospi[25], bf0[38], cos_bit[stage]);
+  bf1[58] = half_btf(cospi[23], bf0[58], -cospi[41], bf0[37], cos_bit[stage]);
+  bf1[59] = half_btf(cospi[55], bf0[59], -cospi[9], bf0[36], cos_bit[stage]);
+  bf1[60] = half_btf(cospi[15], bf0[60], -cospi[49], bf0[35], cos_bit[stage]);
+  bf1[61] = half_btf(cospi[47], bf0[61], -cospi[17], bf0[34], cos_bit[stage]);
+  bf1[62] = half_btf(cospi[31], bf0[62], -cospi[33], bf0[33], cos_bit[stage]);
+  bf1[63] = half_btf(cospi[63], bf0[63], -cospi[1], bf0[32], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 11
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[32];
+  bf1[2] = bf0[16];
+  bf1[3] = bf0[48];
+  bf1[4] = bf0[8];
+  bf1[5] = bf0[40];
+  bf1[6] = bf0[24];
+  bf1[7] = bf0[56];
+  bf1[8] = bf0[4];
+  bf1[9] = bf0[36];
+  bf1[10] = bf0[20];
+  bf1[11] = bf0[52];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[44];
+  bf1[14] = bf0[28];
+  bf1[15] = bf0[60];
+  bf1[16] = bf0[2];
+  bf1[17] = bf0[34];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[50];
+  bf1[20] = bf0[10];
+  bf1[21] = bf0[42];
+  bf1[22] = bf0[26];
+  bf1[23] = bf0[58];
+  bf1[24] = bf0[6];
+  bf1[25] = bf0[38];
+  bf1[26] = bf0[22];
+  bf1[27] = bf0[54];
+  bf1[28] = bf0[14];
+  bf1[29] = bf0[46];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[62];
+  bf1[32] = bf0[1];
+  bf1[33] = bf0[33];
+  bf1[34] = bf0[17];
+  bf1[35] = bf0[49];
+  bf1[36] = bf0[9];
+  bf1[37] = bf0[41];
+  bf1[38] = bf0[25];
+  bf1[39] = bf0[57];
+  bf1[40] = bf0[5];
+  bf1[41] = bf0[37];
+  bf1[42] = bf0[21];
+  bf1[43] = bf0[53];
+  bf1[44] = bf0[13];
+  bf1[45] = bf0[45];
+  bf1[46] = bf0[29];
+  bf1[47] = bf0[61];
+  bf1[48] = bf0[3];
+  bf1[49] = bf0[35];
+  bf1[50] = bf0[19];
+  bf1[51] = bf0[51];
+  bf1[52] = bf0[11];
+  bf1[53] = bf0[43];
+  bf1[54] = bf0[27];
+  bf1[55] = bf0[59];
+  bf1[56] = bf0[7];
+  bf1[57] = bf0[39];
+  bf1[58] = bf0[23];
+  bf1[59] = bf0[55];
+  bf1[60] = bf0[15];
+  bf1[61] = bf0[47];
+  bf1[62] = bf0[31];
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+#endif  // CONFIG_TX64X64
diff --git a/av1/common/av1_inv_txfm1d.c b/av1/common/av1_inv_txfm1d.c
index 2e9bbcb..40d8403 100644
--- a/av1/common/av1_inv_txfm1d.c
+++ b/av1/common/av1_inv_txfm1d.c
@@ -40,6 +40,7 @@
   }
 #endif
 
+// TODO(angiebird): Make 1-d txfm functions static
 void av1_idct4_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
                    const int8_t *stage_range) {
   const int32_t size = 4;
@@ -1535,3 +1536,798 @@
   bf1[31] = bf0[0];
   range_check(stage, input, bf1, size, stage_range[stage]);
 }
+
+#if CONFIG_TX64X64
+void av1_idct64_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 64;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[64];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = input[32];
+  bf1[2] = input[16];
+  bf1[3] = input[48];
+  bf1[4] = input[8];
+  bf1[5] = input[40];
+  bf1[6] = input[24];
+  bf1[7] = input[56];
+  bf1[8] = input[4];
+  bf1[9] = input[36];
+  bf1[10] = input[20];
+  bf1[11] = input[52];
+  bf1[12] = input[12];
+  bf1[13] = input[44];
+  bf1[14] = input[28];
+  bf1[15] = input[60];
+  bf1[16] = input[2];
+  bf1[17] = input[34];
+  bf1[18] = input[18];
+  bf1[19] = input[50];
+  bf1[20] = input[10];
+  bf1[21] = input[42];
+  bf1[22] = input[26];
+  bf1[23] = input[58];
+  bf1[24] = input[6];
+  bf1[25] = input[38];
+  bf1[26] = input[22];
+  bf1[27] = input[54];
+  bf1[28] = input[14];
+  bf1[29] = input[46];
+  bf1[30] = input[30];
+  bf1[31] = input[62];
+  bf1[32] = input[1];
+  bf1[33] = input[33];
+  bf1[34] = input[17];
+  bf1[35] = input[49];
+  bf1[36] = input[9];
+  bf1[37] = input[41];
+  bf1[38] = input[25];
+  bf1[39] = input[57];
+  bf1[40] = input[5];
+  bf1[41] = input[37];
+  bf1[42] = input[21];
+  bf1[43] = input[53];
+  bf1[44] = input[13];
+  bf1[45] = input[45];
+  bf1[46] = input[29];
+  bf1[47] = input[61];
+  bf1[48] = input[3];
+  bf1[49] = input[35];
+  bf1[50] = input[19];
+  bf1[51] = input[51];
+  bf1[52] = input[11];
+  bf1[53] = input[43];
+  bf1[54] = input[27];
+  bf1[55] = input[59];
+  bf1[56] = input[7];
+  bf1[57] = input[39];
+  bf1[58] = input[23];
+  bf1[59] = input[55];
+  bf1[60] = input[15];
+  bf1[61] = input[47];
+  bf1[62] = input[31];
+  bf1[63] = input[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = bf0[21];
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = bf0[26];
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = half_btf(cospi[63], bf0[32], -cospi[1], bf0[63], cos_bit[stage]);
+  bf1[33] = half_btf(cospi[31], bf0[33], -cospi[33], bf0[62], cos_bit[stage]);
+  bf1[34] = half_btf(cospi[47], bf0[34], -cospi[17], bf0[61], cos_bit[stage]);
+  bf1[35] = half_btf(cospi[15], bf0[35], -cospi[49], bf0[60], cos_bit[stage]);
+  bf1[36] = half_btf(cospi[55], bf0[36], -cospi[9], bf0[59], cos_bit[stage]);
+  bf1[37] = half_btf(cospi[23], bf0[37], -cospi[41], bf0[58], cos_bit[stage]);
+  bf1[38] = half_btf(cospi[39], bf0[38], -cospi[25], bf0[57], cos_bit[stage]);
+  bf1[39] = half_btf(cospi[7], bf0[39], -cospi[57], bf0[56], cos_bit[stage]);
+  bf1[40] = half_btf(cospi[59], bf0[40], -cospi[5], bf0[55], cos_bit[stage]);
+  bf1[41] = half_btf(cospi[27], bf0[41], -cospi[37], bf0[54], cos_bit[stage]);
+  bf1[42] = half_btf(cospi[43], bf0[42], -cospi[21], bf0[53], cos_bit[stage]);
+  bf1[43] = half_btf(cospi[11], bf0[43], -cospi[53], bf0[52], cos_bit[stage]);
+  bf1[44] = half_btf(cospi[51], bf0[44], -cospi[13], bf0[51], cos_bit[stage]);
+  bf1[45] = half_btf(cospi[19], bf0[45], -cospi[45], bf0[50], cos_bit[stage]);
+  bf1[46] = half_btf(cospi[35], bf0[46], -cospi[29], bf0[49], cos_bit[stage]);
+  bf1[47] = half_btf(cospi[3], bf0[47], -cospi[61], bf0[48], cos_bit[stage]);
+  bf1[48] = half_btf(cospi[61], bf0[47], cospi[3], bf0[48], cos_bit[stage]);
+  bf1[49] = half_btf(cospi[29], bf0[46], cospi[35], bf0[49], cos_bit[stage]);
+  bf1[50] = half_btf(cospi[45], bf0[45], cospi[19], bf0[50], cos_bit[stage]);
+  bf1[51] = half_btf(cospi[13], bf0[44], cospi[51], bf0[51], cos_bit[stage]);
+  bf1[52] = half_btf(cospi[53], bf0[43], cospi[11], bf0[52], cos_bit[stage]);
+  bf1[53] = half_btf(cospi[21], bf0[42], cospi[43], bf0[53], cos_bit[stage]);
+  bf1[54] = half_btf(cospi[37], bf0[41], cospi[27], bf0[54], cos_bit[stage]);
+  bf1[55] = half_btf(cospi[5], bf0[40], cospi[59], bf0[55], cos_bit[stage]);
+  bf1[56] = half_btf(cospi[57], bf0[39], cospi[7], bf0[56], cos_bit[stage]);
+  bf1[57] = half_btf(cospi[25], bf0[38], cospi[39], bf0[57], cos_bit[stage]);
+  bf1[58] = half_btf(cospi[41], bf0[37], cospi[23], bf0[58], cos_bit[stage]);
+  bf1[59] = half_btf(cospi[9], bf0[36], cospi[55], bf0[59], cos_bit[stage]);
+  bf1[60] = half_btf(cospi[49], bf0[35], cospi[15], bf0[60], cos_bit[stage]);
+  bf1[61] = half_btf(cospi[17], bf0[34], cospi[47], bf0[61], cos_bit[stage]);
+  bf1[62] = half_btf(cospi[33], bf0[33], cospi[31], bf0[62], cos_bit[stage]);
+  bf1[63] = half_btf(cospi[1], bf0[32], cospi[63], bf0[63], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit[stage]);
+  bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit[stage]);
+  bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit[stage]);
+  bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit[stage]);
+  bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit[stage]);
+  bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit[stage]);
+  bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit[stage]);
+  bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit[stage]);
+  bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit[stage]);
+  bf1[32] = bf0[32] + bf0[33];
+  bf1[33] = bf0[32] - bf0[33];
+  bf1[34] = -bf0[34] + bf0[35];
+  bf1[35] = bf0[34] + bf0[35];
+  bf1[36] = bf0[36] + bf0[37];
+  bf1[37] = bf0[36] - bf0[37];
+  bf1[38] = -bf0[38] + bf0[39];
+  bf1[39] = bf0[38] + bf0[39];
+  bf1[40] = bf0[40] + bf0[41];
+  bf1[41] = bf0[40] - bf0[41];
+  bf1[42] = -bf0[42] + bf0[43];
+  bf1[43] = bf0[42] + bf0[43];
+  bf1[44] = bf0[44] + bf0[45];
+  bf1[45] = bf0[44] - bf0[45];
+  bf1[46] = -bf0[46] + bf0[47];
+  bf1[47] = bf0[46] + bf0[47];
+  bf1[48] = bf0[48] + bf0[49];
+  bf1[49] = bf0[48] - bf0[49];
+  bf1[50] = -bf0[50] + bf0[51];
+  bf1[51] = bf0[50] + bf0[51];
+  bf1[52] = bf0[52] + bf0[53];
+  bf1[53] = bf0[52] - bf0[53];
+  bf1[54] = -bf0[54] + bf0[55];
+  bf1[55] = bf0[54] + bf0[55];
+  bf1[56] = bf0[56] + bf0[57];
+  bf1[57] = bf0[56] - bf0[57];
+  bf1[58] = -bf0[58] + bf0[59];
+  bf1[59] = bf0[58] + bf0[59];
+  bf1[60] = bf0[60] + bf0[61];
+  bf1[61] = bf0[60] - bf0[61];
+  bf1[62] = -bf0[62] + bf0[63];
+  bf1[63] = bf0[62] + bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit[stage]);
+  bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit[stage]);
+  bf1[16] = bf0[16] + bf0[17];
+  bf1[17] = bf0[16] - bf0[17];
+  bf1[18] = -bf0[18] + bf0[19];
+  bf1[19] = bf0[18] + bf0[19];
+  bf1[20] = bf0[20] + bf0[21];
+  bf1[21] = bf0[20] - bf0[21];
+  bf1[22] = -bf0[22] + bf0[23];
+  bf1[23] = bf0[22] + bf0[23];
+  bf1[24] = bf0[24] + bf0[25];
+  bf1[25] = bf0[24] - bf0[25];
+  bf1[26] = -bf0[26] + bf0[27];
+  bf1[27] = bf0[26] + bf0[27];
+  bf1[28] = bf0[28] + bf0[29];
+  bf1[29] = bf0[28] - bf0[29];
+  bf1[30] = -bf0[30] + bf0[31];
+  bf1[31] = bf0[30] + bf0[31];
+  bf1[32] = bf0[32];
+  bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit[stage]);
+  bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit[stage]);
+  bf1[35] = bf0[35];
+  bf1[36] = bf0[36];
+  bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit[stage]);
+  bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit[stage]);
+  bf1[39] = bf0[39];
+  bf1[40] = bf0[40];
+  bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit[stage]);
+  bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit[stage]);
+  bf1[43] = bf0[43];
+  bf1[44] = bf0[44];
+  bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit[stage]);
+  bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit[stage]);
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = half_btf(-cospi[52], bf0[46], cospi[12], bf0[49], cos_bit[stage]);
+  bf1[50] = half_btf(cospi[12], bf0[45], cospi[52], bf0[50], cos_bit[stage]);
+  bf1[51] = bf0[51];
+  bf1[52] = bf0[52];
+  bf1[53] = half_btf(-cospi[20], bf0[42], cospi[44], bf0[53], cos_bit[stage]);
+  bf1[54] = half_btf(cospi[44], bf0[41], cospi[20], bf0[54], cos_bit[stage]);
+  bf1[55] = bf0[55];
+  bf1[56] = bf0[56];
+  bf1[57] = half_btf(-cospi[36], bf0[38], cospi[28], bf0[57], cos_bit[stage]);
+  bf1[58] = half_btf(cospi[28], bf0[37], cospi[36], bf0[58], cos_bit[stage]);
+  bf1[59] = bf0[59];
+  bf1[60] = bf0[60];
+  bf1[61] = half_btf(-cospi[4], bf0[34], cospi[60], bf0[61], cos_bit[stage]);
+  bf1[62] = half_btf(cospi[60], bf0[33], cospi[4], bf0[62], cos_bit[stage]);
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit[stage]);
+  bf1[8] = bf0[8] + bf0[9];
+  bf1[9] = bf0[8] - bf0[9];
+  bf1[10] = -bf0[10] + bf0[11];
+  bf1[11] = bf0[10] + bf0[11];
+  bf1[12] = bf0[12] + bf0[13];
+  bf1[13] = bf0[12] - bf0[13];
+  bf1[14] = -bf0[14] + bf0[15];
+  bf1[15] = bf0[14] + bf0[15];
+  bf1[16] = bf0[16];
+  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit[stage]);
+  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit[stage]);
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit[stage]);
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit[stage]);
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit[stage]);
+  bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit[stage]);
+  bf1[31] = bf0[31];
+  bf1[32] = bf0[32] + bf0[35];
+  bf1[33] = bf0[33] + bf0[34];
+  bf1[34] = bf0[33] - bf0[34];
+  bf1[35] = bf0[32] - bf0[35];
+  bf1[36] = -bf0[36] + bf0[39];
+  bf1[37] = -bf0[37] + bf0[38];
+  bf1[38] = bf0[37] + bf0[38];
+  bf1[39] = bf0[36] + bf0[39];
+  bf1[40] = bf0[40] + bf0[43];
+  bf1[41] = bf0[41] + bf0[42];
+  bf1[42] = bf0[41] - bf0[42];
+  bf1[43] = bf0[40] - bf0[43];
+  bf1[44] = -bf0[44] + bf0[47];
+  bf1[45] = -bf0[45] + bf0[46];
+  bf1[46] = bf0[45] + bf0[46];
+  bf1[47] = bf0[44] + bf0[47];
+  bf1[48] = bf0[48] + bf0[51];
+  bf1[49] = bf0[49] + bf0[50];
+  bf1[50] = bf0[49] - bf0[50];
+  bf1[51] = bf0[48] - bf0[51];
+  bf1[52] = -bf0[52] + bf0[55];
+  bf1[53] = -bf0[53] + bf0[54];
+  bf1[54] = bf0[53] + bf0[54];
+  bf1[55] = bf0[52] + bf0[55];
+  bf1[56] = bf0[56] + bf0[59];
+  bf1[57] = bf0[57] + bf0[58];
+  bf1[58] = bf0[57] - bf0[58];
+  bf1[59] = bf0[56] - bf0[59];
+  bf1[60] = -bf0[60] + bf0[63];
+  bf1[61] = -bf0[61] + bf0[62];
+  bf1[62] = bf0[61] + bf0[62];
+  bf1[63] = bf0[60] + bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit[stage]);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = bf0[4] - bf0[5];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[6] + bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit[stage]);
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[19];
+  bf1[17] = bf0[17] + bf0[18];
+  bf1[18] = bf0[17] - bf0[18];
+  bf1[19] = bf0[16] - bf0[19];
+  bf1[20] = -bf0[20] + bf0[23];
+  bf1[21] = -bf0[21] + bf0[22];
+  bf1[22] = bf0[21] + bf0[22];
+  bf1[23] = bf0[20] + bf0[23];
+  bf1[24] = bf0[24] + bf0[27];
+  bf1[25] = bf0[25] + bf0[26];
+  bf1[26] = bf0[25] - bf0[26];
+  bf1[27] = bf0[24] - bf0[27];
+  bf1[28] = -bf0[28] + bf0[31];
+  bf1[29] = -bf0[29] + bf0[30];
+  bf1[30] = bf0[29] + bf0[30];
+  bf1[31] = bf0[28] + bf0[31];
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit[stage]);
+  bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit[stage]);
+  bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit[stage]);
+  bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit[stage]);
+  bf1[38] = bf0[38];
+  bf1[39] = bf0[39];
+  bf1[40] = bf0[40];
+  bf1[41] = bf0[41];
+  bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit[stage]);
+  bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit[stage]);
+  bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit[stage]);
+  bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit[stage]);
+  bf1[46] = bf0[46];
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = bf0[49];
+  bf1[50] = half_btf(-cospi[40], bf0[45], cospi[24], bf0[50], cos_bit[stage]);
+  bf1[51] = half_btf(-cospi[40], bf0[44], cospi[24], bf0[51], cos_bit[stage]);
+  bf1[52] = half_btf(cospi[24], bf0[43], cospi[40], bf0[52], cos_bit[stage]);
+  bf1[53] = half_btf(cospi[24], bf0[42], cospi[40], bf0[53], cos_bit[stage]);
+  bf1[54] = bf0[54];
+  bf1[55] = bf0[55];
+  bf1[56] = bf0[56];
+  bf1[57] = bf0[57];
+  bf1[58] = half_btf(-cospi[8], bf0[37], cospi[56], bf0[58], cos_bit[stage]);
+  bf1[59] = half_btf(-cospi[8], bf0[36], cospi[56], bf0[59], cos_bit[stage]);
+  bf1[60] = half_btf(cospi[56], bf0[35], cospi[8], bf0[60], cos_bit[stage]);
+  bf1[61] = half_btf(cospi[56], bf0[34], cospi[8], bf0[61], cos_bit[stage]);
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = bf0[1] - bf0[2];
+  bf1[3] = bf0[0] - bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8] + bf0[11];
+  bf1[9] = bf0[9] + bf0[10];
+  bf1[10] = bf0[9] - bf0[10];
+  bf1[11] = bf0[8] - bf0[11];
+  bf1[12] = -bf0[12] + bf0[15];
+  bf1[13] = -bf0[13] + bf0[14];
+  bf1[14] = bf0[13] + bf0[14];
+  bf1[15] = bf0[12] + bf0[15];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit[stage]);
+  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit[stage]);
+  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit[stage]);
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit[stage]);
+  bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit[stage]);
+  bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit[stage]);
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = bf0[32] + bf0[39];
+  bf1[33] = bf0[33] + bf0[38];
+  bf1[34] = bf0[34] + bf0[37];
+  bf1[35] = bf0[35] + bf0[36];
+  bf1[36] = bf0[35] - bf0[36];
+  bf1[37] = bf0[34] - bf0[37];
+  bf1[38] = bf0[33] - bf0[38];
+  bf1[39] = bf0[32] - bf0[39];
+  bf1[40] = -bf0[40] + bf0[47];
+  bf1[41] = -bf0[41] + bf0[46];
+  bf1[42] = -bf0[42] + bf0[45];
+  bf1[43] = -bf0[43] + bf0[44];
+  bf1[44] = bf0[43] + bf0[44];
+  bf1[45] = bf0[42] + bf0[45];
+  bf1[46] = bf0[41] + bf0[46];
+  bf1[47] = bf0[40] + bf0[47];
+  bf1[48] = bf0[48] + bf0[55];
+  bf1[49] = bf0[49] + bf0[54];
+  bf1[50] = bf0[50] + bf0[53];
+  bf1[51] = bf0[51] + bf0[52];
+  bf1[52] = bf0[51] - bf0[52];
+  bf1[53] = bf0[50] - bf0[53];
+  bf1[54] = bf0[49] - bf0[54];
+  bf1[55] = bf0[48] - bf0[55];
+  bf1[56] = -bf0[56] + bf0[63];
+  bf1[57] = -bf0[57] + bf0[62];
+  bf1[58] = -bf0[58] + bf0[61];
+  bf1[59] = -bf0[59] + bf0[60];
+  bf1[60] = bf0[59] + bf0[60];
+  bf1[61] = bf0[58] + bf0[61];
+  bf1[62] = bf0[57] + bf0[62];
+  bf1[63] = bf0[56] + bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[7];
+  bf1[1] = bf0[1] + bf0[6];
+  bf1[2] = bf0[2] + bf0[5];
+  bf1[3] = bf0[3] + bf0[4];
+  bf1[4] = bf0[3] - bf0[4];
+  bf1[5] = bf0[2] - bf0[5];
+  bf1[6] = bf0[1] - bf0[6];
+  bf1[7] = bf0[0] - bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[23];
+  bf1[17] = bf0[17] + bf0[22];
+  bf1[18] = bf0[18] + bf0[21];
+  bf1[19] = bf0[19] + bf0[20];
+  bf1[20] = bf0[19] - bf0[20];
+  bf1[21] = bf0[18] - bf0[21];
+  bf1[22] = bf0[17] - bf0[22];
+  bf1[23] = bf0[16] - bf0[23];
+  bf1[24] = -bf0[24] + bf0[31];
+  bf1[25] = -bf0[25] + bf0[30];
+  bf1[26] = -bf0[26] + bf0[29];
+  bf1[27] = -bf0[27] + bf0[28];
+  bf1[28] = bf0[27] + bf0[28];
+  bf1[29] = bf0[26] + bf0[29];
+  bf1[30] = bf0[25] + bf0[30];
+  bf1[31] = bf0[24] + bf0[31];
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = bf0[34];
+  bf1[35] = bf0[35];
+  bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit[stage]);
+  bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit[stage]);
+  bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit[stage]);
+  bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit[stage]);
+  bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit[stage]);
+  bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit[stage]);
+  bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit[stage]);
+  bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit[stage]);
+  bf1[44] = bf0[44];
+  bf1[45] = bf0[45];
+  bf1[46] = bf0[46];
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = bf0[49];
+  bf1[50] = bf0[50];
+  bf1[51] = bf0[51];
+  bf1[52] = half_btf(-cospi[16], bf0[43], cospi[48], bf0[52], cos_bit[stage]);
+  bf1[53] = half_btf(-cospi[16], bf0[42], cospi[48], bf0[53], cos_bit[stage]);
+  bf1[54] = half_btf(-cospi[16], bf0[41], cospi[48], bf0[54], cos_bit[stage]);
+  bf1[55] = half_btf(-cospi[16], bf0[40], cospi[48], bf0[55], cos_bit[stage]);
+  bf1[56] = half_btf(cospi[48], bf0[39], cospi[16], bf0[56], cos_bit[stage]);
+  bf1[57] = half_btf(cospi[48], bf0[38], cospi[16], bf0[57], cos_bit[stage]);
+  bf1[58] = half_btf(cospi[48], bf0[37], cospi[16], bf0[58], cos_bit[stage]);
+  bf1[59] = half_btf(cospi[48], bf0[36], cospi[16], bf0[59], cos_bit[stage]);
+  bf1[60] = bf0[60];
+  bf1[61] = bf0[61];
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[15];
+  bf1[1] = bf0[1] + bf0[14];
+  bf1[2] = bf0[2] + bf0[13];
+  bf1[3] = bf0[3] + bf0[12];
+  bf1[4] = bf0[4] + bf0[11];
+  bf1[5] = bf0[5] + bf0[10];
+  bf1[6] = bf0[6] + bf0[9];
+  bf1[7] = bf0[7] + bf0[8];
+  bf1[8] = bf0[7] - bf0[8];
+  bf1[9] = bf0[6] - bf0[9];
+  bf1[10] = bf0[5] - bf0[10];
+  bf1[11] = bf0[4] - bf0[11];
+  bf1[12] = bf0[3] - bf0[12];
+  bf1[13] = bf0[2] - bf0[13];
+  bf1[14] = bf0[1] - bf0[14];
+  bf1[15] = bf0[0] - bf0[15];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
+  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
+  bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = bf0[32] + bf0[47];
+  bf1[33] = bf0[33] + bf0[46];
+  bf1[34] = bf0[34] + bf0[45];
+  bf1[35] = bf0[35] + bf0[44];
+  bf1[36] = bf0[36] + bf0[43];
+  bf1[37] = bf0[37] + bf0[42];
+  bf1[38] = bf0[38] + bf0[41];
+  bf1[39] = bf0[39] + bf0[40];
+  bf1[40] = bf0[39] - bf0[40];
+  bf1[41] = bf0[38] - bf0[41];
+  bf1[42] = bf0[37] - bf0[42];
+  bf1[43] = bf0[36] - bf0[43];
+  bf1[44] = bf0[35] - bf0[44];
+  bf1[45] = bf0[34] - bf0[45];
+  bf1[46] = bf0[33] - bf0[46];
+  bf1[47] = bf0[32] - bf0[47];
+  bf1[48] = -bf0[48] + bf0[63];
+  bf1[49] = -bf0[49] + bf0[62];
+  bf1[50] = -bf0[50] + bf0[61];
+  bf1[51] = -bf0[51] + bf0[60];
+  bf1[52] = -bf0[52] + bf0[59];
+  bf1[53] = -bf0[53] + bf0[58];
+  bf1[54] = -bf0[54] + bf0[57];
+  bf1[55] = -bf0[55] + bf0[56];
+  bf1[56] = bf0[55] + bf0[56];
+  bf1[57] = bf0[54] + bf0[57];
+  bf1[58] = bf0[53] + bf0[58];
+  bf1[59] = bf0[52] + bf0[59];
+  bf1[60] = bf0[51] + bf0[60];
+  bf1[61] = bf0[50] + bf0[61];
+  bf1[62] = bf0[49] + bf0[62];
+  bf1[63] = bf0[48] + bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 10
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[31];
+  bf1[1] = bf0[1] + bf0[30];
+  bf1[2] = bf0[2] + bf0[29];
+  bf1[3] = bf0[3] + bf0[28];
+  bf1[4] = bf0[4] + bf0[27];
+  bf1[5] = bf0[5] + bf0[26];
+  bf1[6] = bf0[6] + bf0[25];
+  bf1[7] = bf0[7] + bf0[24];
+  bf1[8] = bf0[8] + bf0[23];
+  bf1[9] = bf0[9] + bf0[22];
+  bf1[10] = bf0[10] + bf0[21];
+  bf1[11] = bf0[11] + bf0[20];
+  bf1[12] = bf0[12] + bf0[19];
+  bf1[13] = bf0[13] + bf0[18];
+  bf1[14] = bf0[14] + bf0[17];
+  bf1[15] = bf0[15] + bf0[16];
+  bf1[16] = bf0[15] - bf0[16];
+  bf1[17] = bf0[14] - bf0[17];
+  bf1[18] = bf0[13] - bf0[18];
+  bf1[19] = bf0[12] - bf0[19];
+  bf1[20] = bf0[11] - bf0[20];
+  bf1[21] = bf0[10] - bf0[21];
+  bf1[22] = bf0[9] - bf0[22];
+  bf1[23] = bf0[8] - bf0[23];
+  bf1[24] = bf0[7] - bf0[24];
+  bf1[25] = bf0[6] - bf0[25];
+  bf1[26] = bf0[5] - bf0[26];
+  bf1[27] = bf0[4] - bf0[27];
+  bf1[28] = bf0[3] - bf0[28];
+  bf1[29] = bf0[2] - bf0[29];
+  bf1[30] = bf0[1] - bf0[30];
+  bf1[31] = bf0[0] - bf0[31];
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = bf0[34];
+  bf1[35] = bf0[35];
+  bf1[36] = bf0[36];
+  bf1[37] = bf0[37];
+  bf1[38] = bf0[38];
+  bf1[39] = bf0[39];
+  bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit[stage]);
+  bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit[stage]);
+  bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit[stage]);
+  bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit[stage]);
+  bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit[stage]);
+  bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit[stage]);
+  bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit[stage]);
+  bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit[stage]);
+  bf1[48] = half_btf(cospi[32], bf0[47], cospi[32], bf0[48], cos_bit[stage]);
+  bf1[49] = half_btf(cospi[32], bf0[46], cospi[32], bf0[49], cos_bit[stage]);
+  bf1[50] = half_btf(cospi[32], bf0[45], cospi[32], bf0[50], cos_bit[stage]);
+  bf1[51] = half_btf(cospi[32], bf0[44], cospi[32], bf0[51], cos_bit[stage]);
+  bf1[52] = half_btf(cospi[32], bf0[43], cospi[32], bf0[52], cos_bit[stage]);
+  bf1[53] = half_btf(cospi[32], bf0[42], cospi[32], bf0[53], cos_bit[stage]);
+  bf1[54] = half_btf(cospi[32], bf0[41], cospi[32], bf0[54], cos_bit[stage]);
+  bf1[55] = half_btf(cospi[32], bf0[40], cospi[32], bf0[55], cos_bit[stage]);
+  bf1[56] = bf0[56];
+  bf1[57] = bf0[57];
+  bf1[58] = bf0[58];
+  bf1[59] = bf0[59];
+  bf1[60] = bf0[60];
+  bf1[61] = bf0[61];
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 11
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[63];
+  bf1[1] = bf0[1] + bf0[62];
+  bf1[2] = bf0[2] + bf0[61];
+  bf1[3] = bf0[3] + bf0[60];
+  bf1[4] = bf0[4] + bf0[59];
+  bf1[5] = bf0[5] + bf0[58];
+  bf1[6] = bf0[6] + bf0[57];
+  bf1[7] = bf0[7] + bf0[56];
+  bf1[8] = bf0[8] + bf0[55];
+  bf1[9] = bf0[9] + bf0[54];
+  bf1[10] = bf0[10] + bf0[53];
+  bf1[11] = bf0[11] + bf0[52];
+  bf1[12] = bf0[12] + bf0[51];
+  bf1[13] = bf0[13] + bf0[50];
+  bf1[14] = bf0[14] + bf0[49];
+  bf1[15] = bf0[15] + bf0[48];
+  bf1[16] = bf0[16] + bf0[47];
+  bf1[17] = bf0[17] + bf0[46];
+  bf1[18] = bf0[18] + bf0[45];
+  bf1[19] = bf0[19] + bf0[44];
+  bf1[20] = bf0[20] + bf0[43];
+  bf1[21] = bf0[21] + bf0[42];
+  bf1[22] = bf0[22] + bf0[41];
+  bf1[23] = bf0[23] + bf0[40];
+  bf1[24] = bf0[24] + bf0[39];
+  bf1[25] = bf0[25] + bf0[38];
+  bf1[26] = bf0[26] + bf0[37];
+  bf1[27] = bf0[27] + bf0[36];
+  bf1[28] = bf0[28] + bf0[35];
+  bf1[29] = bf0[29] + bf0[34];
+  bf1[30] = bf0[30] + bf0[33];
+  bf1[31] = bf0[31] + bf0[32];
+  bf1[32] = bf0[31] - bf0[32];
+  bf1[33] = bf0[30] - bf0[33];
+  bf1[34] = bf0[29] - bf0[34];
+  bf1[35] = bf0[28] - bf0[35];
+  bf1[36] = bf0[27] - bf0[36];
+  bf1[37] = bf0[26] - bf0[37];
+  bf1[38] = bf0[25] - bf0[38];
+  bf1[39] = bf0[24] - bf0[39];
+  bf1[40] = bf0[23] - bf0[40];
+  bf1[41] = bf0[22] - bf0[41];
+  bf1[42] = bf0[21] - bf0[42];
+  bf1[43] = bf0[20] - bf0[43];
+  bf1[44] = bf0[19] - bf0[44];
+  bf1[45] = bf0[18] - bf0[45];
+  bf1[46] = bf0[17] - bf0[46];
+  bf1[47] = bf0[16] - bf0[47];
+  bf1[48] = bf0[15] - bf0[48];
+  bf1[49] = bf0[14] - bf0[49];
+  bf1[50] = bf0[13] - bf0[50];
+  bf1[51] = bf0[12] - bf0[51];
+  bf1[52] = bf0[11] - bf0[52];
+  bf1[53] = bf0[10] - bf0[53];
+  bf1[54] = bf0[9] - bf0[54];
+  bf1[55] = bf0[8] - bf0[55];
+  bf1[56] = bf0[7] - bf0[56];
+  bf1[57] = bf0[6] - bf0[57];
+  bf1[58] = bf0[5] - bf0[58];
+  bf1[59] = bf0[4] - bf0[59];
+  bf1[60] = bf0[3] - bf0[60];
+  bf1[61] = bf0[2] - bf0[61];
+  bf1[62] = bf0[1] - bf0[62];
+  bf1[63] = bf0[0] - bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+#endif  // CONFIG_TX64X64
diff --git a/av1/common/entropymode.c b/av1/common/entropymode.c
index 01bcde9..23e0409 100644
--- a/av1/common/entropymode.c
+++ b/av1/common/entropymode.c
@@ -1325,9 +1325,14 @@
 #endif  // CONFIG_SUPERTX
 
 // FIXME(someone) need real defaults here
-static const struct segmentation_probs default_seg_probs = {
-  { 128, 128, 128, 128, 128, 128, 128 }, { 128, 128, 128 },
+static const aom_prob default_segment_tree_probs[SEG_TREE_PROBS] = {
+  128, 128, 128, 128, 128, 128, 128
 };
+// clang-format off
+static const aom_prob default_segment_pred_probs[PREDICTION_PROBS] = {
+  128, 128, 128
+};
+// clang-format on
 
 static void init_mode_probs(FRAME_CONTEXT *fc) {
   av1_copy(fc->uv_mode_prob, default_uv_probs);
@@ -1372,8 +1377,8 @@
 #if CONFIG_SUPERTX
   av1_copy(fc->supertx_prob, default_supertx_prob);
 #endif  // CONFIG_SUPERTX
-  av1_copy(fc->seg.tree_probs, default_seg_probs.tree_probs);
-  av1_copy(fc->seg.pred_probs, default_seg_probs.pred_probs);
+  av1_copy(fc->seg.tree_probs, default_segment_tree_probs);
+  av1_copy(fc->seg.pred_probs, default_segment_pred_probs);
 #if CONFIG_EXT_INTRA
   av1_copy(fc->ext_intra_probs, default_ext_intra_probs);
   av1_copy(fc->intra_filter_probs, default_intra_filter_probs);
@@ -1392,6 +1397,7 @@
                      fc->inter_ext_tx_cdf, EXT_TX_SIZES);
   av1_tree_to_cdf_1D(av1_partition_tree, fc->partition_prob, fc->partition_cdf,
                      PARTITION_CONTEXTS);
+  av1_tree_to_cdf(av1_segment_tree, fc->seg.tree_probs, fc->seg.tree_cdf);
 #endif
 }
 
diff --git a/av1/common/odintrin.h b/av1/common/odintrin.h
index 73106df..5b83f8c 100644
--- a/av1/common/odintrin.h
+++ b/av1/common/odintrin.h
@@ -11,10 +11,10 @@
 #ifndef AV1_COMMON_ODINTRIN_H_
 #define AV1_COMMON_ODINTRIN_H_
 
-#include "av1/common/enums.h"
 #include "aom/aom_integer.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_ports/bitops.h"
+#include "av1/common/enums.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/av1/common/onyxc_int.h b/av1/common/onyxc_int.h
index ee8cb47..cf8d7b4 100644
--- a/av1/common/onyxc_int.h
+++ b/av1/common/onyxc_int.h
@@ -13,21 +13,21 @@
 #define AV1_COMMON_ONYXC_INT_H_
 
 #include "./aom_config.h"
+#include "./av1_rtcd.h"
 #include "aom/internal/aom_codec_internal.h"
 #include "aom_util/aom_thread.h"
-#include "./av1_rtcd.h"
 #include "av1/common/alloccommon.h"
-#include "av1/common/loopfilter.h"
-#include "av1/common/entropymv.h"
 #include "av1/common/entropy.h"
 #include "av1/common/entropymode.h"
-#include "av1/common/mv.h"
+#include "av1/common/entropymv.h"
 #include "av1/common/frame_buffers.h"
+#include "av1/common/loopfilter.h"
+#include "av1/common/mv.h"
 #include "av1/common/quant_common.h"
-#include "av1/common/tile_common.h"
 #if CONFIG_LOOP_RESTORATION
 #include "av1/common/restoration.h"
 #endif  // CONFIG_LOOP_RESTORATION
+#include "av1/common/tile_common.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/av1/common/reconintra.c b/av1/common/reconintra.c
index 22479b6..483b0b2 100644
--- a/av1/common/reconintra.c
+++ b/av1/common/reconintra.c
@@ -233,6 +233,14 @@
   const int w = AOMMAX(num_4x4_blocks_wide_lookup[bsize] >> ss_x, 1);
   const int step = 1 << txsz;
 
+  // TODO(bshacklett, huisu): Currently the RD loop traverses 4X8 blocks in
+  // inverted N order while in the bitstream the subblocks are stored in Z
+  // order. This discrepancy makes this function incorrect when considering 4X8
+  // blocks in the RD loop, so we disable the extended right edge for these
+  // blocks. The correct solution is to change the bitstream to store these
+  // blocks in inverted N order, and then update this function appropriately.
+  if (bsize == BLOCK_4X8 && y == 1) return 0;
+
   if (!right_available) {
     return 0;
   } else {
diff --git a/av1/common/seg_common.h b/av1/common/seg_common.h
index 8c85d9a..d833a86 100644
--- a/av1/common/seg_common.h
+++ b/av1/common/seg_common.h
@@ -48,6 +48,9 @@
 
 struct segmentation_probs {
   aom_prob tree_probs[SEG_TREE_PROBS];
+#if CONFIG_DAALA_EC
+  aom_cdf_prob tree_cdf[MAX_SEGMENTS];
+#endif
   aom_prob pred_probs[PREDICTION_PROBS];
 };
 
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index f853866..6744572 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -54,6 +54,7 @@
 #include "av1/decoder/dsubexp.h"
 
 #define MAX_AV1_HEADER_SIZE 80
+#define ACCT_STR __func__
 
 static int is_compound_reference_allowed(const AV1_COMMON *cm) {
   int i;
@@ -109,7 +110,7 @@
   int i, j;
   for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) {
     for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i)
-      av1_diff_update_prob(r, &fc->switchable_interp_prob[j][i]);
+      av1_diff_update_prob(r, &fc->switchable_interp_prob[j][i], ACCT_STR);
 #if CONFIG_DAALA_EC
     av1_tree_to_cdf(av1_switchable_interp_tree, fc->switchable_interp_prob[j],
                     fc->switchable_interp_cdf[j]);
@@ -121,31 +122,31 @@
   int i;
 #if CONFIG_REF_MV
   for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i)
-    av1_diff_update_prob(r, &fc->newmv_prob[i]);
+    av1_diff_update_prob(r, &fc->newmv_prob[i], ACCT_STR);
   for (i = 0; i < ZEROMV_MODE_CONTEXTS; ++i)
-    av1_diff_update_prob(r, &fc->zeromv_prob[i]);
+    av1_diff_update_prob(r, &fc->zeromv_prob[i], ACCT_STR);
   for (i = 0; i < REFMV_MODE_CONTEXTS; ++i)
-    av1_diff_update_prob(r, &fc->refmv_prob[i]);
+    av1_diff_update_prob(r, &fc->refmv_prob[i], ACCT_STR);
   for (i = 0; i < DRL_MODE_CONTEXTS; ++i)
-    av1_diff_update_prob(r, &fc->drl_prob[i]);
+    av1_diff_update_prob(r, &fc->drl_prob[i], ACCT_STR);
 #if CONFIG_EXT_INTER
-  av1_diff_update_prob(r, &fc->new2mv_prob);
+  av1_diff_update_prob(r, &fc->new2mv_prob, ACCT_STR);
 #endif  // CONFIG_EXT_INTER
 #else
   int j;
   for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
     for (j = 0; j < INTER_MODES - 1; ++j)
-      av1_diff_update_prob(r, &fc->inter_mode_probs[i][j]);
+      av1_diff_update_prob(r, &fc->inter_mode_probs[i][j], ACCT_STR);
 #endif
 }
 
 #if CONFIG_EXT_INTER
 static void read_inter_compound_mode_probs(FRAME_CONTEXT *fc, aom_reader *r) {
   int i, j;
-  if (aom_read(r, GROUP_DIFF_UPDATE_PROB)) {
+  if (aom_read(r, GROUP_DIFF_UPDATE_PROB, ACCT_STR)) {
     for (j = 0; j < INTER_MODE_CONTEXTS; ++j) {
       for (i = 0; i < INTER_COMPOUND_MODES - 1; ++i) {
-        av1_diff_update_prob(r, &fc->inter_compound_mode_probs[j][i]);
+        av1_diff_update_prob(r, &fc->inter_compound_mode_probs[j][i], ACCT_STR);
       }
     }
   }
@@ -169,12 +170,12 @@
 
   if (cm->reference_mode == REFERENCE_MODE_SELECT)
     for (i = 0; i < COMP_INTER_CONTEXTS; ++i)
-      av1_diff_update_prob(r, &fc->comp_inter_prob[i]);
+      av1_diff_update_prob(r, &fc->comp_inter_prob[i], ACCT_STR);
 
   if (cm->reference_mode != COMPOUND_REFERENCE) {
     for (i = 0; i < REF_CONTEXTS; ++i) {
       for (j = 0; j < (SINGLE_REFS - 1); ++j) {
-        av1_diff_update_prob(r, &fc->single_ref_prob[i][j]);
+        av1_diff_update_prob(r, &fc->single_ref_prob[i][j], ACCT_STR);
       }
     }
   }
@@ -183,12 +184,12 @@
     for (i = 0; i < REF_CONTEXTS; ++i) {
 #if CONFIG_EXT_REFS
       for (j = 0; j < (FWD_REFS - 1); ++j)
-        av1_diff_update_prob(r, &fc->comp_ref_prob[i][j]);
+        av1_diff_update_prob(r, &fc->comp_ref_prob[i][j], ACCT_STR);
       for (j = 0; j < (BWD_REFS - 1); ++j)
-        av1_diff_update_prob(r, &fc->comp_bwdref_prob[i][j]);
+        av1_diff_update_prob(r, &fc->comp_bwdref_prob[i][j], ACCT_STR);
 #else
       for (j = 0; j < (COMP_REFS - 1); ++j)
-        av1_diff_update_prob(r, &fc->comp_ref_prob[i][j]);
+        av1_diff_update_prob(r, &fc->comp_ref_prob[i][j], ACCT_STR);
 #endif  // CONFIG_EXT_REFS
     }
   }
@@ -196,7 +197,7 @@
 
 static void update_mv_probs(aom_prob *p, int n, aom_reader *r) {
   int i;
-  for (i = 0; i < n; ++i) av1_diff_update_prob(r, &p[i]);
+  for (i = 0; i < n; ++i) av1_diff_update_prob(r, &p[i], ACCT_STR);
 }
 
 static void read_mv_probs(nmv_context *ctx, int allow_hp, aom_reader *r) {
@@ -417,12 +418,9 @@
   // passing bsize from decode_partition().
   xd->mi[0]->mbmi.sb_type = bsize;
   for (y = 0; y < y_mis; ++y)
-    for (x = !y; x < x_mis; ++x) {
-      xd->mi[y * cm->mi_stride + x] = xd->mi[0];
-    }
+    for (x = !y; x < x_mis; ++x) xd->mi[y * cm->mi_stride + x] = xd->mi[0];
 
   set_plane_n4(xd, bw, bh, bwl, bhl);
-
   set_skip_context(xd, mi_row, mi_col);
 
 #if CONFIG_VAR_TX
@@ -1170,7 +1168,9 @@
   const int bh = 1 << (bhl - 1);
   const int x_mis = AOMMIN(bw, cm->mi_cols - mi_col);
   const int y_mis = AOMMIN(bh, cm->mi_rows - mi_row);
-
+#if CONFIG_ACCOUNTING
+  aom_accounting_set_context(&pbi->accounting, mi_col, mi_row);
+#endif
 #if CONFIG_SUPERTX
   MB_MODE_INFO *mbmi;
   if (supertx_enabled) {
@@ -1415,21 +1415,22 @@
   if (has_rows && has_cols)
 #if CONFIG_EXT_PARTITION_TYPES
     if (bsize <= BLOCK_8X8)
-      p = (PARTITION_TYPE)aom_read_tree(r, av1_partition_tree, probs);
+      p = (PARTITION_TYPE)aom_read_tree(r, av1_partition_tree, probs, ACCT_STR);
     else
-      p = (PARTITION_TYPE)aom_read_tree(r, av1_ext_partition_tree, probs);
+      p = (PARTITION_TYPE)aom_read_tree(r, av1_ext_partition_tree, probs,
+                                        ACCT_STR);
 #else
 #if CONFIG_DAALA_EC
     p = (PARTITION_TYPE)aom_read_tree_cdf(r, cm->fc->partition_cdf[ctx],
-                                          PARTITION_TYPES);
+                                          PARTITION_TYPES, ACCT_STR);
 #else
-    p = (PARTITION_TYPE)aom_read_tree(r, av1_partition_tree, probs);
+    p = (PARTITION_TYPE)aom_read_tree(r, av1_partition_tree, probs, ACCT_STR);
 #endif
 #endif  // CONFIG_EXT_PARTITION_TYPES
   else if (!has_rows && has_cols)
-    p = aom_read(r, probs[1]) ? PARTITION_SPLIT : PARTITION_HORZ;
+    p = aom_read(r, probs[1], ACCT_STR) ? PARTITION_SPLIT : PARTITION_HORZ;
   else if (has_rows && !has_cols)
-    p = aom_read(r, probs[2]) ? PARTITION_SPLIT : PARTITION_VERT;
+    p = aom_read(r, probs[2], ACCT_STR) ? PARTITION_SPLIT : PARTITION_VERT;
   else
     p = PARTITION_SPLIT;
 
@@ -1445,7 +1446,7 @@
     return 1;
   } else {
     const int ctx = av1_get_skip_context(xd);
-    const int skip = aom_read(r, cm->fc->skip_probs[ctx]);
+    const int skip = aom_read(r, cm->fc->skip_probs[ctx], ACCT_STR);
     FRAME_COUNTS *counts = xd->counts;
     if (counts) ++counts->skip[ctx][skip];
     return skip;
@@ -1507,8 +1508,8 @@
   if (!frame_is_intra_only(cm) && partition != PARTITION_NONE &&
       bsize <= MAX_SUPERTX_BLOCK_SIZE && !supertx_enabled && !xd->lossless[0]) {
     const int supertx_context = partition_supertx_context_lookup[partition];
-    supertx_enabled =
-        aom_read(r, cm->fc->supertx_prob[supertx_context][supertx_size]);
+    supertx_enabled = aom_read(
+        r, cm->fc->supertx_prob[supertx_context][supertx_size], ACCT_STR);
     if (xd->counts)
       xd->counts->supertx[supertx_context][supertx_size][supertx_enabled]++;
 #if CONFIG_VAR_TX
@@ -1710,14 +1711,16 @@
         int eset = get_ext_tx_set(supertx_size, bsize, 1);
         if (eset > 0) {
           txfm = aom_read_tree(r, av1_ext_tx_inter_tree[eset],
-                               cm->fc->inter_ext_tx_prob[eset][supertx_size]);
+                               cm->fc->inter_ext_tx_prob[eset][supertx_size],
+                               ACCT_STR);
           if (xd->counts) ++xd->counts->inter_ext_tx[eset][supertx_size][txfm];
         }
       }
 #else
       if (supertx_size < TX_32X32) {
         txfm = aom_read_tree(r, av1_ext_tx_tree,
-                             cm->fc->inter_ext_tx_prob[supertx_size]);
+                             cm->fc->inter_ext_tx_prob[supertx_size],
+                             ACCT_STR);
         if (xd->counts) ++xd->counts->inter_ext_tx[supertx_size][txfm];
       }
 #endif  // CONFIG_EXT_TX
@@ -1809,10 +1812,10 @@
 
     if (!((mi_row * MI_SIZE) & 127) && !((mi_col * MI_SIZE) & 127) &&
         cm->clpf_size == CLPF_128X128) {
-      cm->clpf_blocks[tl] = aom_read_literal(r, 1);
+      cm->clpf_blocks[tl] = aom_read_literal(r, 1, ACCT_STR);
     } else if (cm->clpf_size == CLPF_64X64 &&
                !clpf_all_skip(cm, mi_col, mi_row, 64 / MI_SIZE)) {
-      cm->clpf_blocks[tl] = aom_read_literal(r, 1);
+      cm->clpf_blocks[tl] = aom_read_literal(r, 1, ACCT_STR);
     } else if (cm->clpf_size == CLPF_32X32) {
       const int tr = tl + 1;
       const int bl = tl + cm->clpf_stride;
@@ -1821,19 +1824,19 @@
 
       // Up to four bits per SB
       if (!clpf_all_skip(cm, mi_col, mi_row, size))
-        cm->clpf_blocks[tl] = aom_read_literal(r, 1);
+        cm->clpf_blocks[tl] = aom_read_literal(r, 1, ACCT_STR);
 
       if (mi_col + size < cm->mi_cols &&
           !clpf_all_skip(cm, mi_col + size, mi_row, size))
-        cm->clpf_blocks[tr] = aom_read_literal(r, 1);
+        cm->clpf_blocks[tr] = aom_read_literal(r, 1, ACCT_STR);
 
       if (mi_row + size < cm->mi_rows &&
           !clpf_all_skip(cm, mi_col, mi_row + size, size))
-        cm->clpf_blocks[bl] = aom_read_literal(r, 1);
+        cm->clpf_blocks[bl] = aom_read_literal(r, 1, ACCT_STR);
 
       if (mi_col + size < cm->mi_cols && mi_row + size < cm->mi_rows &&
           !clpf_all_skip(cm, mi_col + size, mi_row + size, size))
-        cm->clpf_blocks[br] = aom_read_literal(r, 1);
+        cm->clpf_blocks[br] = aom_read_literal(r, 1, ACCT_STR);
     }
   }
 #endif
@@ -1894,13 +1897,13 @@
                                    aom_reader *r) {
   int i, j, k, l, m;
 
-  if (aom_read_bit(r))
+  if (aom_read_bit(r, ACCT_STR))
     for (i = 0; i < PLANE_TYPES; ++i)
       for (j = 0; j < REF_TYPES; ++j)
         for (k = 0; k < COEF_BANDS; ++k)
           for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l)
             for (m = 0; m < UNCONSTRAINED_NODES; ++m)
-              av1_diff_update_prob(r, &coef_probs[i][j][k][l][m]);
+              av1_diff_update_prob(r, &coef_probs[i][j][k][l][m], ACCT_STR);
 }
 
 static void read_coef_probs(FRAME_CONTEXT *fc, TX_MODE tx_mode, aom_reader *r) {
@@ -1990,38 +1993,39 @@
           rsi->wiener_info, sizeof(*rsi->wiener_info) * ntiles);
       assert(rsi->wiener_info != NULL);
       for (i = 0; i < ntiles; ++i) {
-        rsi->restoration_type[i] = aom_read_tree(
-            rb, av1_switchable_restore_tree, cm->fc->switchable_restore_prob);
+        rsi->restoration_type[i] =
+            aom_read_tree(rb, av1_switchable_restore_tree,
+                          cm->fc->switchable_restore_prob, ACCT_STR);
         if (rsi->restoration_type[i] == RESTORE_WIENER) {
           rsi->wiener_info[i].level = 1;
           rsi->wiener_info[i].vfilter[0] =
-              aom_read_literal(rb, WIENER_FILT_TAP0_BITS) +
+              aom_read_literal(rb, WIENER_FILT_TAP0_BITS, ACCT_STR) +
               WIENER_FILT_TAP0_MINV;
           rsi->wiener_info[i].vfilter[1] =
-              aom_read_literal(rb, WIENER_FILT_TAP1_BITS) +
+              aom_read_literal(rb, WIENER_FILT_TAP1_BITS, ACCT_STR) +
               WIENER_FILT_TAP1_MINV;
           rsi->wiener_info[i].vfilter[2] =
-              aom_read_literal(rb, WIENER_FILT_TAP2_BITS) +
+              aom_read_literal(rb, WIENER_FILT_TAP2_BITS, ACCT_STR) +
               WIENER_FILT_TAP2_MINV;
           rsi->wiener_info[i].hfilter[0] =
-              aom_read_literal(rb, WIENER_FILT_TAP0_BITS) +
+              aom_read_literal(rb, WIENER_FILT_TAP0_BITS, ACCT_STR) +
               WIENER_FILT_TAP0_MINV;
           rsi->wiener_info[i].hfilter[1] =
-              aom_read_literal(rb, WIENER_FILT_TAP1_BITS) +
+              aom_read_literal(rb, WIENER_FILT_TAP1_BITS, ACCT_STR) +
               WIENER_FILT_TAP1_MINV;
           rsi->wiener_info[i].hfilter[2] =
-              aom_read_literal(rb, WIENER_FILT_TAP2_BITS) +
+              aom_read_literal(rb, WIENER_FILT_TAP2_BITS, ACCT_STR) +
               WIENER_FILT_TAP2_MINV;
         } else if (rsi->restoration_type[i] == RESTORE_BILATERAL) {
           int s;
           for (s = 0; s < BILATERAL_SUBTILES; ++s) {
 #if BILATERAL_SUBTILES == 0
             rsi->bilateral_info[i].level[s] =
-                aom_read_literal(rb, av1_bilateral_level_bits(cm));
+                aom_read_literal(rb, av1_bilateral_level_bits(cm), ACCT_STR);
 #else
-            if (aom_read(rb, RESTORE_NONE_BILATERAL_PROB)) {
+            if (aom_read(rb, RESTORE_NONE_BILATERAL_PROB, ACCT_STR)) {
               rsi->bilateral_info[i].level[s] =
-                  aom_read_literal(rb, av1_bilateral_level_bits(cm));
+                  aom_read_literal(rb, av1_bilateral_level_bits(cm), ACCT_STR);
             } else {
               rsi->bilateral_info[i].level[s] = -1;
             }
@@ -2034,26 +2038,26 @@
           rsi->wiener_info, sizeof(*rsi->wiener_info) * ntiles);
       assert(rsi->wiener_info != NULL);
       for (i = 0; i < ntiles; ++i) {
-        if (aom_read(rb, RESTORE_NONE_WIENER_PROB)) {
+        if (aom_read(rb, RESTORE_NONE_WIENER_PROB, ACCT_STR)) {
           rsi->wiener_info[i].level = 1;
           rsi->restoration_type[i] = RESTORE_WIENER;
           rsi->wiener_info[i].vfilter[0] =
-              aom_read_literal(rb, WIENER_FILT_TAP0_BITS) +
+              aom_read_literal(rb, WIENER_FILT_TAP0_BITS, ACCT_STR) +
               WIENER_FILT_TAP0_MINV;
           rsi->wiener_info[i].vfilter[1] =
-              aom_read_literal(rb, WIENER_FILT_TAP1_BITS) +
+              aom_read_literal(rb, WIENER_FILT_TAP1_BITS, ACCT_STR) +
               WIENER_FILT_TAP1_MINV;
           rsi->wiener_info[i].vfilter[2] =
-              aom_read_literal(rb, WIENER_FILT_TAP2_BITS) +
+              aom_read_literal(rb, WIENER_FILT_TAP2_BITS, ACCT_STR) +
               WIENER_FILT_TAP2_MINV;
           rsi->wiener_info[i].hfilter[0] =
-              aom_read_literal(rb, WIENER_FILT_TAP0_BITS) +
+              aom_read_literal(rb, WIENER_FILT_TAP0_BITS, ACCT_STR) +
               WIENER_FILT_TAP0_MINV;
           rsi->wiener_info[i].hfilter[1] =
-              aom_read_literal(rb, WIENER_FILT_TAP1_BITS) +
+              aom_read_literal(rb, WIENER_FILT_TAP1_BITS, ACCT_STR) +
               WIENER_FILT_TAP1_MINV;
           rsi->wiener_info[i].hfilter[2] =
-              aom_read_literal(rb, WIENER_FILT_TAP2_BITS) +
+              aom_read_literal(rb, WIENER_FILT_TAP2_BITS, ACCT_STR) +
               WIENER_FILT_TAP2_MINV;
         } else {
           rsi->wiener_info[i].level = 0;
@@ -2068,9 +2072,9 @@
         int s;
         rsi->restoration_type[i] = RESTORE_BILATERAL;
         for (s = 0; s < BILATERAL_SUBTILES; ++s) {
-          if (aom_read(rb, RESTORE_NONE_BILATERAL_PROB)) {
+          if (aom_read(rb, RESTORE_NONE_BILATERAL_PROB, ACCT_STR)) {
             rsi->bilateral_info[i].level[s] =
-                aom_read_literal(rb, av1_bilateral_level_bits(cm));
+                aom_read_literal(rb, av1_bilateral_level_bits(cm), ACCT_STR);
           } else {
             rsi->bilateral_info[i].level[s] = -1;
           }
@@ -2773,7 +2777,9 @@
                     aom_memalign(32, n_tiles * (sizeof(*pbi->tile_data))));
     pbi->allocated_tiles = n_tiles;
   }
-
+#if CONFIG_ACCOUNTING
+  aom_accounting_reset(&pbi->accounting);
+#endif
   // Load all tile information into tile_data.
   for (tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) {
     for (tile_col = tile_cols_start; tile_col < tile_cols_end; ++tile_col) {
@@ -2796,6 +2802,9 @@
       setup_token_decoder(buf->data, data_end, buf->size, &cm->error,
                           &td->bit_reader, pbi->decrypt_cb, pbi->decrypt_state);
 #endif
+#if CONFIG_ACCOUNTING
+      tile_data->bit_reader.accounting = &pbi->accounting;
+#endif
       av1_init_macroblockd(cm, &td->xd, td->dqcoeff);
 #if CONFIG_PALETTE
       td->xd.plane[0].color_index_map = td->color_index_map[0];
@@ -2814,6 +2823,10 @@
     for (tile_col = tile_cols_start; tile_col < tile_cols_end; ++tile_col) {
       const int col = inv_col_order ? tile_cols - 1 - tile_col : tile_col;
       TileData *const td = pbi->tile_data + tile_cols * row + col;
+#if CONFIG_ACCOUNTING
+      tile_data->bit_reader.accounting->last_tell_frac =
+          aom_reader_tell_frac(&tile_data->bit_reader);
+#endif
 
       av1_tile_set_col(&tile_info, cm, col);
 
@@ -3494,22 +3507,23 @@
   int i, j, k;
   int s;
   for (s = 1; s < EXT_TX_SETS_INTER; ++s) {
-    if (aom_read(r, GROUP_DIFF_UPDATE_PROB)) {
+    if (aom_read(r, GROUP_DIFF_UPDATE_PROB, ACCT_STR)) {
       for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
         if (!use_inter_ext_tx_for_txsize[s][i]) continue;
         for (j = 0; j < num_ext_tx_set_inter[s] - 1; ++j)
-          av1_diff_update_prob(r, &fc->inter_ext_tx_prob[s][i][j]);
+          av1_diff_update_prob(r, &fc->inter_ext_tx_prob[s][i][j], ACCT_STR);
       }
     }
   }
 
   for (s = 1; s < EXT_TX_SETS_INTRA; ++s) {
-    if (aom_read(r, GROUP_DIFF_UPDATE_PROB)) {
+    if (aom_read(r, GROUP_DIFF_UPDATE_PROB, ACCT_STR)) {
       for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
         if (!use_intra_ext_tx_for_txsize[s][i]) continue;
         for (j = 0; j < INTRA_MODES; ++j)
           for (k = 0; k < num_ext_tx_set_intra[s] - 1; ++k)
-            av1_diff_update_prob(r, &fc->intra_ext_tx_prob[s][i][j][k]);
+            av1_diff_update_prob(r, &fc->intra_ext_tx_prob[s][i][j][k],
+                                 ACCT_STR);
       }
     }
   }
@@ -3519,11 +3533,11 @@
 
 static void read_ext_tx_probs(FRAME_CONTEXT *fc, aom_reader *r) {
   int i, j, k;
-  if (aom_read(r, GROUP_DIFF_UPDATE_PROB)) {
+  if (aom_read(r, GROUP_DIFF_UPDATE_PROB, ACCT_STR)) {
     for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
       for (j = 0; j < TX_TYPES; ++j) {
         for (k = 0; k < TX_TYPES - 1; ++k)
-          av1_diff_update_prob(r, &fc->intra_ext_tx_prob[i][j][k]);
+          av1_diff_update_prob(r, &fc->intra_ext_tx_prob[i][j][k], ACCT_STR);
 #if CONFIG_DAALA_EC
         av1_tree_to_cdf(av1_ext_tx_tree, fc->intra_ext_tx_prob[i][j],
                         fc->intra_ext_tx_cdf[i][j]);
@@ -3531,10 +3545,10 @@
       }
     }
   }
-  if (aom_read(r, GROUP_DIFF_UPDATE_PROB)) {
+  if (aom_read(r, GROUP_DIFF_UPDATE_PROB, ACCT_STR)) {
     for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
       for (k = 0; k < TX_TYPES - 1; ++k)
-        av1_diff_update_prob(r, &fc->inter_ext_tx_prob[i][k]);
+        av1_diff_update_prob(r, &fc->inter_ext_tx_prob[i][k], ACCT_STR);
 #if CONFIG_DAALA_EC
       av1_tree_to_cdf(av1_ext_tx_tree, fc->inter_ext_tx_prob[i],
                       fc->inter_ext_tx_cdf[i]);
@@ -3547,10 +3561,10 @@
 #if CONFIG_SUPERTX
 static void read_supertx_probs(FRAME_CONTEXT *fc, aom_reader *r) {
   int i, j;
-  if (aom_read(r, GROUP_DIFF_UPDATE_PROB)) {
+  if (aom_read(r, GROUP_DIFF_UPDATE_PROB, ACCT_STR)) {
     for (i = 0; i < PARTITION_SUPERTX_CONTEXTS; ++i) {
       for (j = 1; j < TX_SIZES; ++j) {
-        av1_diff_update_prob(r, &fc->supertx_prob[i][j]);
+        av1_diff_update_prob(r, &fc->supertx_prob[i][j], ACCT_STR);
       }
     }
   }
@@ -3561,7 +3575,7 @@
 static void read_global_motion_params(Global_Motion_Params *params,
                                       aom_prob *probs, aom_reader *r) {
   GLOBAL_MOTION_TYPE gmtype =
-      aom_read_tree(r, av1_global_motion_types_tree, probs);
+      aom_read_tree(r, av1_global_motion_types_tree, probs, ACCT_STR);
   params->gmtype = gmtype;
   params->motion_params.wmtype = gm_to_trans_type(gmtype);
   switch (gmtype) {
@@ -3643,48 +3657,52 @@
     for (i = 0; i < TX_SIZES - 1; ++i)
       for (j = 0; j < TX_SIZE_CONTEXTS; ++j)
         for (k = 0; k < i + 1; ++k)
-          av1_diff_update_prob(&r, &fc->tx_size_probs[i][j][k]);
+          av1_diff_update_prob(&r, &fc->tx_size_probs[i][j][k], ACCT_STR);
   }
 
   read_coef_probs(fc, cm->tx_mode, &r);
 
 #if CONFIG_VAR_TX
   for (k = 0; k < TXFM_PARTITION_CONTEXTS; ++k)
-    av1_diff_update_prob(&r, &fc->txfm_partition_prob[k]);
+    av1_diff_update_prob(&r, &fc->txfm_partition_prob[k], ACCT_STR);
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
   if (cm->tx_mode == TX_MODE_SELECT) {
     for (i = 1; i < TX_SIZES - 1; ++i)
-      av1_diff_update_prob(&r, &fc->rect_tx_prob[i]);
+      av1_diff_update_prob(&r, &fc->rect_tx_prob[i], ACCT_STR);
   }
 #endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
 #endif
 
   for (k = 0; k < SKIP_CONTEXTS; ++k)
-    av1_diff_update_prob(&r, &fc->skip_probs[k]);
+    av1_diff_update_prob(&r, &fc->skip_probs[k], ACCT_STR);
 
   if (cm->seg.enabled && cm->seg.update_map) {
     if (cm->seg.temporal_update) {
       for (k = 0; k < PREDICTION_PROBS; k++)
-        av1_diff_update_prob(&r, &cm->fc->seg.pred_probs[k]);
+        av1_diff_update_prob(&r, &cm->fc->seg.pred_probs[k], ACCT_STR);
     }
     for (k = 0; k < MAX_SEGMENTS - 1; k++)
-      av1_diff_update_prob(&r, &cm->fc->seg.tree_probs[k]);
+      av1_diff_update_prob(&r, &cm->fc->seg.tree_probs[k], ACCT_STR);
+#if CONFIG_DAALA_EC
+    av1_tree_to_cdf(av1_segment_tree, cm->fc->seg.tree_probs,
+                    cm->fc->seg.tree_cdf);
+#endif
   }
 
   for (j = 0; j < INTRA_MODES; j++)
     for (i = 0; i < INTRA_MODES - 1; ++i)
-      av1_diff_update_prob(&r, &fc->uv_mode_prob[j][i]);
+      av1_diff_update_prob(&r, &fc->uv_mode_prob[j][i], ACCT_STR);
 
 #if CONFIG_EXT_PARTITION_TYPES
   for (i = 0; i < PARTITION_TYPES - 1; ++i)
-    av1_diff_update_prob(&r, &fc->partition_prob[0][i]);
+    av1_diff_update_prob(&r, &fc->partition_prob[0][i], ACCT_STR);
   for (j = 1; j < PARTITION_CONTEXTS; ++j)
     for (i = 0; i < EXT_PARTITION_TYPES - 1; ++i)
-      av1_diff_update_prob(&r, &fc->partition_prob[j][i]);
+      av1_diff_update_prob(&r, &fc->partition_prob[j][i], ACCT_STR);
 #else
   for (j = 0; j < PARTITION_CONTEXTS; ++j) {
     for (i = 0; i < PARTITION_TYPES - 1; ++i)
-      av1_diff_update_prob(&r, &fc->partition_prob[j][i]);
+      av1_diff_update_prob(&r, &fc->partition_prob[j][i], ACCT_STR);
 #if CONFIG_DAALA_EC
     av1_tree_to_cdf(av1_partition_tree, fc->partition_prob[j],
                     fc->partition_cdf[j]);
@@ -3695,7 +3713,7 @@
 #if CONFIG_EXT_INTRA
   for (i = 0; i < INTRA_FILTERS + 1; ++i)
     for (j = 0; j < INTRA_FILTERS - 1; ++j)
-      av1_diff_update_prob(&r, &fc->intra_filter_probs[i][j]);
+      av1_diff_update_prob(&r, &fc->intra_filter_probs[i][j], ACCT_STR);
 #endif  // CONFIG_EXT_INTRA
 
   if (frame_is_intra_only(cm)) {
@@ -3703,7 +3721,7 @@
     for (k = 0; k < INTRA_MODES; k++)
       for (j = 0; j < INTRA_MODES; j++)
         for (i = 0; i < INTRA_MODES - 1; ++i)
-          av1_diff_update_prob(&r, &cm->kf_y_prob[k][j][i]);
+          av1_diff_update_prob(&r, &cm->kf_y_prob[k][j][i], ACCT_STR);
   } else {
 #if !CONFIG_REF_MV
     nmv_context *const nmvc = &fc->nmvc;
@@ -3716,23 +3734,23 @@
     if (cm->reference_mode != COMPOUND_REFERENCE) {
       for (i = 0; i < BLOCK_SIZE_GROUPS; i++) {
         if (is_interintra_allowed_bsize_group(i)) {
-          av1_diff_update_prob(&r, &fc->interintra_prob[i]);
+          av1_diff_update_prob(&r, &fc->interintra_prob[i], ACCT_STR);
         }
       }
       for (i = 0; i < BLOCK_SIZE_GROUPS; i++) {
         for (j = 0; j < INTERINTRA_MODES - 1; j++)
-          av1_diff_update_prob(&r, &fc->interintra_mode_prob[i][j]);
+          av1_diff_update_prob(&r, &fc->interintra_mode_prob[i][j], ACCT_STR);
       }
       for (i = 0; i < BLOCK_SIZES; i++) {
         if (is_interintra_allowed_bsize(i) && is_interintra_wedge_used(i)) {
-          av1_diff_update_prob(&r, &fc->wedge_interintra_prob[i]);
+          av1_diff_update_prob(&r, &fc->wedge_interintra_prob[i], ACCT_STR);
         }
       }
     }
     if (cm->reference_mode != SINGLE_REFERENCE) {
       for (i = 0; i < BLOCK_SIZES; i++) {
         if (is_interinter_wedge_used(i)) {
-          av1_diff_update_prob(&r, &fc->wedge_interinter_prob[i]);
+          av1_diff_update_prob(&r, &fc->wedge_interinter_prob[i], ACCT_STR);
         }
       }
     }
@@ -3741,14 +3759,14 @@
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
     for (i = BLOCK_8X8; i < BLOCK_SIZES; ++i) {
       for (j = 0; j < MOTION_MODES - 1; ++j)
-        av1_diff_update_prob(&r, &fc->motion_mode_prob[i][j]);
+        av1_diff_update_prob(&r, &fc->motion_mode_prob[i][j], ACCT_STR);
     }
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 
     if (cm->interp_filter == SWITCHABLE) read_switchable_interp_probs(fc, &r);
 
     for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
-      av1_diff_update_prob(&r, &fc->intra_inter_prob[i]);
+      av1_diff_update_prob(&r, &fc->intra_inter_prob[i], ACCT_STR);
 
     if (cm->reference_mode != SINGLE_REFERENCE)
       setup_compound_reference_mode(cm);
@@ -3757,7 +3775,7 @@
 
     for (j = 0; j < BLOCK_SIZE_GROUPS; j++)
       for (i = 0; i < INTRA_MODES - 1; ++i)
-        av1_diff_update_prob(&r, &fc->y_mode_prob[j][i]);
+        av1_diff_update_prob(&r, &fc->y_mode_prob[j][i], ACCT_STR);
 
 #if CONFIG_REF_MV
     for (i = 0; i < NMV_CONTEXTS; ++i)
diff --git a/av1/decoder/decodemv.c b/av1/decoder/decodemv.c
index a05820a..66056c0 100644
--- a/av1/decoder/decodemv.c
+++ b/av1/decoder/decodemv.c
@@ -25,23 +25,25 @@
 
 #include "aom_dsp/aom_dsp_common.h"
 
+#define ACCT_STR __func__
+
 #if CONFIG_EXT_INTRA || CONFIG_PALETTE
 static INLINE int read_uniform(aom_reader *r, int n) {
   int l = get_unsigned_bits(n);
   int m = (1 << l) - n;
-  int v = aom_read_literal(r, l - 1);
+  int v = aom_read_literal(r, l - 1, ACCT_STR);
 
   assert(l != 0);
 
   if (v < m)
     return v;
   else
-    return (v << 1) - m + aom_read_literal(r, 1);
+    return (v << 1) - m + aom_read_literal(r, 1, ACCT_STR);
 }
 #endif  // CONFIG_EXT_INTRA || CONFIG_PALETTE
 
 static PREDICTION_MODE read_intra_mode(aom_reader *r, const aom_prob *p) {
-  return (PREDICTION_MODE)aom_read_tree(r, av1_intra_mode_tree, p);
+  return (PREDICTION_MODE)aom_read_tree(r, av1_intra_mode_tree, p, ACCT_STR);
 }
 
 static PREDICTION_MODE read_intra_mode_y(AV1_COMMON *cm, MACROBLOCKD *xd,
@@ -67,7 +69,8 @@
 static INTERINTRA_MODE read_interintra_mode(AV1_COMMON *cm, MACROBLOCKD *xd,
                                             aom_reader *r, int size_group) {
   const INTERINTRA_MODE ii_mode = (INTERINTRA_MODE)aom_read_tree(
-      r, av1_interintra_mode_tree, cm->fc->interintra_mode_prob[size_group]);
+      r, av1_interintra_mode_tree, cm->fc->interintra_mode_prob[size_group],
+      ACCT_STR);
   FRAME_COUNTS *counts = xd->counts;
   if (counts) ++counts->interintra_mode[size_group][ii_mode];
   return ii_mode;
@@ -84,7 +87,7 @@
   int16_t mode_ctx = ctx & NEWMV_CTX_MASK;
   aom_prob mode_prob = cm->fc->newmv_prob[mode_ctx];
 
-  if (aom_read(r, mode_prob) == 0) {
+  if (aom_read(r, mode_prob, ACCT_STR) == 0) {
     if (counts) ++counts->newmv_mode[mode_ctx][0];
 
 #if CONFIG_EXT_INTER
@@ -94,7 +97,7 @@
 #if CONFIG_EXT_INTER
     } else {
       mode_prob = cm->fc->new2mv_prob;
-      if (aom_read(r, mode_prob) == 0) {
+      if (aom_read(r, mode_prob, ACCT_STR) == 0) {
         if (counts) ++counts->new2mv_mode[0];
         return NEWMV;
       } else {
@@ -111,7 +114,7 @@
   mode_ctx = (ctx >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK;
 
   mode_prob = cm->fc->zeromv_prob[mode_ctx];
-  if (aom_read(r, mode_prob) == 0) {
+  if (aom_read(r, mode_prob, ACCT_STR) == 0) {
     if (counts) ++counts->zeromv_mode[mode_ctx][0];
     return ZEROMV;
   }
@@ -125,7 +128,7 @@
 
   mode_prob = cm->fc->refmv_prob[mode_ctx];
 
-  if (aom_read(r, mode_prob) == 0) {
+  if (aom_read(r, mode_prob, ACCT_STR) == 0) {
     if (counts) ++counts->refmv_mode[mode_ctx][0];
 
     return NEARESTMV;
@@ -137,8 +140,8 @@
   // Invalid prediction mode.
   assert(0);
 #else
-  const int mode =
-      aom_read_tree(r, av1_inter_mode_tree, cm->fc->inter_mode_probs[ctx]);
+  const int mode = aom_read_tree(r, av1_inter_mode_tree,
+                                 cm->fc->inter_mode_probs[ctx], ACCT_STR);
   FRAME_COUNTS *counts = xd->counts;
   if (counts) ++counts->inter_mode[ctx][mode];
 
@@ -158,7 +161,7 @@
       if (xd->ref_mv_count[ref_frame_type] > idx + 1) {
         uint8_t drl_ctx = av1_drl_ctx(xd->ref_mv_stack[ref_frame_type], idx);
         aom_prob drl_prob = cm->fc->drl_prob[drl_ctx];
-        if (!aom_read(r, drl_prob)) {
+        if (!aom_read(r, drl_prob, ACCT_STR)) {
           mbmi->ref_mv_idx = idx;
           if (xd->counts) ++xd->counts->drl_mode[drl_ctx][0];
           return;
@@ -178,7 +181,7 @@
       if (xd->ref_mv_count[ref_frame_type] > idx + 1) {
         uint8_t drl_ctx = av1_drl_ctx(xd->ref_mv_stack[ref_frame_type], idx);
         aom_prob drl_prob = cm->fc->drl_prob[drl_ctx];
-        if (!aom_read(r, drl_prob)) {
+        if (!aom_read(r, drl_prob, ACCT_STR)) {
           mbmi->ref_mv_idx = idx - 1;
           if (xd->counts) ++xd->counts->drl_mode[drl_ctx][0];
           return;
@@ -194,8 +197,9 @@
 #if CONFIG_EXT_INTER
 static PREDICTION_MODE read_inter_compound_mode(AV1_COMMON *cm, MACROBLOCKD *xd,
                                                 aom_reader *r, int16_t ctx) {
-  const int mode = aom_read_tree(r, av1_inter_compound_mode_tree,
-                                 cm->fc->inter_compound_mode_probs[ctx]);
+  const int mode =
+      aom_read_tree(r, av1_inter_compound_mode_tree,
+                    cm->fc->inter_compound_mode_probs[ctx], ACCT_STR);
   FRAME_COUNTS *counts = xd->counts;
 
   if (counts) ++counts->inter_compound_mode[ctx][mode];
@@ -207,7 +211,11 @@
 
 static int read_segment_id(aom_reader *r,
                            const struct segmentation_probs *segp) {
-  return aom_read_tree(r, av1_segment_tree, segp->tree_probs);
+#if CONFIG_DAALA_EC
+  return aom_read_symbol(r, segp->tree_cdf, MAX_SEGMENTS, ACCT_STR);
+#else
+  return aom_read_tree(r, av1_segment_tree, segp->tree_probs, ACCT_STR);
+#endif
 }
 
 #if CONFIG_VAR_TX
@@ -231,7 +239,7 @@
 
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
-  is_split = aom_read(r, cm->fc->txfm_partition_prob[ctx]);
+  is_split = aom_read(r, cm->fc->txfm_partition_prob[ctx], ACCT_STR);
 
   if (is_split) {
     BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
@@ -274,8 +282,9 @@
                                      int tx_size_cat, aom_reader *r) {
   FRAME_COUNTS *counts = xd->counts;
   const int ctx = get_tx_size_context(xd);
-  int tx_size = aom_read_tree(r, av1_tx_size_tree[tx_size_cat],
-                              cm->fc->tx_size_probs[tx_size_cat][ctx]);
+  int tx_size =
+      aom_read_tree(r, av1_tx_size_tree[tx_size_cat],
+                    cm->fc->tx_size_probs[tx_size_cat][ctx], ACCT_STR);
   if (counts) ++counts->tx_size[tx_size_cat][ctx][tx_size];
   return (TX_SIZE)tx_size;
 }
@@ -416,7 +425,7 @@
   if (seg->temporal_update) {
     const int ctx = av1_get_pred_context_seg_id(xd);
     const aom_prob pred_prob = segp->pred_probs[ctx];
-    mbmi->seg_id_predicted = aom_read(r, pred_prob);
+    mbmi->seg_id_predicted = aom_read(r, pred_prob, ACCT_STR);
     if (counts) ++counts->seg.pred[ctx][mbmi->seg_id_predicted];
     if (mbmi->seg_id_predicted) {
       segment_id = predicted_segment_id;
@@ -438,7 +447,7 @@
     return 1;
   } else {
     const int ctx = av1_get_skip_context(xd);
-    const int skip = aom_read(r, cm->fc->skip_probs[ctx]);
+    const int skip = aom_read(r, cm->fc->skip_probs[ctx], ACCT_STR);
     FRAME_COUNTS *counts = xd->counts;
     if (counts) ++counts->skip[ctx][skip];
     return skip;
@@ -461,15 +470,17 @@
       palette_ctx += (above_mi->mbmi.palette_mode_info.palette_size[0] > 0);
     if (left_mi)
       palette_ctx += (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
-    if (aom_read(r, av1_default_palette_y_mode_prob[bsize - BLOCK_8X8]
-                                                   [palette_ctx])) {
+    if (aom_read(
+            r, av1_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_ctx],
+            ACCT_STR)) {
       pmi->palette_size[0] =
           aom_read_tree(r, av1_palette_size_tree,
-                        av1_default_palette_y_size_prob[bsize - BLOCK_8X8]) +
+                        av1_default_palette_y_size_prob[bsize - BLOCK_8X8],
+                        ACCT_STR) +
           2;
       n = pmi->palette_size[0];
       for (i = 0; i < n; ++i)
-        pmi->palette_colors[i] = aom_read_literal(r, cm->bit_depth);
+        pmi->palette_colors[i] = aom_read_literal(r, cm->bit_depth, ACCT_STR);
 
       xd->plane[0].color_index_map[0] = read_uniform(r, n);
       assert(xd->plane[0].color_index_map[0] < n);
@@ -477,18 +488,19 @@
   }
 
   if (mbmi->uv_mode == DC_PRED) {
-    if (aom_read(r,
-                 av1_default_palette_uv_mode_prob[pmi->palette_size[0] > 0])) {
+    if (aom_read(r, av1_default_palette_uv_mode_prob[pmi->palette_size[0] > 0],
+                 ACCT_STR)) {
       pmi->palette_size[1] =
           aom_read_tree(r, av1_palette_size_tree,
-                        av1_default_palette_uv_size_prob[bsize - BLOCK_8X8]) +
+                        av1_default_palette_uv_size_prob[bsize - BLOCK_8X8],
+                        ACCT_STR) +
           2;
       n = pmi->palette_size[1];
       for (i = 0; i < n; ++i) {
         pmi->palette_colors[PALETTE_MAX_SIZE + i] =
-            aom_read_literal(r, cm->bit_depth);
+            aom_read_literal(r, cm->bit_depth, ACCT_STR);
         pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] =
-            aom_read_literal(r, cm->bit_depth);
+            aom_read_literal(r, cm->bit_depth, ACCT_STR);
       }
       xd->plane[1].color_index_map[0] = read_uniform(r, n);
       assert(xd->plane[1].color_index_map[0] < n);
@@ -513,7 +525,7 @@
 #endif  // CONFIG_PALETTE
       ) {
     mbmi->ext_intra_mode_info.use_ext_intra_mode[0] =
-        aom_read(r, cm->fc->ext_intra_probs[0]);
+        aom_read(r, cm->fc->ext_intra_probs[0], ACCT_STR);
     if (mbmi->ext_intra_mode_info.use_ext_intra_mode[0]) {
       mbmi->ext_intra_mode_info.ext_intra_mode[0] =
           read_uniform(r, FILTER_INTRA_MODES);
@@ -527,7 +539,7 @@
 #endif  // CONFIG_PALETTE
       ) {
     mbmi->ext_intra_mode_info.use_ext_intra_mode[1] =
-        aom_read(r, cm->fc->ext_intra_probs[1]);
+        aom_read(r, cm->fc->ext_intra_probs[1], ACCT_STR);
     if (mbmi->ext_intra_mode_info.use_ext_intra_mode[1]) {
       mbmi->ext_intra_mode_info.ext_intra_mode[1] =
           read_uniform(r, FILTER_INTRA_MODES);
@@ -552,8 +564,8 @@
     p_angle = mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP;
     if (av1_is_intra_filter_switchable(p_angle)) {
       FRAME_COUNTS *counts = xd->counts;
-      mbmi->intra_filter = aom_read_tree(r, av1_intra_filter_tree,
-                                         cm->fc->intra_filter_probs[ctx]);
+      mbmi->intra_filter = aom_read_tree(
+          r, av1_intra_filter_tree, cm->fc->intra_filter_probs[ctx], ACCT_STR);
       if (counts) ++counts->intra_filter[ctx][mbmi->intra_filter];
     } else {
       mbmi->intra_filter = INTRA_FILTER_LINEAR;
@@ -641,7 +653,8 @@
       if (eset > 0) {
         mbmi->tx_type = aom_read_tree(
             r, av1_ext_tx_intra_tree[eset],
-            cm->fc->intra_ext_tx_prob[eset][mbmi->tx_size][mbmi->mode]);
+            cm->fc->intra_ext_tx_prob[eset][mbmi->tx_size][mbmi->mode],
+            ACCT_STR);
         if (counts)
           ++counts->intra_ext_tx[eset][mbmi->tx_size][mbmi->mode]
                                 [mbmi->tx_type];
@@ -654,9 +667,9 @@
         !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
       FRAME_COUNTS *counts = xd->counts;
       TX_TYPE tx_type_nom = intra_mode_to_tx_type_context[mbmi->mode];
-      mbmi->tx_type =
-          aom_read_tree(r, av1_ext_tx_tree,
-                        cm->fc->intra_ext_tx_prob[mbmi->tx_size][tx_type_nom]);
+      mbmi->tx_type = aom_read_tree(
+          r, av1_ext_tx_tree,
+          cm->fc->intra_ext_tx_prob[mbmi->tx_size][tx_type_nom], ACCT_STR);
       if (counts)
         ++counts->intra_ext_tx[mbmi->tx_size][tx_type_nom][mbmi->tx_type];
     } else {
@@ -669,29 +682,31 @@
 static int read_mv_component(aom_reader *r, const nmv_component *mvcomp,
                              int usehp) {
   int mag, d, fr, hp;
-  const int sign = aom_read(r, mvcomp->sign);
-  const int mv_class = aom_read_tree(r, av1_mv_class_tree, mvcomp->classes);
+  const int sign = aom_read(r, mvcomp->sign, ACCT_STR);
+  const int mv_class =
+      aom_read_tree(r, av1_mv_class_tree, mvcomp->classes, ACCT_STR);
   const int class0 = mv_class == MV_CLASS_0;
 
   // Integer part
   if (class0) {
-    d = aom_read_tree(r, av1_mv_class0_tree, mvcomp->class0);
+    d = aom_read_tree(r, av1_mv_class0_tree, mvcomp->class0, ACCT_STR);
     mag = 0;
   } else {
     int i;
     const int n = mv_class + CLASS0_BITS - 1;  // number of bits
 
     d = 0;
-    for (i = 0; i < n; ++i) d |= aom_read(r, mvcomp->bits[i]) << i;
+    for (i = 0; i < n; ++i) d |= aom_read(r, mvcomp->bits[i], ACCT_STR) << i;
     mag = CLASS0_SIZE << (mv_class + 2);
   }
 
   // Fractional part
   fr = aom_read_tree(r, av1_mv_fp_tree,
-                     class0 ? mvcomp->class0_fp[d] : mvcomp->fp);
+                     class0 ? mvcomp->class0_fp[d] : mvcomp->fp, ACCT_STR);
 
   // High precision part (if hp is not used, the default value of the hp is 1)
-  hp = usehp ? aom_read(r, class0 ? mvcomp->class0_hp : mvcomp->hp) : 1;
+  hp = usehp ? aom_read(r, class0 ? mvcomp->class0_hp : mvcomp->hp, ACCT_STR)
+             : 1;
 
   // Result
   mag += ((d << 3) | (fr << 1) | hp) + 1;
@@ -704,7 +719,8 @@
   MV_JOINT_TYPE joint_type;
   const int use_hp = allow_hp && av1_use_mv_hp(ref);
   MV diff = { 0, 0 };
-  joint_type = (MV_JOINT_TYPE)aom_read_tree(r, av1_mv_joint_tree, ctx->joints);
+  joint_type =
+      (MV_JOINT_TYPE)aom_read_tree(r, av1_mv_joint_tree, ctx->joints, ACCT_STR);
 
   if (mv_joint_vertical(joint_type))
     diff.row = read_mv_component(r, &ctx->comps[0], use_hp);
@@ -724,7 +740,7 @@
   if (cm->reference_mode == REFERENCE_MODE_SELECT) {
     const int ctx = av1_get_reference_mode_context(cm, xd);
     const REFERENCE_MODE mode =
-        (REFERENCE_MODE)aom_read(r, cm->fc->comp_inter_prob[ctx]);
+        (REFERENCE_MODE)aom_read(r, cm->fc->comp_inter_prob[ctx], ACCT_STR);
     FRAME_COUNTS *counts = xd->counts;
     if (counts) ++counts->comp_inter[ctx][mode];
     return mode;  // SINGLE_REFERENCE or COMPOUND_REFERENCE
@@ -754,7 +770,7 @@
       const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
 #endif  // CONFIG_EXT_REFS
       const int ctx = av1_get_pred_context_comp_ref_p(cm, xd);
-      const int bit = aom_read(r, fc->comp_ref_prob[ctx][0]);
+      const int bit = aom_read(r, fc->comp_ref_prob[ctx][0], ACCT_STR);
 
       if (counts) ++counts->comp_ref[ctx][0][bit];
 
@@ -762,12 +778,12 @@
       // Decode forward references.
       if (!bit) {
         const int ctx1 = av1_get_pred_context_comp_ref_p1(cm, xd);
-        const int bit1 = aom_read(r, fc->comp_ref_prob[ctx1][1]);
+        const int bit1 = aom_read(r, fc->comp_ref_prob[ctx1][1], ACCT_STR);
         if (counts) ++counts->comp_ref[ctx1][1][bit1];
         ref_frame[!idx] = cm->comp_fwd_ref[bit1 ? 0 : 1];
       } else {
         const int ctx2 = av1_get_pred_context_comp_ref_p2(cm, xd);
-        const int bit2 = aom_read(r, fc->comp_ref_prob[ctx2][2]);
+        const int bit2 = aom_read(r, fc->comp_ref_prob[ctx2][2], ACCT_STR);
         if (counts) ++counts->comp_ref[ctx2][2][bit2];
         ref_frame[!idx] = cm->comp_fwd_ref[bit2 ? 3 : 2];
       }
@@ -775,7 +791,8 @@
       // Decode backward references.
       {
         const int ctx_bwd = av1_get_pred_context_comp_bwdref_p(cm, xd);
-        const int bit_bwd = aom_read(r, fc->comp_bwdref_prob[ctx_bwd][0]);
+        const int bit_bwd =
+            aom_read(r, fc->comp_bwdref_prob[ctx_bwd][0], ACCT_STR);
         if (counts) ++counts->comp_bwdref[ctx_bwd][0][bit_bwd];
         ref_frame[idx] = cm->comp_bwd_ref[bit_bwd];
       }
@@ -786,38 +803,38 @@
     } else if (mode == SINGLE_REFERENCE) {
 #if CONFIG_EXT_REFS
       const int ctx0 = av1_get_pred_context_single_ref_p1(xd);
-      const int bit0 = aom_read(r, fc->single_ref_prob[ctx0][0]);
+      const int bit0 = aom_read(r, fc->single_ref_prob[ctx0][0], ACCT_STR);
       if (counts) ++counts->single_ref[ctx0][0][bit0];
 
       if (bit0) {
         const int ctx1 = av1_get_pred_context_single_ref_p2(xd);
-        const int bit1 = aom_read(r, fc->single_ref_prob[ctx1][1]);
+        const int bit1 = aom_read(r, fc->single_ref_prob[ctx1][1], ACCT_STR);
         if (counts) ++counts->single_ref[ctx1][1][bit1];
         ref_frame[0] = bit1 ? ALTREF_FRAME : BWDREF_FRAME;
       } else {
         const int ctx2 = av1_get_pred_context_single_ref_p3(xd);
-        const int bit2 = aom_read(r, fc->single_ref_prob[ctx2][2]);
+        const int bit2 = aom_read(r, fc->single_ref_prob[ctx2][2], ACCT_STR);
         if (counts) ++counts->single_ref[ctx2][2][bit2];
         if (bit2) {
           const int ctx4 = av1_get_pred_context_single_ref_p5(xd);
-          const int bit4 = aom_read(r, fc->single_ref_prob[ctx4][4]);
+          const int bit4 = aom_read(r, fc->single_ref_prob[ctx4][4], ACCT_STR);
           if (counts) ++counts->single_ref[ctx4][4][bit4];
           ref_frame[0] = bit4 ? GOLDEN_FRAME : LAST3_FRAME;
         } else {
           const int ctx3 = av1_get_pred_context_single_ref_p4(xd);
-          const int bit3 = aom_read(r, fc->single_ref_prob[ctx3][3]);
+          const int bit3 = aom_read(r, fc->single_ref_prob[ctx3][3], ACCT_STR);
           if (counts) ++counts->single_ref[ctx3][3][bit3];
           ref_frame[0] = bit3 ? LAST2_FRAME : LAST_FRAME;
         }
       }
 #else
       const int ctx0 = av1_get_pred_context_single_ref_p1(xd);
-      const int bit0 = aom_read(r, fc->single_ref_prob[ctx0][0]);
+      const int bit0 = aom_read(r, fc->single_ref_prob[ctx0][0], ACCT_STR);
       if (counts) ++counts->single_ref[ctx0][0][bit0];
 
       if (bit0) {
         const int ctx1 = av1_get_pred_context_single_ref_p2(xd);
-        const int bit1 = aom_read(r, fc->single_ref_prob[ctx1][1]);
+        const int bit1 = aom_read(r, fc->single_ref_prob[ctx1][1], ACCT_STR);
         if (counts) ++counts->single_ref[ctx1][1][bit1];
         ref_frame[0] = bit1 ? ALTREF_FRAME : GOLDEN_FRAME;
       } else {
@@ -839,8 +856,9 @@
     int motion_mode;
     FRAME_COUNTS *counts = xd->counts;
 
-    motion_mode = aom_read_tree(r, av1_motion_mode_tree,
-                                cm->fc->motion_mode_prob[mbmi->sb_type]);
+    motion_mode =
+        aom_read_tree(r, av1_motion_mode_tree,
+                      cm->fc->motion_mode_prob[mbmi->sb_type], ACCT_STR);
     if (counts) ++counts->motion_mode[mbmi->sb_type][motion_mode];
     return (MOTION_MODE)(SIMPLE_TRANSLATION + motion_mode);
   } else {
@@ -869,11 +887,13 @@
     FRAME_COUNTS *counts = xd->counts;
 #if CONFIG_DAALA_EC
     const InterpFilter type =
-        (InterpFilter)av1_switchable_interp_inv[aom_read_tree_cdf(
-            r, cm->fc->switchable_interp_cdf[ctx], SWITCHABLE_FILTERS)];
+        (InterpFilter)av1_switchable_interp_inv[aom_read_symbol(
+            r, cm->fc->switchable_interp_cdf[ctx], SWITCHABLE_FILTERS,
+            ACCT_STR)];
 #else
     const InterpFilter type = (InterpFilter)aom_read_tree(
-        r, av1_switchable_interp_tree, cm->fc->switchable_interp_prob[ctx]);
+        r, av1_switchable_interp_tree, cm->fc->switchable_interp_prob[ctx],
+        ACCT_STR);
 #endif
     if (counts) ++counts->switchable_interp[ctx][type];
     return type;
@@ -1169,7 +1189,7 @@
     return get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME) != INTRA_FRAME;
   } else {
     const int ctx = av1_get_intra_inter_context(xd);
-    const int is_inter = aom_read(r, cm->fc->intra_inter_prob[ctx]);
+    const int is_inter = aom_read(r, cm->fc->intra_inter_prob[ctx], ACCT_STR);
     FRAME_COUNTS *counts = xd->counts;
     if (counts) ++counts->intra_inter[ctx][is_inter];
     return is_inter;
@@ -1528,7 +1548,8 @@
 #endif
       is_interintra_allowed(mbmi)) {
     const int bsize_group = size_group_lookup[bsize];
-    const int interintra = aom_read(r, cm->fc->interintra_prob[bsize_group]);
+    const int interintra =
+        aom_read(r, cm->fc->interintra_prob[bsize_group], ACCT_STR);
     if (xd->counts) xd->counts->interintra[bsize_group][interintra]++;
     assert(mbmi->ref_frame[1] == NONE);
     if (interintra) {
@@ -1545,12 +1566,12 @@
 #endif  // CONFIG_EXT_INTRA
       if (is_interintra_wedge_used(bsize)) {
         mbmi->use_wedge_interintra =
-            aom_read(r, cm->fc->wedge_interintra_prob[bsize]);
+            aom_read(r, cm->fc->wedge_interintra_prob[bsize], ACCT_STR);
         if (xd->counts)
           xd->counts->wedge_interintra[bsize][mbmi->use_wedge_interintra]++;
         if (mbmi->use_wedge_interintra) {
           mbmi->interintra_wedge_index =
-              aom_read_literal(r, get_wedge_bits_lookup(bsize));
+              aom_read_literal(r, get_wedge_bits_lookup(bsize), ACCT_STR);
           mbmi->interintra_wedge_sign = 0;
         }
       }
@@ -1579,13 +1600,13 @@
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
       is_interinter_wedge_used(bsize)) {
     mbmi->use_wedge_interinter =
-        aom_read(r, cm->fc->wedge_interinter_prob[bsize]);
+        aom_read(r, cm->fc->wedge_interinter_prob[bsize], ACCT_STR);
     if (xd->counts)
       xd->counts->wedge_interinter[bsize][mbmi->use_wedge_interinter]++;
     if (mbmi->use_wedge_interinter) {
       mbmi->interinter_wedge_index =
-          aom_read_literal(r, get_wedge_bits_lookup(bsize));
-      mbmi->interinter_wedge_sign = aom_read_bit(r);
+          aom_read_literal(r, get_wedge_bits_lookup(bsize), ACCT_STR);
+      mbmi->interinter_wedge_sign = aom_read_bit(r, ACCT_STR);
     }
   }
 #endif  // CONFIG_EXT_INTER
@@ -1655,7 +1676,7 @@
       int use_rect_tx = 0;
 
       if (is_rect_tx_allowed) {
-        use_rect_tx = aom_read(r, cm->fc->rect_tx_prob[tx_size_cat]);
+        use_rect_tx = aom_read(r, cm->fc->rect_tx_prob[tx_size_cat], ACCT_STR);
         if (xd->counts) {
           ++xd->counts->rect_tx[tx_size_cat][use_rect_tx];
         }
@@ -1742,7 +1763,8 @@
         if (eset > 0) {
           mbmi->tx_type = aom_read_tree(
               r, av1_ext_tx_inter_tree[eset],
-              cm->fc->inter_ext_tx_prob[eset][txsize_sqr_map[mbmi->tx_size]]);
+              cm->fc->inter_ext_tx_prob[eset][txsize_sqr_map[mbmi->tx_size]],
+              ACCT_STR);
           if (counts)
             ++counts->inter_ext_tx[eset][txsize_sqr_map[mbmi->tx_size]]
                                   [mbmi->tx_type];
@@ -1751,7 +1773,8 @@
         if (eset > 0) {
           mbmi->tx_type = aom_read_tree(
               r, av1_ext_tx_intra_tree[eset],
-              cm->fc->intra_ext_tx_prob[eset][mbmi->tx_size][mbmi->mode]);
+              cm->fc->intra_ext_tx_prob[eset][mbmi->tx_size][mbmi->mode],
+              ACCT_STR);
           if (counts)
             ++counts->intra_ext_tx[eset][mbmi->tx_size][mbmi->mode]
                                   [mbmi->tx_type];
@@ -1769,22 +1792,24 @@
       FRAME_COUNTS *counts = xd->counts;
       if (inter_block) {
 #if CONFIG_DAALA_EC
-        mbmi->tx_type = av1_ext_tx_inv[aom_read_tree_cdf(
-            r, cm->fc->inter_ext_tx_cdf[mbmi->tx_size], TX_TYPES)];
+        mbmi->tx_type = av1_ext_tx_inv[aom_read_symbol(
+            r, cm->fc->inter_ext_tx_cdf[mbmi->tx_size], TX_TYPES, ACCT_STR)];
 #else
-        mbmi->tx_type = aom_read_tree(r, av1_ext_tx_tree,
-                                      cm->fc->inter_ext_tx_prob[mbmi->tx_size]);
+        mbmi->tx_type =
+            aom_read_tree(r, av1_ext_tx_tree,
+                          cm->fc->inter_ext_tx_prob[mbmi->tx_size], ACCT_STR);
 #endif
         if (counts) ++counts->inter_ext_tx[mbmi->tx_size][mbmi->tx_type];
       } else {
         const TX_TYPE tx_type_nom = intra_mode_to_tx_type_context[mbmi->mode];
 #if CONFIG_DAALA_EC
-        mbmi->tx_type = av1_ext_tx_inv[aom_read_tree_cdf(
-            r, cm->fc->intra_ext_tx_cdf[mbmi->tx_size][tx_type_nom], TX_TYPES)];
+        mbmi->tx_type = av1_ext_tx_inv[aom_read_symbol(
+            r, cm->fc->intra_ext_tx_cdf[mbmi->tx_size][tx_type_nom], TX_TYPES,
+            ACCT_STR)];
 #else
         mbmi->tx_type = aom_read_tree(
             r, av1_ext_tx_tree,
-            cm->fc->intra_ext_tx_prob[mbmi->tx_size][tx_type_nom]);
+            cm->fc->intra_ext_tx_prob[mbmi->tx_size][tx_type_nom], ACCT_STR);
 #endif
         if (counts)
           ++counts->intra_ext_tx[mbmi->tx_size][tx_type_nom][mbmi->tx_type];
diff --git a/av1/decoder/decoder.c b/av1/decoder/decoder.c
index 61b9fc0..9952650 100644
--- a/av1/decoder/decoder.c
+++ b/av1/decoder/decoder.c
@@ -126,6 +126,9 @@
 #if CONFIG_LOOP_RESTORATION
   av1_loop_restoration_precal();
 #endif  // CONFIG_LOOP_RESTORATION
+#if CONFIG_ACCOUNTING
+  aom_accounting_init(&pbi->accounting);
+#endif
 
   cm->error.setjmp = 0;
 
@@ -154,6 +157,10 @@
     av1_loop_filter_dealloc(&pbi->lf_row_sync);
   }
 
+#if CONFIG_ACCOUNTING
+  aom_accounting_clear(&pbi->accounting);
+#endif
+
   aom_free(pbi);
 }
 
diff --git a/av1/decoder/decoder.h b/av1/decoder/decoder.h
index 919e7b8..7575260 100644
--- a/av1/decoder/decoder.h
+++ b/av1/decoder/decoder.h
@@ -22,6 +22,9 @@
 #include "av1/common/thread_common.h"
 #include "av1/common/onyxc_int.h"
 #include "av1/decoder/dthread.h"
+#if CONFIG_ACCOUNTING
+#include "av1/common/accounting.h"
+#endif
 
 #ifdef __cplusplus
 extern "C" {
@@ -100,6 +103,10 @@
   int tile_col_size_bytes;
   int dec_tile_row, dec_tile_col;
 #endif  // CONFIG_EXT_TILE
+#if CONFIG_ACCOUNTING
+  Accounting accounting;
+#endif
+
 } AV1Decoder;
 
 int av1_receive_compressed_data(struct AV1Decoder *pbi, size_t size,
diff --git a/av1/decoder/detokenize.c b/av1/decoder/detokenize.c
index 24bcd21..1bd4c0d 100644
--- a/av1/decoder/detokenize.c
+++ b/av1/decoder/detokenize.c
@@ -22,6 +22,8 @@
 
 #include "av1/decoder/detokenize.h"
 
+#define ACCT_STR __func__
+
 #define EOB_CONTEXT_NODE 0
 #define ZERO_CONTEXT_NODE 1
 #define ONE_CONTEXT_NODE 2
@@ -41,7 +43,7 @@
 
 static INLINE int read_coeff(const aom_prob *probs, int n, aom_reader *r) {
   int i, val = 0;
-  for (i = 0; i < n; ++i) val = (val << 1) | aom_read(r, probs[i]);
+  for (i = 0; i < n; ++i) val = (val << 1) | aom_read(r, probs[i], ACCT_STR);
   return val;
 }
 
@@ -142,7 +144,7 @@
     band = *band_translate++;
     prob = coef_probs[band][ctx];
     if (counts) ++eob_branch_count[band][ctx];
-    if (!aom_read(r, prob[EOB_CONTEXT_NODE])) {
+    if (!aom_read(r, prob[EOB_CONTEXT_NODE], ACCT_STR)) {
       INCREMENT_COUNT(EOB_MODEL_TOKEN);
       break;
     }
@@ -151,7 +153,7 @@
     dqv_val = &dq_val[band][0];
 #endif  // CONFIG_NEW_QUANT
 
-    while (!aom_read(r, prob[ZERO_CONTEXT_NODE])) {
+    while (!aom_read(r, prob[ZERO_CONTEXT_NODE], ACCT_STR)) {
       INCREMENT_COUNT(ZERO_TOKEN);
       dqv = dq[1];
       token_cache[scan[c]] = 0;
@@ -166,8 +168,8 @@
     }
 #if CONFIG_ANS
     cdf = &coef_cdfs[band][ctx];
-    token =
-        ONE_TOKEN + aom_read_symbol(r, *cdf, CATEGORY6_TOKEN - ONE_TOKEN + 1);
+    token = ONE_TOKEN +
+            aom_read_symbol(r, *cdf, CATEGORY6_TOKEN - ONE_TOKEN + 1, ACCT_STR);
     INCREMENT_COUNT(ONE_TOKEN + (token > ONE_TOKEN));
     switch (token) {
       case ONE_TOKEN:
@@ -211,14 +213,14 @@
       } break;
     }
 #else
-    if (!aom_read(r, prob[ONE_CONTEXT_NODE])) {
+    if (!aom_read(r, prob[ONE_CONTEXT_NODE], ACCT_STR)) {
       INCREMENT_COUNT(ONE_TOKEN);
       token = ONE_TOKEN;
       val = 1;
     } else {
       INCREMENT_COUNT(TWO_TOKEN);
       token = aom_read_tree(r, av1_coef_con_tree,
-                            av1_pareto8_full[prob[PIVOT_NODE] - 1]);
+                            av1_pareto8_full[prob[PIVOT_NODE] - 1], ACCT_STR);
       switch (token) {
         case TWO_TOKEN:
         case THREE_TOKEN:
@@ -275,12 +277,13 @@
 
 #if CONFIG_COEFFICIENT_RANGE_CHECKING
 #if CONFIG_AOM_HIGHBITDEPTH
-    dqcoeff[scan[c]] = highbd_check_range((aom_read_bit(r) ? -v : v), xd->bd);
+    dqcoeff[scan[c]] =
+        highbd_check_range((aom_read_bit(r, ACCT_STR) ? -v : v), xd->bd);
 #else
-    dqcoeff[scan[c]] = check_range(aom_read_bit(r) ? -v : v);
+    dqcoeff[scan[c]] = check_range(aom_read_bit(r, ACCT_STR) ? -v : v);
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 #else
-    dqcoeff[scan[c]] = aom_read_bit(r) ? -v : v;
+    dqcoeff[scan[c]] = aom_read_bit(r, ACCT_STR) ? -v : v;
 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
     token_cache[scan[c]] = av1_pt_energy_class[token];
     ++c;
@@ -355,7 +358,7 @@
       color_ctx =
           av1_get_palette_color_context(color_map, cols, i, j, n, color_order);
       color_idx = aom_read_tree(r, av1_palette_color_tree[n - 2],
-                                prob[n - 2][color_ctx]);
+                                prob[n - 2][color_ctx], ACCT_STR);
       assert(color_idx >= 0 && color_idx < n);
       color_map[i * cols + j] = color_order[color_idx];
     }
diff --git a/av1/decoder/dsubexp.c b/av1/decoder/dsubexp.c
index ebcd784..ee6a295 100644
--- a/av1/decoder/dsubexp.c
+++ b/av1/decoder/dsubexp.c
@@ -21,23 +21,29 @@
   return (v & 1) ? m - ((v + 1) >> 1) : m + (v >> 1);
 }
 
-static int decode_uniform(aom_reader *r) {
+#define decode_uniform(r, ACCT_STR_NAME) \
+  decode_uniform_(r ACCT_STR_ARG(ACCT_STR_NAME))
+#define decode_term_subexp(r, ACCT_STR_NAME) \
+  decode_term_subexp_(r ACCT_STR_ARG(ACCT_STR_NAME))
+
+static int decode_uniform_(aom_reader *r ACCT_STR_PARAM) {
   const int l = 8;
   const int m = (1 << l) - 190;
-  const int v = aom_read_literal(r, l - 1);
-  return v < m ? v : (v << 1) - m + aom_read_bit(r);
+  const int v = aom_read_literal(r, l - 1, ACCT_STR_NAME);
+  return v < m ? v : (v << 1) - m + aom_read_bit(r, ACCT_STR_NAME);
 }
 
 static int inv_remap_prob(int v, int m) {
+  /* clang-format off */
   static uint8_t inv_map_table[MAX_PROB - 1] = {
-    7,   20,  33,  46,  59,  72,  85,  98,  111, 124, 137, 150, 163, 176, 189,
-    202, 215, 228, 241, 254, 1,   2,   3,   4,   5,   6,   8,   9,   10,  11,
-    12,  13,  14,  15,  16,  17,  18,  19,  21,  22,  23,  24,  25,  26,  27,
-    28,  29,  30,  31,  32,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,
-    44,  45,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  60,
-    61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  73,  74,  75,  76,
-    77,  78,  79,  80,  81,  82,  83,  84,  86,  87,  88,  89,  90,  91,  92,
-    93,  94,  95,  96,  97,  99,  100, 101, 102, 103, 104, 105, 106, 107, 108,
+      7,  20,  33,  46,  59,  72,  85,  98, 111, 124, 137, 150, 163, 176, 189,
+    202, 215, 228, 241, 254,   1,   2,   3,   4,   5,   6,   8,   9,  10,  11,
+     12,  13,  14,  15,  16,  17,  18,  19,  21,  22,  23,  24,  25,  26,  27,
+     28,  29,  30,  31,  32,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,
+     44,  45,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  60,
+     61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  73,  74,  75,  76,
+     77,  78,  79,  80,  81,  82,  83,  84,  86,  87,  88,  89,  90,  91,  92,
+     93,  94,  95,  96,  97,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108,
     109, 110, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 125,
     126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 138, 139, 140, 141,
     142, 143, 144, 145, 146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157,
@@ -46,8 +52,8 @@
     191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206,
     207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221, 222,
     223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238,
-    239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
-  };
+    239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253
+  }; /* clang-format on */
   assert(v < (int)(sizeof(inv_map_table) / sizeof(inv_map_table[0])));
   v = inv_map_table[v];
   m--;
@@ -58,26 +64,31 @@
   }
 }
 
-static int decode_term_subexp(aom_reader *r) {
-  if (!aom_read_bit(r)) return aom_read_literal(r, 4);
-  if (!aom_read_bit(r)) return aom_read_literal(r, 4) + 16;
-  if (!aom_read_bit(r)) return aom_read_literal(r, 5) + 32;
-  return decode_uniform(r) + 64;
+static int decode_term_subexp_(aom_reader *r ACCT_STR_PARAM) {
+  if (!aom_read_bit(r, ACCT_STR_NAME))
+    return aom_read_literal(r, 4, ACCT_STR_NAME);
+  if (!aom_read_bit(r, ACCT_STR_NAME))
+    return aom_read_literal(r, 4, ACCT_STR_NAME) + 16;
+  if (!aom_read_bit(r, ACCT_STR_NAME))
+    return aom_read_literal(r, 5, ACCT_STR_NAME) + 32;
+  return decode_uniform(r, ACCT_STR_NAME) + 64;
 }
 
-void av1_diff_update_prob(aom_reader *r, aom_prob *p) {
-  if (aom_read(r, DIFF_UPDATE_PROB)) {
-    const int delp = decode_term_subexp(r);
+void av1_diff_update_prob_(aom_reader *r, aom_prob *p ACCT_STR_PARAM) {
+  if (aom_read(r, DIFF_UPDATE_PROB, ACCT_STR_NAME)) {
+    const int delp = decode_term_subexp(r, ACCT_STR_NAME);
     *p = (aom_prob)inv_remap_prob(delp, *p);
   }
 }
 
+#if CONFIG_GLOBAL_MOTION
 int aom_read_primitive_symmetric(aom_reader *r, unsigned int mag_bits) {
-  if (aom_read_bit(r)) {
-    int s = aom_read_bit(r);
-    int x = aom_read_literal(r, mag_bits) + 1;
+  if (aom_read_bit(r, ACCT_STR_NAME)) {
+    int s = aom_read_bit(r, ACCT_STR_NAME);
+    int x = aom_read_literal(r, mag_bits, ACCT_STR_NAME) + 1;
     return (s > 0 ? -x : x);
   } else {
     return 0;
   }
 }
+#endif  // CONFIG_GLOBAL_MOTION
\ No newline at end of file
diff --git a/av1/decoder/dsubexp.h b/av1/decoder/dsubexp.h
index c0d372a..60aa7df 100644
--- a/av1/decoder/dsubexp.h
+++ b/av1/decoder/dsubexp.h
@@ -18,15 +18,22 @@
 extern "C" {
 #endif
 
-void av1_diff_update_prob(aom_reader *r, aom_prob *p);
+#if CONFIG_ACCOUNTING
+#define av1_diff_update_prob(r, p, str) av1_diff_update_prob_(r, p, str)
+#else
+#define av1_diff_update_prob(r, p, str) av1_diff_update_prob_(r, p)
+#endif
+
+void av1_diff_update_prob_(aom_reader *r, aom_prob *p ACCT_STR_PARAM);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
-
+#if CONFIG_GLOBAL_MOTION
 // mag_bits is number of bits for magnitude. The alphabet is of size
 // 2 * 2^mag_bits + 1, symmetric around 0, where one bit is used to
 // indicate 0 or non-zero, mag_bits bits are used to indicate magnitide
 // and 1 more bit for the sign if non-zero.
 int aom_read_primitive_symmetric(aom_reader *r, unsigned int mag_bits);
+#endif  // CONFIG_GLOBAL_MOTION
 #endif  // AV1_DECODER_DSUBEXP_H_
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index c361dbf..948c4f5 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -808,8 +808,13 @@
 static void write_segment_id(aom_writer *w, const struct segmentation *seg,
                              const struct segmentation_probs *segp,
                              int segment_id) {
-  if (seg->enabled && seg->update_map)
+  if (seg->enabled && seg->update_map) {
+#if CONFIG_DAALA_EC
+    aom_write_symbol(w, segment_id, segp->tree_cdf, MAX_SEGMENTS);
+#else
     aom_write_tree(w, av1_segment_tree, segp->tree_probs, segment_id, 3, 0);
+#endif
+  }
 }
 
 // This function encodes the reference frame
@@ -2736,6 +2741,10 @@
     prob_diff_update(av1_segment_tree, cm->fc->seg.tree_probs,
                      cm->counts.seg.tree_total, MAX_SEGMENTS, w);
   }
+#if CONFIG_DAALA_EC
+  av1_tree_to_cdf(av1_segment_tree, cm->fc->seg.tree_probs,
+                  cm->fc->seg.tree_cdf);
+#endif
 }
 
 static void write_txfm_mode(TX_MODE mode, struct aom_write_bit_buffer *wb) {
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 297c354..004ad68 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -4459,7 +4459,9 @@
     best_param = curr_param;
     for (i = 0; i < n_refinements; i++) {
       // look to the left
-      *param = curr_param - step;
+      *param =
+          (int16_t)clamp(curr_param - step, p < 2 ? GM_TRANS_MIN : GM_ALPHA_MIN,
+                         p < 2 ? GM_TRANS_MAX : GM_ALPHA_MAX);
       step_error =
           av1_warp_erroradv(wm,
 #if CONFIG_AOM_HIGHBITDEPTH
@@ -4476,7 +4478,9 @@
       }
 
       // look to the right
-      *param = curr_param + step;
+      *param =
+          (int16_t)clamp(curr_param + step, p < 2 ? GM_TRANS_MIN : GM_ALPHA_MIN,
+                         p < 2 ? GM_TRANS_MAX : GM_ALPHA_MAX);
       step_error =
           av1_warp_erroradv(wm,
 #if CONFIG_AOM_HIGHBITDEPTH
@@ -5245,11 +5249,14 @@
 #if CONFIG_VAR_TX
   if (cm->tx_mode == TX_MODE_SELECT && mbmi->sb_type >= BLOCK_8X8 &&
       is_inter_block(mbmi) && !(mbmi->skip || seg_skip)) {
-    if (dry_run) tx_partition_set_contexts(cm, xd, bsize, mi_row, mi_col);
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
     if (is_rect_tx(mbmi->tx_size)) {
       set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h, xd);
+    } else {
+      if (dry_run) tx_partition_set_contexts(cm, xd, bsize, mi_row, mi_col);
     }
+#else
+    if (dry_run) tx_partition_set_contexts(cm, xd, bsize, mi_row, mi_col);
 #endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
   } else {
     TX_SIZE tx_size;
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index b9feca9..c39b78a 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -1,11 +1,12 @@
 /*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
  *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
 #include <limits.h>
@@ -420,8 +421,6 @@
   // Delete sementation map
   aom_free(cpi->segmentation_map);
   cpi->segmentation_map = NULL;
-  aom_free(cpi->coding_context.last_frame_seg_map_copy);
-  cpi->coding_context.last_frame_seg_map_copy = NULL;
 
 #if CONFIG_REF_MV
   for (i = 0; i < NMV_CONTEXTS; ++i) {
@@ -535,9 +534,6 @@
   memcpy(cc->nmvcosts_hp[1], cpi->nmvcosts_hp[1],
          MV_VALS * sizeof(*cpi->nmvcosts_hp[1]));
 
-  memcpy(cpi->coding_context.last_frame_seg_map_copy, cm->last_frame_seg_map,
-         (cm->mi_rows * cm->mi_cols));
-
   av1_copy(cc->last_ref_lf_deltas, cm->lf.last_ref_deltas);
   av1_copy(cc->last_mode_lf_deltas, cm->lf.last_mode_deltas);
 
@@ -576,9 +572,6 @@
   memcpy(cpi->nmvcosts_hp[1], cc->nmvcosts_hp[1],
          MV_VALS * sizeof(*cc->nmvcosts_hp[1]));
 
-  memcpy(cm->last_frame_seg_map, cpi->coding_context.last_frame_seg_map_copy,
-         (cm->mi_rows * cm->mi_cols));
-
   av1_copy(cm->lf.last_ref_deltas, cc->last_ref_lf_deltas);
   av1_copy(cm->lf.last_mode_deltas, cc->last_mode_lf_deltas);
 
@@ -1899,12 +1892,6 @@
   aom_free(cpi->active_map.map);
   CHECK_MEM_ERROR(cm, cpi->active_map.map,
                   aom_calloc(cm->mi_rows * cm->mi_cols, 1));
-
-  // And a place holder structure is the coding context
-  // for use if we want to save and restore it
-  aom_free(cpi->coding_context.last_frame_seg_map_copy);
-  CHECK_MEM_ERROR(cm, cpi->coding_context.last_frame_seg_map_copy,
-                  aom_calloc(cm->mi_rows * cm->mi_cols, 1));
 }
 
 void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index 00b30fd..e2046ec 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -1,11 +1,12 @@
 /*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
  *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
 #ifndef AV1_ENCODER_ENCODER_H_
@@ -59,8 +60,6 @@
   int nmv_costs_hp[NMV_CONTEXTS][2][MV_VALS];
 #endif
 
-  unsigned char *last_frame_seg_map_copy;
-
   // 0 = Intra, Last, GF, ARF
   signed char last_ref_lf_deltas[TOTAL_REFS_PER_FRAME];
   // 0 = ZERO_MV, MV
diff --git a/av1/encoder/ransac.c b/av1/encoder/ransac.c
index 0c8ad67..714d567 100644
--- a/av1/encoder/ransac.c
+++ b/av1/encoder/ransac.c
@@ -92,16 +92,16 @@
   }
 }
 
-static int get_rand_indices(int npoints, int minpts, int *indices) {
+static int get_rand_indices(int npoints, int minpts, int *indices,
+                            unsigned int *seed) {
   int i, j;
-  unsigned int seed = (unsigned int)npoints;
-  int ptr = rand_r(&seed) % npoints;
+  int ptr = rand_r(seed) % npoints;
   if (minpts > npoints) return 0;
   indices[0] = ptr;
   ptr = (ptr == npoints - 1 ? 0 : ptr + 1);
   i = 1;
   while (i < minpts) {
-    int index = rand_r(&seed) % npoints;
+    int index = rand_r(seed) % npoints;
     while (index) {
       ptr = (ptr == npoints - 1 ? 0 : ptr + 1);
       for (j = 0; j < i; ++j) {
@@ -132,6 +132,7 @@
   int N = 10000, trial_count = 0;
   int i;
   int ret_val = 0;
+  unsigned int seed = (unsigned int)npoints;
 
   int max_inliers = 0;
   double best_variance = 0.0;
@@ -139,7 +140,7 @@
   WarpedMotionParams wm;
   double points1[2 * MAX_MINPTS];
   double points2[2 * MAX_MINPTS];
-  int indices[MAX_MINPTS];
+  int indices[MAX_MINPTS] = { 0 };
 
   double *best_inlier_set1;
   double *best_inlier_set2;
@@ -153,10 +154,6 @@
   double *cnp1, *cnp2;
   double T1[9], T2[9];
 
-  // srand((unsigned)time(NULL)) ;
-  // better to make this deterministic for a given sequence for ease of testing
-  srand(npoints);
-
   *number_of_inliers = 0;
   if (npoints < minpts * MINPTS_MULTIPLIER) {
     printf("Cannot find motion with %d matches\n", npoints);
@@ -203,7 +200,7 @@
     int num_degenerate_iter = 0;
     while (degenerate) {
       num_degenerate_iter++;
-      if (!get_rand_indices(npoints, minpts, indices)) {
+      if (!get_rand_indices(npoints, minpts, indices, &seed)) {
         ret_val = 1;
         goto finish_ransac;
       }
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 8399a85..463570a 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -4138,7 +4138,7 @@
     bits =
         (gm->motion_params.wmmat[0].as_int ? ((GM_ABS_TRANS_BITS + 1) * 2) : 0);
   }
-  return (bits << AV1_PROB_COST_SHIFT) + gmtype_cost[gm->gmtype];
+  return bits ? (bits << AV1_PROB_COST_SHIFT) + gmtype_cost[gm->gmtype] : 0;
 }
 
 #define GLOBAL_MOTION_RATE(ref)                            \
@@ -7810,47 +7810,6 @@
   rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist);
 }
 
-// This function is designed to apply a bias or adjustment to an rd value based
-// on the relative variance of the source and reconstruction.
-#define LOW_VAR_THRESH 16
-#define VLOW_ADJ_MAX 25
-#define VHIGH_ADJ_MAX 8
-static void rd_variance_adjustment(MACROBLOCK *x, int64_t *this_rd,
-                                   MV_REFERENCE_FRAME ref_frame,
-                                   unsigned int source_variance) {
-  unsigned int recon_variance = x->recon_variance;
-  unsigned int absvar_diff = 0;
-  int64_t var_error = 0;
-  int64_t var_factor = 0;
-
-  if (*this_rd == INT64_MAX) return;
-
-  if ((source_variance + recon_variance) > LOW_VAR_THRESH) {
-    absvar_diff = (source_variance > recon_variance)
-                      ? (source_variance - recon_variance)
-                      : (recon_variance - source_variance);
-
-    var_error = ((int64_t)200 * source_variance * recon_variance) /
-                (((int64_t)source_variance * source_variance) +
-                 ((int64_t)recon_variance * recon_variance));
-    var_error = 100 - var_error;
-  }
-
-  // Source variance above a threshold and ref frame is intra.
-  // This case is targeted mainly at discouraging intra modes that give rise
-  // to a predictor with a low spatial complexity compared to the source.
-  if ((source_variance > LOW_VAR_THRESH) && (ref_frame == INTRA_FRAME) &&
-      (source_variance > recon_variance)) {
-    var_factor = AOMMIN(absvar_diff, AOMMIN(VLOW_ADJ_MAX, var_error));
-    // A second possible case of interest is where the source variance
-    // is very low and we wish to discourage false texture or motion trails.
-  } else if ((source_variance < (LOW_VAR_THRESH >> 1)) &&
-             (recon_variance > source_variance)) {
-    var_factor = AOMMIN(absvar_diff, AOMMIN(VHIGH_ADJ_MAX, var_error));
-  }
-  *this_rd += (*this_rd * var_factor) / 100;
-}
-
 // Do we have an internal image edge (e.g. formatting bars).
 int av1_internal_image_edge(const AV1_COMP *cpi) {
   return (cpi->oxcf.pass == 2) &&
@@ -8109,7 +8068,6 @@
     rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
   }
   this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
-  rd_variance_adjustment(x, &this_rd, INTRA_FRAME, x->source_variance);
 
   if (this_rd < *best_intra_rd) {
     *best_intra_rd = this_rd;
@@ -9175,10 +9133,6 @@
 #endif  // CONFIG_MOTION_VAR
     }
 
-    // Apply an adjustment to the rd value based on the similarity of the
-    // source variance and reconstructed variance.
-    rd_variance_adjustment(x, &this_rd, ref_frame, x->source_variance);
-
     if (ref_frame == INTRA_FRAME) {
       // Keep record of best intra rd
       if (this_rd < best_intra_rd) {
diff --git a/av1/encoder/segmentation.c b/av1/encoder/segmentation.c
index dfcb404..3292da4 100644
--- a/av1/encoder/segmentation.c
+++ b/av1/encoder/segmentation.c
@@ -366,6 +366,9 @@
   } else {
     seg->temporal_update = 0;
   }
+#if CONFIG_DAALA_EC
+  av1_tree_to_cdf(av1_segment_tree, segp->tree_probs, segp->tree_cdf);
+#endif
 }
 
 void av1_reset_segment_features(AV1_COMMON *cm) {
diff --git a/configure b/configure
index 53ba10e..611756f 100755
--- a/configure
+++ b/configure
@@ -61,6 +61,7 @@
                                   enable av1 temporal denoising
   ${toggle_webm_io}               enable input from and output to WebM container
   ${toggle_libyuv}                enable libyuv
+  ${toggle_accounting}            enable bit accounting
 
 Codecs:
   Codecs can be selectively enabled or disabled individually, or by family:
@@ -283,6 +284,7 @@
     alt_intra
     palette
     daala_ec
+    cb4x4
 "
 CONFIG_LIST="
     dependency_tracking
@@ -327,6 +329,7 @@
     unit_tests
     webm_io
     libyuv
+    accounting
     decode_perf_tests
     encode_perf_tests
     multi_res_encoding
@@ -387,6 +390,7 @@
     unit_tests
     webm_io
     libyuv
+    accounting
     decode_perf_tests
     encode_perf_tests
     multi_res_encoding
diff --git a/test/accounting_test.cc b/test/accounting_test.cc
new file mode 100644
index 0000000..122f9b8
--- /dev/null
+++ b/test/accounting_test.cc
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+*/
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "test/acm_random.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/bitreader.h"
+#include "aom_dsp/bitwriter.h"
+
+using libaom_test::ACMRandom;
+
+TEST(AV1, TestAccounting) {
+  const int kBufferSize = 10000;
+  const int kSymbols = 1024;
+  aom_writer bw;
+  uint8_t bw_buffer[kBufferSize];
+  aom_start_encode(&bw, bw_buffer);
+  for (int i = 0; i < kSymbols; i++) {
+    aom_write(&bw, 0, 32);
+    aom_write(&bw, 0, 32);
+    aom_write(&bw, 0, 32);
+  }
+  aom_stop_encode(&bw);
+  aom_reader br;
+  aom_reader_init(&br, bw_buffer, kBufferSize, NULL, NULL);
+
+  Accounting accounting;
+  aom_accounting_init(&accounting);
+  br.accounting = &accounting;
+  for (int i = 0; i < kSymbols; i++) {
+    aom_read(&br, 32, "A");
+  }
+  // Consecutive symbols that are the same are coalesced.
+  GTEST_ASSERT_EQ(accounting.syms.num_syms, 1);
+  GTEST_ASSERT_EQ(accounting.syms.syms[0].samples, (unsigned int)kSymbols);
+
+  aom_accounting_reset(&accounting);
+  GTEST_ASSERT_EQ(accounting.syms.num_syms, 0);
+
+  // Should record 2 * kSymbols accounting symbols.
+  aom_reader_init(&br, bw_buffer, kBufferSize, NULL, NULL);
+  br.accounting = &accounting;
+  for (int i = 0; i < kSymbols; i++) {
+    aom_read(&br, 32, "A");
+    aom_read(&br, 32, "B");
+    aom_read(&br, 32, "B");
+  }
+  GTEST_ASSERT_EQ(accounting.syms.num_syms, kSymbols * 2);
+  uint32_t tell_frac = aom_reader_tell_frac(&br);
+  for (int i = 0; i < accounting.syms.num_syms; i++) {
+    tell_frac -= accounting.syms.syms[i].bits;
+  }
+  GTEST_ASSERT_EQ(tell_frac, 0U);
+
+  GTEST_ASSERT_EQ(aom_accounting_dictionary_lookup(&accounting, "A"),
+                  aom_accounting_dictionary_lookup(&accounting, "A"));
+
+  // Check for collisions. The current aom_accounting_hash function returns
+  // the same hash code for AB and BA.
+  GTEST_ASSERT_NE(aom_accounting_dictionary_lookup(&accounting, "AB"),
+                  aom_accounting_dictionary_lookup(&accounting, "BA"));
+}
diff --git a/test/av1_fwd_txfm1d_test.cc b/test/av1_fwd_txfm1d_test.cc
index 03bed19..f671097 100644
--- a/test/av1_fwd_txfm1d_test.cc
+++ b/test/av1_fwd_txfm1d_test.cc
@@ -23,10 +23,15 @@
 const TYPE_TXFM txfm_type_ls[2] = { TYPE_DCT, TYPE_ADST };
 
 const int txfm_size_num = 5;
-const int txfm_size_ls[5] = { 4, 8, 16, 32 };
+const int txfm_size_ls[5] = { 4, 8, 16, 32, 64 };
 
 const TxfmFunc fwd_txfm_func_ls[2][5] = {
+#if CONFIG_TX64X64
+  { av1_fdct4_new, av1_fdct8_new, av1_fdct16_new, av1_fdct32_new,
+    av1_fdct64_new },
+#else
   { av1_fdct4_new, av1_fdct8_new, av1_fdct16_new, av1_fdct32_new, NULL },
+#endif
   { av1_fadst4_new, av1_fadst8_new, av1_fadst16_new, av1_fadst32_new, NULL }
 };
 
diff --git a/test/av1_inv_txfm1d_test.cc b/test/av1_inv_txfm1d_test.cc
index 110d4c3..8470fc0 100644
--- a/test/av1_inv_txfm1d_test.cc
+++ b/test/av1_inv_txfm1d_test.cc
@@ -18,15 +18,25 @@
 namespace {
 const int txfm_type_num = 2;
 const int txfm_size_num = 5;
-const int txfm_size_ls[5] = { 4, 8, 16, 32 };
+const int txfm_size_ls[5] = { 4, 8, 16, 32, 64 };
 
 const TxfmFunc fwd_txfm_func_ls[2][5] = {
+#if CONFIG_TX64X64
+  { av1_fdct4_new, av1_fdct8_new, av1_fdct16_new, av1_fdct32_new,
+    av1_fdct64_new },
+#else
   { av1_fdct4_new, av1_fdct8_new, av1_fdct16_new, av1_fdct32_new, NULL },
+#endif
   { av1_fadst4_new, av1_fadst8_new, av1_fadst16_new, av1_fadst32_new, NULL }
 };
 
 const TxfmFunc inv_txfm_func_ls[2][5] = {
+#if CONFIG_TX64X64
+  { av1_idct4_new, av1_idct8_new, av1_idct16_new, av1_idct32_new,
+    av1_idct64_new },
+#else
   { av1_idct4_new, av1_idct8_new, av1_idct16_new, av1_idct32_new, NULL },
+#endif
   { av1_iadst4_new, av1_iadst8_new, av1_iadst16_new, av1_iadst32_new, NULL }
 };
 
diff --git a/test/boolcoder_test.cc b/test/boolcoder_test.cc
index de9b6e4..1000f58 100644
--- a/test/boolcoder_test.cc
+++ b/test/boolcoder_test.cc
@@ -80,7 +80,7 @@
           } else if (bit_method == 3) {
             bit = bit_rnd(2);
           }
-          GTEST_ASSERT_EQ(aom_read(&br, probas[i]), bit)
+          GTEST_ASSERT_EQ(aom_read(&br, probas[i], NULL), bit)
               << "pos: " << i << " / " << kBitsToTest
               << " bit_method: " << bit_method << " method: " << method;
         }
@@ -88,3 +88,54 @@
     }
   }
 }
+
+#if CONFIG_DAALA_EC
+#define FRAC_DIFF_TOTAL_ERROR 0.07
+#else
+#define FRAC_DIFF_TOTAL_ERROR 0.2
+#endif
+
+TEST(AV1, TestTell) {
+  const int kBufferSize = 10000;
+  aom_writer bw;
+  uint8_t bw_buffer[kBufferSize];
+  const int kSymbols = 1024;
+  // Coders are noisier at low probabilities, so we start at p = 4.
+  for (int p = 4; p <= 256; p++) {
+    double probability = p / 256.;
+    aom_start_encode(&bw, bw_buffer);
+    for (int i = 0; i < kSymbols; i++) {
+      aom_write(&bw, 0, p);
+    }
+    aom_stop_encode(&bw);
+    aom_reader br;
+    aom_reader_init(&br, bw_buffer, kBufferSize, NULL, NULL);
+    ptrdiff_t last_tell = aom_reader_tell(&br);
+    ptrdiff_t last_tell_frac = aom_reader_tell_frac(&br);
+    double frac_diff_total = 0;
+    GTEST_ASSERT_GE(aom_reader_tell(&br), 0);
+    GTEST_ASSERT_LE(aom_reader_tell(&br), 1);
+    for (int i = 0; i < kSymbols; i++) {
+      aom_read(&br, p, NULL);
+      ptrdiff_t tell = aom_reader_tell(&br);
+      ptrdiff_t tell_frac = aom_reader_tell_frac(&br);
+      GTEST_ASSERT_GE(tell, last_tell) << "tell: " << tell
+                                       << ", last_tell: " << last_tell;
+      GTEST_ASSERT_GE(tell_frac, last_tell_frac)
+          << "tell_frac: " << tell_frac
+          << ", last_tell_frac: " << last_tell_frac;
+      // Frac tell should round up to tell.
+      GTEST_ASSERT_EQ(tell, (tell_frac + 7) >> 3);
+      last_tell = tell;
+      frac_diff_total +=
+          fabs(((tell_frac - last_tell_frac) / 8.0) + log2(probability));
+      last_tell_frac = tell_frac;
+    }
+    const int expected = (int)(-kSymbols * log2(probability));
+    // Last tell should be close to the expected value.
+    GTEST_ASSERT_LE(last_tell - expected, 20) << " last_tell: " << last_tell;
+    // The average frac_diff error should be pretty small.
+    GTEST_ASSERT_LE(frac_diff_total / kSymbols, FRAC_DIFF_TOTAL_ERROR)
+        << " frac_diff_total: " << frac_diff_total;
+  }
+}
diff --git a/test/test.mk b/test/test.mk
index 42ff2ba..1146428 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -105,6 +105,9 @@
 LIBAOM_TEST_SRCS-yes                   += ans_test.cc
 else
 LIBAOM_TEST_SRCS-yes                   += boolcoder_test.cc
+ifeq ($(CONFIG_ACCOUNTING),yes)
+LIBAOM_TEST_SRCS-yes                   += accounting_test.cc
+endif
 endif
 LIBAOM_TEST_SRCS-yes                   += divu_small_test.cc
 #LIBAOM_TEST_SRCS-yes                   += encoder_parms_get_to_decoder.cc