Merge "Add SSE4.1 code for deringing functions." into nextgenv2
diff --git a/aom_dsp/ans.c b/aom_dsp/ans.c
index 18f6d48..30f115c 100644
--- a/aom_dsp/ans.c
+++ b/aom_dsp/ans.c
@@ -15,16 +15,7 @@
 #include "aom_dsp/ans.h"
 #include "aom_dsp/prob.h"
 
-void aom_rans_build_cdf_from_pdf(const AnsP10 token_probs[], rans_lut cdf_tab) {
-  int i;
-  cdf_tab[0] = 0;
-  for (i = 1; cdf_tab[i - 1] < RANS_PRECISION; ++i) {
-    cdf_tab[i] = cdf_tab[i - 1] + token_probs[i - 1];
-  }
-  assert(cdf_tab[i - 1] == RANS_PRECISION);
-}
-
-static int find_largest(const AnsP10 *const pdf_tab, int num_syms) {
+static int find_largest(const aom_cdf_prob *const pdf_tab, int num_syms) {
   int largest_idx = -1;
   int largest_p = -1;
   int i;
@@ -38,8 +29,9 @@
   return largest_idx;
 }
 
-void aom_rans_merge_prob8_pdf(AnsP10 *const out_pdf, const AnsP8 node_prob,
-                              const AnsP10 *const src_pdf, int in_syms) {
+void aom_rans_merge_prob8_pdf(aom_cdf_prob *const out_pdf,
+                              const AnsP8 node_prob,
+                              const aom_cdf_prob *const src_pdf, int in_syms) {
   int i;
   int adjustment = RANS_PRECISION;
   const int round_fact = ANS_P8_PRECISION >> 1;
diff --git a/aom_dsp/ans.h b/aom_dsp/ans.h
index ea99f8b..5927e58 100644
--- a/aom_dsp/ans.h
+++ b/aom_dsp/ans.h
@@ -26,24 +26,16 @@
 typedef uint8_t AnsP8;
 #define ANS_P8_PRECISION 256u
 #define ANS_P8_SHIFT 8
-typedef uint16_t AnsP10;
-#define ANS_P10_PRECISION 1024u
+#define RANS_PRECISION 1024u
 #define RANS_PROB_BITS 10
 
-#define RANS_PRECISION ANS_P10_PRECISION
-
-#define L_BASE (ANS_P10_PRECISION * 4)  // L_BASE % precision must be 0
+#define L_BASE (RANS_PRECISION * 4)  // L_BASE % precision must be 0
 #define IO_BASE 256
 // Range I = { L_BASE, L_BASE + 1, ..., L_BASE * IO_BASE - 1 }
 
-// This is now just a boring cdf. It starts with an explicit zero.
-// TODO(aconverse): Remove starting zero.
-typedef uint16_t rans_lut[16];
-
-void aom_rans_build_cdf_from_pdf(const AnsP10 token_probs[], rans_lut cdf_tab);
-
-void aom_rans_merge_prob8_pdf(AnsP10 *const out_pdf, const AnsP8 node_prob,
-                              const AnsP10 *const src_pdf, int in_syms);
+void aom_rans_merge_prob8_pdf(aom_cdf_prob *const out_pdf,
+                              const AnsP8 node_prob,
+                              const aom_cdf_prob *const src_pdf, int in_syms);
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/aom_dsp/ansreader.h b/aom_dsp/ansreader.h
index 11619b0..1f66531 100644
--- a/aom_dsp/ansreader.h
+++ b/aom_dsp/ansreader.h
@@ -62,24 +62,25 @@
 
 struct rans_dec_sym {
   uint8_t val;
-  AnsP10 prob;
-  AnsP10 cum_prob;  // not-inclusive
+  aom_cdf_prob prob;
+  aom_cdf_prob cum_prob;  // not-inclusive
 };
 
-static INLINE void fetch_sym(struct rans_dec_sym *out, const rans_lut cdf,
-                             AnsP10 rem) {
-  int i = 0;
+static INLINE void fetch_sym(struct rans_dec_sym *out, const aom_cdf_prob *cdf,
+                             aom_cdf_prob rem) {
+  int i;
+  aom_cdf_prob cum_prob = 0, top_prob;
   // TODO(skal): if critical, could be a binary search.
   // Or, better, an O(1) alias-table.
-  while (rem >= cdf[i]) {
-    ++i;
+  for (i = 0; rem >= (top_prob = cdf[i]); ++i) {
+    cum_prob = top_prob;
   }
-  out->val = i - 1;
-  out->prob = (AnsP10)(cdf[i] - cdf[i - 1]);
-  out->cum_prob = (AnsP10)cdf[i - 1];
+  out->val = i;
+  out->prob = top_prob - cum_prob;
+  out->cum_prob = cum_prob;
 }
 
-static INLINE int rans_read(struct AnsDecoder *ans, const rans_lut tab) {
+static INLINE int rans_read(struct AnsDecoder *ans, const aom_cdf_prob *tab) {
   unsigned rem;
   unsigned quo;
   struct rans_dec_sym sym;
diff --git a/aom_dsp/answriter.h b/aom_dsp/answriter.h
index 5a82d35..0ac1bda 100644
--- a/aom_dsp/answriter.h
+++ b/aom_dsp/answriter.h
@@ -75,8 +75,8 @@
 }
 
 struct rans_sym {
-  AnsP10 prob;
-  AnsP10 cum_prob;  // not-inclusive
+  aom_cdf_prob prob;
+  aom_cdf_prob cum_prob;  // not-inclusive
 };
 
 // rANS with normalization
@@ -84,7 +84,7 @@
 // ANS_P10_PRECISION is m
 static INLINE void rans_write(struct AnsCoder *ans,
                               const struct rans_sym *const sym) {
-  const AnsP10 p = sym->prob;
+  const aom_cdf_prob p = sym->prob;
   while (ans->state >= L_BASE / RANS_PRECISION * IO_BASE * p) {
     ans->buf[ans->buf_offset++] = ans->state % IO_BASE;
     ans->state /= IO_BASE;
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 6397d01..779ed00 100644
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -44,6 +44,27 @@
 # Intra prediction
 #
 
+add_proto qw/void aom_dc_predictor_2x2/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/aom_dc_predictor_2x2/;
+
+add_proto qw/void aom_dc_top_predictor_2x2/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/aom_dc_top_predictor_2x2/;
+
+add_proto qw/void aom_dc_left_predictor_2x2/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/aom_dc_left_predictor_2x2/;
+
+add_proto qw/void aom_dc_128_predictor_2x2/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/aom_dc_128_predictor_2x2/;
+
+add_proto qw/void aom_v_predictor_2x2/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/aom_v_predictor_2x2/;
+
+add_proto qw/void aom_h_predictor_2x2/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/aom_h_predictor_2x2/;
+
+add_proto qw/void aom_tm_predictor_2x2/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/aom_tm_predictor_2x2/;
+
 add_proto qw/void aom_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/aom_d207_predictor_4x4 sse2/;
 
@@ -649,58 +670,31 @@
 # Forward transform
 #
 if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
-if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
-  add_proto qw/void aom_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct4x4 sse2/;
+  if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void aom_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_highbd_fdct4x4 sse2/;
 
-  add_proto qw/void aom_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct4x4_1 sse2/;
+    add_proto qw/void aom_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_highbd_fdct8x8 sse2/;
 
-  add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct8x8 sse2/;
+    add_proto qw/void aom_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_highbd_fdct8x8_1/;
 
-  add_proto qw/void aom_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct8x8_1 sse2/;
+    add_proto qw/void aom_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_highbd_fdct16x16 sse2/;
 
-  add_proto qw/void aom_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct16x16 sse2/;
+    add_proto qw/void aom_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_highbd_fdct16x16_1/;
 
-  add_proto qw/void aom_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct16x16_1 sse2/;
+    add_proto qw/void aom_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_highbd_fdct32x32 sse2/;
 
-  add_proto qw/void aom_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct32x32 sse2/;
+    add_proto qw/void aom_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_highbd_fdct32x32_rd sse2/;
 
-  add_proto qw/void aom_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct32x32_rd sse2/;
-
-  add_proto qw/void aom_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct32x32_1 sse2/;
-
-  add_proto qw/void aom_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_highbd_fdct4x4 sse2/;
-
-  add_proto qw/void aom_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_highbd_fdct8x8 sse2/;
-
-  add_proto qw/void aom_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_highbd_fdct8x8_1/;
-
-  add_proto qw/void aom_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_highbd_fdct16x16 sse2/;
-
-  add_proto qw/void aom_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_highbd_fdct16x16_1/;
-
-  add_proto qw/void aom_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_highbd_fdct32x32 sse2/;
-
-  add_proto qw/void aom_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_highbd_fdct32x32_rd sse2/;
-
-  add_proto qw/void aom_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_highbd_fdct32x32_1/;
-} else {
+    add_proto qw/void aom_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_highbd_fdct32x32_1/;
+  }   # CONFIG_AOM_HIGHBITDEPTH
   add_proto qw/void aom_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/aom_fdct4x4 sse2 msa/;
 
@@ -726,8 +720,7 @@
   specialize qw/aom_fdct32x32_rd sse2 avx2 msa/;
 
   add_proto qw/void aom_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct32x32_1 sse2 msa/;
-}  # CONFIG_AOM_HIGHBITDEPTH
+  specialize qw/aom_fdct32x32_1 sse2 avx2 msa/;
 }  # CONFIG_AV1_ENCODER
 
 #
diff --git a/aom_dsp/bitreader.h b/aom_dsp/bitreader.h
index d062e07..52e4dc8 100644
--- a/aom_dsp/bitreader.h
+++ b/aom_dsp/bitreader.h
@@ -104,6 +104,20 @@
   return aom_read_tree_bits(r, tree, probs);
 }
 
+static INLINE int aom_read_symbol(aom_reader *r, const aom_cdf_prob *cdf,
+                                  int nsymbs) {
+#if CONFIG_ANS
+  (void)nsymbs;
+  return rans_read(r, cdf);
+#else
+  (void)r;
+  (void)cdf;
+  (void)nsymbs;
+  assert(0 && "Unsupported bitreader operation");
+  return -1;
+#endif
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/aom_dsp/bitwriter.h b/aom_dsp/bitwriter.h
index 5e34fd6..d6937aa 100644
--- a/aom_dsp/bitwriter.h
+++ b/aom_dsp/bitwriter.h
@@ -86,6 +86,24 @@
   aom_write_tree_bits(w, tree, probs, bits, len, i);
 }
 
+static INLINE void aom_write_symbol(aom_writer *w, int symb,
+                                    const aom_cdf_prob *cdf, int nsymbs) {
+#if CONFIG_ANS
+  struct rans_sym s;
+  (void)nsymbs;
+  assert(cdf);
+  s.cum_prob = symb > 0 ? cdf[symb - 1] : 0;
+  s.prob = cdf[symb] - s.cum_prob;
+  buf_rans_write(w, &s);
+#else
+  (void)w;
+  (void)symb;
+  (void)cdf;
+  (void)nsymbs;
+  assert(0 && "Unsupported bitwriter operation");
+#endif
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/aom_dsp/intrapred.c b/aom_dsp/intrapred.c
index 1e40e68..c3af1f4 100644
--- a/aom_dsp/intrapred.c
+++ b/aom_dsp/intrapred.c
@@ -837,6 +837,7 @@
 
 /* clang-format off */
 #define intra_pred_allsizes(type) \
+  intra_pred_sized(type, 2) \
   intra_pred_sized(type, 4) \
   intra_pred_sized(type, 8) \
   intra_pred_sized(type, 16) \
@@ -846,7 +847,7 @@
   intra_pred_highbd_sized(type, 16) \
   intra_pred_highbd_sized(type, 32)
 
-#define intra_pred_no_4x4(type) \
+#define intra_pred_above_4x4(type) \
   intra_pred_sized(type, 8) \
   intra_pred_sized(type, 16) \
   intra_pred_sized(type, 32) \
@@ -857,26 +858,27 @@
 
 #else
 #define intra_pred_allsizes(type) \
+  intra_pred_sized(type, 2) \
   intra_pred_sized(type, 4) \
   intra_pred_sized(type, 8) \
   intra_pred_sized(type, 16) \
   intra_pred_sized(type, 32)
 
-#define intra_pred_no_4x4(type) \
+#define intra_pred_above_4x4(type) \
   intra_pred_sized(type, 8) \
   intra_pred_sized(type, 16) \
   intra_pred_sized(type, 32)
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
-intra_pred_no_4x4(d207)
-intra_pred_no_4x4(d63)
-intra_pred_no_4x4(d45)
+intra_pred_above_4x4(d207)
+intra_pred_above_4x4(d63)
+intra_pred_above_4x4(d45)
 intra_pred_allsizes(d207e)
 intra_pred_allsizes(d63e)
-intra_pred_no_4x4(d45e)
-intra_pred_no_4x4(d117)
-intra_pred_no_4x4(d135)
-intra_pred_no_4x4(d153)
+intra_pred_above_4x4(d45e)
+intra_pred_above_4x4(d117)
+intra_pred_above_4x4(d135)
+intra_pred_above_4x4(d153)
 intra_pred_allsizes(v)
 intra_pred_allsizes(h)
 #if CONFIG_ALT_INTRA
diff --git a/aom_dsp/prob.h b/aom_dsp/prob.h
index 4f25b30..cd133e2 100644
--- a/aom_dsp/prob.h
+++ b/aom_dsp/prob.h
@@ -23,6 +23,9 @@
 
 typedef uint8_t aom_prob;
 
+// TODO(negge): Rename this aom_prob once we remove vpxbool.
+typedef uint16_t aom_cdf_prob;
+
 #define MAX_PROB 255
 
 #define aom_prob_half ((aom_prob)128)
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 8d6fabb..55aee8c 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -391,9 +391,6 @@
 add_proto qw/void av1_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
 specialize qw/av1_fht16x16 sse2 avx2/;
 
-add_proto qw/void av1_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-specialize qw/av1_fht32x32/;
-
 if (aom_config("CONFIG_EXT_TX") eq "yes") {
   add_proto qw/void av1_fht4x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
   specialize qw/av1_fht4x8 sse2/;
@@ -412,6 +409,9 @@
 
   add_proto qw/void av1_fht32x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
   specialize qw/av1_fht32x16/;
+
+  add_proto qw/void av1_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/av1_fht32x32 avx2/;
 }
 
 if (aom_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
diff --git a/av1/common/entropy.c b/av1/common/entropy.c
index d44cc99..1defc53 100644
--- a/av1/common/entropy.c
+++ b/av1/common/entropy.c
@@ -418,263 +418,264 @@
 // beta = 8
 // Values for tokens ONE_TOKEN through CATEGORY6_TOKEN included here.
 // ZERO_TOKEN and EOB_TOKEN are coded as flags outside this coder.
-const AnsP10 av1_pareto8_token_probs[COEFF_PROB_MODELS][ENTROPY_TOKENS - 2] = {
-  { 4, 4, 4, 4, 8, 15, 30, 57, 103, 795 },
-  { 8, 8, 8, 8, 15, 30, 57, 103, 168, 619 },
-  { 12, 12, 12, 12, 23, 43, 80, 138, 205, 487 },
-  { 16, 16, 15, 15, 30, 56, 101, 165, 225, 385 },
-  { 20, 20, 19, 19, 36, 68, 119, 186, 231, 306 },
-  { 24, 23, 23, 22, 43, 79, 135, 201, 230, 244 },
-  { 28, 27, 26, 26, 49, 89, 149, 211, 223, 196 },
-  { 32, 31, 30, 29, 55, 98, 160, 218, 212, 159 },
-  { 36, 35, 33, 32, 60, 107, 171, 221, 200, 129 },
-  { 40, 38, 37, 35, 66, 115, 179, 222, 187, 105 },
-  { 44, 42, 40, 38, 71, 122, 186, 221, 174, 86 },
-  { 48, 45, 43, 41, 76, 129, 192, 219, 160, 71 },
-  { 52, 49, 46, 44, 80, 136, 196, 215, 148, 58 },
-  { 56, 53, 49, 46, 85, 142, 200, 210, 135, 48 },
-  { 60, 56, 52, 49, 89, 147, 203, 204, 124, 40 },
-  { 64, 60, 55, 52, 93, 151, 205, 198, 113, 33 },
-  { 68, 63, 58, 54, 97, 156, 205, 192, 103, 28 },
-  { 72, 66, 61, 57, 100, 160, 206, 185, 94, 23 },
-  { 76, 70, 64, 59, 104, 163, 205, 178, 85, 20 },
-  { 80, 73, 67, 61, 107, 166, 205, 171, 77, 17 },
-  { 84, 76, 69, 63, 110, 169, 204, 164, 71, 14 },
-  { 88, 80, 72, 65, 113, 171, 202, 157, 64, 12 },
-  { 92, 83, 75, 67, 116, 173, 200, 150, 58, 10 },
-  { 96, 86, 77, 69, 118, 175, 198, 143, 53, 9 },
-  { 100, 89, 80, 71, 121, 176, 195, 137, 48, 7 },
-  { 104, 92, 82, 73, 123, 178, 192, 130, 44, 6 },
-  { 108, 96, 84, 75, 125, 178, 189, 124, 40, 5 },
-  { 112, 98, 87, 76, 127, 179, 186, 118, 36, 5 },
-  { 116, 101, 89, 78, 129, 179, 183, 112, 33, 4 },
-  { 120, 104, 91, 80, 131, 180, 179, 106, 30, 3 },
-  { 124, 107, 93, 81, 132, 180, 176, 101, 27, 3 },
-  { 128, 110, 95, 82, 134, 179, 172, 96, 25, 3 },
-  { 132, 113, 97, 84, 135, 179, 168, 91, 23, 2 },
-  { 136, 116, 99, 85, 136, 179, 164, 86, 21, 2 },
-  { 140, 119, 101, 86, 137, 178, 160, 82, 19, 2 },
-  { 144, 122, 103, 88, 138, 177, 157, 77, 17, 1 },
-  { 148, 124, 105, 89, 139, 176, 153, 73, 16, 1 },
-  { 152, 127, 107, 90, 140, 175, 149, 69, 14, 1 },
-  { 156, 130, 108, 91, 141, 173, 145, 66, 13, 1 },
-  { 160, 133, 110, 92, 141, 172, 141, 62, 12, 1 },
-  { 164, 135, 111, 93, 142, 171, 137, 59, 11, 1 },
-  { 168, 138, 113, 94, 142, 169, 133, 56, 10, 1 },
-  { 172, 140, 115, 94, 142, 168, 130, 53, 9, 1 },
-  { 176, 143, 116, 95, 143, 166, 126, 50, 8, 1 },
-  { 180, 145, 118, 96, 143, 164, 122, 47, 8, 1 },
-  { 184, 147, 119, 96, 143, 163, 119, 45, 7, 1 },
-  { 188, 150, 120, 97, 143, 161, 116, 42, 6, 1 },
-  { 192, 152, 121, 98, 143, 159, 112, 40, 6, 1 },
-  { 196, 155, 123, 98, 142, 157, 109, 38, 5, 1 },
-  { 200, 157, 124, 99, 142, 155, 105, 36, 5, 1 },
-  { 204, 159, 125, 99, 142, 153, 102, 34, 5, 1 },
-  { 208, 161, 126, 100, 142, 151, 99, 32, 4, 1 },
-  { 212, 164, 127, 100, 141, 149, 96, 30, 4, 1 },
-  { 216, 166, 129, 100, 141, 147, 93, 28, 3, 1 },
-  { 220, 168, 130, 101, 140, 144, 90, 27, 3, 1 },
-  { 224, 170, 131, 101, 140, 142, 87, 25, 3, 1 },
-  { 228, 172, 132, 101, 139, 140, 84, 24, 3, 1 },
-  { 232, 174, 132, 101, 139, 138, 81, 23, 3, 1 },
-  { 236, 176, 133, 101, 138, 136, 79, 22, 2, 1 },
-  { 240, 178, 134, 102, 137, 134, 76, 20, 2, 1 },
-  { 244, 180, 135, 102, 136, 131, 74, 19, 2, 1 },
-  { 248, 182, 135, 102, 136, 129, 71, 18, 2, 1 },
-  { 252, 184, 136, 101, 135, 127, 69, 17, 2, 1 },
-  { 256, 186, 137, 102, 134, 124, 66, 16, 2, 1 },
-  { 260, 188, 138, 102, 133, 122, 64, 15, 1, 1 },
-  { 264, 190, 138, 101, 132, 120, 62, 15, 1, 1 },
-  { 268, 191, 139, 101, 131, 118, 60, 14, 1, 1 },
-  { 272, 193, 139, 101, 130, 116, 58, 13, 1, 1 },
-  { 276, 195, 139, 101, 129, 114, 56, 12, 1, 1 },
-  { 280, 196, 140, 101, 128, 111, 54, 12, 1, 1 },
-  { 284, 198, 140, 101, 127, 109, 52, 11, 1, 1 },
-  { 288, 200, 141, 100, 126, 107, 50, 10, 1, 1 },
-  { 292, 201, 141, 100, 125, 105, 48, 10, 1, 1 },
-  { 296, 203, 141, 100, 123, 103, 47, 9, 1, 1 },
-  { 300, 204, 142, 99, 122, 101, 45, 9, 1, 1 },
-  { 304, 206, 142, 99, 121, 99, 43, 8, 1, 1 },
-  { 308, 207, 142, 99, 119, 97, 42, 8, 1, 1 },
-  { 312, 209, 142, 99, 118, 95, 40, 7, 1, 1 },
-  { 316, 210, 142, 98, 117, 93, 39, 7, 1, 1 },
-  { 320, 211, 142, 98, 116, 91, 37, 7, 1, 1 },
-  { 324, 213, 142, 97, 115, 89, 36, 6, 1, 1 },
-  { 328, 214, 142, 97, 113, 87, 35, 6, 1, 1 },
-  { 332, 215, 143, 96, 112, 85, 33, 6, 1, 1 },
-  { 336, 216, 143, 96, 111, 83, 32, 5, 1, 1 },
-  { 340, 218, 143, 95, 109, 81, 31, 5, 1, 1 },
-  { 344, 219, 142, 95, 108, 79, 30, 5, 1, 1 },
-  { 348, 220, 142, 94, 107, 78, 29, 4, 1, 1 },
-  { 352, 221, 142, 94, 105, 76, 28, 4, 1, 1 },
-  { 356, 222, 142, 93, 104, 74, 27, 4, 1, 1 },
-  { 360, 223, 142, 92, 103, 72, 26, 4, 1, 1 },
-  { 364, 224, 142, 92, 101, 70, 25, 4, 1, 1 },
-  { 368, 225, 142, 91, 100, 69, 24, 3, 1, 1 },
-  { 372, 226, 141, 91, 99, 67, 23, 3, 1, 1 },
-  { 376, 227, 141, 90, 97, 66, 22, 3, 1, 1 },
-  { 380, 228, 141, 89, 96, 64, 21, 3, 1, 1 },
-  { 384, 229, 140, 89, 95, 62, 20, 3, 1, 1 },
-  { 388, 229, 140, 88, 93, 61, 20, 3, 1, 1 },
-  { 392, 230, 140, 87, 92, 60, 19, 2, 1, 1 },
-  { 396, 231, 140, 86, 91, 58, 18, 2, 1, 1 },
-  { 400, 232, 139, 86, 89, 57, 17, 2, 1, 1 },
-  { 404, 232, 139, 85, 88, 55, 17, 2, 1, 1 },
-  { 408, 233, 138, 84, 87, 54, 16, 2, 1, 1 },
-  { 412, 234, 138, 84, 85, 52, 15, 2, 1, 1 },
-  { 416, 234, 137, 83, 84, 51, 15, 2, 1, 1 },
-  { 420, 235, 137, 82, 82, 50, 14, 2, 1, 1 },
-  { 424, 236, 136, 81, 81, 48, 14, 2, 1, 1 },
-  { 428, 236, 136, 81, 80, 47, 13, 1, 1, 1 },
-  { 432, 236, 135, 80, 79, 46, 13, 1, 1, 1 },
-  { 436, 237, 135, 79, 77, 45, 12, 1, 1, 1 },
-  { 440, 238, 134, 78, 76, 43, 12, 1, 1, 1 },
-  { 444, 238, 134, 77, 75, 42, 11, 1, 1, 1 },
-  { 448, 238, 133, 77, 73, 41, 11, 1, 1, 1 },
-  { 452, 239, 132, 76, 72, 40, 10, 1, 1, 1 },
-  { 456, 239, 131, 75, 71, 39, 10, 1, 1, 1 },
-  { 460, 239, 131, 74, 70, 38, 9, 1, 1, 1 },
-  { 464, 240, 130, 73, 68, 37, 9, 1, 1, 1 },
-  { 468, 240, 129, 72, 67, 36, 9, 1, 1, 1 },
-  { 472, 240, 128, 72, 66, 35, 8, 1, 1, 1 },
-  { 476, 240, 127, 71, 65, 34, 8, 1, 1, 1 },
-  { 480, 240, 127, 70, 63, 33, 8, 1, 1, 1 },
-  { 484, 241, 126, 69, 62, 32, 7, 1, 1, 1 },
-  { 488, 241, 125, 68, 61, 31, 7, 1, 1, 1 },
-  { 492, 241, 124, 67, 60, 30, 7, 1, 1, 1 },
-  { 496, 241, 124, 66, 59, 29, 6, 1, 1, 1 },
-  { 500, 240, 123, 66, 58, 28, 6, 1, 1, 1 },
-  { 504, 240, 122, 65, 57, 27, 6, 1, 1, 1 },
-  { 508, 240, 121, 64, 55, 27, 6, 1, 1, 1 },
-  { 512, 241, 120, 63, 54, 26, 5, 1, 1, 1 },
-  { 516, 241, 119, 62, 53, 25, 5, 1, 1, 1 },
-  { 520, 240, 118, 62, 52, 24, 5, 1, 1, 1 },
-  { 524, 240, 117, 60, 51, 24, 5, 1, 1, 1 },
-  { 528, 239, 116, 60, 50, 23, 5, 1, 1, 1 },
-  { 532, 239, 116, 59, 49, 22, 4, 1, 1, 1 },
-  { 536, 239, 115, 58, 48, 21, 4, 1, 1, 1 },
-  { 540, 239, 113, 57, 47, 21, 4, 1, 1, 1 },
-  { 544, 238, 113, 56, 46, 20, 4, 1, 1, 1 },
-  { 548, 238, 112, 55, 45, 19, 4, 1, 1, 1 },
-  { 552, 238, 110, 55, 44, 19, 3, 1, 1, 1 },
-  { 556, 237, 110, 54, 43, 18, 3, 1, 1, 1 },
-  { 560, 237, 108, 53, 42, 18, 3, 1, 1, 1 },
-  { 564, 236, 108, 52, 41, 17, 3, 1, 1, 1 },
-  { 568, 236, 106, 51, 40, 17, 3, 1, 1, 1 },
-  { 572, 235, 105, 51, 39, 16, 3, 1, 1, 1 },
-  { 576, 235, 104, 50, 38, 15, 3, 1, 1, 1 },
-  { 580, 234, 103, 49, 37, 15, 3, 1, 1, 1 },
-  { 584, 234, 102, 48, 37, 14, 2, 1, 1, 1 },
-  { 588, 233, 101, 47, 36, 14, 2, 1, 1, 1 },
-  { 592, 233, 100, 46, 35, 13, 2, 1, 1, 1 },
-  { 596, 231, 99, 46, 34, 13, 2, 1, 1, 1 },
-  { 600, 230, 98, 45, 33, 13, 2, 1, 1, 1 },
-  { 604, 230, 97, 44, 32, 12, 2, 1, 1, 1 },
-  { 608, 229, 96, 43, 31, 12, 2, 1, 1, 1 },
-  { 612, 228, 95, 42, 31, 11, 2, 1, 1, 1 },
-  { 616, 227, 93, 42, 30, 11, 2, 1, 1, 1 },
-  { 620, 227, 92, 41, 29, 10, 2, 1, 1, 1 },
-  { 624, 226, 92, 40, 28, 10, 1, 1, 1, 1 },
-  { 628, 225, 90, 39, 28, 10, 1, 1, 1, 1 },
-  { 632, 224, 89, 39, 27, 9, 1, 1, 1, 1 },
-  { 636, 223, 88, 38, 26, 9, 1, 1, 1, 1 },
-  { 640, 222, 87, 37, 25, 9, 1, 1, 1, 1 },
-  { 644, 221, 86, 36, 25, 8, 1, 1, 1, 1 },
-  { 648, 220, 84, 36, 24, 8, 1, 1, 1, 1 },
-  { 652, 219, 83, 35, 23, 8, 1, 1, 1, 1 },
-  { 656, 218, 82, 34, 23, 7, 1, 1, 1, 1 },
-  { 660, 217, 81, 33, 22, 7, 1, 1, 1, 1 },
-  { 664, 215, 80, 33, 21, 7, 1, 1, 1, 1 },
-  { 668, 214, 78, 32, 21, 7, 1, 1, 1, 1 },
-  { 672, 213, 78, 31, 20, 6, 1, 1, 1, 1 },
-  { 676, 211, 76, 31, 20, 6, 1, 1, 1, 1 },
-  { 680, 210, 75, 30, 19, 6, 1, 1, 1, 1 },
-  { 684, 209, 74, 29, 18, 6, 1, 1, 1, 1 },
-  { 688, 208, 73, 28, 18, 5, 1, 1, 1, 1 },
-  { 692, 206, 72, 28, 17, 5, 1, 1, 1, 1 },
-  { 696, 205, 70, 27, 17, 5, 1, 1, 1, 1 },
-  { 700, 203, 69, 27, 16, 5, 1, 1, 1, 1 },
-  { 704, 201, 68, 26, 16, 5, 1, 1, 1, 1 },
-  { 708, 201, 67, 25, 15, 4, 1, 1, 1, 1 },
-  { 712, 198, 66, 25, 15, 4, 1, 1, 1, 1 },
-  { 716, 197, 65, 24, 14, 4, 1, 1, 1, 1 },
-  { 720, 196, 63, 23, 14, 4, 1, 1, 1, 1 },
-  { 724, 194, 62, 23, 13, 4, 1, 1, 1, 1 },
-  { 728, 193, 61, 22, 13, 3, 1, 1, 1, 1 },
-  { 732, 191, 60, 22, 12, 3, 1, 1, 1, 1 },
-  { 736, 189, 59, 21, 12, 3, 1, 1, 1, 1 },
-  { 740, 188, 58, 20, 11, 3, 1, 1, 1, 1 },
-  { 744, 186, 56, 20, 11, 3, 1, 1, 1, 1 },
-  { 748, 184, 55, 19, 11, 3, 1, 1, 1, 1 },
-  { 752, 182, 54, 19, 10, 3, 1, 1, 1, 1 },
-  { 756, 181, 53, 18, 10, 2, 1, 1, 1, 1 },
-  { 760, 179, 52, 18, 9, 2, 1, 1, 1, 1 },
-  { 764, 177, 51, 17, 9, 2, 1, 1, 1, 1 },
-  { 768, 174, 50, 17, 9, 2, 1, 1, 1, 1 },
-  { 772, 173, 49, 16, 8, 2, 1, 1, 1, 1 },
-  { 776, 171, 47, 16, 8, 2, 1, 1, 1, 1 },
-  { 780, 169, 46, 15, 8, 2, 1, 1, 1, 1 },
-  { 784, 167, 45, 15, 7, 2, 1, 1, 1, 1 },
-  { 788, 165, 44, 14, 7, 2, 1, 1, 1, 1 },
-  { 792, 162, 43, 14, 7, 2, 1, 1, 1, 1 },
-  { 796, 161, 42, 13, 7, 1, 1, 1, 1, 1 },
-  { 800, 159, 41, 13, 6, 1, 1, 1, 1, 1 },
-  { 804, 157, 40, 12, 6, 1, 1, 1, 1, 1 },
-  { 808, 154, 39, 12, 6, 1, 1, 1, 1, 1 },
-  { 812, 153, 38, 11, 5, 1, 1, 1, 1, 1 },
-  { 816, 150, 37, 11, 5, 1, 1, 1, 1, 1 },
-  { 820, 148, 36, 10, 5, 1, 1, 1, 1, 1 },
-  { 824, 145, 35, 10, 5, 1, 1, 1, 1, 1 },
-  { 828, 143, 34, 10, 4, 1, 1, 1, 1, 1 },
-  { 832, 141, 33, 9, 4, 1, 1, 1, 1, 1 },
-  { 836, 138, 32, 9, 4, 1, 1, 1, 1, 1 },
-  { 840, 136, 30, 9, 4, 1, 1, 1, 1, 1 },
-  { 844, 133, 30, 8, 4, 1, 1, 1, 1, 1 },
-  { 848, 131, 29, 8, 3, 1, 1, 1, 1, 1 },
-  { 852, 129, 28, 7, 3, 1, 1, 1, 1, 1 },
-  { 856, 126, 27, 7, 3, 1, 1, 1, 1, 1 },
-  { 860, 123, 26, 7, 3, 1, 1, 1, 1, 1 },
-  { 864, 121, 25, 6, 3, 1, 1, 1, 1, 1 },
-  { 868, 118, 24, 6, 3, 1, 1, 1, 1, 1 },
-  { 872, 116, 23, 6, 2, 1, 1, 1, 1, 1 },
-  { 876, 113, 22, 6, 2, 1, 1, 1, 1, 1 },
-  { 880, 111, 21, 5, 2, 1, 1, 1, 1, 1 },
-  { 884, 108, 20, 5, 2, 1, 1, 1, 1, 1 },
-  { 888, 105, 19, 5, 2, 1, 1, 1, 1, 1 },
-  { 892, 102, 19, 4, 2, 1, 1, 1, 1, 1 },
-  { 896, 99, 18, 4, 2, 1, 1, 1, 1, 1 },
-  { 900, 97, 17, 4, 1, 1, 1, 1, 1, 1 },
-  { 904, 94, 16, 4, 1, 1, 1, 1, 1, 1 },
-  { 908, 92, 15, 3, 1, 1, 1, 1, 1, 1 },
-  { 912, 89, 14, 3, 1, 1, 1, 1, 1, 1 },
-  { 916, 85, 14, 3, 1, 1, 1, 1, 1, 1 },
-  { 920, 82, 13, 3, 1, 1, 1, 1, 1, 1 },
-  { 924, 79, 12, 3, 1, 1, 1, 1, 1, 1 },
-  { 928, 77, 11, 2, 1, 1, 1, 1, 1, 1 },
-  { 932, 73, 11, 2, 1, 1, 1, 1, 1, 1 },
-  { 936, 70, 10, 2, 1, 1, 1, 1, 1, 1 },
-  { 940, 67, 9, 2, 1, 1, 1, 1, 1, 1 },
-  { 944, 64, 8, 2, 1, 1, 1, 1, 1, 1 },
-  { 948, 60, 8, 2, 1, 1, 1, 1, 1, 1 },
-  { 952, 58, 7, 1, 1, 1, 1, 1, 1, 1 },
-  { 956, 54, 7, 1, 1, 1, 1, 1, 1, 1 },
-  { 960, 51, 6, 1, 1, 1, 1, 1, 1, 1 },
-  { 964, 48, 5, 1, 1, 1, 1, 1, 1, 1 },
-  { 968, 44, 5, 1, 1, 1, 1, 1, 1, 1 },
-  { 972, 41, 4, 1, 1, 1, 1, 1, 1, 1 },
-  { 976, 37, 4, 1, 1, 1, 1, 1, 1, 1 },
-  { 980, 34, 3, 1, 1, 1, 1, 1, 1, 1 },
-  { 984, 30, 3, 1, 1, 1, 1, 1, 1, 1 },
-  { 988, 27, 2, 1, 1, 1, 1, 1, 1, 1 },
-  { 992, 23, 2, 1, 1, 1, 1, 1, 1, 1 },
-  { 996, 19, 2, 1, 1, 1, 1, 1, 1, 1 },
-  { 1000, 16, 1, 1, 1, 1, 1, 1, 1, 1 },
-  { 1004, 12, 1, 1, 1, 1, 1, 1, 1, 1 },
-  { 1008, 8, 1, 1, 1, 1, 1, 1, 1, 1 },
-  { 1012, 4, 1, 1, 1, 1, 1, 1, 1, 1 },
-  { 1015, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
-  { 1015, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
-};
+const aom_cdf_prob
+    av1_pareto8_token_probs[COEFF_PROB_MODELS][ENTROPY_TOKENS - 2] = {
+      { 4, 4, 4, 4, 8, 15, 30, 57, 103, 795 },
+      { 8, 8, 8, 8, 15, 30, 57, 103, 168, 619 },
+      { 12, 12, 12, 12, 23, 43, 80, 138, 205, 487 },
+      { 16, 16, 15, 15, 30, 56, 101, 165, 225, 385 },
+      { 20, 20, 19, 19, 36, 68, 119, 186, 231, 306 },
+      { 24, 23, 23, 22, 43, 79, 135, 201, 230, 244 },
+      { 28, 27, 26, 26, 49, 89, 149, 211, 223, 196 },
+      { 32, 31, 30, 29, 55, 98, 160, 218, 212, 159 },
+      { 36, 35, 33, 32, 60, 107, 171, 221, 200, 129 },
+      { 40, 38, 37, 35, 66, 115, 179, 222, 187, 105 },
+      { 44, 42, 40, 38, 71, 122, 186, 221, 174, 86 },
+      { 48, 45, 43, 41, 76, 129, 192, 219, 160, 71 },
+      { 52, 49, 46, 44, 80, 136, 196, 215, 148, 58 },
+      { 56, 53, 49, 46, 85, 142, 200, 210, 135, 48 },
+      { 60, 56, 52, 49, 89, 147, 203, 204, 124, 40 },
+      { 64, 60, 55, 52, 93, 151, 205, 198, 113, 33 },
+      { 68, 63, 58, 54, 97, 156, 205, 192, 103, 28 },
+      { 72, 66, 61, 57, 100, 160, 206, 185, 94, 23 },
+      { 76, 70, 64, 59, 104, 163, 205, 178, 85, 20 },
+      { 80, 73, 67, 61, 107, 166, 205, 171, 77, 17 },
+      { 84, 76, 69, 63, 110, 169, 204, 164, 71, 14 },
+      { 88, 80, 72, 65, 113, 171, 202, 157, 64, 12 },
+      { 92, 83, 75, 67, 116, 173, 200, 150, 58, 10 },
+      { 96, 86, 77, 69, 118, 175, 198, 143, 53, 9 },
+      { 100, 89, 80, 71, 121, 176, 195, 137, 48, 7 },
+      { 104, 92, 82, 73, 123, 178, 192, 130, 44, 6 },
+      { 108, 96, 84, 75, 125, 178, 189, 124, 40, 5 },
+      { 112, 98, 87, 76, 127, 179, 186, 118, 36, 5 },
+      { 116, 101, 89, 78, 129, 179, 183, 112, 33, 4 },
+      { 120, 104, 91, 80, 131, 180, 179, 106, 30, 3 },
+      { 124, 107, 93, 81, 132, 180, 176, 101, 27, 3 },
+      { 128, 110, 95, 82, 134, 179, 172, 96, 25, 3 },
+      { 132, 113, 97, 84, 135, 179, 168, 91, 23, 2 },
+      { 136, 116, 99, 85, 136, 179, 164, 86, 21, 2 },
+      { 140, 119, 101, 86, 137, 178, 160, 82, 19, 2 },
+      { 144, 122, 103, 88, 138, 177, 157, 77, 17, 1 },
+      { 148, 124, 105, 89, 139, 176, 153, 73, 16, 1 },
+      { 152, 127, 107, 90, 140, 175, 149, 69, 14, 1 },
+      { 156, 130, 108, 91, 141, 173, 145, 66, 13, 1 },
+      { 160, 133, 110, 92, 141, 172, 141, 62, 12, 1 },
+      { 164, 135, 111, 93, 142, 171, 137, 59, 11, 1 },
+      { 168, 138, 113, 94, 142, 169, 133, 56, 10, 1 },
+      { 172, 140, 115, 94, 142, 168, 130, 53, 9, 1 },
+      { 176, 143, 116, 95, 143, 166, 126, 50, 8, 1 },
+      { 180, 145, 118, 96, 143, 164, 122, 47, 8, 1 },
+      { 184, 147, 119, 96, 143, 163, 119, 45, 7, 1 },
+      { 188, 150, 120, 97, 143, 161, 116, 42, 6, 1 },
+      { 192, 152, 121, 98, 143, 159, 112, 40, 6, 1 },
+      { 196, 155, 123, 98, 142, 157, 109, 38, 5, 1 },
+      { 200, 157, 124, 99, 142, 155, 105, 36, 5, 1 },
+      { 204, 159, 125, 99, 142, 153, 102, 34, 5, 1 },
+      { 208, 161, 126, 100, 142, 151, 99, 32, 4, 1 },
+      { 212, 164, 127, 100, 141, 149, 96, 30, 4, 1 },
+      { 216, 166, 129, 100, 141, 147, 93, 28, 3, 1 },
+      { 220, 168, 130, 101, 140, 144, 90, 27, 3, 1 },
+      { 224, 170, 131, 101, 140, 142, 87, 25, 3, 1 },
+      { 228, 172, 132, 101, 139, 140, 84, 24, 3, 1 },
+      { 232, 174, 132, 101, 139, 138, 81, 23, 3, 1 },
+      { 236, 176, 133, 101, 138, 136, 79, 22, 2, 1 },
+      { 240, 178, 134, 102, 137, 134, 76, 20, 2, 1 },
+      { 244, 180, 135, 102, 136, 131, 74, 19, 2, 1 },
+      { 248, 182, 135, 102, 136, 129, 71, 18, 2, 1 },
+      { 252, 184, 136, 101, 135, 127, 69, 17, 2, 1 },
+      { 256, 186, 137, 102, 134, 124, 66, 16, 2, 1 },
+      { 260, 188, 138, 102, 133, 122, 64, 15, 1, 1 },
+      { 264, 190, 138, 101, 132, 120, 62, 15, 1, 1 },
+      { 268, 191, 139, 101, 131, 118, 60, 14, 1, 1 },
+      { 272, 193, 139, 101, 130, 116, 58, 13, 1, 1 },
+      { 276, 195, 139, 101, 129, 114, 56, 12, 1, 1 },
+      { 280, 196, 140, 101, 128, 111, 54, 12, 1, 1 },
+      { 284, 198, 140, 101, 127, 109, 52, 11, 1, 1 },
+      { 288, 200, 141, 100, 126, 107, 50, 10, 1, 1 },
+      { 292, 201, 141, 100, 125, 105, 48, 10, 1, 1 },
+      { 296, 203, 141, 100, 123, 103, 47, 9, 1, 1 },
+      { 300, 204, 142, 99, 122, 101, 45, 9, 1, 1 },
+      { 304, 206, 142, 99, 121, 99, 43, 8, 1, 1 },
+      { 308, 207, 142, 99, 119, 97, 42, 8, 1, 1 },
+      { 312, 209, 142, 99, 118, 95, 40, 7, 1, 1 },
+      { 316, 210, 142, 98, 117, 93, 39, 7, 1, 1 },
+      { 320, 211, 142, 98, 116, 91, 37, 7, 1, 1 },
+      { 324, 213, 142, 97, 115, 89, 36, 6, 1, 1 },
+      { 328, 214, 142, 97, 113, 87, 35, 6, 1, 1 },
+      { 332, 215, 143, 96, 112, 85, 33, 6, 1, 1 },
+      { 336, 216, 143, 96, 111, 83, 32, 5, 1, 1 },
+      { 340, 218, 143, 95, 109, 81, 31, 5, 1, 1 },
+      { 344, 219, 142, 95, 108, 79, 30, 5, 1, 1 },
+      { 348, 220, 142, 94, 107, 78, 29, 4, 1, 1 },
+      { 352, 221, 142, 94, 105, 76, 28, 4, 1, 1 },
+      { 356, 222, 142, 93, 104, 74, 27, 4, 1, 1 },
+      { 360, 223, 142, 92, 103, 72, 26, 4, 1, 1 },
+      { 364, 224, 142, 92, 101, 70, 25, 4, 1, 1 },
+      { 368, 225, 142, 91, 100, 69, 24, 3, 1, 1 },
+      { 372, 226, 141, 91, 99, 67, 23, 3, 1, 1 },
+      { 376, 227, 141, 90, 97, 66, 22, 3, 1, 1 },
+      { 380, 228, 141, 89, 96, 64, 21, 3, 1, 1 },
+      { 384, 229, 140, 89, 95, 62, 20, 3, 1, 1 },
+      { 388, 229, 140, 88, 93, 61, 20, 3, 1, 1 },
+      { 392, 230, 140, 87, 92, 60, 19, 2, 1, 1 },
+      { 396, 231, 140, 86, 91, 58, 18, 2, 1, 1 },
+      { 400, 232, 139, 86, 89, 57, 17, 2, 1, 1 },
+      { 404, 232, 139, 85, 88, 55, 17, 2, 1, 1 },
+      { 408, 233, 138, 84, 87, 54, 16, 2, 1, 1 },
+      { 412, 234, 138, 84, 85, 52, 15, 2, 1, 1 },
+      { 416, 234, 137, 83, 84, 51, 15, 2, 1, 1 },
+      { 420, 235, 137, 82, 82, 50, 14, 2, 1, 1 },
+      { 424, 236, 136, 81, 81, 48, 14, 2, 1, 1 },
+      { 428, 236, 136, 81, 80, 47, 13, 1, 1, 1 },
+      { 432, 236, 135, 80, 79, 46, 13, 1, 1, 1 },
+      { 436, 237, 135, 79, 77, 45, 12, 1, 1, 1 },
+      { 440, 238, 134, 78, 76, 43, 12, 1, 1, 1 },
+      { 444, 238, 134, 77, 75, 42, 11, 1, 1, 1 },
+      { 448, 238, 133, 77, 73, 41, 11, 1, 1, 1 },
+      { 452, 239, 132, 76, 72, 40, 10, 1, 1, 1 },
+      { 456, 239, 131, 75, 71, 39, 10, 1, 1, 1 },
+      { 460, 239, 131, 74, 70, 38, 9, 1, 1, 1 },
+      { 464, 240, 130, 73, 68, 37, 9, 1, 1, 1 },
+      { 468, 240, 129, 72, 67, 36, 9, 1, 1, 1 },
+      { 472, 240, 128, 72, 66, 35, 8, 1, 1, 1 },
+      { 476, 240, 127, 71, 65, 34, 8, 1, 1, 1 },
+      { 480, 240, 127, 70, 63, 33, 8, 1, 1, 1 },
+      { 484, 241, 126, 69, 62, 32, 7, 1, 1, 1 },
+      { 488, 241, 125, 68, 61, 31, 7, 1, 1, 1 },
+      { 492, 241, 124, 67, 60, 30, 7, 1, 1, 1 },
+      { 496, 241, 124, 66, 59, 29, 6, 1, 1, 1 },
+      { 500, 240, 123, 66, 58, 28, 6, 1, 1, 1 },
+      { 504, 240, 122, 65, 57, 27, 6, 1, 1, 1 },
+      { 508, 240, 121, 64, 55, 27, 6, 1, 1, 1 },
+      { 512, 241, 120, 63, 54, 26, 5, 1, 1, 1 },
+      { 516, 241, 119, 62, 53, 25, 5, 1, 1, 1 },
+      { 520, 240, 118, 62, 52, 24, 5, 1, 1, 1 },
+      { 524, 240, 117, 60, 51, 24, 5, 1, 1, 1 },
+      { 528, 239, 116, 60, 50, 23, 5, 1, 1, 1 },
+      { 532, 239, 116, 59, 49, 22, 4, 1, 1, 1 },
+      { 536, 239, 115, 58, 48, 21, 4, 1, 1, 1 },
+      { 540, 239, 113, 57, 47, 21, 4, 1, 1, 1 },
+      { 544, 238, 113, 56, 46, 20, 4, 1, 1, 1 },
+      { 548, 238, 112, 55, 45, 19, 4, 1, 1, 1 },
+      { 552, 238, 110, 55, 44, 19, 3, 1, 1, 1 },
+      { 556, 237, 110, 54, 43, 18, 3, 1, 1, 1 },
+      { 560, 237, 108, 53, 42, 18, 3, 1, 1, 1 },
+      { 564, 236, 108, 52, 41, 17, 3, 1, 1, 1 },
+      { 568, 236, 106, 51, 40, 17, 3, 1, 1, 1 },
+      { 572, 235, 105, 51, 39, 16, 3, 1, 1, 1 },
+      { 576, 235, 104, 50, 38, 15, 3, 1, 1, 1 },
+      { 580, 234, 103, 49, 37, 15, 3, 1, 1, 1 },
+      { 584, 234, 102, 48, 37, 14, 2, 1, 1, 1 },
+      { 588, 233, 101, 47, 36, 14, 2, 1, 1, 1 },
+      { 592, 233, 100, 46, 35, 13, 2, 1, 1, 1 },
+      { 596, 231, 99, 46, 34, 13, 2, 1, 1, 1 },
+      { 600, 230, 98, 45, 33, 13, 2, 1, 1, 1 },
+      { 604, 230, 97, 44, 32, 12, 2, 1, 1, 1 },
+      { 608, 229, 96, 43, 31, 12, 2, 1, 1, 1 },
+      { 612, 228, 95, 42, 31, 11, 2, 1, 1, 1 },
+      { 616, 227, 93, 42, 30, 11, 2, 1, 1, 1 },
+      { 620, 227, 92, 41, 29, 10, 2, 1, 1, 1 },
+      { 624, 226, 92, 40, 28, 10, 1, 1, 1, 1 },
+      { 628, 225, 90, 39, 28, 10, 1, 1, 1, 1 },
+      { 632, 224, 89, 39, 27, 9, 1, 1, 1, 1 },
+      { 636, 223, 88, 38, 26, 9, 1, 1, 1, 1 },
+      { 640, 222, 87, 37, 25, 9, 1, 1, 1, 1 },
+      { 644, 221, 86, 36, 25, 8, 1, 1, 1, 1 },
+      { 648, 220, 84, 36, 24, 8, 1, 1, 1, 1 },
+      { 652, 219, 83, 35, 23, 8, 1, 1, 1, 1 },
+      { 656, 218, 82, 34, 23, 7, 1, 1, 1, 1 },
+      { 660, 217, 81, 33, 22, 7, 1, 1, 1, 1 },
+      { 664, 215, 80, 33, 21, 7, 1, 1, 1, 1 },
+      { 668, 214, 78, 32, 21, 7, 1, 1, 1, 1 },
+      { 672, 213, 78, 31, 20, 6, 1, 1, 1, 1 },
+      { 676, 211, 76, 31, 20, 6, 1, 1, 1, 1 },
+      { 680, 210, 75, 30, 19, 6, 1, 1, 1, 1 },
+      { 684, 209, 74, 29, 18, 6, 1, 1, 1, 1 },
+      { 688, 208, 73, 28, 18, 5, 1, 1, 1, 1 },
+      { 692, 206, 72, 28, 17, 5, 1, 1, 1, 1 },
+      { 696, 205, 70, 27, 17, 5, 1, 1, 1, 1 },
+      { 700, 203, 69, 27, 16, 5, 1, 1, 1, 1 },
+      { 704, 201, 68, 26, 16, 5, 1, 1, 1, 1 },
+      { 708, 201, 67, 25, 15, 4, 1, 1, 1, 1 },
+      { 712, 198, 66, 25, 15, 4, 1, 1, 1, 1 },
+      { 716, 197, 65, 24, 14, 4, 1, 1, 1, 1 },
+      { 720, 196, 63, 23, 14, 4, 1, 1, 1, 1 },
+      { 724, 194, 62, 23, 13, 4, 1, 1, 1, 1 },
+      { 728, 193, 61, 22, 13, 3, 1, 1, 1, 1 },
+      { 732, 191, 60, 22, 12, 3, 1, 1, 1, 1 },
+      { 736, 189, 59, 21, 12, 3, 1, 1, 1, 1 },
+      { 740, 188, 58, 20, 11, 3, 1, 1, 1, 1 },
+      { 744, 186, 56, 20, 11, 3, 1, 1, 1, 1 },
+      { 748, 184, 55, 19, 11, 3, 1, 1, 1, 1 },
+      { 752, 182, 54, 19, 10, 3, 1, 1, 1, 1 },
+      { 756, 181, 53, 18, 10, 2, 1, 1, 1, 1 },
+      { 760, 179, 52, 18, 9, 2, 1, 1, 1, 1 },
+      { 764, 177, 51, 17, 9, 2, 1, 1, 1, 1 },
+      { 768, 174, 50, 17, 9, 2, 1, 1, 1, 1 },
+      { 772, 173, 49, 16, 8, 2, 1, 1, 1, 1 },
+      { 776, 171, 47, 16, 8, 2, 1, 1, 1, 1 },
+      { 780, 169, 46, 15, 8, 2, 1, 1, 1, 1 },
+      { 784, 167, 45, 15, 7, 2, 1, 1, 1, 1 },
+      { 788, 165, 44, 14, 7, 2, 1, 1, 1, 1 },
+      { 792, 162, 43, 14, 7, 2, 1, 1, 1, 1 },
+      { 796, 161, 42, 13, 7, 1, 1, 1, 1, 1 },
+      { 800, 159, 41, 13, 6, 1, 1, 1, 1, 1 },
+      { 804, 157, 40, 12, 6, 1, 1, 1, 1, 1 },
+      { 808, 154, 39, 12, 6, 1, 1, 1, 1, 1 },
+      { 812, 153, 38, 11, 5, 1, 1, 1, 1, 1 },
+      { 816, 150, 37, 11, 5, 1, 1, 1, 1, 1 },
+      { 820, 148, 36, 10, 5, 1, 1, 1, 1, 1 },
+      { 824, 145, 35, 10, 5, 1, 1, 1, 1, 1 },
+      { 828, 143, 34, 10, 4, 1, 1, 1, 1, 1 },
+      { 832, 141, 33, 9, 4, 1, 1, 1, 1, 1 },
+      { 836, 138, 32, 9, 4, 1, 1, 1, 1, 1 },
+      { 840, 136, 30, 9, 4, 1, 1, 1, 1, 1 },
+      { 844, 133, 30, 8, 4, 1, 1, 1, 1, 1 },
+      { 848, 131, 29, 8, 3, 1, 1, 1, 1, 1 },
+      { 852, 129, 28, 7, 3, 1, 1, 1, 1, 1 },
+      { 856, 126, 27, 7, 3, 1, 1, 1, 1, 1 },
+      { 860, 123, 26, 7, 3, 1, 1, 1, 1, 1 },
+      { 864, 121, 25, 6, 3, 1, 1, 1, 1, 1 },
+      { 868, 118, 24, 6, 3, 1, 1, 1, 1, 1 },
+      { 872, 116, 23, 6, 2, 1, 1, 1, 1, 1 },
+      { 876, 113, 22, 6, 2, 1, 1, 1, 1, 1 },
+      { 880, 111, 21, 5, 2, 1, 1, 1, 1, 1 },
+      { 884, 108, 20, 5, 2, 1, 1, 1, 1, 1 },
+      { 888, 105, 19, 5, 2, 1, 1, 1, 1, 1 },
+      { 892, 102, 19, 4, 2, 1, 1, 1, 1, 1 },
+      { 896, 99, 18, 4, 2, 1, 1, 1, 1, 1 },
+      { 900, 97, 17, 4, 1, 1, 1, 1, 1, 1 },
+      { 904, 94, 16, 4, 1, 1, 1, 1, 1, 1 },
+      { 908, 92, 15, 3, 1, 1, 1, 1, 1, 1 },
+      { 912, 89, 14, 3, 1, 1, 1, 1, 1, 1 },
+      { 916, 85, 14, 3, 1, 1, 1, 1, 1, 1 },
+      { 920, 82, 13, 3, 1, 1, 1, 1, 1, 1 },
+      { 924, 79, 12, 3, 1, 1, 1, 1, 1, 1 },
+      { 928, 77, 11, 2, 1, 1, 1, 1, 1, 1 },
+      { 932, 73, 11, 2, 1, 1, 1, 1, 1, 1 },
+      { 936, 70, 10, 2, 1, 1, 1, 1, 1, 1 },
+      { 940, 67, 9, 2, 1, 1, 1, 1, 1, 1 },
+      { 944, 64, 8, 2, 1, 1, 1, 1, 1, 1 },
+      { 948, 60, 8, 2, 1, 1, 1, 1, 1, 1 },
+      { 952, 58, 7, 1, 1, 1, 1, 1, 1, 1 },
+      { 956, 54, 7, 1, 1, 1, 1, 1, 1, 1 },
+      { 960, 51, 6, 1, 1, 1, 1, 1, 1, 1 },
+      { 964, 48, 5, 1, 1, 1, 1, 1, 1, 1 },
+      { 968, 44, 5, 1, 1, 1, 1, 1, 1, 1 },
+      { 972, 41, 4, 1, 1, 1, 1, 1, 1, 1 },
+      { 976, 37, 4, 1, 1, 1, 1, 1, 1, 1 },
+      { 980, 34, 3, 1, 1, 1, 1, 1, 1, 1 },
+      { 984, 30, 3, 1, 1, 1, 1, 1, 1, 1 },
+      { 988, 27, 2, 1, 1, 1, 1, 1, 1, 1 },
+      { 992, 23, 2, 1, 1, 1, 1, 1, 1, 1 },
+      { 996, 19, 2, 1, 1, 1, 1, 1, 1, 1 },
+      { 1000, 16, 1, 1, 1, 1, 1, 1, 1, 1 },
+      { 1004, 12, 1, 1, 1, 1, 1, 1, 1, 1 },
+      { 1008, 8, 1, 1, 1, 1, 1, 1, 1, 1 },
+      { 1012, 4, 1, 1, 1, 1, 1, 1, 1, 1 },
+      { 1015, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+      { 1015, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+    };
 #endif  // CONFIG_ANS
 
 /* clang-format off */
@@ -2801,6 +2802,15 @@
 }
 
 #if CONFIG_ANS
+static void build_token_cdfs(const aom_prob *pdf_model,
+                             aom_cdf_prob cdf[ENTROPY_TOKENS]) {
+  int i, sum = 0;
+  assert(pdf_model[2] != 0);
+  for (i = 0; i < ENTROPY_TOKENS - 2; ++i) {
+    cdf[i] = sum += av1_pareto8_token_probs[pdf_model[2] - 1][i];
+  }
+}
+
 void av1_coef_pareto_cdfs(FRAME_CONTEXT *fc) {
   TX_SIZE t;
   int i, j, k, l;
@@ -2809,11 +2819,8 @@
       for (j = 0; j < REF_TYPES; ++j)
         for (k = 0; k < COEF_BANDS; ++k)
           for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
-            const aom_prob *const tree_probs = fc->coef_probs[t][i][j][k][l];
-            aom_prob pivot = tree_probs[PIVOT_NODE];
-            assert(pivot != 0);
-            aom_rans_build_cdf_from_pdf(av1_pareto8_token_probs[pivot - 1],
-                                        fc->coef_cdfs[t][i][j][k][l]);
+            build_token_cdfs(fc->coef_probs[t][i][j][k][l],
+                             fc->coef_cdfs[t][i][j][k][l]);
           }
 }
 #endif  // CONFIG_ANS
diff --git a/av1/common/entropy.h b/av1/common/entropy.h
index f0727c0..fd68e82 100644
--- a/av1/common/entropy.h
+++ b/av1/common/entropy.h
@@ -191,10 +191,10 @@
 extern const aom_tree_index av1_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)];
 extern const aom_prob av1_pareto8_full[COEFF_PROB_MODELS][MODEL_NODES];
 #if CONFIG_ANS
-extern const AnsP10 av1_pareto8_token_probs[COEFF_PROB_MODELS]
-                                           [ENTROPY_TOKENS - 2];
-
-typedef rans_lut coeff_cdf_model[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS];
+typedef aom_cdf_prob coeff_cdf_model[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS]
+                                    [ENTROPY_TOKENS];
+extern const aom_cdf_prob av1_pareto8_token_probs[COEFF_PROB_MODELS]
+                                                 [ENTROPY_TOKENS - 2];
 #endif  // CONFIG_ANS
 
 typedef aom_prob av1_coeff_probs_model[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS]
diff --git a/av1/common/idct.c b/av1/common/idct.c
index 328f360..eedbc79 100644
--- a/av1/common/idct.c
+++ b/av1/common/idct.c
@@ -33,6 +33,9 @@
   return txsize_sqr_up_map[tx_size] == TX_32X32;
 }
 
+// NOTE: The implementation of all inverses need to be aware of the fact
+// that input and output could be the same buffer.
+
 #if CONFIG_EXT_TX
 static void iidtx4_c(const tran_low_t *input, tran_low_t *output) {
   int i;
@@ -56,17 +59,17 @@
   for (i = 0; i < 32; ++i) output[i] = input[i] * 4;
 }
 
-// For use in lieu of DST
+// For use in lieu of ADST
 static void ihalfright32_c(const tran_low_t *input, tran_low_t *output) {
   int i;
   tran_low_t inputhalf[16];
-  for (i = 0; i < 16; ++i) {
-    output[i] = input[16 + i] * 4;
-  }
   // Multiply input by sqrt(2)
   for (i = 0; i < 16; ++i) {
     inputhalf[i] = (tran_low_t)dct_const_round_shift(input[i] * Sqrt2);
   }
+  for (i = 0; i < 16; ++i) {
+    output[i] = input[16 + i] * 4;
+  }
   idct16_c(inputhalf, output + 16);
   // Note overall scaling factor is 4 times orthogonal
 }
@@ -106,14 +109,14 @@
                                   int bd) {
   int i;
   tran_low_t inputhalf[16];
-  for (i = 0; i < 16; ++i) {
-    output[i] = input[16 + i] * 4;
-  }
   // Multiply input by sqrt(2)
   for (i = 0; i < 16; ++i) {
     inputhalf[i] =
         HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[i] * Sqrt2), bd);
   }
+  for (i = 0; i < 16; ++i) {
+    output[i] = input[16 + i] * 4;
+  }
   aom_highbd_idct16_c(inputhalf, output + 16, bd);
   // Note overall scaling factor is 4 times orthogonal
 }
diff --git a/av1/decoder/detokenize.c b/av1/decoder/detokenize.c
index 6a0769c..9a40f69 100644
--- a/av1/decoder/detokenize.c
+++ b/av1/decoder/detokenize.c
@@ -75,9 +75,9 @@
       fc->coef_probs[tx_size_ctx][type][ref];
   const aom_prob *prob;
 #if CONFIG_ANS
-  const rans_lut(*coef_cdfs)[COEFF_CONTEXTS] =
+  const aom_cdf_prob(*const coef_cdfs)[COEFF_CONTEXTS][ENTROPY_TOKENS] =
       fc->coef_cdfs[tx_size_ctx][type][ref];
-  const rans_lut *cdf;
+  const aom_cdf_prob(*cdf)[ENTROPY_TOKENS];
 #endif  // CONFIG_ANS
   unsigned int(*coef_counts)[COEFF_CONTEXTS][UNCONSTRAINED_NODES + 1];
   unsigned int(*eob_branch_count)[COEFF_CONTEXTS];
@@ -166,7 +166,8 @@
     }
 #if CONFIG_ANS
     cdf = &coef_cdfs[band][ctx];
-    token = ONE_TOKEN + rans_read(r, *cdf);
+    token =
+        ONE_TOKEN + aom_read_symbol(r, *cdf, CATEGORY6_TOKEN - ONE_TOKEN + 1);
     INCREMENT_COUNT(ONE_TOKEN + (token > ONE_TOKEN));
     switch (token) {
       case ONE_TOKEN:
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index 91eeeaa..aaffebb 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -657,11 +657,8 @@
       aom_write(w, t != ZERO_TOKEN, p->context_tree[1]);
 
       if (t != ZERO_TOKEN) {
-        struct rans_sym s;
-        const rans_lut *token_cdf = p->token_cdf;
-        s.cum_prob = (*token_cdf)[t - ONE_TOKEN];
-        s.prob = (*token_cdf)[t - ONE_TOKEN + 1] - s.cum_prob;
-        buf_rans_write(w, &s);
+        aom_write_symbol(w, t - ONE_TOKEN, *p->token_cdf,
+                         CATEGORY6_TOKEN - ONE_TOKEN + 1);
       }
     }
 #else
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 983f8cc..90b0416 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -3086,15 +3086,15 @@
     cpi->lst_fb_idxes[ref_frame] = cpi->lst_fb_idxes[ref_frame - 1];
 
     // [0] is allocated to the current coded frame. The statistics for the
-    // reference frames start at [1].
+    // reference frames start at [LAST_FRAME], i.e. [1].
     if (!cpi->rc.is_src_frame_alt_ref) {
-      memcpy(cpi->interp_filter_selected[ref_frame + 1],
-             cpi->interp_filter_selected[ref_frame],
-             sizeof(cpi->interp_filter_selected[ref_frame]));
+      memcpy(cpi->interp_filter_selected[ref_frame + LAST_FRAME],
+             cpi->interp_filter_selected[ref_frame - 1 + LAST_FRAME],
+             sizeof(cpi->interp_filter_selected[ref_frame - 1 + LAST_FRAME]));
     }
   }
 }
-#endif
+#endif  // CONFIG_EXT_REFS
 
 void av1_update_reference_frames(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
@@ -3181,14 +3181,12 @@
     int tmp = cpi->lst_fb_idxes[LAST_REF_FRAMES - 1];
 
     shift_last_ref_frames(cpi);
-
     cpi->lst_fb_idxes[0] = cpi->bwd_fb_idx;
-    if (!cpi->rc.is_src_frame_alt_ref) {
-      memcpy(cpi->interp_filter_selected[0],
-             cpi->interp_filter_selected[BWDREF_FRAME],
-             sizeof(cpi->interp_filter_selected[BWDREF_FRAME]));
-    }
     cpi->bwd_fb_idx = tmp;
+
+    memcpy(cpi->interp_filter_selected[LAST_FRAME],
+           cpi->interp_filter_selected[BWDREF_FRAME],
+           sizeof(cpi->interp_filter_selected[BWDREF_FRAME]));
   } else if (cpi->rc.is_src_frame_ext_arf && cm->show_existing_frame) {
     // Deal with the special case for showing existing internal ALTREF_FRAME
     // Refresh the LAST_FRAME with the ALTREF_FRAME and retire the LAST3_FRAME
@@ -3198,15 +3196,15 @@
     int tmp = cpi->lst_fb_idxes[LAST_REF_FRAMES - 1];
 
     shift_last_ref_frames(cpi);
-
     cpi->lst_fb_idxes[0] = cpi->alt_fb_idx;
+    cpi->alt_fb_idx = tmp;
+
+    // We need to modify the mapping accordingly
+    cpi->arf_map[which_arf] = cpi->alt_fb_idx;
+
     memcpy(cpi->interp_filter_selected[LAST_FRAME],
            cpi->interp_filter_selected[ALTREF_FRAME + which_arf],
            sizeof(cpi->interp_filter_selected[ALTREF_FRAME + which_arf]));
-
-    cpi->alt_fb_idx = tmp;
-    // We need to modify the mapping accordingly
-    cpi->arf_map[which_arf] = cpi->alt_fb_idx;
 #endif     // CONFIG_EXT_REFS
   } else { /* For non key/golden frames */
     if (cpi->refresh_alt_ref_frame) {
@@ -3241,22 +3239,12 @@
         uref_cnt_fb(cpi->upsampled_ref_bufs,
                     &cpi->upsampled_ref_idx[cpi->gld_fb_idx], new_uidx);
 
-      if (!cpi->rc.is_src_frame_alt_ref) {
+#if !CONFIG_EXT_REFS
+      if (!cpi->rc.is_src_frame_alt_ref)
+#endif  // !CONFIG_EXT_REFS
         memcpy(cpi->interp_filter_selected[GOLDEN_FRAME],
                cpi->interp_filter_selected[0],
                sizeof(cpi->interp_filter_selected[0]));
-      } else {
-        int which_arf = 0;
-#if CONFIG_EXT_REFS
-        if (cpi->oxcf.pass == 2) {
-          const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-          which_arf = gf_group->arf_update_idx[gf_group->index];
-        }
-#endif  // CONFIG_EXT_REFS
-        memcpy(cpi->interp_filter_selected[GOLDEN_FRAME],
-               cpi->interp_filter_selected[ALTREF_FRAME + which_arf],
-               sizeof(cpi->interp_filter_selected[ALTREF_FRAME + which_arf]));
-      }
     }
 
 #if CONFIG_EXT_REFS
@@ -3271,6 +3259,7 @@
         cpi->alt_fb_idx = cpi->bwd_fb_idx;
         cpi->bwd_fb_idx = tmp;
       }
+
       ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->bwd_fb_idx],
                  cm->new_fb_idx);
       if (use_upsampled_ref)
@@ -3354,20 +3343,14 @@
       tmp = cpi->lst_fb_idxes[LAST_REF_FRAMES - 1];
 
       shift_last_ref_frames(cpi);
-
       cpi->lst_fb_idxes[0] = tmp;
 
-      if (!cpi->rc.is_src_frame_alt_ref) {
-        if (cm->show_existing_frame) {
-          memcpy(cpi->interp_filter_selected[LAST_FRAME],
-                 cpi->interp_filter_selected[BWDREF_FRAME],
-                 sizeof(cpi->interp_filter_selected[BWDREF_FRAME]));
-        } else {
-          memcpy(cpi->interp_filter_selected[LAST_FRAME],
-                 cpi->interp_filter_selected[0],
-                 sizeof(cpi->interp_filter_selected[0]));
-        }
-      }
+      assert(cm->show_existing_frame == 0);
+      // NOTE: Currently only LF_UPDATE and INTNL_OVERLAY_UPDATE frames are to
+      //       refresh the LAST_FRAME.
+      memcpy(cpi->interp_filter_selected[LAST_FRAME],
+             cpi->interp_filter_selected[0],
+             sizeof(cpi->interp_filter_selected[0]));
     }
 #else
     ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx],
diff --git a/av1/encoder/firstpass.c b/av1/encoder/firstpass.c
index 5821d3f..9fdf540 100644
--- a/av1/encoder/firstpass.c
+++ b/av1/encoder/firstpass.c
@@ -2585,10 +2585,11 @@
       if (cpi->num_extra_arfs) {
         int tmp = cpi->bwd_fb_idx;
 
-        cpi->rc.is_bwd_ref_frame = 1;
         cpi->bwd_fb_idx = cpi->alt_fb_idx;
         cpi->alt_fb_idx = cpi->arf_map[0];
         cpi->arf_map[0] = tmp;
+
+        cpi->rc.is_bwd_ref_frame = 1;
       } else {
         cpi->rc.is_bwd_ref_frame = 0;
       }
@@ -2639,11 +2640,13 @@
         // NOTE: The indices will be swapped back after this frame is encoded
         //       (in av1_update_reference_frames()).
         int tmp = cpi->bwd_fb_idx;
+
         cpi->bwd_fb_idx = cpi->alt_fb_idx;
         cpi->alt_fb_idx = cpi->arf_map[0];
         cpi->arf_map[0] = tmp;
       }
       break;
+
     case LAST_BIPRED_UPDATE:
       cpi->refresh_last_frame = 0;
       cpi->refresh_golden_frame = 0;
diff --git a/av1/encoder/hybrid_fwd_txfm.c b/av1/encoder/hybrid_fwd_txfm.c
index 9589a48..1103c4b 100644
--- a/av1/encoder/hybrid_fwd_txfm.c
+++ b/av1/encoder/hybrid_fwd_txfm.c
@@ -180,16 +180,14 @@
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
-      av1_fht32x32_c(src_diff, coeff, diff_stride, tx_type);
+      av1_fht32x32(src_diff, coeff, diff_stride, tx_type);
       break;
     case V_DCT:
     case H_DCT:
     case V_ADST:
     case H_ADST:
     case V_FLIPADST:
-    case H_FLIPADST:
-      av1_fht32x32_c(src_diff, coeff, diff_stride, tx_type);
-      break;
+    case H_FLIPADST: av1_fht32x32(src_diff, coeff, diff_stride, tx_type); break;
     case IDTX: av1_fwd_idtx_c(src_diff, coeff, diff_stride, 32, tx_type); break;
 #endif  // CONFIG_EXT_TX
     default: assert(0); break;
diff --git a/av1/encoder/tokenize.c b/av1/encoder/tokenize.c
index d659607..8095681 100644
--- a/av1/encoder/tokenize.c
+++ b/av1/encoder/tokenize.c
@@ -387,7 +387,7 @@
 
 static INLINE void add_token(TOKENEXTRA **t, const aom_prob *context_tree,
 #if CONFIG_ANS
-                             const rans_lut *token_cdf,
+                             const aom_cdf_prob (*token_cdf)[ENTROPY_TOKENS],
 #endif  // CONFIG_ANS
                              int32_t extra, uint8_t token,
                              uint8_t skip_eob_node, unsigned int *counts) {
@@ -402,17 +402,6 @@
   ++counts[token];
 }
 
-static INLINE void add_token_no_extra(TOKENEXTRA **t,
-                                      const aom_prob *context_tree,
-                                      uint8_t token, uint8_t skip_eob_node,
-                                      unsigned int *counts) {
-  (*t)->token = token;
-  (*t)->context_tree = context_tree;
-  (*t)->skip_eob_node = skip_eob_node;
-  (*t)++;
-  ++counts[token];
-}
-
 static INLINE int get_tx_eob(const struct segmentation *seg, int segment_id,
                              TX_SIZE tx_size) {
   const int eob_max = num_4x4_blocks_txsize_lookup[tx_size] << 4;
@@ -498,8 +487,8 @@
       cpi->common.fc->coef_probs[txsize_sqr_map[tx_size]][type][ref];
 #endif  // CONFIG_ENTROPY
 #if CONFIG_ANS
-  rans_lut(*const coef_cdfs)[COEFF_CONTEXTS] =
-      cpi->common.fc->coef_cdfs[txsize_sqr_map[tx_size]][type][ref];
+  aom_cdf_prob(*const coef_cdfs)[COEFF_CONTEXTS][ENTROPY_TOKENS] =
+      cpi->common.fc->coef_cdfs[tx_size][type][ref];
 #endif  // CONFIG_ANS
   unsigned int(*const eob_branch)[COEFF_CONTEXTS] =
       td->counts->eob_branch[txsize_sqr_map[tx_size]][type][ref];
@@ -522,7 +511,7 @@
 
     add_token(&t, coef_probs[band[c]][pt],
 #if CONFIG_ANS
-              (const rans_lut *)&coef_cdfs[band[c]][pt],
+              (const aom_cdf_prob(*)[ENTROPY_TOKENS]) & coef_cdfs[band[c]][pt],
 #endif  // CONFIG_ANS
               extra, (uint8_t)token, (uint8_t)skip_eob, counts[band[c]][pt]);
 
@@ -532,8 +521,11 @@
     skip_eob = (token == ZERO_TOKEN);
   }
   if (c < seg_eob) {
-    add_token_no_extra(&t, coef_probs[band[c]][pt], EOB_TOKEN, 0,
-                       counts[band[c]][pt]);
+    add_token(&t, coef_probs[band[c]][pt],
+#if CONFIG_ANS || CONFIG_DAALA_EC
+              NULL,
+#endif
+              0, EOB_TOKEN, 0, counts[band[c]][pt]);
     ++eob_branch[band[c]][pt];
   }
 
diff --git a/av1/encoder/tokenize.h b/av1/encoder/tokenize.h
index 520e1b6..f20848a 100644
--- a/av1/encoder/tokenize.h
+++ b/av1/encoder/tokenize.h
@@ -37,7 +37,7 @@
 typedef struct {
   const aom_prob *context_tree;
 #if CONFIG_ANS
-  const rans_lut *token_cdf;
+  const aom_cdf_prob (*token_cdf)[ENTROPY_TOKENS];
 #endif  // CONFIG_ANS
   EXTRABIT extra;
   uint8_t token;
diff --git a/av1/encoder/x86/hybrid_fwd_txfm_avx2.c b/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
index b23d39d..69bf89a 100644
--- a/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
+++ b/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
@@ -25,8 +25,7 @@
   *u = _mm256_permute2x128_si256(v, v, 1);
 }
 
-void aom_fdct16x16_1_avx2(const int16_t *input, tran_low_t *output,
-                          int stride) {
+static int32_t get_16x16_sum(const int16_t *input, int stride) {
   __m256i r0, r1, r2, r3, u0, u1;
   __m256i zero = _mm256_setzero_si256();
   __m256i sum = _mm256_setzero_si256();
@@ -61,8 +60,14 @@
                      _mm256_castsi256_si128(u1));
   v1 = _mm_srli_si128(v0, 4);
   v0 = _mm_add_epi32(v0, v1);
-  v0 = _mm_srai_epi32(v0, 1);
-  output[0] = (tran_low_t)_mm_extract_epi32(v0, 0);
+  return (int32_t)_mm_extract_epi32(v0, 0);
+}
+
+void aom_fdct16x16_1_avx2(const int16_t *input, tran_low_t *output,
+                          int stride) {
+  int32_t dc = get_16x16_sum(input, stride);
+  output[0] = (tran_low_t)(dc >> 1);
+  _mm256_zeroupper();
 }
 
 static void mm256_transpose_16x16(__m256i *in) {
@@ -559,8 +564,6 @@
   x1 = _mm256_unpackhi_epi16(u3, u4);
   in[13] = butter_fly(x0, x1, cospi_p06_p26);
   in[3] = butter_fly(x0, x1, cospi_m26_p06);
-
-  mm256_transpose_16x16(in);
 }
 
 void fadst16_avx2(__m256i *in) {
@@ -1105,8 +1108,6 @@
   in[3] = _mm256_sub_epi16(zero, x4);
   in[13] = _mm256_sub_epi16(zero, x13);
   in[15] = _mm256_sub_epi16(zero, x1);
-
-  mm256_transpose_16x16(in);
 }
 
 #if CONFIG_EXT_TX
@@ -1134,7 +1135,6 @@
     in[i] = _mm256_packs_epi32(u0, u1);
     i++;
   }
-  mm256_transpose_16x16(in);
 }
 #endif
 
@@ -1146,24 +1146,28 @@
     case DCT_DCT:
       load_buffer_16x16(input, stride, 0, 0, in);
       fdct16_avx2(in);
+      mm256_transpose_16x16(in);
       right_shift_16x16(in);
       fdct16_avx2(in);
       break;
     case ADST_DCT:
       load_buffer_16x16(input, stride, 0, 0, in);
       fadst16_avx2(in);
+      mm256_transpose_16x16(in);
       right_shift_16x16(in);
       fdct16_avx2(in);
       break;
     case DCT_ADST:
       load_buffer_16x16(input, stride, 0, 0, in);
       fdct16_avx2(in);
+      mm256_transpose_16x16(in);
       right_shift_16x16(in);
       fadst16_avx2(in);
       break;
     case ADST_ADST:
       load_buffer_16x16(input, stride, 0, 0, in);
       fadst16_avx2(in);
+      mm256_transpose_16x16(in);
       right_shift_16x16(in);
       fadst16_avx2(in);
       break;
@@ -1171,71 +1175,698 @@
     case FLIPADST_DCT:
       load_buffer_16x16(input, stride, 1, 0, in);
       fadst16_avx2(in);
+      mm256_transpose_16x16(in);
       right_shift_16x16(in);
       fdct16_avx2(in);
       break;
     case DCT_FLIPADST:
       load_buffer_16x16(input, stride, 0, 1, in);
       fdct16_avx2(in);
+      mm256_transpose_16x16(in);
       right_shift_16x16(in);
       fadst16_avx2(in);
       break;
     case FLIPADST_FLIPADST:
       load_buffer_16x16(input, stride, 1, 1, in);
       fadst16_avx2(in);
+      mm256_transpose_16x16(in);
       right_shift_16x16(in);
       fadst16_avx2(in);
       break;
     case ADST_FLIPADST:
       load_buffer_16x16(input, stride, 0, 1, in);
       fadst16_avx2(in);
+      mm256_transpose_16x16(in);
       right_shift_16x16(in);
       fadst16_avx2(in);
       break;
     case FLIPADST_ADST:
       load_buffer_16x16(input, stride, 1, 0, in);
       fadst16_avx2(in);
+      mm256_transpose_16x16(in);
       right_shift_16x16(in);
       fadst16_avx2(in);
       break;
     case V_DCT:
       load_buffer_16x16(input, stride, 0, 0, in);
       fdct16_avx2(in);
+      mm256_transpose_16x16(in);
       right_shift_16x16(in);
       fidtx16_avx2(in);
       break;
     case H_DCT:
       load_buffer_16x16(input, stride, 0, 0, in);
       fidtx16_avx2(in);
+      mm256_transpose_16x16(in);
       right_shift_16x16(in);
       fdct16_avx2(in);
       break;
     case V_ADST:
       load_buffer_16x16(input, stride, 0, 0, in);
       fadst16_avx2(in);
+      mm256_transpose_16x16(in);
       right_shift_16x16(in);
       fidtx16_avx2(in);
       break;
     case H_ADST:
       load_buffer_16x16(input, stride, 0, 0, in);
       fidtx16_avx2(in);
+      mm256_transpose_16x16(in);
       right_shift_16x16(in);
       fadst16_avx2(in);
       break;
     case V_FLIPADST:
       load_buffer_16x16(input, stride, 1, 0, in);
       fadst16_avx2(in);
+      mm256_transpose_16x16(in);
       right_shift_16x16(in);
       fidtx16_avx2(in);
       break;
     case H_FLIPADST:
       load_buffer_16x16(input, stride, 0, 1, in);
       fidtx16_avx2(in);
+      mm256_transpose_16x16(in);
       right_shift_16x16(in);
       fadst16_avx2(in);
       break;
 #endif  // CONFIG_EXT_TX
     default: assert(0); break;
   }
+  mm256_transpose_16x16(in);
   write_buffer_16x16(in, 16, output);
+  _mm256_zeroupper();
+}
+
+void aom_fdct32x32_1_avx2(const int16_t *input, tran_low_t *output,
+                          int stride) {
+  // left and upper corner
+  int32_t sum = get_16x16_sum(input, stride);
+  // right and upper corner
+  sum += get_16x16_sum(input + 16, stride);
+  // left and lower corner
+  sum += get_16x16_sum(input + (stride << 4), stride);
+  // right and lower corner
+  sum += get_16x16_sum(input + (stride << 4) + 16, stride);
+
+  sum >>= 3;
+  output[0] = (tran_low_t)sum;
+  _mm256_zeroupper();
+}
+
+#if CONFIG_EXT_TX
+static void mm256_vectors_swap(__m256i *a0, __m256i *a1, const int size) {
+  int i = 0;
+  __m256i temp;
+  while (i < size) {
+    temp = a0[i];
+    a0[i] = a1[i];
+    a1[i] = temp;
+    i++;
+  }
+}
+
+static void mm256_transpose_32x32(__m256i *in0, __m256i *in1) {
+  mm256_transpose_16x16(in0);
+  mm256_transpose_16x16(&in0[16]);
+  mm256_transpose_16x16(in1);
+  mm256_transpose_16x16(&in1[16]);
+  mm256_vectors_swap(&in0[16], in1, 16);
+}
+
+static void prepare_16x16_even(const __m256i *in, __m256i *even) {
+  even[0] = _mm256_add_epi16(in[0], in[31]);
+  even[1] = _mm256_add_epi16(in[1], in[30]);
+  even[2] = _mm256_add_epi16(in[2], in[29]);
+  even[3] = _mm256_add_epi16(in[3], in[28]);
+  even[4] = _mm256_add_epi16(in[4], in[27]);
+  even[5] = _mm256_add_epi16(in[5], in[26]);
+  even[6] = _mm256_add_epi16(in[6], in[25]);
+  even[7] = _mm256_add_epi16(in[7], in[24]);
+  even[8] = _mm256_add_epi16(in[8], in[23]);
+  even[9] = _mm256_add_epi16(in[9], in[22]);
+  even[10] = _mm256_add_epi16(in[10], in[21]);
+  even[11] = _mm256_add_epi16(in[11], in[20]);
+  even[12] = _mm256_add_epi16(in[12], in[19]);
+  even[13] = _mm256_add_epi16(in[13], in[18]);
+  even[14] = _mm256_add_epi16(in[14], in[17]);
+  even[15] = _mm256_add_epi16(in[15], in[16]);
+}
+
+static void prepare_16x16_odd(const __m256i *in, __m256i *odd) {
+  odd[0] = _mm256_sub_epi16(in[15], in[16]);
+  odd[1] = _mm256_sub_epi16(in[14], in[17]);
+  odd[2] = _mm256_sub_epi16(in[13], in[18]);
+  odd[3] = _mm256_sub_epi16(in[12], in[19]);
+  odd[4] = _mm256_sub_epi16(in[11], in[20]);
+  odd[5] = _mm256_sub_epi16(in[10], in[21]);
+  odd[6] = _mm256_sub_epi16(in[9], in[22]);
+  odd[7] = _mm256_sub_epi16(in[8], in[23]);
+  odd[8] = _mm256_sub_epi16(in[7], in[24]);
+  odd[9] = _mm256_sub_epi16(in[6], in[25]);
+  odd[10] = _mm256_sub_epi16(in[5], in[26]);
+  odd[11] = _mm256_sub_epi16(in[4], in[27]);
+  odd[12] = _mm256_sub_epi16(in[3], in[28]);
+  odd[13] = _mm256_sub_epi16(in[2], in[29]);
+  odd[14] = _mm256_sub_epi16(in[1], in[30]);
+  odd[15] = _mm256_sub_epi16(in[0], in[31]);
+}
+
+static void collect_16col(const __m256i *even, const __m256i *odd,
+                          __m256i *out) {
+  // fdct16_avx2() already maps the output
+  out[0] = even[0];
+  out[2] = even[1];
+  out[4] = even[2];
+  out[6] = even[3];
+  out[8] = even[4];
+  out[10] = even[5];
+  out[12] = even[6];
+  out[14] = even[7];
+  out[16] = even[8];
+  out[18] = even[9];
+  out[20] = even[10];
+  out[22] = even[11];
+  out[24] = even[12];
+  out[26] = even[13];
+  out[28] = even[14];
+  out[30] = even[15];
+
+  out[1] = odd[0];
+  out[17] = odd[1];
+  out[9] = odd[2];
+  out[25] = odd[3];
+  out[5] = odd[4];
+  out[21] = odd[5];
+  out[13] = odd[6];
+  out[29] = odd[7];
+  out[3] = odd[8];
+  out[19] = odd[9];
+  out[11] = odd[10];
+  out[27] = odd[11];
+  out[7] = odd[12];
+  out[23] = odd[13];
+  out[15] = odd[14];
+  out[31] = odd[15];
+}
+
+static void collect_coeffs(const __m256i *first_16col_even,
+                           const __m256i *first_16col_odd,
+                           const __m256i *second_16col_even,
+                           const __m256i *second_16col_odd, __m256i *in0,
+                           __m256i *in1) {
+  collect_16col(first_16col_even, first_16col_odd, in0);
+  collect_16col(second_16col_even, second_16col_odd, in1);
+}
+
+static void fdct16_odd_avx2(__m256i *in) {
+  // sequence: cospi_L_H = pairs(L, H) and L first
+  const __m256i cospi_p16_p16 = pair256_set_epi16(cospi_16_64, cospi_16_64);
+  const __m256i cospi_m16_p16 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64);
+  const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m256i cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m256i cospi_p28_p04 = pair256_set_epi16(cospi_28_64, cospi_4_64);
+  const __m256i cospi_m28_m04 = pair256_set_epi16(-cospi_28_64, -cospi_4_64);
+  const __m256i cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m256i cospi_p12_p20 = pair256_set_epi16(cospi_12_64, cospi_20_64);
+  const __m256i cospi_m12_m20 = pair256_set_epi16(-cospi_12_64, -cospi_20_64);
+
+  const __m256i cospi_p31_p01 = pair256_set_epi16(cospi_31_64, cospi_1_64);
+  const __m256i cospi_m01_p31 = pair256_set_epi16(-cospi_1_64, cospi_31_64);
+  const __m256i cospi_p15_p17 = pair256_set_epi16(cospi_15_64, cospi_17_64);
+  const __m256i cospi_m17_p15 = pair256_set_epi16(-cospi_17_64, cospi_15_64);
+  const __m256i cospi_p23_p09 = pair256_set_epi16(cospi_23_64, cospi_9_64);
+  const __m256i cospi_m09_p23 = pair256_set_epi16(-cospi_9_64, cospi_23_64);
+  const __m256i cospi_p07_p25 = pair256_set_epi16(cospi_7_64, cospi_25_64);
+  const __m256i cospi_m25_p07 = pair256_set_epi16(-cospi_25_64, cospi_7_64);
+  const __m256i cospi_p27_p05 = pair256_set_epi16(cospi_27_64, cospi_5_64);
+  const __m256i cospi_m05_p27 = pair256_set_epi16(-cospi_5_64, cospi_27_64);
+  const __m256i cospi_p11_p21 = pair256_set_epi16(cospi_11_64, cospi_21_64);
+  const __m256i cospi_m21_p11 = pair256_set_epi16(-cospi_21_64, cospi_11_64);
+  const __m256i cospi_p19_p13 = pair256_set_epi16(cospi_19_64, cospi_13_64);
+  const __m256i cospi_m13_p19 = pair256_set_epi16(-cospi_13_64, cospi_19_64);
+  const __m256i cospi_p03_p29 = pair256_set_epi16(cospi_3_64, cospi_29_64);
+  const __m256i cospi_m29_p03 = pair256_set_epi16(-cospi_29_64, cospi_3_64);
+
+  __m256i x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
+  __m256i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, y15;
+  __m256i u0, u1;
+
+  // stage 1 is in prepare_16x16_odd()
+
+  // stage 2
+  y0 = in[0];
+  y1 = in[1];
+  y2 = in[2];
+  y3 = in[3];
+
+  u0 = _mm256_unpacklo_epi16(in[4], in[11]);
+  u1 = _mm256_unpackhi_epi16(in[4], in[11]);
+  y4 = butter_fly(u0, u1, cospi_m16_p16);
+  y11 = butter_fly(u0, u1, cospi_p16_p16);
+
+  u0 = _mm256_unpacklo_epi16(in[5], in[10]);
+  u1 = _mm256_unpackhi_epi16(in[5], in[10]);
+  y5 = butter_fly(u0, u1, cospi_m16_p16);
+  y10 = butter_fly(u0, u1, cospi_p16_p16);
+
+  u0 = _mm256_unpacklo_epi16(in[6], in[9]);
+  u1 = _mm256_unpackhi_epi16(in[6], in[9]);
+  y6 = butter_fly(u0, u1, cospi_m16_p16);
+  y9 = butter_fly(u0, u1, cospi_p16_p16);
+
+  u0 = _mm256_unpacklo_epi16(in[7], in[8]);
+  u1 = _mm256_unpackhi_epi16(in[7], in[8]);
+  y7 = butter_fly(u0, u1, cospi_m16_p16);
+  y8 = butter_fly(u0, u1, cospi_p16_p16);
+
+  y12 = in[12];
+  y13 = in[13];
+  y14 = in[14];
+  y15 = in[15];
+
+  // stage 3
+  x0 = _mm256_add_epi16(y0, y7);
+  x1 = _mm256_add_epi16(y1, y6);
+  x2 = _mm256_add_epi16(y2, y5);
+  x3 = _mm256_add_epi16(y3, y4);
+  x4 = _mm256_sub_epi16(y3, y4);
+  x5 = _mm256_sub_epi16(y2, y5);
+  x6 = _mm256_sub_epi16(y1, y6);
+  x7 = _mm256_sub_epi16(y0, y7);
+  x8 = _mm256_sub_epi16(y15, y8);
+  x9 = _mm256_sub_epi16(y14, y9);
+  x10 = _mm256_sub_epi16(y13, y10);
+  x11 = _mm256_sub_epi16(y12, y11);
+  x12 = _mm256_add_epi16(y12, y11);
+  x13 = _mm256_add_epi16(y13, y10);
+  x14 = _mm256_add_epi16(y14, y9);
+  x15 = _mm256_add_epi16(y15, y8);
+
+  // stage 4
+  y0 = x0;
+  y1 = x1;
+  y6 = x6;
+  y7 = x7;
+  y8 = x8;
+  y9 = x9;
+  y14 = x14;
+  y15 = x15;
+
+  u0 = _mm256_unpacklo_epi16(x2, x13);
+  u1 = _mm256_unpackhi_epi16(x2, x13);
+  y2 = butter_fly(u0, u1, cospi_m08_p24);
+  y13 = butter_fly(u0, u1, cospi_p24_p08);
+
+  u0 = _mm256_unpacklo_epi16(x3, x12);
+  u1 = _mm256_unpackhi_epi16(x3, x12);
+  y3 = butter_fly(u0, u1, cospi_m08_p24);
+  y12 = butter_fly(u0, u1, cospi_p24_p08);
+
+  u0 = _mm256_unpacklo_epi16(x4, x11);
+  u1 = _mm256_unpackhi_epi16(x4, x11);
+  y4 = butter_fly(u0, u1, cospi_m24_m08);
+  y11 = butter_fly(u0, u1, cospi_m08_p24);
+
+  u0 = _mm256_unpacklo_epi16(x5, x10);
+  u1 = _mm256_unpackhi_epi16(x5, x10);
+  y5 = butter_fly(u0, u1, cospi_m24_m08);
+  y10 = butter_fly(u0, u1, cospi_m08_p24);
+
+  // stage 5
+  x0 = _mm256_add_epi16(y0, y3);
+  x1 = _mm256_add_epi16(y1, y2);
+  x2 = _mm256_sub_epi16(y1, y2);
+  x3 = _mm256_sub_epi16(y0, y3);
+  x4 = _mm256_sub_epi16(y7, y4);
+  x5 = _mm256_sub_epi16(y6, y5);
+  x6 = _mm256_add_epi16(y6, y5);
+  x7 = _mm256_add_epi16(y7, y4);
+
+  x8 = _mm256_add_epi16(y8, y11);
+  x9 = _mm256_add_epi16(y9, y10);
+  x10 = _mm256_sub_epi16(y9, y10);
+  x11 = _mm256_sub_epi16(y8, y11);
+  x12 = _mm256_sub_epi16(y15, y12);
+  x13 = _mm256_sub_epi16(y14, y13);
+  x14 = _mm256_add_epi16(y14, y13);
+  x15 = _mm256_add_epi16(y15, y12);
+
+  // stage 6
+  y0 = x0;
+  y3 = x3;
+  y4 = x4;
+  y7 = x7;
+  y8 = x8;
+  y11 = x11;
+  y12 = x12;
+  y15 = x15;
+
+  u0 = _mm256_unpacklo_epi16(x1, x14);
+  u1 = _mm256_unpackhi_epi16(x1, x14);
+  y1 = butter_fly(u0, u1, cospi_m04_p28);
+  y14 = butter_fly(u0, u1, cospi_p28_p04);
+
+  u0 = _mm256_unpacklo_epi16(x2, x13);
+  u1 = _mm256_unpackhi_epi16(x2, x13);
+  y2 = butter_fly(u0, u1, cospi_m28_m04);
+  y13 = butter_fly(u0, u1, cospi_m04_p28);
+
+  u0 = _mm256_unpacklo_epi16(x5, x10);
+  u1 = _mm256_unpackhi_epi16(x5, x10);
+  y5 = butter_fly(u0, u1, cospi_m20_p12);
+  y10 = butter_fly(u0, u1, cospi_p12_p20);
+
+  u0 = _mm256_unpacklo_epi16(x6, x9);
+  u1 = _mm256_unpackhi_epi16(x6, x9);
+  y6 = butter_fly(u0, u1, cospi_m12_m20);
+  y9 = butter_fly(u0, u1, cospi_m20_p12);
+
+  // stage 7
+  x0 = _mm256_add_epi16(y0, y1);
+  x1 = _mm256_sub_epi16(y0, y1);
+  x2 = _mm256_sub_epi16(y3, y2);
+  x3 = _mm256_add_epi16(y3, y2);
+  x4 = _mm256_add_epi16(y4, y5);
+  x5 = _mm256_sub_epi16(y4, y5);
+  x6 = _mm256_sub_epi16(y7, y6);
+  x7 = _mm256_add_epi16(y7, y6);
+
+  x8 = _mm256_add_epi16(y8, y9);
+  x9 = _mm256_sub_epi16(y8, y9);
+  x10 = _mm256_sub_epi16(y11, y10);
+  x11 = _mm256_add_epi16(y11, y10);
+  x12 = _mm256_add_epi16(y12, y13);
+  x13 = _mm256_sub_epi16(y12, y13);
+  x14 = _mm256_sub_epi16(y15, y14);
+  x15 = _mm256_add_epi16(y15, y14);
+
+  // stage 8
+  u0 = _mm256_unpacklo_epi16(x0, x15);
+  u1 = _mm256_unpackhi_epi16(x0, x15);
+  in[0] = butter_fly(u0, u1, cospi_p31_p01);
+  in[15] = butter_fly(u0, u1, cospi_m01_p31);
+
+  u0 = _mm256_unpacklo_epi16(x1, x14);
+  u1 = _mm256_unpackhi_epi16(x1, x14);
+  in[1] = butter_fly(u0, u1, cospi_p15_p17);
+  in[14] = butter_fly(u0, u1, cospi_m17_p15);
+
+  u0 = _mm256_unpacklo_epi16(x2, x13);
+  u1 = _mm256_unpackhi_epi16(x2, x13);
+  in[2] = butter_fly(u0, u1, cospi_p23_p09);
+  in[13] = butter_fly(u0, u1, cospi_m09_p23);
+
+  u0 = _mm256_unpacklo_epi16(x3, x12);
+  u1 = _mm256_unpackhi_epi16(x3, x12);
+  in[3] = butter_fly(u0, u1, cospi_p07_p25);
+  in[12] = butter_fly(u0, u1, cospi_m25_p07);
+
+  u0 = _mm256_unpacklo_epi16(x4, x11);
+  u1 = _mm256_unpackhi_epi16(x4, x11);
+  in[4] = butter_fly(u0, u1, cospi_p27_p05);
+  in[11] = butter_fly(u0, u1, cospi_m05_p27);
+
+  u0 = _mm256_unpacklo_epi16(x5, x10);
+  u1 = _mm256_unpackhi_epi16(x5, x10);
+  in[5] = butter_fly(u0, u1, cospi_p11_p21);
+  in[10] = butter_fly(u0, u1, cospi_m21_p11);
+
+  u0 = _mm256_unpacklo_epi16(x6, x9);
+  u1 = _mm256_unpackhi_epi16(x6, x9);
+  in[6] = butter_fly(u0, u1, cospi_p19_p13);
+  in[9] = butter_fly(u0, u1, cospi_m13_p19);
+
+  u0 = _mm256_unpacklo_epi16(x7, x8);
+  u1 = _mm256_unpackhi_epi16(x7, x8);
+  in[7] = butter_fly(u0, u1, cospi_p03_p29);
+  in[8] = butter_fly(u0, u1, cospi_m29_p03);
+}
+
+static void fdct32_avx2(__m256i *in0, __m256i *in1) {
+  __m256i even0[16], even1[16], odd0[16], odd1[16];
+  prepare_16x16_even(in0, even0);
+  fdct16_avx2(even0);
+
+  prepare_16x16_odd(in0, odd0);
+  fdct16_odd_avx2(odd0);
+
+  prepare_16x16_even(in1, even1);
+  fdct16_avx2(even1);
+
+  prepare_16x16_odd(in1, odd1);
+  fdct16_odd_avx2(odd1);
+
+  collect_coeffs(even0, odd0, even1, odd1, in0, in1);
+
+  mm256_transpose_32x32(in0, in1);
+}
+#endif  // CONFIG_EXT_TX
+
+static INLINE void write_buffer_32x32(const __m256i *in0, const __m256i *in1,
+                                      int stride, tran_low_t *output) {
+  int i = 0;
+  tran_low_t *coeff = output;
+  while (i < 32) {
+    _mm256_storeu_si256((__m256i *)coeff, in0[i]);
+    _mm256_storeu_si256((__m256i *)(coeff + 16), in1[i]);
+    coeff += stride;
+    i += 1;
+  }
+}
+
+#if CONFIG_EXT_TX
+static void fhalfright32_16col_avx2(__m256i *in) {
+  int i = 0;
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i sqrt2 = _mm256_set1_epi16(Sqrt2);
+  const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+  __m256i x0, x1;
+
+  while (i < 16) {
+    in[i] = _mm256_slli_epi16(in[i], 2);
+    x0 = _mm256_unpacklo_epi16(in[i + 16], zero);
+    x1 = _mm256_unpackhi_epi16(in[i + 16], zero);
+    x0 = _mm256_madd_epi16(x0, sqrt2);
+    x1 = _mm256_madd_epi16(x1, sqrt2);
+    x0 = _mm256_add_epi32(x0, dct_rounding);
+    x1 = _mm256_add_epi32(x1, dct_rounding);
+    x0 = _mm256_srai_epi32(x0, DCT_CONST_BITS);
+    x1 = _mm256_srai_epi32(x1, DCT_CONST_BITS);
+    in[i + 16] = _mm256_packs_epi32(x0, x1);
+    i += 1;
+  }
+  fdct16_avx2(&in[16]);
+}
+
+static void fhalfright32_avx2(__m256i *in0, __m256i *in1) {
+  fhalfright32_16col_avx2(in0);
+  fhalfright32_16col_avx2(in1);
+  mm256_vectors_swap(in0, &in0[16], 16);
+  mm256_vectors_swap(in1, &in1[16], 16);
+  mm256_transpose_32x32(in0, in1);
+}
+
+static void load_buffer_32x32(const int16_t *input, int stride, int flipud,
+                              int fliplr, __m256i *in0, __m256i *in1) {
+  // Load 4 16x16 blocks
+  const int16_t *topL = input;
+  const int16_t *topR = input + 16;
+  const int16_t *botL = input + 16 * stride;
+  const int16_t *botR = input + 16 * stride + 16;
+
+  const int16_t *tmp;
+
+  if (flipud) {
+    // Swap left columns
+    tmp = topL;
+    topL = botL;
+    botL = tmp;
+    // Swap right columns
+    tmp = topR;
+    topR = botR;
+    botR = tmp;
+  }
+
+  if (fliplr) {
+    // Swap top rows
+    tmp = topL;
+    topL = topR;
+    topR = tmp;
+    // Swap bottom rows
+    tmp = botL;
+    botL = botR;
+    botR = tmp;
+  }
+
+  // load first 16 columns
+  load_buffer_16x16(topL, stride, flipud, fliplr, in0);
+  load_buffer_16x16(botL, stride, flipud, fliplr, in0 + 16);
+
+  // load second 16 columns
+  load_buffer_16x16(topR, stride, flipud, fliplr, in1);
+  load_buffer_16x16(botR, stride, flipud, fliplr, in1 + 16);
+}
+#endif  // CONFIG_EXT_TX
+
+static void nr_right_shift_32x32_16col(__m256i *in) {
+  int i = 0;
+  const __m256i one = _mm256_set1_epi16(1);
+  __m256i sign;
+  while (i < 32) {
+    sign = _mm256_srai_epi16(in[i], 15);
+    in[i] = _mm256_add_epi16(in[i], one);
+    in[i] = _mm256_sub_epi16(in[i], sign);
+    in[i] = _mm256_srai_epi16(in[i], 2);
+    i += 1;
+  }
+}
+
+// Negative rounding
+static void nr_right_shift_32x32(__m256i *in0, __m256i *in1) {
+  nr_right_shift_32x32_16col(in0);
+  nr_right_shift_32x32_16col(in1);
+}
+
+#if CONFIG_EXT_TX
+static void pr_right_shift_32x32_16col(__m256i *in) {
+  int i = 0;
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i one = _mm256_set1_epi16(1);
+  __m256i sign;
+  while (i < 32) {
+    sign = _mm256_cmpgt_epi16(in[i], zero);
+    in[i] = _mm256_add_epi16(in[i], one);
+    in[i] = _mm256_sub_epi16(in[i], sign);
+    in[i] = _mm256_srai_epi16(in[i], 2);
+    i += 1;
+  }
+}
+
+// Positive rounding
+static void pr_right_shift_32x32(__m256i *in0, __m256i *in1) {
+  pr_right_shift_32x32_16col(in0);
+  pr_right_shift_32x32_16col(in1);
+}
+
+static void fidtx32_avx2(__m256i *in0, __m256i *in1) {
+  int i = 0;
+  while (i < 32) {
+    in0[i] = _mm256_slli_epi16(in0[i], 2);
+    in1[i] = _mm256_slli_epi16(in1[i], 2);
+    i += 1;
+  }
+  mm256_transpose_32x32(in0, in1);
+}
+#endif
+
+void av1_fht32x32_avx2(const int16_t *input, tran_low_t *output, int stride,
+                       int tx_type) {
+  __m256i in0[32];  // left 32 columns
+  __m256i in1[32];  // right 32 columns
+  (void)input;
+  (void)stride;
+
+  switch (tx_type) {
+// TODO(luoyi): For DCT_DCT, fwd_txfm_32x32() uses aom set. But this
+// function has better speed. The replacement must work with the
+// corresponding inverse transform.
+// case DCT_DCT:
+//   load_buffer_32x32(input, stride, 0, 0, in0, in1);
+//   fdct32_avx2(in0, in1);
+//   pr_right_shift_32x32(in0, in1);
+//   fdct32_avx2(in0, in1);
+//   break;
+#if CONFIG_EXT_TX
+    case ADST_DCT:
+      load_buffer_32x32(input, stride, 0, 0, in0, in1);
+      fhalfright32_avx2(in0, in1);
+      pr_right_shift_32x32(in0, in1);
+      fdct32_avx2(in0, in1);
+      break;
+    case DCT_ADST:
+      load_buffer_32x32(input, stride, 0, 0, in0, in1);
+      fdct32_avx2(in0, in1);
+      pr_right_shift_32x32(in0, in1);
+      fhalfright32_avx2(in0, in1);
+      break;
+    case ADST_ADST:
+      load_buffer_32x32(input, stride, 0, 0, in0, in1);
+      fhalfright32_avx2(in0, in1);
+      pr_right_shift_32x32(in0, in1);
+      fhalfright32_avx2(in0, in1);
+      break;
+    case FLIPADST_DCT:
+      load_buffer_32x32(input, stride, 1, 0, in0, in1);
+      fhalfright32_avx2(in0, in1);
+      pr_right_shift_32x32(in0, in1);
+      fdct32_avx2(in0, in1);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_32x32(input, stride, 0, 1, in0, in1);
+      fdct32_avx2(in0, in1);
+      pr_right_shift_32x32(in0, in1);
+      fhalfright32_avx2(in0, in1);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_32x32(input, stride, 1, 1, in0, in1);
+      fhalfright32_avx2(in0, in1);
+      pr_right_shift_32x32(in0, in1);
+      fhalfright32_avx2(in0, in1);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_32x32(input, stride, 0, 1, in0, in1);
+      fhalfright32_avx2(in0, in1);
+      pr_right_shift_32x32(in0, in1);
+      fhalfright32_avx2(in0, in1);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_32x32(input, stride, 1, 0, in0, in1);
+      fhalfright32_avx2(in0, in1);
+      pr_right_shift_32x32(in0, in1);
+      fhalfright32_avx2(in0, in1);
+      break;
+    case V_DCT:
+      load_buffer_32x32(input, stride, 0, 0, in0, in1);
+      fdct32_avx2(in0, in1);
+      pr_right_shift_32x32(in0, in1);
+      fidtx32_avx2(in0, in1);
+      break;
+    case H_DCT:
+      load_buffer_32x32(input, stride, 0, 0, in0, in1);
+      fidtx32_avx2(in0, in1);
+      pr_right_shift_32x32(in0, in1);
+      fdct32_avx2(in0, in1);
+      break;
+    case V_ADST:
+      load_buffer_32x32(input, stride, 0, 0, in0, in1);
+      fhalfright32_avx2(in0, in1);
+      pr_right_shift_32x32(in0, in1);
+      fidtx32_avx2(in0, in1);
+      break;
+    case H_ADST:
+      load_buffer_32x32(input, stride, 0, 0, in0, in1);
+      fidtx32_avx2(in0, in1);
+      pr_right_shift_32x32(in0, in1);
+      fhalfright32_avx2(in0, in1);
+      break;
+    case V_FLIPADST:
+      load_buffer_32x32(input, stride, 1, 0, in0, in1);
+      fhalfright32_avx2(in0, in1);
+      pr_right_shift_32x32(in0, in1);
+      fidtx32_avx2(in0, in1);
+      break;
+    case H_FLIPADST:
+      load_buffer_32x32(input, stride, 0, 1, in0, in1);
+      fidtx32_avx2(in0, in1);
+      pr_right_shift_32x32(in0, in1);
+      fhalfright32_avx2(in0, in1);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+  nr_right_shift_32x32(in0, in1);
+  write_buffer_32x32(in0, in1, 32, output);
+  _mm256_zeroupper();
 }
diff --git a/test/ans_test.cc b/test/ans_test.cc
index ca38de2..ba8e3c7 100644
--- a/test/ans_test.cc
+++ b/test/ans_test.cc
@@ -74,18 +74,21 @@
   return ans_read_end(&d);
 }
 
-// TODO(aconverse@google.com): replace this with a more representative
-// distribution from the codec.
-const rans_sym rans_sym_tab[] = {
-  { 67, 0 }, { 99, 67 }, { 575, 166 }, { 283, 741 },
-};
+const aom_cdf_prob spareto65[] = { 260, 188, 138, 102, 133, 122, 64, 15, 1, 1 };
 
-std::vector<int> ans_encode_build_vals(const rans_sym *tab, int iters) {
+const int kRansSymbols =
+    static_cast<int>(sizeof(spareto65) / sizeof(spareto65[0]));
+
+std::vector<int> ans_encode_build_vals(rans_sym *const tab, int iters) {
+  aom_cdf_prob sum = 0;
+  for (int i = 0; i < kRansSymbols; ++i) {
+    tab[i].cum_prob = sum;
+    tab[i].prob = spareto65[i];
+    sum += spareto65[i];
+  }
   std::vector<int> p_to_sym;
-  int i = 0;
-  while (p_to_sym.size() < RANS_PRECISION) {
+  for (int i = 0; i < kRansSymbols; ++i) {
     p_to_sym.insert(p_to_sym.end(), tab[i].prob, i);
-    ++i;
   }
   assert(p_to_sym.size() == RANS_PRECISION);
   std::vector<int> ret;
@@ -97,10 +100,11 @@
   return ret;
 }
 
-void rans_build_dec_tab(const struct rans_sym sym_tab[], rans_lut dec_tab) {
-  dec_tab[0] = 0;
-  for (int i = 1; dec_tab[i - 1] < RANS_PRECISION; ++i) {
-    dec_tab[i] = dec_tab[i - 1] + sym_tab[i - 1].prob;
+void rans_build_dec_tab(const struct rans_sym sym_tab[],
+                        aom_cdf_prob *dec_tab) {
+  unsigned int sum = 0;
+  for (int i = 0; sum < RANS_PRECISION; ++i) {
+    dec_tab[i] = sum += sym_tab[i].prob;
   }
 }
 
@@ -108,7 +112,7 @@
                 uint8_t *buf) {
   AnsCoder a;
   ans_write_init(&a, buf);
-  rans_lut dec_tab;
+  aom_cdf_prob dec_tab[kRansSymbols];
   rans_build_dec_tab(tab, dec_tab);
 
   std::clock_t start = std::clock();
@@ -149,16 +153,20 @@
 class AnsTest : public ::testing::Test {
  protected:
   static void SetUpTestCase() {
-    sym_vec_ = ans_encode_build_vals(rans_sym_tab, kNumSyms);
+    sym_vec_ = ans_encode_build_vals(rans_sym_tab_, kNumSyms);
   }
   virtual void SetUp() { buf_ = new uint8_t[kNumSyms / 2]; }
   virtual void TearDown() { delete[] buf_; }
   static const int kNumSyms = 25000000;
   static std::vector<int> sym_vec_;
+  static rans_sym rans_sym_tab_[kRansSymbols];
   uint8_t *buf_;
 };
 std::vector<int> AnsTest::sym_vec_;
+rans_sym AnsTest::rans_sym_tab_[kRansSymbols];
 
 TEST_F(AbsTest, Uabs) { EXPECT_TRUE(check_uabs(pv_vec_, buf_)); }
-TEST_F(AnsTest, Rans) { EXPECT_TRUE(check_rans(sym_vec_, rans_sym_tab, buf_)); }
+TEST_F(AnsTest, Rans) {
+  EXPECT_TRUE(check_rans(sym_vec_, rans_sym_tab_, buf_));
+}
 }  // namespace
diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index 9a661f9..e4179ef 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -402,6 +402,12 @@
                                                      AOM_BITS_8)));
 #endif  // HAVE_SSE2 && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
+#if HAVE_AVX2 && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(AVX2, PartialTrans32x32Test,
+                        ::testing::Values(make_tuple(&aom_fdct32x32_1_avx2,
+                                                     AOM_BITS_8)));
+#endif  // HAVE_AVX2 && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
 #if HAVE_SSE2 && CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans32x32Test,
diff --git a/test/fht32x32_test.cc b/test/fht32x32_test.cc
new file mode 100644
index 0000000..a949ebf
--- /dev/null
+++ b/test/fht32x32_test.cc
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./av1_rtcd.h"
+#include "./aom_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/transform_test_base.h"
+#include "test/util.h"
+#include "aom_ports/mem.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
+                        int tx_type);
+using std::tr1::tuple;
+using libaom_test::FhtFunc;
+typedef tuple<FhtFunc, IhtFunc, int, aom_bit_depth_t, int> Ht32x32Param;
+
+void fht32x32_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
+  av1_fht32x32_c(in, out, stride, tx_type);
+}
+
+#if CONFIG_AOM_HIGHBITDEPTH
+typedef void (*IHbdHtFunc)(const tran_low_t *in, uint8_t *out, int stride,
+                           int tx_type, int bd);
+typedef void (*HbdHtFunc)(const int16_t *input, int32_t *output, int stride,
+                          int tx_type, int bd);
+
+// Target optimized function, tx_type, bit depth
+typedef tuple<HbdHtFunc, int, int> HighbdHt32x32Param;
+
+void highbd_fht32x32_ref(const int16_t *in, int32_t *out, int stride,
+                         int tx_type, int bd) {
+  av1_fwd_txfm2d_32x32_c(in, out, stride, tx_type, bd);
+}
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+#if HAVE_AVX2
+void dummy_inv_txfm(const tran_low_t *in, uint8_t *out, int stride,
+                    int tx_type) {
+  (void)in;
+  (void)out;
+  (void)stride;
+  (void)tx_type;
+}
+#endif
+
+class AV1Trans32x32HT : public libaom_test::TransformTestBase,
+                        public ::testing::TestWithParam<Ht32x32Param> {
+ public:
+  virtual ~AV1Trans32x32HT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    tx_type_ = GET_PARAM(2);
+    pitch_ = 32;
+    fwd_txfm_ref = fht32x32_ref;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
+    num_coeffs_ = GET_PARAM(4);
+  }
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
+    fwd_txfm_(in, out, stride, tx_type_);
+  }
+
+  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride, tx_type_);
+  }
+
+  FhtFunc fwd_txfm_;
+  IhtFunc inv_txfm_;
+};
+
+TEST_P(AV1Trans32x32HT, CoeffCheck) { RunCoeffCheck(); }
+
+#if CONFIG_AOM_HIGHBITDEPTH
+class AV1HighbdTrans32x32HT
+    : public ::testing::TestWithParam<HighbdHt32x32Param> {
+ public:
+  virtual ~AV1HighbdTrans32x32HT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    fwd_txfm_ref_ = highbd_fht32x32_ref;
+    tx_type_ = GET_PARAM(1);
+    bit_depth_ = GET_PARAM(2);
+    mask_ = (1 << bit_depth_) - 1;
+    num_coeffs_ = 1024;
+
+    input_ = reinterpret_cast<int16_t *>(
+        aom_memalign(32, sizeof(int16_t) * num_coeffs_));
+    output_ = reinterpret_cast<int32_t *>(
+        aom_memalign(32, sizeof(int32_t) * num_coeffs_));
+    output_ref_ = reinterpret_cast<int32_t *>(
+        aom_memalign(32, sizeof(int32_t) * num_coeffs_));
+  }
+
+  virtual void TearDown() {
+    aom_free(input_);
+    aom_free(output_);
+    aom_free(output_ref_);
+    libaom_test::ClearSystemState();
+  }
+
+ protected:
+  void RunBitexactCheck();
+
+ private:
+  HbdHtFunc fwd_txfm_;
+  HbdHtFunc fwd_txfm_ref_;
+  int tx_type_;
+  int bit_depth_;
+  int mask_;
+  int num_coeffs_;
+  int16_t *input_;
+  int32_t *output_;
+  int32_t *output_ref_;
+};
+
+void AV1HighbdTrans32x32HT::RunBitexactCheck() {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  int i, j;
+  const int stride = 32;
+  const int num_tests = 1000;
+
+  for (i = 0; i < num_tests; ++i) {
+    for (j = 0; j < num_coeffs_; ++j) {
+      input_[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
+    }
+
+    fwd_txfm_ref_(input_, output_ref_, stride, tx_type_, bit_depth_);
+    ASM_REGISTER_STATE_CHECK(
+        fwd_txfm_(input_, output_, stride, tx_type_, bit_depth_));
+
+    for (j = 0; j < num_coeffs_; ++j) {
+      EXPECT_EQ(output_ref_[j], output_[j])
+          << "Not bit-exact result at index: " << j << " at test block: " << i;
+    }
+  }
+}
+
+TEST_P(AV1HighbdTrans32x32HT, HighbdCoeffCheck) { RunBitexactCheck(); }
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+using std::tr1::make_tuple;
+
+#if HAVE_AVX2
+const Ht32x32Param kArrayHt32x32Param_avx2[] = {
+  // TODO(luoyi): DCT_DCT tx_type is not enabled in av1_fht32x32_c(avx2) yet.
+  // make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 0, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 1, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 2, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 3, AOM_BITS_8, 1024),
+#if CONFIG_EXT_TX
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 4, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 5, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 6, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 7, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 8, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 10, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 11, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 12, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 13, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 14, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 15, AOM_BITS_8, 1024)
+#endif  // CONFIG_EXT_TX
+};
+INSTANTIATE_TEST_CASE_P(AVX2, AV1Trans32x32HT,
+                        ::testing::ValuesIn(kArrayHt32x32Param_avx2));
+#endif  // HAVE_AVX2
+}  // namespace
diff --git a/test/test.mk b/test/test.mk
index c071cea..162d7c9 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -144,6 +144,7 @@
 LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_fht16x8_test.cc
 LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_iht8x16_test.cc
 LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_iht16x8_test.cc
+LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += fht32x32_test.cc
 endif
 LIBAOM_TEST_SRCS-$(CONFIG_EXT_TILE)     += av1_ext_tile_test.cc
 
diff --git a/tools/gen_constrained_tokenset.py b/tools/gen_constrained_tokenset.py
new file mode 100755
index 0000000..a0f8280
--- /dev/null
+++ b/tools/gen_constrained_tokenset.py
@@ -0,0 +1,115 @@
+#!/usr/bin/python
+##
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+"""Generate the probability model for the constrained token set.
+
+Model obtained from a 2-sided zero-centered distribution derived
+from a Pareto distribution. The cdf of the distribution is:
+cdf(x) = 0.5 + 0.5 * sgn(x) * [1 - {alpha/(alpha + |x|)} ^ beta]
+
+For a given beta and a given probability of the 1-node, the alpha
+is first solved, and then the {alpha, beta} pair is used to generate
+the probabilities for the rest of the nodes.
+"""
+
+import heapq
+import sys
+import numpy as np
+import scipy.optimize
+import scipy.stats
+
+
+def cdf_spareto(x, xm, beta):
+  p = 1 - (xm / (np.abs(x) + xm))**beta
+  p = 0.5 + 0.5 * np.sign(x) * p
+  return p
+
+
+def get_spareto(p, beta):
+  cdf = cdf_spareto
+
+  def func(x):
+    return ((cdf(1.5, x, beta) - cdf(0.5, x, beta)) /
+            (1 - cdf(0.5, x, beta)) - p)**2
+
+  alpha = scipy.optimize.fminbound(func, 1e-12, 10000, xtol=1e-12)
+  parray = np.zeros(11)
+  parray[0] = 2 * (cdf(0.5, alpha, beta) - 0.5)
+  parray[1] = (2 * (cdf(1.5, alpha, beta) - cdf(0.5, alpha, beta)))
+  parray[2] = (2 * (cdf(2.5, alpha, beta) - cdf(1.5, alpha, beta)))
+  parray[3] = (2 * (cdf(3.5, alpha, beta) - cdf(2.5, alpha, beta)))
+  parray[4] = (2 * (cdf(4.5, alpha, beta) - cdf(3.5, alpha, beta)))
+  parray[5] = (2 * (cdf(6.5, alpha, beta) - cdf(4.5, alpha, beta)))
+  parray[6] = (2 * (cdf(10.5, alpha, beta) - cdf(6.5, alpha, beta)))
+  parray[7] = (2 * (cdf(18.5, alpha, beta) - cdf(10.5, alpha, beta)))
+  parray[8] = (2 * (cdf(34.5, alpha, beta) - cdf(18.5, alpha, beta)))
+  parray[9] = (2 * (cdf(66.5, alpha, beta) - cdf(34.5, alpha, beta)))
+  parray[10] = 2 * (1. - cdf(66.5, alpha, beta))
+  return parray
+
+
+def quantize_probs(p, save_first_bin, bits):
+  """Quantize probability precisely.
+
+  Quantize probabilities minimizing dH (Kullback-Leibler divergence)
+  approximated by: sum (p_i-q_i)^2/p_i.
+  References:
+  https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
+  https://github.com/JarekDuda/AsymmetricNumeralSystemsToolkit
+  """
+  num_sym = p.size
+  p = np.clip(p, 1e-16, 1)
+  L = 2**bits
+  pL = p * L
+  ip = 1. / p  # inverse probability
+  q = np.clip(np.round(pL), 1, L + 1 - num_sym)
+  quant_err = (pL - q)**2 * ip
+  sgn = np.sign(L - q.sum())  # direction of correction
+  if sgn != 0:  # correction is needed
+    v = []  # heap of adjustment results (adjustment err, index) of each symbol
+    for i in range(1 if save_first_bin else 0, num_sym):
+      q_adj = q[i] + sgn
+      if q_adj > 0 and q_adj < L:
+        adj_err = (pL[i] - q_adj)**2 * ip[i] - quant_err[i]
+        heapq.heappush(v, (adj_err, i))
+    while q.sum() != L:
+      # apply lowest error adjustment
+      (adj_err, i) = heapq.heappop(v)
+      quant_err[i] += adj_err
+      q[i] += sgn
+      # calculate the cost of adjusting this symbol again
+      q_adj = q[i] + sgn
+      if q_adj > 0 and q_adj < L:
+        adj_err = (pL[i] - q_adj)**2 * ip[i] - quant_err[i]
+        heapq.heappush(v, (adj_err, i))
+  return q
+
+
+def get_quantized_spareto(p, beta, bits):
+  parray = get_spareto(p, beta)
+  parray = parray[1:] / (1 - parray[0])
+  qarray = quantize_probs(parray, True, bits)
+  return qarray.astype(np.int)
+
+
+def main(bits=8):
+  beta = 8
+  for q in range(1, 256):
+    parray = get_quantized_spareto(q / 256., beta, bits)
+    assert parray.sum() == 2**bits
+    print '{', ', '.join('%d' % i for i in parray), '},'
+
+
+if __name__ == '__main__':
+  if len(sys.argv) > 1:
+    main(int(sys.argv[1]))
+  else:
+    main()