Merge "Renamings for OBMC experiment" into nextgenv2

commit: a48764d05fb4e4b7fc6bec88beaac4da4240bcd1 [log] [tgz]
author: Yue Chen <yuec@google.com> Fri Oct 14 01:33:00 2016 +0000
committer: Gerrit Code Review <noreply-gerritcodereview@google.com> Fri Oct 14 01:33:00 2016 +0000
tree: 0afe5082a8b8d67332cb97d1cd4256c829cf6a66
parent: 975350387ce0b55bf5af8cb944f6a242b72251ff [diff]
parent: cb60b185c73e0f15eeac71d83175345de9b03fa6 [diff]
diff --git a/aom_dsp/ans.c b/aom_dsp/ans.c
index 18f6d48..30f115c 100644
--- a/aom_dsp/ans.c
+++ b/aom_dsp/ans.c

@@ -15,16 +15,7 @@
 #include "aom_dsp/ans.h"
 #include "aom_dsp/prob.h"
 
-void aom_rans_build_cdf_from_pdf(const AnsP10 token_probs[], rans_lut cdf_tab) {
-  int i;
-  cdf_tab[0] = 0;
-  for (i = 1; cdf_tab[i - 1] < RANS_PRECISION; ++i) {
-    cdf_tab[i] = cdf_tab[i - 1] + token_probs[i - 1];
-  }
-  assert(cdf_tab[i - 1] == RANS_PRECISION);
-}
-
-static int find_largest(const AnsP10 *const pdf_tab, int num_syms) {
+static int find_largest(const aom_cdf_prob *const pdf_tab, int num_syms) {
   int largest_idx = -1;
   int largest_p = -1;
   int i;
@@ -38,8 +29,9 @@
   return largest_idx;
 }
 
-void aom_rans_merge_prob8_pdf(AnsP10 *const out_pdf, const AnsP8 node_prob,
-                              const AnsP10 *const src_pdf, int in_syms) {
+void aom_rans_merge_prob8_pdf(aom_cdf_prob *const out_pdf,
+                              const AnsP8 node_prob,
+                              const aom_cdf_prob *const src_pdf, int in_syms) {
   int i;
   int adjustment = RANS_PRECISION;
   const int round_fact = ANS_P8_PRECISION >> 1;

diff --git a/aom_dsp/ans.h b/aom_dsp/ans.h
index ea99f8b..5927e58 100644
--- a/aom_dsp/ans.h
+++ b/aom_dsp/ans.h

@@ -26,24 +26,16 @@
 typedef uint8_t AnsP8;
 #define ANS_P8_PRECISION 256u
 #define ANS_P8_SHIFT 8
-typedef uint16_t AnsP10;
-#define ANS_P10_PRECISION 1024u
+#define RANS_PRECISION 1024u
 #define RANS_PROB_BITS 10
 
-#define RANS_PRECISION ANS_P10_PRECISION
-
-#define L_BASE (ANS_P10_PRECISION * 4)  // L_BASE % precision must be 0
+#define L_BASE (RANS_PRECISION * 4)  // L_BASE % precision must be 0
 #define IO_BASE 256
 // Range I = { L_BASE, L_BASE + 1, ..., L_BASE * IO_BASE - 1 }
 
-// This is now just a boring cdf. It starts with an explicit zero.
-// TODO(aconverse): Remove starting zero.
-typedef uint16_t rans_lut[16];
-
-void aom_rans_build_cdf_from_pdf(const AnsP10 token_probs[], rans_lut cdf_tab);
-
-void aom_rans_merge_prob8_pdf(AnsP10 *const out_pdf, const AnsP8 node_prob,
-                              const AnsP10 *const src_pdf, int in_syms);
+void aom_rans_merge_prob8_pdf(aom_cdf_prob *const out_pdf,
+                              const AnsP8 node_prob,
+                              const aom_cdf_prob *const src_pdf, int in_syms);
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus

diff --git a/aom_dsp/ansreader.h b/aom_dsp/ansreader.h
index 11619b0..1f66531 100644
--- a/aom_dsp/ansreader.h
+++ b/aom_dsp/ansreader.h

@@ -62,24 +62,25 @@
 
 struct rans_dec_sym {
   uint8_t val;
-  AnsP10 prob;
-  AnsP10 cum_prob;  // not-inclusive
+  aom_cdf_prob prob;
+  aom_cdf_prob cum_prob;  // not-inclusive
 };
 
-static INLINE void fetch_sym(struct rans_dec_sym *out, const rans_lut cdf,
-                             AnsP10 rem) {
-  int i = 0;
+static INLINE void fetch_sym(struct rans_dec_sym *out, const aom_cdf_prob *cdf,
+                             aom_cdf_prob rem) {
+  int i;
+  aom_cdf_prob cum_prob = 0, top_prob;
   // TODO(skal): if critical, could be a binary search.
   // Or, better, an O(1) alias-table.
-  while (rem >= cdf[i]) {
-    ++i;
+  for (i = 0; rem >= (top_prob = cdf[i]); ++i) {
+    cum_prob = top_prob;
   }
-  out->val = i - 1;
-  out->prob = (AnsP10)(cdf[i] - cdf[i - 1]);
-  out->cum_prob = (AnsP10)cdf[i - 1];
+  out->val = i;
+  out->prob = top_prob - cum_prob;
+  out->cum_prob = cum_prob;
 }
 
-static INLINE int rans_read(struct AnsDecoder *ans, const rans_lut tab) {
+static INLINE int rans_read(struct AnsDecoder *ans, const aom_cdf_prob *tab) {
   unsigned rem;
   unsigned quo;
   struct rans_dec_sym sym;

diff --git a/aom_dsp/answriter.h b/aom_dsp/answriter.h
index 5a82d35..0ac1bda 100644
--- a/aom_dsp/answriter.h
+++ b/aom_dsp/answriter.h

@@ -75,8 +75,8 @@
 }
 
 struct rans_sym {
-  AnsP10 prob;
-  AnsP10 cum_prob;  // not-inclusive
+  aom_cdf_prob prob;
+  aom_cdf_prob cum_prob;  // not-inclusive
 };
 
 // rANS with normalization
@@ -84,7 +84,7 @@
 // ANS_P10_PRECISION is m
 static INLINE void rans_write(struct AnsCoder *ans,
                               const struct rans_sym *const sym) {
-  const AnsP10 p = sym->prob;
+  const aom_cdf_prob p = sym->prob;
   while (ans->state >= L_BASE / RANS_PRECISION * IO_BASE * p) {
     ans->buf[ans->buf_offset++] = ans->state % IO_BASE;
     ans->state /= IO_BASE;

diff --git a/aom_dsp/aom_dsp.mk b/aom_dsp/aom_dsp.mk
index 036aef0..25f98a8 100644
--- a/aom_dsp/aom_dsp.mk
+++ b/aom_dsp/aom_dsp.mk

@@ -9,6 +9,7 @@
 ## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ##
 
+
 DSP_SRCS-yes += aom_dsp.mk
 DSP_SRCS-yes += aom_dsp_common.h
 
@@ -72,8 +73,6 @@
 DSP_SRCS-$(HAVE_DSPR2)  += mips/common_dspr2.c
 
 # inter predictions
-
-ifeq ($(CONFIG_AV1),yes)
 DSP_SRCS-yes            += blend.h
 DSP_SRCS-yes            += blend_a64_mask.c
 DSP_SRCS-yes            += blend_a64_hmask.c
@@ -82,7 +81,6 @@
 DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_mask_sse4.c
 DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_hmask_sse4.c
 DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_vmask_sse4.c
-endif  #CONFIG_AV1
 
 # interpolation filters
 DSP_SRCS-yes += aom_convolve.c
@@ -101,7 +99,6 @@
 DSP_SRCS-$(HAVE_SSE2)  += x86/aom_high_subpixel_8t_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)  += x86/aom_high_subpixel_bilinear_sse2.asm
 endif
-
 DSP_SRCS-$(HAVE_SSE2)  += x86/aom_convolve_copy_sse2.asm
 
 ifeq ($(HAVE_NEON_ASM),yes)

diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 8d17d03..6af5588 100644
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl

@@ -44,6 +44,27 @@
 # Intra prediction
 #
 
+add_proto qw/void aom_dc_predictor_2x2/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/aom_dc_predictor_2x2/;
+
+add_proto qw/void aom_dc_top_predictor_2x2/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/aom_dc_top_predictor_2x2/;
+
+add_proto qw/void aom_dc_left_predictor_2x2/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/aom_dc_left_predictor_2x2/;
+
+add_proto qw/void aom_dc_128_predictor_2x2/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/aom_dc_128_predictor_2x2/;
+
+add_proto qw/void aom_v_predictor_2x2/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/aom_v_predictor_2x2/;
+
+add_proto qw/void aom_h_predictor_2x2/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/aom_h_predictor_2x2/;
+
+add_proto qw/void aom_tm_predictor_2x2/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/aom_tm_predictor_2x2/;
+
 add_proto qw/void aom_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/aom_d207_predictor_4x4 sse2/;
 
@@ -648,59 +669,32 @@
 #
 # Forward transform
 #
-if ((aom_config("CONFIG_AV1_ENCODER") eq "yes")) {
-if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
-  add_proto qw/void aom_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct4x4 sse2/;
+if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
+  if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void aom_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_highbd_fdct4x4 sse2/;
 
-  add_proto qw/void aom_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct4x4_1 sse2/;
+    add_proto qw/void aom_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_highbd_fdct8x8 sse2/;
 
-  add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct8x8 sse2/;
+    add_proto qw/void aom_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_highbd_fdct8x8_1/;
 
-  add_proto qw/void aom_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct8x8_1 sse2/;
+    add_proto qw/void aom_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_highbd_fdct16x16 sse2/;
 
-  add_proto qw/void aom_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct16x16 sse2/;
+    add_proto qw/void aom_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_highbd_fdct16x16_1/;
 
-  add_proto qw/void aom_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct16x16_1 sse2/;
+    add_proto qw/void aom_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_highbd_fdct32x32 sse2/;
 
-  add_proto qw/void aom_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct32x32 sse2/;
+    add_proto qw/void aom_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_highbd_fdct32x32_rd sse2/;
 
-  add_proto qw/void aom_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct32x32_rd sse2/;
-
-  add_proto qw/void aom_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct32x32_1 sse2/;
-
-  add_proto qw/void aom_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_highbd_fdct4x4 sse2/;
-
-  add_proto qw/void aom_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_highbd_fdct8x8 sse2/;
-
-  add_proto qw/void aom_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_highbd_fdct8x8_1/;
-
-  add_proto qw/void aom_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_highbd_fdct16x16 sse2/;
-
-  add_proto qw/void aom_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_highbd_fdct16x16_1/;
-
-  add_proto qw/void aom_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_highbd_fdct32x32 sse2/;
-
-  add_proto qw/void aom_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_highbd_fdct32x32_rd sse2/;
-
-  add_proto qw/void aom_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_highbd_fdct32x32_1/;
-} else {
+    add_proto qw/void aom_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_highbd_fdct32x32_1/;
+  }   # CONFIG_AOM_HIGHBITDEPTH
   add_proto qw/void aom_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/aom_fdct4x4 sse2 msa/;
 
@@ -726,8 +720,7 @@
   specialize qw/aom_fdct32x32_rd sse2 avx2 msa/;
 
   add_proto qw/void aom_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct32x32_1 sse2 msa/;
-}  # CONFIG_AOM_HIGHBITDEPTH
+  specialize qw/aom_fdct32x32_1 sse2 avx2 msa/;
 }  # CONFIG_AV1_ENCODER
 
 #

diff --git a/aom_dsp/arm/loopfilter_4_neon.asm b/aom_dsp/arm/loopfilter_4_neon.asm
index e82dea5..8b54984 100644
--- a/aom_dsp/arm/loopfilter_4_neon.asm
+++ b/aom_dsp/arm/loopfilter_4_neon.asm

@@ -17,7 +17,7 @@
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-; Currently aom only works on iterations 8 at a time. The vp8 loop filter
+; Currently aom only works on iterations 8 at a time. The aom loop filter
 ; works on 16 iterations at a time.
 ;
 ; void aom_lpf_horizontal_4_neon(uint8_t *s,
@@ -66,7 +66,7 @@
     pop         {pc}
     ENDP        ; |aom_lpf_horizontal_4_neon|
 
-; Currently aom only works on iterations 8 at a time. The vp8 loop filter
+; Currently aom only works on iterations 8 at a time. The aom loop filter
 ; works on 16 iterations at a time.
 ;
 ; void aom_lpf_vertical_4_neon(uint8_t *s,

diff --git a/aom_dsp/arm/loopfilter_8_neon.asm b/aom_dsp/arm/loopfilter_8_neon.asm
index 23b819b..9f3db66 100644
--- a/aom_dsp/arm/loopfilter_8_neon.asm
+++ b/aom_dsp/arm/loopfilter_8_neon.asm

@@ -17,7 +17,7 @@
 
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
-; Currently aom only works on iterations 8 at a time. The vp8 loop filter
+; Currently aom only works on iterations 8 at a time. The aom loop filter
 ; works on 16 iterations at a time.
 ;
 ; void aom_lpf_horizontal_8_neon(uint8_t *s, int p,

diff --git a/aom_dsp/bitreader.h b/aom_dsp/bitreader.h
index d062e07..52e4dc8 100644
--- a/aom_dsp/bitreader.h
+++ b/aom_dsp/bitreader.h

@@ -104,6 +104,20 @@
   return aom_read_tree_bits(r, tree, probs);
 }
 
+static INLINE int aom_read_symbol(aom_reader *r, const aom_cdf_prob *cdf,
+                                  int nsymbs) {
+#if CONFIG_ANS
+  (void)nsymbs;
+  return rans_read(r, cdf);
+#else
+  (void)r;
+  (void)cdf;
+  (void)nsymbs;
+  assert(0 && "Unsupported bitreader operation");
+  return -1;
+#endif
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/aom_dsp/bitwriter.h b/aom_dsp/bitwriter.h
index 5e34fd6..d6937aa 100644
--- a/aom_dsp/bitwriter.h
+++ b/aom_dsp/bitwriter.h

@@ -86,6 +86,24 @@
   aom_write_tree_bits(w, tree, probs, bits, len, i);
 }
 
+static INLINE void aom_write_symbol(aom_writer *w, int symb,
+                                    const aom_cdf_prob *cdf, int nsymbs) {
+#if CONFIG_ANS
+  struct rans_sym s;
+  (void)nsymbs;
+  assert(cdf);
+  s.cum_prob = symb > 0 ? cdf[symb - 1] : 0;
+  s.prob = cdf[symb] - s.cum_prob;
+  buf_rans_write(w, &s);
+#else
+  (void)w;
+  (void)symb;
+  (void)cdf;
+  (void)nsymbs;
+  assert(0 && "Unsupported bitwriter operation");
+#endif
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/aom_dsp/intrapred.c b/aom_dsp/intrapred.c
index 1e40e68..c3af1f4 100644
--- a/aom_dsp/intrapred.c
+++ b/aom_dsp/intrapred.c

@@ -837,6 +837,7 @@
 
 /* clang-format off */
 #define intra_pred_allsizes(type) \
+  intra_pred_sized(type, 2) \
   intra_pred_sized(type, 4) \
   intra_pred_sized(type, 8) \
   intra_pred_sized(type, 16) \
@@ -846,7 +847,7 @@
   intra_pred_highbd_sized(type, 16) \
   intra_pred_highbd_sized(type, 32)
 
-#define intra_pred_no_4x4(type) \
+#define intra_pred_above_4x4(type) \
   intra_pred_sized(type, 8) \
   intra_pred_sized(type, 16) \
   intra_pred_sized(type, 32) \
@@ -857,26 +858,27 @@
 
 #else
 #define intra_pred_allsizes(type) \
+  intra_pred_sized(type, 2) \
   intra_pred_sized(type, 4) \
   intra_pred_sized(type, 8) \
   intra_pred_sized(type, 16) \
   intra_pred_sized(type, 32)
 
-#define intra_pred_no_4x4(type) \
+#define intra_pred_above_4x4(type) \
   intra_pred_sized(type, 8) \
   intra_pred_sized(type, 16) \
   intra_pred_sized(type, 32)
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
-intra_pred_no_4x4(d207)
-intra_pred_no_4x4(d63)
-intra_pred_no_4x4(d45)
+intra_pred_above_4x4(d207)
+intra_pred_above_4x4(d63)
+intra_pred_above_4x4(d45)
 intra_pred_allsizes(d207e)
 intra_pred_allsizes(d63e)
-intra_pred_no_4x4(d45e)
-intra_pred_no_4x4(d117)
-intra_pred_no_4x4(d135)
-intra_pred_no_4x4(d153)
+intra_pred_above_4x4(d45e)
+intra_pred_above_4x4(d117)
+intra_pred_above_4x4(d135)
+intra_pred_above_4x4(d153)
 intra_pred_allsizes(v)
 intra_pred_allsizes(h)
 #if CONFIG_ALT_INTRA

diff --git a/aom_dsp/mips/sad_msa.c b/aom_dsp/mips/sad_msa.c
index 7f7364d..258eb5c 100644
--- a/aom_dsp/mips/sad_msa.c
+++ b/aom_dsp/mips/sad_msa.c

@@ -1435,93 +1435,95 @@
                               second_pred);                             \
   }
 
+/* clang-format off */
 // 64x64
-AOM_SAD_64xHEIGHT_MSA(64);
-AOM_SAD_64xHEIGHTx3_MSA(64);
-AOM_SAD_64xHEIGHTx8_MSA(64);
-AOM_SAD_64xHEIGHTx4D_MSA(64);
-AOM_AVGSAD_64xHEIGHT_MSA(64);
+AOM_SAD_64xHEIGHT_MSA(64)
+AOM_SAD_64xHEIGHTx3_MSA(64)
+AOM_SAD_64xHEIGHTx8_MSA(64)
+AOM_SAD_64xHEIGHTx4D_MSA(64)
+AOM_AVGSAD_64xHEIGHT_MSA(64)
 
 // 64x32
-AOM_SAD_64xHEIGHT_MSA(32);
-AOM_SAD_64xHEIGHTx3_MSA(32);
-AOM_SAD_64xHEIGHTx8_MSA(32);
-AOM_SAD_64xHEIGHTx4D_MSA(32);
-AOM_AVGSAD_64xHEIGHT_MSA(32);
+AOM_SAD_64xHEIGHT_MSA(32)
+AOM_SAD_64xHEIGHTx3_MSA(32)
+AOM_SAD_64xHEIGHTx8_MSA(32)
+AOM_SAD_64xHEIGHTx4D_MSA(32)
+AOM_AVGSAD_64xHEIGHT_MSA(32)
 
 // 32x64
-AOM_SAD_32xHEIGHT_MSA(64);
-AOM_SAD_32xHEIGHTx3_MSA(64);
-AOM_SAD_32xHEIGHTx8_MSA(64);
-AOM_SAD_32xHEIGHTx4D_MSA(64);
-AOM_AVGSAD_32xHEIGHT_MSA(64);
+AOM_SAD_32xHEIGHT_MSA(64)
+AOM_SAD_32xHEIGHTx3_MSA(64)
+AOM_SAD_32xHEIGHTx8_MSA(64)
+AOM_SAD_32xHEIGHTx4D_MSA(64)
+AOM_AVGSAD_32xHEIGHT_MSA(64)
 
 // 32x32
-AOM_SAD_32xHEIGHT_MSA(32);
-AOM_SAD_32xHEIGHTx3_MSA(32);
-AOM_SAD_32xHEIGHTx8_MSA(32);
-AOM_SAD_32xHEIGHTx4D_MSA(32);
-AOM_AVGSAD_32xHEIGHT_MSA(32);
+AOM_SAD_32xHEIGHT_MSA(32)
+AOM_SAD_32xHEIGHTx3_MSA(32)
+AOM_SAD_32xHEIGHTx8_MSA(32)
+AOM_SAD_32xHEIGHTx4D_MSA(32)
+AOM_AVGSAD_32xHEIGHT_MSA(32)
 
 // 32x16
-AOM_SAD_32xHEIGHT_MSA(16);
-AOM_SAD_32xHEIGHTx3_MSA(16);
-AOM_SAD_32xHEIGHTx8_MSA(16);
-AOM_SAD_32xHEIGHTx4D_MSA(16);
-AOM_AVGSAD_32xHEIGHT_MSA(16);
+AOM_SAD_32xHEIGHT_MSA(16)
+AOM_SAD_32xHEIGHTx3_MSA(16)
+AOM_SAD_32xHEIGHTx8_MSA(16)
+AOM_SAD_32xHEIGHTx4D_MSA(16)
+AOM_AVGSAD_32xHEIGHT_MSA(16)
 
 // 16x32
-AOM_SAD_16xHEIGHT_MSA(32);
-AOM_SAD_16xHEIGHTx3_MSA(32);
-AOM_SAD_16xHEIGHTx8_MSA(32);
-AOM_SAD_16xHEIGHTx4D_MSA(32);
-AOM_AVGSAD_16xHEIGHT_MSA(32);
+AOM_SAD_16xHEIGHT_MSA(32)
+AOM_SAD_16xHEIGHTx3_MSA(32)
+AOM_SAD_16xHEIGHTx8_MSA(32)
+AOM_SAD_16xHEIGHTx4D_MSA(32)
+AOM_AVGSAD_16xHEIGHT_MSA(32)
 
 // 16x16
-AOM_SAD_16xHEIGHT_MSA(16);
-AOM_SAD_16xHEIGHTx3_MSA(16);
-AOM_SAD_16xHEIGHTx8_MSA(16);
-AOM_SAD_16xHEIGHTx4D_MSA(16);
-AOM_AVGSAD_16xHEIGHT_MSA(16);
+AOM_SAD_16xHEIGHT_MSA(16)
+AOM_SAD_16xHEIGHTx3_MSA(16)
+AOM_SAD_16xHEIGHTx8_MSA(16)
+AOM_SAD_16xHEIGHTx4D_MSA(16)
+AOM_AVGSAD_16xHEIGHT_MSA(16)
 
 // 16x8
-AOM_SAD_16xHEIGHT_MSA(8);
-AOM_SAD_16xHEIGHTx3_MSA(8);
-AOM_SAD_16xHEIGHTx8_MSA(8);
-AOM_SAD_16xHEIGHTx4D_MSA(8);
-AOM_AVGSAD_16xHEIGHT_MSA(8);
+AOM_SAD_16xHEIGHT_MSA(8)
+AOM_SAD_16xHEIGHTx3_MSA(8)
+AOM_SAD_16xHEIGHTx8_MSA(8)
+AOM_SAD_16xHEIGHTx4D_MSA(8)
+AOM_AVGSAD_16xHEIGHT_MSA(8)
 
 // 8x16
-AOM_SAD_8xHEIGHT_MSA(16);
-AOM_SAD_8xHEIGHTx3_MSA(16);
-AOM_SAD_8xHEIGHTx8_MSA(16);
-AOM_SAD_8xHEIGHTx4D_MSA(16);
-AOM_AVGSAD_8xHEIGHT_MSA(16);
+AOM_SAD_8xHEIGHT_MSA(16)
+AOM_SAD_8xHEIGHTx3_MSA(16)
+AOM_SAD_8xHEIGHTx8_MSA(16)
+AOM_SAD_8xHEIGHTx4D_MSA(16)
+AOM_AVGSAD_8xHEIGHT_MSA(16)
 
 // 8x8
-AOM_SAD_8xHEIGHT_MSA(8);
-AOM_SAD_8xHEIGHTx3_MSA(8);
-AOM_SAD_8xHEIGHTx8_MSA(8);
-AOM_SAD_8xHEIGHTx4D_MSA(8);
-AOM_AVGSAD_8xHEIGHT_MSA(8);
+AOM_SAD_8xHEIGHT_MSA(8)
+AOM_SAD_8xHEIGHTx3_MSA(8)
+AOM_SAD_8xHEIGHTx8_MSA(8)
+AOM_SAD_8xHEIGHTx4D_MSA(8)
+AOM_AVGSAD_8xHEIGHT_MSA(8)
 
 // 8x4
-AOM_SAD_8xHEIGHT_MSA(4);
-AOM_SAD_8xHEIGHTx3_MSA(4);
-AOM_SAD_8xHEIGHTx8_MSA(4);
-AOM_SAD_8xHEIGHTx4D_MSA(4);
-AOM_AVGSAD_8xHEIGHT_MSA(4);
+AOM_SAD_8xHEIGHT_MSA(4)
+AOM_SAD_8xHEIGHTx3_MSA(4)
+AOM_SAD_8xHEIGHTx8_MSA(4)
+AOM_SAD_8xHEIGHTx4D_MSA(4)
+AOM_AVGSAD_8xHEIGHT_MSA(4)
 
 // 4x8
-AOM_SAD_4xHEIGHT_MSA(8);
-AOM_SAD_4xHEIGHTx3_MSA(8);
-AOM_SAD_4xHEIGHTx8_MSA(8);
-AOM_SAD_4xHEIGHTx4D_MSA(8);
-AOM_AVGSAD_4xHEIGHT_MSA(8);
+AOM_SAD_4xHEIGHT_MSA(8)
+AOM_SAD_4xHEIGHTx3_MSA(8)
+AOM_SAD_4xHEIGHTx8_MSA(8)
+AOM_SAD_4xHEIGHTx4D_MSA(8)
+AOM_AVGSAD_4xHEIGHT_MSA(8)
 
 // 4x4
-AOM_SAD_4xHEIGHT_MSA(4);
-AOM_SAD_4xHEIGHTx3_MSA(4);
-AOM_SAD_4xHEIGHTx8_MSA(4);
-AOM_SAD_4xHEIGHTx4D_MSA(4);
-AOM_AVGSAD_4xHEIGHT_MSA(4);
+AOM_SAD_4xHEIGHT_MSA(4)
+AOM_SAD_4xHEIGHTx3_MSA(4)
+AOM_SAD_4xHEIGHTx8_MSA(4)
+AOM_SAD_4xHEIGHTx4D_MSA(4)
+AOM_AVGSAD_4xHEIGHT_MSA(4)
+    /* clang-format on */

diff --git a/aom_dsp/mips/sub_pixel_variance_msa.c b/aom_dsp/mips/sub_pixel_variance_msa.c
index cfbdb15..3eb8510 100644
--- a/aom_dsp/mips/sub_pixel_variance_msa.c
+++ b/aom_dsp/mips/sub_pixel_variance_msa.c

@@ -1,11 +1,12 @@
 /*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
  *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
 #include "./aom_dsp_rtcd.h"
@@ -1652,23 +1653,25 @@
     return var;                                                               \
   }
 
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4);
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8);
+/* clang-format off */
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4)
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8)
 
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4);
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8);
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16);
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4)
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8)
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16)
 
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8);
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16);
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32);
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8)
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16)
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32)
 
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16);
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32);
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64);
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16)
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32)
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64)
 
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32);
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64);
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32)
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64)
+/* clang-format on */
 
 #define AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht)                          \
   uint32_t aom_sub_pixel_avg_variance##wd##x##ht##_msa(                       \
@@ -1703,19 +1706,21 @@
     return VARIANCE_##wd##Wx##ht##H(*sse, diff);                              \
   }
 
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 4);
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 8);
+/* clang-format off */
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 4)
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 8)
 
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 4);
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 8);
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 16);
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 4)
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 8)
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 16)
 
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 8);
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 16);
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 32);
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 8)
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 16)
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 32)
 
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 16);
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32);
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 16)
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32)
+/* clang-format on */
 
 uint32_t aom_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr,
                                              int32_t src_stride,
@@ -1784,5 +1789,7 @@
     return VARIANCE_64Wx##ht##H(*sse, diff);                                  \
   }
 
-AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32);
-AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64);
+/* clang-format off */
+AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32)
+AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64)
+/* clang-format on */

diff --git a/aom_dsp/mips/variance_msa.c b/aom_dsp/mips/variance_msa.c
index 1479363..745fdfc 100644
--- a/aom_dsp/mips/variance_msa.c
+++ b/aom_dsp/mips/variance_msa.c

@@ -540,8 +540,9 @@
     return VARIANCE_##wd##Wx##ht##H(*sse, diff);                               \
   }
 
-AOM_VARIANCE_WDXHT_MSA(4, 4);
-AOM_VARIANCE_WDXHT_MSA(4, 8);
+/* clang-format off */
+AOM_VARIANCE_WDXHT_MSA(4, 4)
+AOM_VARIANCE_WDXHT_MSA(4, 8)
 
 AOM_VARIANCE_WDXHT_MSA(8, 4)
 AOM_VARIANCE_WDXHT_MSA(8, 8)
@@ -553,6 +554,7 @@
 
 AOM_VARIANCE_WDXHT_MSA(32, 16)
 AOM_VARIANCE_WDXHT_MSA(32, 32)
+/* clang-format on */
 
 uint32_t aom_variance32x64_msa(const uint8_t *src, int32_t src_stride,
                                const uint8_t *ref, int32_t ref_stride,

diff --git a/aom_dsp/prob.h b/aom_dsp/prob.h
index 4f25b30..cd133e2 100644
--- a/aom_dsp/prob.h
+++ b/aom_dsp/prob.h

@@ -23,6 +23,9 @@
 
 typedef uint8_t aom_prob;
 
+// TODO(negge): Rename this aom_prob once we remove vpxbool.
+typedef uint16_t aom_cdf_prob;
+
 #define MAX_PROB 255
 
 #define aom_prob_half ((aom_prob)128)

diff --git a/aom_dsp/simd/v128_intrinsics_x86.h b/aom_dsp/simd/v128_intrinsics_x86.h
index 4504996..8319f03 100644
--- a/aom_dsp/simd/v128_intrinsics_x86.h
+++ b/aom_dsp/simd/v128_intrinsics_x86.h

@@ -162,7 +162,11 @@
 
 SIMD_INLINE v128 v128_unziplo_8(v128 a, v128 b) {
 #if defined(__SSSE3__)
+#ifdef __x86_64__
   v128 order = _mm_cvtsi64_si128(0x0e0c0a0806040200LL);
+#else
+  v128 order = _mm_set_epi32(0, 0, 0x0e0c0a08, 0x06040200);
+#endif
   return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order),
                             _mm_shuffle_epi8(a, order));
 #else
@@ -176,7 +180,11 @@
 
 SIMD_INLINE v128 v128_unziplo_16(v128 a, v128 b) {
 #if defined(__SSSE3__)
+#ifdef __x86_64__
   v128 order = _mm_cvtsi64_si128(0x0d0c090805040100LL);
+#else
+  v128 order = _mm_set_epi32(0, 0, 0x0d0c0908, 0x05040100);
+#endif
   return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order),
                             _mm_shuffle_epi8(a, order));
 #else
@@ -262,7 +270,7 @@
 
 SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) {
   v128 r = _mm_madd_epi16(a, b);
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) && defined(__x86_64__)
   v128 c = _mm_add_epi64(_mm_cvtepi32_epi64(r),
                          _mm_cvtepi32_epi64(_mm_srli_si128(r, 8)));
   return _mm_cvtsi128_si64(_mm_add_epi64(c, _mm_srli_si128(c, 8)));

diff --git a/aom_dsp/simd/v64_intrinsics_x86.h b/aom_dsp/simd/v64_intrinsics_x86.h
index b951492..bef43c4 100644
--- a/aom_dsp/simd/v64_intrinsics_x86.h
+++ b/aom_dsp/simd/v64_intrinsics_x86.h

@@ -47,7 +47,11 @@
 }
 
 SIMD_INLINE v64 v64_from_64(uint64_t x) {
+#ifdef __x86_64__
+  return _mm_cvtsi64_si128(x);
+#else
   return _mm_set_epi32(0, 0, x >> 32, (uint32_t)x);
+#endif
 }
 
 SIMD_INLINE uint64_t v64_u64(v64 x) {
@@ -168,7 +172,7 @@
 SIMD_INLINE v64 v64_unziphi_8(v64 a, v64 b) {
 #if defined(__SSSE3__)
   return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
-                          _mm_cvtsi64_si128(0x0f0d0b0907050301LL));
+                          v64_from_64(0x0f0d0b0907050301LL));
 #else
   return _mm_packus_epi16(
       _mm_unpacklo_epi64(_mm_srli_epi16(b, 8), _mm_srli_epi16(a, 8)),
@@ -179,7 +183,7 @@
 SIMD_INLINE v64 v64_unziplo_8(v64 a, v64 b) {
 #if defined(__SSSE3__)
   return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
-                          _mm_cvtsi64_si128(0x0e0c0a0806040200LL));
+                          v64_from_64(0x0e0c0a0806040200LL));
 #else
   return v64_unziphi_8(_mm_slli_si128(a, 1), _mm_slli_si128(b, 1));
 #endif
@@ -188,7 +192,7 @@
 SIMD_INLINE v64 v64_unziphi_16(v64 a, v64 b) {
 #if defined(__SSSE3__)
   return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
-                          _mm_cvtsi64_si128(0x0f0e0b0a07060302LL));
+                          v64_from_64(0x0f0e0b0a07060302LL));
 #else
   return _mm_packs_epi32(
       _mm_unpacklo_epi64(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16)),
@@ -199,7 +203,7 @@
 SIMD_INLINE v64 v64_unziplo_16(v64 a, v64 b) {
 #if defined(__SSSE3__)
   return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
-                          _mm_cvtsi64_si128(0x0d0c090805040100LL));
+                          v64_from_64(0x0d0c090805040100LL));
 #else
   return v64_unziphi_16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2));
 #endif
@@ -261,7 +265,7 @@
 
 SIMD_INLINE int64_t v64_dotp_s16(v64 a, v64 b) {
   __m128i r = _mm_madd_epi16(a, b);
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) && defined(__x86_64__)
   __m128i x = _mm_cvtepi32_epi64(r);
   return _mm_cvtsi128_si64(_mm_add_epi64(x, _mm_srli_si128(x, 8)));
 #else

diff --git a/args.c b/args.c
index e12f16b..5829857 100644
--- a/args.c
+++ b/args.c

@@ -14,6 +14,7 @@
 #include <limits.h>
 #include "args.h"
 
+#include "aom/aom_integer.h"
 #include "aom_ports/msvc.h"
 
 #if defined(__GNUC__) && __GNUC__
@@ -119,13 +120,13 @@
 }
 
 unsigned int arg_parse_uint(const struct arg *arg) {
-  long int rawval;
+  uint32_t rawval;
   char *endptr;
 
-  rawval = strtol(arg->val, &endptr, 10);
+  rawval = strtoul(arg->val, &endptr, 10);
 
   if (arg->val[0] != '\0' && endptr[0] == '\0') {
-    if (rawval >= 0 && rawval <= UINT_MAX) return rawval;
+    if (rawval <= UINT_MAX) return rawval;
 
     die("Option %s: Value %ld out of range for unsigned int\n", arg->name,
         rawval);
@@ -136,7 +137,7 @@
 }
 
 int arg_parse_int(const struct arg *arg) {
-  long int rawval;
+  int32_t rawval;
   char *endptr;
 
   rawval = strtol(arg->val, &endptr, 10);

diff --git a/av1/av1_common.mk b/av1/av1_common.mk
index 5a283a9..9730bee 100644
--- a/av1/av1_common.mk
+++ b/av1/av1_common.mk

@@ -98,6 +98,8 @@
 ifeq ($(CONFIG_DERING),yes)
 AV1_COMMON_SRCS-yes += common/od_dering.c
 AV1_COMMON_SRCS-yes += common/od_dering.h
+AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/od_dering_sse4.c
+AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/od_dering_sse4.h
 AV1_COMMON_SRCS-yes += common/dering.c
 AV1_COMMON_SRCS-yes += common/dering.h
 endif

diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index be23948..55aee8c 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl

@@ -20,6 +20,7 @@
 struct mv;
 union int_mv;
 struct yv12_buffer_config;
+typedef int16_t od_dering_in;
 EOF
 }
 forward_decls qw/av1_common_forward_decls/;
@@ -390,9 +391,6 @@
 add_proto qw/void av1_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
 specialize qw/av1_fht16x16 sse2 avx2/;
 
-add_proto qw/void av1_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-specialize qw/av1_fht32x32/;
-
 if (aom_config("CONFIG_EXT_TX") eq "yes") {
   add_proto qw/void av1_fht4x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
   specialize qw/av1_fht4x8 sse2/;
@@ -411,6 +409,9 @@
 
   add_proto qw/void av1_fht32x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
   specialize qw/av1_fht32x16/;
+
+  add_proto qw/void av1_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/av1_fht32x32 avx2/;
 }
 
 if (aom_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
@@ -840,4 +841,24 @@
 
 }
 # end encoder functions
+
+# Deringing Functions
+
+if (aom_config("CONFIG_DERING") eq "yes") {
+  add_proto qw/int od_dir_find8/, "const od_dering_in *img, int stride, int32_t *var, int coeff_shift";
+  specialize qw/od_dir_find8 sse4_1/;
+
+  add_proto qw/int od_filter_dering_direction_4x4/, "int16_t *y, int ystride, const int16_t *in, int threshold, int dir";
+  specialize qw/od_filter_dering_direction_4x4 sse4_1/;
+
+  add_proto qw/int od_filter_dering_direction_8x8/, "int16_t *y, int ystride, const int16_t *in, int threshold, int dir";
+  specialize qw/od_filter_dering_direction_8x8 sse4_1/;
+
+  add_proto qw/void od_filter_dering_orthogonal_4x4/, "int16_t *y, int ystride, const int16_t *in, int threshold, int dir";
+  specialize qw/od_filter_dering_orthogonal_4x4 sse4_1/;
+
+  add_proto qw/void od_filter_dering_orthogonal_8x8/, "int16_t *y, int ystride, const int16_t *in, int threshold, int dir";
+  specialize qw/od_filter_dering_orthogonal_8x8 sse4_1/;
+}
+
 1;

diff --git a/av1/common/clpf.c b/av1/common/clpf.c
index 1cf5272..a01e6b4 100644
--- a/av1/common/clpf.c
+++ b/av1/common/clpf.c

@@ -14,14 +14,6 @@
 #include "aom/aom_image.h"
 #include "aom_dsp/aom_dsp_common.h"
 
-int av1_clpf_maxbits(const AV1_COMMON *cm) {
-  return get_msb(
-             ALIGN_POWER_OF_TWO(cm->mi_cols * MI_SIZE, cm->clpf_size + 4) *
-                 ALIGN_POWER_OF_TWO(cm->mi_rows * MI_SIZE, cm->clpf_size + 4) >>
-             (cm->clpf_size * 2 + 8)) +
-         1;
-}
-
 int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int b) {
   int delta = 4 * clamp(A - X, -b, b) + clamp(B - X, -b, b) +
               3 * clamp(C - X, -b, b) + 3 * clamp(D - X, -b, b) +
@@ -73,14 +65,14 @@
 #endif
 
 // Return number of filtered blocks
-int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
-                   const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm,
-                   int enable_fb_flag, unsigned int strength,
-                   unsigned int fb_size_log2, uint8_t *blocks, int plane,
-                   int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
-                                   const YV12_BUFFER_CONFIG *,
-                                   const AV1_COMMON *cm, int, int, int,
-                                   unsigned int, unsigned int, uint8_t *)) {
+void av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
+                    const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm,
+                    int enable_fb_flag, unsigned int strength,
+                    unsigned int fb_size_log2, int plane,
+                    int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
+                                    const YV12_BUFFER_CONFIG *,
+                                    const AV1_COMMON *cm, int, int, int,
+                                    unsigned int, unsigned int, int8_t *)) {
   /* Constrained low-pass filter (CLPF) */
   int c, k, l, m, n;
   const int subx = plane != AOM_PLANE_Y && frame->subsampling_x;
@@ -95,7 +87,6 @@
   int dstride = bs;
   const int num_fb_hor = (width + (1 << fb_size_log2) - 1) >> fb_size_log2;
   const int num_fb_ver = (height + (1 << fb_size_log2) - 1) >> fb_size_log2;
-  int block_index = 0;
   uint8_t *cache = NULL;
   uint8_t **cache_ptr = NULL;
   uint8_t **cache_dst = NULL;
@@ -125,7 +116,7 @@
   for (k = 0; k < num_fb_ver; k++) {
     for (l = 0; l < num_fb_hor; l++) {
       int h, w;
-      int allskip = 1;
+      int allskip = !(enable_fb_flag && fb_size_log2 == MAX_FB_SIZE_LOG2);
       const int xoff = l << fb_size_log2;
       const int yoff = k << fb_size_log2;
       for (m = 0; allskip && m < (1 << fb_size_log2) / bs; m++) {
@@ -148,8 +139,11 @@
       w += !w << fb_size_log2;
       if (!allskip &&  // Do not filter the block if all is skip encoded
           (!enable_fb_flag ||
+           // Only called if fb_flag enabled (luma only)
            decision(k, l, frame, org, cm, bs, w / bs, h / bs, strength,
-                    fb_size_log2, blocks + block_index))) {
+                    fb_size_log2,
+                    cm->clpf_blocks + yoff / MIN_FB_SIZE * cm->clpf_stride +
+                        xoff / MIN_FB_SIZE))) {
         // Iterate over all smaller blocks inside the filter block
         for (m = 0; m < ((h + bs - 1) >> bslog); m++) {
           for (n = 0; n < ((w + bs - 1) >> bslog); n++) {
@@ -160,8 +154,9 @@
             sizey = AOMMIN(height - ypos, bs);
             if (!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride +
                                      (xpos << subx) / MI_SIZE]
-                     ->mbmi.skip) {  // Not skip block
-              // Temporary buffering needed if filtering in-place
+                     ->mbmi.skip ||
+                (enable_fb_flag && fb_size_log2 == MAX_FB_SIZE_LOG2)) {
+              // Temporary buffering needed for in-place filtering
               if (cache_ptr[cache_idx]) {
 // Copy filtered block back into the frame
 #if CONFIG_AOM_HIGHBITDEPTH
@@ -247,7 +242,6 @@
           }
         }
       }
-      block_index += !allskip;  // Count number of blocks filtered
     }
   }
 
@@ -287,6 +281,4 @@
   aom_free(cache);
   aom_free(cache_ptr);
   aom_free(cache_dst);
-
-  return block_index;
 }

diff --git a/av1/common/clpf.h b/av1/common/clpf.h
index 8e4213b..fc74f2c 100644
--- a/av1/common/clpf.h
+++ b/av1/common/clpf.h

@@ -13,17 +13,19 @@
 
 #include "av1/common/reconinter.h"
 
-#define MAX_FB_SIZE 128
+#define MAX_FB_SIZE_LOG2 7
+#define MIN_FB_SIZE_LOG2 5
+#define MAX_FB_SIZE (1 << MAX_FB_SIZE_LOG2)
+#define MIN_FB_SIZE (1 << MIN_FB_SIZE_LOG2)
 
-int av1_clpf_maxbits(const AV1_COMMON *cm);
 int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int b);
-int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
-                   const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm,
-                   int enable_fb_flag, unsigned int strength,
-                   unsigned int fb_size_log2, uint8_t *blocks, int plane,
-                   int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
-                                   const YV12_BUFFER_CONFIG *,
-                                   const AV1_COMMON *cm, int, int, int,
-                                   unsigned int, unsigned int, uint8_t *));
+void av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
+                    const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm,
+                    int enable_fb_flag, unsigned int strength,
+                    unsigned int fb_size_log2, int plane,
+                    int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
+                                    const YV12_BUFFER_CONFIG *,
+                                    const AV1_COMMON *cm, int, int, int,
+                                    unsigned int, unsigned int, int8_t *));
 
 #endif

diff --git a/av1/common/dering.c b/av1/common/dering.c
index 7405fb5..c21d4e5 100644
--- a/av1/common/dering.c
+++ b/av1/common/dering.c

@@ -98,30 +98,28 @@
       int nhb, nvb;
       nhb = AOMMIN(MAX_MIB_SIZE, cm->mi_cols - MAX_MIB_SIZE * sbc);
       nvb = AOMMIN(MAX_MIB_SIZE, cm->mi_rows - MAX_MIB_SIZE * sbr);
+      level = compute_level_from_index(
+          global_level, cm->mi_grid_visible[MAX_MIB_SIZE * sbr * cm->mi_stride +
+                                            MAX_MIB_SIZE * sbc]
+                            ->mbmi.dering_gain);
+      if (level == 0 || sb_all_skip(cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE))
+        continue;
       for (pli = 0; pli < 3; pli++) {
         int16_t dst[MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8];
         int threshold;
-#if DERING_REFINEMENT
-        level = compute_level_from_index(
-            global_level,
-            cm->mi_grid_visible[MAX_MIB_SIZE * sbr * cm->mi_stride +
-                                MAX_MIB_SIZE * sbc]
-                ->mbmi.dering_gain);
-#else
-          level = global_level;
-#endif
         /* FIXME: This is a temporary hack that uses more conservative
            deringing for chroma. */
-        if (pli) level = (level * 5 + 4) >> 3;
-        if (sb_all_skip(cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE)) level = 0;
-        threshold = level << coeff_shift;
-        od_dering(&OD_DERING_VTBL_C, dst, MAX_MIB_SIZE * bsize[pli],
+        if (pli)
+          threshold = (level * 5 + 4) >> 3 << coeff_shift;
+        else
+          threshold = level << coeff_shift;
+        if (threshold == 0) continue;
+        od_dering(dst, MAX_MIB_SIZE * bsize[pli],
                   &src[pli][sbr * stride * bsize[pli] * MAX_MIB_SIZE +
                             sbc * bsize[pli] * MAX_MIB_SIZE],
                   stride, nhb, nvb, sbc, sbr, nhsb, nvsb, dec[pli], dir, pli,
                   &bskip[MAX_MIB_SIZE * sbr * cm->mi_cols + MAX_MIB_SIZE * sbc],
-                  cm->mi_cols, threshold, OD_DERING_NO_CHECK_OVERLAP,
-                  coeff_shift);
+                  cm->mi_cols, threshold, coeff_shift);
         for (r = 0; r < bsize[pli] * nvb; ++r) {
           for (c = 0; c < bsize[pli] * nhb; ++c) {
 #if CONFIG_AOM_HIGHBITDEPTH

diff --git a/av1/common/dering.h b/av1/common/dering.h
index 2c1efd7..7c93f8b 100644
--- a/av1/common/dering.h
+++ b/av1/common/dering.h

@@ -24,7 +24,6 @@
 #define DERING_LEVEL_BITS 6
 #define MAX_DERING_LEVEL (1 << DERING_LEVEL_BITS)
 
-#define DERING_REFINEMENT 1
 #define DERING_REFINEMENT_BITS 2
 #define DERING_REFINEMENT_LEVELS 4
 

diff --git a/av1/common/entropy.c b/av1/common/entropy.c
index c9166db..1defc53 100644
--- a/av1/common/entropy.c
+++ b/av1/common/entropy.c

@@ -418,263 +418,264 @@
 // beta = 8
 // Values for tokens ONE_TOKEN through CATEGORY6_TOKEN included here.
 // ZERO_TOKEN and EOB_TOKEN are coded as flags outside this coder.
-const AnsP10 av1_pareto8_token_probs[COEFF_PROB_MODELS][ENTROPY_TOKENS - 2] = {
-  { 4, 4, 4, 4, 8, 15, 30, 57, 103, 795 },
-  { 8, 8, 8, 8, 15, 30, 57, 103, 168, 619 },
-  { 12, 12, 12, 12, 23, 43, 80, 138, 205, 487 },
-  { 16, 16, 15, 15, 30, 56, 101, 165, 225, 385 },
-  { 20, 20, 19, 19, 36, 68, 119, 186, 231, 306 },
-  { 24, 23, 23, 22, 43, 79, 135, 201, 230, 244 },
-  { 28, 27, 26, 26, 49, 89, 149, 211, 223, 196 },
-  { 32, 31, 30, 29, 55, 98, 160, 218, 212, 159 },
-  { 36, 35, 33, 32, 60, 107, 171, 221, 200, 129 },
-  { 40, 38, 37, 35, 66, 115, 179, 222, 187, 105 },
-  { 44, 42, 40, 38, 71, 122, 186, 221, 174, 86 },
-  { 48, 45, 43, 41, 76, 129, 192, 219, 160, 71 },
-  { 52, 49, 46, 44, 80, 136, 196, 215, 148, 58 },
-  { 56, 53, 49, 46, 85, 142, 200, 210, 135, 48 },
-  { 60, 56, 52, 49, 89, 147, 203, 204, 124, 40 },
-  { 64, 60, 55, 52, 93, 151, 205, 198, 113, 33 },
-  { 68, 63, 58, 54, 97, 156, 205, 192, 103, 28 },
-  { 72, 66, 61, 57, 100, 160, 206, 185, 94, 23 },
-  { 76, 70, 64, 59, 104, 163, 205, 178, 85, 20 },
-  { 80, 73, 67, 61, 107, 166, 205, 171, 77, 17 },
-  { 84, 76, 69, 63, 110, 169, 204, 164, 71, 14 },
-  { 88, 80, 72, 65, 113, 171, 202, 157, 64, 12 },
-  { 92, 83, 75, 67, 116, 173, 200, 150, 58, 10 },
-  { 96, 86, 77, 69, 118, 175, 198, 143, 53, 9 },
-  { 100, 89, 80, 71, 121, 176, 195, 137, 48, 7 },
-  { 104, 92, 82, 73, 123, 178, 192, 130, 44, 6 },
-  { 108, 96, 84, 75, 125, 178, 189, 124, 40, 5 },
-  { 112, 98, 87, 76, 127, 179, 186, 118, 36, 5 },
-  { 116, 101, 89, 78, 129, 179, 183, 112, 33, 4 },
-  { 120, 104, 91, 80, 131, 180, 179, 106, 30, 3 },
-  { 124, 107, 93, 81, 132, 180, 176, 101, 27, 3 },
-  { 128, 110, 95, 82, 134, 179, 172, 96, 25, 3 },
-  { 132, 113, 97, 84, 135, 179, 168, 91, 23, 2 },
-  { 136, 116, 99, 85, 136, 179, 164, 86, 21, 2 },
-  { 140, 119, 101, 86, 137, 178, 160, 82, 19, 2 },
-  { 144, 122, 103, 88, 138, 177, 157, 77, 17, 1 },
-  { 148, 124, 105, 89, 139, 176, 153, 73, 16, 1 },
-  { 152, 127, 107, 90, 140, 175, 149, 69, 14, 1 },
-  { 156, 130, 108, 91, 141, 173, 145, 66, 13, 1 },
-  { 160, 133, 110, 92, 141, 172, 141, 62, 12, 1 },
-  { 164, 135, 111, 93, 142, 171, 137, 59, 11, 1 },
-  { 168, 138, 113, 94, 142, 169, 133, 56, 10, 1 },
-  { 172, 140, 115, 94, 142, 168, 130, 53, 9, 1 },
-  { 176, 143, 116, 95, 143, 166, 126, 50, 8, 1 },
-  { 180, 145, 118, 96, 143, 164, 122, 47, 8, 1 },
-  { 184, 147, 119, 96, 143, 163, 119, 45, 7, 1 },
-  { 188, 150, 120, 97, 143, 161, 116, 42, 6, 1 },
-  { 192, 152, 121, 98, 143, 159, 112, 40, 6, 1 },
-  { 196, 155, 123, 98, 142, 157, 109, 38, 5, 1 },
-  { 200, 157, 124, 99, 142, 155, 105, 36, 5, 1 },
-  { 204, 159, 125, 99, 142, 153, 102, 34, 5, 1 },
-  { 208, 161, 126, 100, 142, 151, 99, 32, 4, 1 },
-  { 212, 164, 127, 100, 141, 149, 96, 30, 4, 1 },
-  { 216, 166, 129, 100, 141, 147, 93, 28, 3, 1 },
-  { 220, 168, 130, 101, 140, 144, 90, 27, 3, 1 },
-  { 224, 170, 131, 101, 140, 142, 87, 25, 3, 1 },
-  { 228, 172, 132, 101, 139, 140, 84, 24, 3, 1 },
-  { 232, 174, 132, 101, 139, 138, 81, 23, 3, 1 },
-  { 236, 176, 133, 101, 138, 136, 79, 22, 2, 1 },
-  { 240, 178, 134, 102, 137, 134, 76, 20, 2, 1 },
-  { 244, 180, 135, 102, 136, 131, 74, 19, 2, 1 },
-  { 248, 182, 135, 102, 136, 129, 71, 18, 2, 1 },
-  { 252, 184, 136, 101, 135, 127, 69, 17, 2, 1 },
-  { 256, 186, 137, 102, 134, 124, 66, 16, 2, 1 },
-  { 260, 188, 138, 102, 133, 122, 64, 15, 1, 1 },
-  { 264, 190, 138, 101, 132, 120, 62, 15, 1, 1 },
-  { 268, 191, 139, 101, 131, 118, 60, 14, 1, 1 },
-  { 272, 193, 139, 101, 130, 116, 58, 13, 1, 1 },
-  { 276, 195, 139, 101, 129, 114, 56, 12, 1, 1 },
-  { 280, 196, 140, 101, 128, 111, 54, 12, 1, 1 },
-  { 284, 198, 140, 101, 127, 109, 52, 11, 1, 1 },
-  { 288, 200, 141, 100, 126, 107, 50, 10, 1, 1 },
-  { 292, 201, 141, 100, 125, 105, 48, 10, 1, 1 },
-  { 296, 203, 141, 100, 123, 103, 47, 9, 1, 1 },
-  { 300, 204, 142, 99, 122, 101, 45, 9, 1, 1 },
-  { 304, 206, 142, 99, 121, 99, 43, 8, 1, 1 },
-  { 308, 207, 142, 99, 119, 97, 42, 8, 1, 1 },
-  { 312, 209, 142, 99, 118, 95, 40, 7, 1, 1 },
-  { 316, 210, 142, 98, 117, 93, 39, 7, 1, 1 },
-  { 320, 211, 142, 98, 116, 91, 37, 7, 1, 1 },
-  { 324, 213, 142, 97, 115, 89, 36, 6, 1, 1 },
-  { 328, 214, 142, 97, 113, 87, 35, 6, 1, 1 },
-  { 332, 215, 143, 96, 112, 85, 33, 6, 1, 1 },
-  { 336, 216, 143, 96, 111, 83, 32, 5, 1, 1 },
-  { 340, 218, 143, 95, 109, 81, 31, 5, 1, 1 },
-  { 344, 219, 142, 95, 108, 79, 30, 5, 1, 1 },
-  { 348, 220, 142, 94, 107, 78, 29, 4, 1, 1 },
-  { 352, 221, 142, 94, 105, 76, 28, 4, 1, 1 },
-  { 356, 222, 142, 93, 104, 74, 27, 4, 1, 1 },
-  { 360, 223, 142, 92, 103, 72, 26, 4, 1, 1 },
-  { 364, 224, 142, 92, 101, 70, 25, 4, 1, 1 },
-  { 368, 225, 142, 91, 100, 69, 24, 3, 1, 1 },
-  { 372, 226, 141, 91, 99, 67, 23, 3, 1, 1 },
-  { 376, 227, 141, 90, 97, 66, 22, 3, 1, 1 },
-  { 380, 228, 141, 89, 96, 64, 21, 3, 1, 1 },
-  { 384, 229, 140, 89, 95, 62, 20, 3, 1, 1 },
-  { 388, 229, 140, 88, 93, 61, 20, 3, 1, 1 },
-  { 392, 230, 140, 87, 92, 60, 19, 2, 1, 1 },
-  { 396, 231, 140, 86, 91, 58, 18, 2, 1, 1 },
-  { 400, 232, 139, 86, 89, 57, 17, 2, 1, 1 },
-  { 404, 232, 139, 85, 88, 55, 17, 2, 1, 1 },
-  { 408, 233, 138, 84, 87, 54, 16, 2, 1, 1 },
-  { 412, 234, 138, 84, 85, 52, 15, 2, 1, 1 },
-  { 416, 234, 137, 83, 84, 51, 15, 2, 1, 1 },
-  { 420, 235, 137, 82, 82, 50, 14, 2, 1, 1 },
-  { 424, 236, 136, 81, 81, 48, 14, 2, 1, 1 },
-  { 428, 236, 136, 81, 80, 47, 13, 1, 1, 1 },
-  { 432, 236, 135, 80, 79, 46, 13, 1, 1, 1 },
-  { 436, 237, 135, 79, 77, 45, 12, 1, 1, 1 },
-  { 440, 238, 134, 78, 76, 43, 12, 1, 1, 1 },
-  { 444, 238, 134, 77, 75, 42, 11, 1, 1, 1 },
-  { 448, 238, 133, 77, 73, 41, 11, 1, 1, 1 },
-  { 452, 239, 132, 76, 72, 40, 10, 1, 1, 1 },
-  { 456, 239, 131, 75, 71, 39, 10, 1, 1, 1 },
-  { 460, 239, 131, 74, 70, 38, 9, 1, 1, 1 },
-  { 464, 240, 130, 73, 68, 37, 9, 1, 1, 1 },
-  { 468, 240, 129, 72, 67, 36, 9, 1, 1, 1 },
-  { 472, 240, 128, 72, 66, 35, 8, 1, 1, 1 },
-  { 476, 240, 127, 71, 65, 34, 8, 1, 1, 1 },
-  { 480, 240, 127, 70, 63, 33, 8, 1, 1, 1 },
-  { 484, 241, 126, 69, 62, 32, 7, 1, 1, 1 },
-  { 488, 241, 125, 68, 61, 31, 7, 1, 1, 1 },
-  { 492, 241, 124, 67, 60, 30, 7, 1, 1, 1 },
-  { 496, 241, 124, 66, 59, 29, 6, 1, 1, 1 },
-  { 500, 240, 123, 66, 58, 28, 6, 1, 1, 1 },
-  { 504, 240, 122, 65, 57, 27, 6, 1, 1, 1 },
-  { 508, 240, 121, 64, 55, 27, 6, 1, 1, 1 },
-  { 512, 241, 120, 63, 54, 26, 5, 1, 1, 1 },
-  { 516, 241, 119, 62, 53, 25, 5, 1, 1, 1 },
-  { 520, 240, 118, 62, 52, 24, 5, 1, 1, 1 },
-  { 524, 240, 117, 60, 51, 24, 5, 1, 1, 1 },
-  { 528, 239, 116, 60, 50, 23, 5, 1, 1, 1 },
-  { 532, 239, 116, 59, 49, 22, 4, 1, 1, 1 },
-  { 536, 239, 115, 58, 48, 21, 4, 1, 1, 1 },
-  { 540, 239, 113, 57, 47, 21, 4, 1, 1, 1 },
-  { 544, 238, 113, 56, 46, 20, 4, 1, 1, 1 },
-  { 548, 238, 112, 55, 45, 19, 4, 1, 1, 1 },
-  { 552, 238, 110, 55, 44, 19, 3, 1, 1, 1 },
-  { 556, 237, 110, 54, 43, 18, 3, 1, 1, 1 },
-  { 560, 237, 108, 53, 42, 18, 3, 1, 1, 1 },
-  { 564, 236, 108, 52, 41, 17, 3, 1, 1, 1 },
-  { 568, 236, 106, 51, 40, 17, 3, 1, 1, 1 },
-  { 572, 235, 105, 51, 39, 16, 3, 1, 1, 1 },
-  { 576, 235, 104, 50, 38, 15, 3, 1, 1, 1 },
-  { 580, 234, 103, 49, 37, 15, 3, 1, 1, 1 },
-  { 584, 234, 102, 48, 37, 14, 2, 1, 1, 1 },
-  { 588, 233, 101, 47, 36, 14, 2, 1, 1, 1 },
-  { 592, 233, 100, 46, 35, 13, 2, 1, 1, 1 },
-  { 596, 231, 99, 46, 34, 13, 2, 1, 1, 1 },
-  { 600, 230, 98, 45, 33, 13, 2, 1, 1, 1 },
-  { 604, 230, 97, 44, 32, 12, 2, 1, 1, 1 },
-  { 608, 229, 96, 43, 31, 12, 2, 1, 1, 1 },
-  { 612, 228, 95, 42, 31, 11, 2, 1, 1, 1 },
-  { 616, 227, 93, 42, 30, 11, 2, 1, 1, 1 },
-  { 620, 227, 92, 41, 29, 10, 2, 1, 1, 1 },
-  { 624, 226, 92, 40, 28, 10, 1, 1, 1, 1 },
-  { 628, 225, 90, 39, 28, 10, 1, 1, 1, 1 },
-  { 632, 224, 89, 39, 27, 9, 1, 1, 1, 1 },
-  { 636, 223, 88, 38, 26, 9, 1, 1, 1, 1 },
-  { 640, 222, 87, 37, 25, 9, 1, 1, 1, 1 },
-  { 644, 221, 86, 36, 25, 8, 1, 1, 1, 1 },
-  { 648, 220, 84, 36, 24, 8, 1, 1, 1, 1 },
-  { 652, 219, 83, 35, 23, 8, 1, 1, 1, 1 },
-  { 656, 218, 82, 34, 23, 7, 1, 1, 1, 1 },
-  { 660, 217, 81, 33, 22, 7, 1, 1, 1, 1 },
-  { 664, 215, 80, 33, 21, 7, 1, 1, 1, 1 },
-  { 668, 214, 78, 32, 21, 7, 1, 1, 1, 1 },
-  { 672, 213, 78, 31, 20, 6, 1, 1, 1, 1 },
-  { 676, 211, 76, 31, 20, 6, 1, 1, 1, 1 },
-  { 680, 210, 75, 30, 19, 6, 1, 1, 1, 1 },
-  { 684, 209, 74, 29, 18, 6, 1, 1, 1, 1 },
-  { 688, 208, 73, 28, 18, 5, 1, 1, 1, 1 },
-  { 692, 206, 72, 28, 17, 5, 1, 1, 1, 1 },
-  { 696, 205, 70, 27, 17, 5, 1, 1, 1, 1 },
-  { 700, 203, 69, 27, 16, 5, 1, 1, 1, 1 },
-  { 704, 201, 68, 26, 16, 5, 1, 1, 1, 1 },
-  { 708, 201, 67, 25, 15, 4, 1, 1, 1, 1 },
-  { 712, 198, 66, 25, 15, 4, 1, 1, 1, 1 },
-  { 716, 197, 65, 24, 14, 4, 1, 1, 1, 1 },
-  { 720, 196, 63, 23, 14, 4, 1, 1, 1, 1 },
-  { 724, 194, 62, 23, 13, 4, 1, 1, 1, 1 },
-  { 728, 193, 61, 22, 13, 3, 1, 1, 1, 1 },
-  { 732, 191, 60, 22, 12, 3, 1, 1, 1, 1 },
-  { 736, 189, 59, 21, 12, 3, 1, 1, 1, 1 },
-  { 740, 188, 58, 20, 11, 3, 1, 1, 1, 1 },
-  { 744, 186, 56, 20, 11, 3, 1, 1, 1, 1 },
-  { 748, 184, 55, 19, 11, 3, 1, 1, 1, 1 },
-  { 752, 182, 54, 19, 10, 3, 1, 1, 1, 1 },
-  { 756, 181, 53, 18, 10, 2, 1, 1, 1, 1 },
-  { 760, 179, 52, 18, 9, 2, 1, 1, 1, 1 },
-  { 764, 177, 51, 17, 9, 2, 1, 1, 1, 1 },
-  { 768, 174, 50, 17, 9, 2, 1, 1, 1, 1 },
-  { 772, 173, 49, 16, 8, 2, 1, 1, 1, 1 },
-  { 776, 171, 47, 16, 8, 2, 1, 1, 1, 1 },
-  { 780, 169, 46, 15, 8, 2, 1, 1, 1, 1 },
-  { 784, 167, 45, 15, 7, 2, 1, 1, 1, 1 },
-  { 788, 165, 44, 14, 7, 2, 1, 1, 1, 1 },
-  { 792, 162, 43, 14, 7, 2, 1, 1, 1, 1 },
-  { 796, 161, 42, 13, 7, 1, 1, 1, 1, 1 },
-  { 800, 159, 41, 13, 6, 1, 1, 1, 1, 1 },
-  { 804, 157, 40, 12, 6, 1, 1, 1, 1, 1 },
-  { 808, 154, 39, 12, 6, 1, 1, 1, 1, 1 },
-  { 812, 153, 38, 11, 5, 1, 1, 1, 1, 1 },
-  { 816, 150, 37, 11, 5, 1, 1, 1, 1, 1 },
-  { 820, 148, 36, 10, 5, 1, 1, 1, 1, 1 },
-  { 824, 145, 35, 10, 5, 1, 1, 1, 1, 1 },
-  { 828, 143, 34, 10, 4, 1, 1, 1, 1, 1 },
-  { 832, 141, 33, 9, 4, 1, 1, 1, 1, 1 },
-  { 836, 138, 32, 9, 4, 1, 1, 1, 1, 1 },
-  { 840, 136, 30, 9, 4, 1, 1, 1, 1, 1 },
-  { 844, 133, 30, 8, 4, 1, 1, 1, 1, 1 },
-  { 848, 131, 29, 8, 3, 1, 1, 1, 1, 1 },
-  { 852, 129, 28, 7, 3, 1, 1, 1, 1, 1 },
-  { 856, 126, 27, 7, 3, 1, 1, 1, 1, 1 },
-  { 860, 123, 26, 7, 3, 1, 1, 1, 1, 1 },
-  { 864, 121, 25, 6, 3, 1, 1, 1, 1, 1 },
-  { 868, 118, 24, 6, 3, 1, 1, 1, 1, 1 },
-  { 872, 116, 23, 6, 2, 1, 1, 1, 1, 1 },
-  { 876, 113, 22, 6, 2, 1, 1, 1, 1, 1 },
-  { 880, 111, 21, 5, 2, 1, 1, 1, 1, 1 },
-  { 884, 108, 20, 5, 2, 1, 1, 1, 1, 1 },
-  { 888, 105, 19, 5, 2, 1, 1, 1, 1, 1 },
-  { 892, 102, 19, 4, 2, 1, 1, 1, 1, 1 },
-  { 896, 99, 18, 4, 2, 1, 1, 1, 1, 1 },
-  { 900, 97, 17, 4, 1, 1, 1, 1, 1, 1 },
-  { 904, 94, 16, 4, 1, 1, 1, 1, 1, 1 },
-  { 908, 92, 15, 3, 1, 1, 1, 1, 1, 1 },
-  { 912, 89, 14, 3, 1, 1, 1, 1, 1, 1 },
-  { 916, 85, 14, 3, 1, 1, 1, 1, 1, 1 },
-  { 920, 82, 13, 3, 1, 1, 1, 1, 1, 1 },
-  { 924, 79, 12, 3, 1, 1, 1, 1, 1, 1 },
-  { 928, 77, 11, 2, 1, 1, 1, 1, 1, 1 },
-  { 932, 73, 11, 2, 1, 1, 1, 1, 1, 1 },
-  { 936, 70, 10, 2, 1, 1, 1, 1, 1, 1 },
-  { 940, 67, 9, 2, 1, 1, 1, 1, 1, 1 },
-  { 944, 64, 8, 2, 1, 1, 1, 1, 1, 1 },
-  { 948, 60, 8, 2, 1, 1, 1, 1, 1, 1 },
-  { 952, 58, 7, 1, 1, 1, 1, 1, 1, 1 },
-  { 956, 54, 7, 1, 1, 1, 1, 1, 1, 1 },
-  { 960, 51, 6, 1, 1, 1, 1, 1, 1, 1 },
-  { 964, 48, 5, 1, 1, 1, 1, 1, 1, 1 },
-  { 968, 44, 5, 1, 1, 1, 1, 1, 1, 1 },
-  { 972, 41, 4, 1, 1, 1, 1, 1, 1, 1 },
-  { 976, 37, 4, 1, 1, 1, 1, 1, 1, 1 },
-  { 980, 34, 3, 1, 1, 1, 1, 1, 1, 1 },
-  { 984, 30, 3, 1, 1, 1, 1, 1, 1, 1 },
-  { 988, 27, 2, 1, 1, 1, 1, 1, 1, 1 },
-  { 992, 23, 2, 1, 1, 1, 1, 1, 1, 1 },
-  { 996, 19, 2, 1, 1, 1, 1, 1, 1, 1 },
-  { 1000, 16, 1, 1, 1, 1, 1, 1, 1, 1 },
-  { 1004, 12, 1, 1, 1, 1, 1, 1, 1, 1 },
-  { 1008, 8, 1, 1, 1, 1, 1, 1, 1, 1 },
-  { 1012, 4, 1, 1, 1, 1, 1, 1, 1, 1 },
-  { 1015, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
-  { 1015, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
-};
+const aom_cdf_prob
+    av1_pareto8_token_probs[COEFF_PROB_MODELS][ENTROPY_TOKENS - 2] = {
+      { 4, 4, 4, 4, 8, 15, 30, 57, 103, 795 },
+      { 8, 8, 8, 8, 15, 30, 57, 103, 168, 619 },
+      { 12, 12, 12, 12, 23, 43, 80, 138, 205, 487 },
+      { 16, 16, 15, 15, 30, 56, 101, 165, 225, 385 },
+      { 20, 20, 19, 19, 36, 68, 119, 186, 231, 306 },
+      { 24, 23, 23, 22, 43, 79, 135, 201, 230, 244 },
+      { 28, 27, 26, 26, 49, 89, 149, 211, 223, 196 },
+      { 32, 31, 30, 29, 55, 98, 160, 218, 212, 159 },
+      { 36, 35, 33, 32, 60, 107, 171, 221, 200, 129 },
+      { 40, 38, 37, 35, 66, 115, 179, 222, 187, 105 },
+      { 44, 42, 40, 38, 71, 122, 186, 221, 174, 86 },
+      { 48, 45, 43, 41, 76, 129, 192, 219, 160, 71 },
+      { 52, 49, 46, 44, 80, 136, 196, 215, 148, 58 },
+      { 56, 53, 49, 46, 85, 142, 200, 210, 135, 48 },
+      { 60, 56, 52, 49, 89, 147, 203, 204, 124, 40 },
+      { 64, 60, 55, 52, 93, 151, 205, 198, 113, 33 },
+      { 68, 63, 58, 54, 97, 156, 205, 192, 103, 28 },
+      { 72, 66, 61, 57, 100, 160, 206, 185, 94, 23 },
+      { 76, 70, 64, 59, 104, 163, 205, 178, 85, 20 },
+      { 80, 73, 67, 61, 107, 166, 205, 171, 77, 17 },
+      { 84, 76, 69, 63, 110, 169, 204, 164, 71, 14 },
+      { 88, 80, 72, 65, 113, 171, 202, 157, 64, 12 },
+      { 92, 83, 75, 67, 116, 173, 200, 150, 58, 10 },
+      { 96, 86, 77, 69, 118, 175, 198, 143, 53, 9 },
+      { 100, 89, 80, 71, 121, 176, 195, 137, 48, 7 },
+      { 104, 92, 82, 73, 123, 178, 192, 130, 44, 6 },
+      { 108, 96, 84, 75, 125, 178, 189, 124, 40, 5 },
+      { 112, 98, 87, 76, 127, 179, 186, 118, 36, 5 },
+      { 116, 101, 89, 78, 129, 179, 183, 112, 33, 4 },
+      { 120, 104, 91, 80, 131, 180, 179, 106, 30, 3 },
+      { 124, 107, 93, 81, 132, 180, 176, 101, 27, 3 },
+      { 128, 110, 95, 82, 134, 179, 172, 96, 25, 3 },
+      { 132, 113, 97, 84, 135, 179, 168, 91, 23, 2 },
+      { 136, 116, 99, 85, 136, 179, 164, 86, 21, 2 },
+      { 140, 119, 101, 86, 137, 178, 160, 82, 19, 2 },
+      { 144, 122, 103, 88, 138, 177, 157, 77, 17, 1 },
+      { 148, 124, 105, 89, 139, 176, 153, 73, 16, 1 },
+      { 152, 127, 107, 90, 140, 175, 149, 69, 14, 1 },
+      { 156, 130, 108, 91, 141, 173, 145, 66, 13, 1 },
+      { 160, 133, 110, 92, 141, 172, 141, 62, 12, 1 },
+      { 164, 135, 111, 93, 142, 171, 137, 59, 11, 1 },
+      { 168, 138, 113, 94, 142, 169, 133, 56, 10, 1 },
+      { 172, 140, 115, 94, 142, 168, 130, 53, 9, 1 },
+      { 176, 143, 116, 95, 143, 166, 126, 50, 8, 1 },
+      { 180, 145, 118, 96, 143, 164, 122, 47, 8, 1 },
+      { 184, 147, 119, 96, 143, 163, 119, 45, 7, 1 },
+      { 188, 150, 120, 97, 143, 161, 116, 42, 6, 1 },
+      { 192, 152, 121, 98, 143, 159, 112, 40, 6, 1 },
+      { 196, 155, 123, 98, 142, 157, 109, 38, 5, 1 },
+      { 200, 157, 124, 99, 142, 155, 105, 36, 5, 1 },
+      { 204, 159, 125, 99, 142, 153, 102, 34, 5, 1 },
+      { 208, 161, 126, 100, 142, 151, 99, 32, 4, 1 },
+      { 212, 164, 127, 100, 141, 149, 96, 30, 4, 1 },
+      { 216, 166, 129, 100, 141, 147, 93, 28, 3, 1 },
+      { 220, 168, 130, 101, 140, 144, 90, 27, 3, 1 },
+      { 224, 170, 131, 101, 140, 142, 87, 25, 3, 1 },
+      { 228, 172, 132, 101, 139, 140, 84, 24, 3, 1 },
+      { 232, 174, 132, 101, 139, 138, 81, 23, 3, 1 },
+      { 236, 176, 133, 101, 138, 136, 79, 22, 2, 1 },
+      { 240, 178, 134, 102, 137, 134, 76, 20, 2, 1 },
+      { 244, 180, 135, 102, 136, 131, 74, 19, 2, 1 },
+      { 248, 182, 135, 102, 136, 129, 71, 18, 2, 1 },
+      { 252, 184, 136, 101, 135, 127, 69, 17, 2, 1 },
+      { 256, 186, 137, 102, 134, 124, 66, 16, 2, 1 },
+      { 260, 188, 138, 102, 133, 122, 64, 15, 1, 1 },
+      { 264, 190, 138, 101, 132, 120, 62, 15, 1, 1 },
+      { 268, 191, 139, 101, 131, 118, 60, 14, 1, 1 },
+      { 272, 193, 139, 101, 130, 116, 58, 13, 1, 1 },
+      { 276, 195, 139, 101, 129, 114, 56, 12, 1, 1 },
+      { 280, 196, 140, 101, 128, 111, 54, 12, 1, 1 },
+      { 284, 198, 140, 101, 127, 109, 52, 11, 1, 1 },
+      { 288, 200, 141, 100, 126, 107, 50, 10, 1, 1 },
+      { 292, 201, 141, 100, 125, 105, 48, 10, 1, 1 },
+      { 296, 203, 141, 100, 123, 103, 47, 9, 1, 1 },
+      { 300, 204, 142, 99, 122, 101, 45, 9, 1, 1 },
+      { 304, 206, 142, 99, 121, 99, 43, 8, 1, 1 },
+      { 308, 207, 142, 99, 119, 97, 42, 8, 1, 1 },
+      { 312, 209, 142, 99, 118, 95, 40, 7, 1, 1 },
+      { 316, 210, 142, 98, 117, 93, 39, 7, 1, 1 },
+      { 320, 211, 142, 98, 116, 91, 37, 7, 1, 1 },
+      { 324, 213, 142, 97, 115, 89, 36, 6, 1, 1 },
+      { 328, 214, 142, 97, 113, 87, 35, 6, 1, 1 },
+      { 332, 215, 143, 96, 112, 85, 33, 6, 1, 1 },
+      { 336, 216, 143, 96, 111, 83, 32, 5, 1, 1 },
+      { 340, 218, 143, 95, 109, 81, 31, 5, 1, 1 },
+      { 344, 219, 142, 95, 108, 79, 30, 5, 1, 1 },
+      { 348, 220, 142, 94, 107, 78, 29, 4, 1, 1 },
+      { 352, 221, 142, 94, 105, 76, 28, 4, 1, 1 },
+      { 356, 222, 142, 93, 104, 74, 27, 4, 1, 1 },
+      { 360, 223, 142, 92, 103, 72, 26, 4, 1, 1 },
+      { 364, 224, 142, 92, 101, 70, 25, 4, 1, 1 },
+      { 368, 225, 142, 91, 100, 69, 24, 3, 1, 1 },
+      { 372, 226, 141, 91, 99, 67, 23, 3, 1, 1 },
+      { 376, 227, 141, 90, 97, 66, 22, 3, 1, 1 },
+      { 380, 228, 141, 89, 96, 64, 21, 3, 1, 1 },
+      { 384, 229, 140, 89, 95, 62, 20, 3, 1, 1 },
+      { 388, 229, 140, 88, 93, 61, 20, 3, 1, 1 },
+      { 392, 230, 140, 87, 92, 60, 19, 2, 1, 1 },
+      { 396, 231, 140, 86, 91, 58, 18, 2, 1, 1 },
+      { 400, 232, 139, 86, 89, 57, 17, 2, 1, 1 },
+      { 404, 232, 139, 85, 88, 55, 17, 2, 1, 1 },
+      { 408, 233, 138, 84, 87, 54, 16, 2, 1, 1 },
+      { 412, 234, 138, 84, 85, 52, 15, 2, 1, 1 },
+      { 416, 234, 137, 83, 84, 51, 15, 2, 1, 1 },
+      { 420, 235, 137, 82, 82, 50, 14, 2, 1, 1 },
+      { 424, 236, 136, 81, 81, 48, 14, 2, 1, 1 },
+      { 428, 236, 136, 81, 80, 47, 13, 1, 1, 1 },
+      { 432, 236, 135, 80, 79, 46, 13, 1, 1, 1 },
+      { 436, 237, 135, 79, 77, 45, 12, 1, 1, 1 },
+      { 440, 238, 134, 78, 76, 43, 12, 1, 1, 1 },
+      { 444, 238, 134, 77, 75, 42, 11, 1, 1, 1 },
+      { 448, 238, 133, 77, 73, 41, 11, 1, 1, 1 },
+      { 452, 239, 132, 76, 72, 40, 10, 1, 1, 1 },
+      { 456, 239, 131, 75, 71, 39, 10, 1, 1, 1 },
+      { 460, 239, 131, 74, 70, 38, 9, 1, 1, 1 },
+      { 464, 240, 130, 73, 68, 37, 9, 1, 1, 1 },
+      { 468, 240, 129, 72, 67, 36, 9, 1, 1, 1 },
+      { 472, 240, 128, 72, 66, 35, 8, 1, 1, 1 },
+      { 476, 240, 127, 71, 65, 34, 8, 1, 1, 1 },
+      { 480, 240, 127, 70, 63, 33, 8, 1, 1, 1 },
+      { 484, 241, 126, 69, 62, 32, 7, 1, 1, 1 },
+      { 488, 241, 125, 68, 61, 31, 7, 1, 1, 1 },
+      { 492, 241, 124, 67, 60, 30, 7, 1, 1, 1 },
+      { 496, 241, 124, 66, 59, 29, 6, 1, 1, 1 },
+      { 500, 240, 123, 66, 58, 28, 6, 1, 1, 1 },
+      { 504, 240, 122, 65, 57, 27, 6, 1, 1, 1 },
+      { 508, 240, 121, 64, 55, 27, 6, 1, 1, 1 },
+      { 512, 241, 120, 63, 54, 26, 5, 1, 1, 1 },
+      { 516, 241, 119, 62, 53, 25, 5, 1, 1, 1 },
+      { 520, 240, 118, 62, 52, 24, 5, 1, 1, 1 },
+      { 524, 240, 117, 60, 51, 24, 5, 1, 1, 1 },
+      { 528, 239, 116, 60, 50, 23, 5, 1, 1, 1 },
+      { 532, 239, 116, 59, 49, 22, 4, 1, 1, 1 },
+      { 536, 239, 115, 58, 48, 21, 4, 1, 1, 1 },
+      { 540, 239, 113, 57, 47, 21, 4, 1, 1, 1 },
+      { 544, 238, 113, 56, 46, 20, 4, 1, 1, 1 },
+      { 548, 238, 112, 55, 45, 19, 4, 1, 1, 1 },
+      { 552, 238, 110, 55, 44, 19, 3, 1, 1, 1 },
+      { 556, 237, 110, 54, 43, 18, 3, 1, 1, 1 },
+      { 560, 237, 108, 53, 42, 18, 3, 1, 1, 1 },
+      { 564, 236, 108, 52, 41, 17, 3, 1, 1, 1 },
+      { 568, 236, 106, 51, 40, 17, 3, 1, 1, 1 },
+      { 572, 235, 105, 51, 39, 16, 3, 1, 1, 1 },
+      { 576, 235, 104, 50, 38, 15, 3, 1, 1, 1 },
+      { 580, 234, 103, 49, 37, 15, 3, 1, 1, 1 },
+      { 584, 234, 102, 48, 37, 14, 2, 1, 1, 1 },
+      { 588, 233, 101, 47, 36, 14, 2, 1, 1, 1 },
+      { 592, 233, 100, 46, 35, 13, 2, 1, 1, 1 },
+      { 596, 231, 99, 46, 34, 13, 2, 1, 1, 1 },
+      { 600, 230, 98, 45, 33, 13, 2, 1, 1, 1 },
+      { 604, 230, 97, 44, 32, 12, 2, 1, 1, 1 },
+      { 608, 229, 96, 43, 31, 12, 2, 1, 1, 1 },
+      { 612, 228, 95, 42, 31, 11, 2, 1, 1, 1 },
+      { 616, 227, 93, 42, 30, 11, 2, 1, 1, 1 },
+      { 620, 227, 92, 41, 29, 10, 2, 1, 1, 1 },
+      { 624, 226, 92, 40, 28, 10, 1, 1, 1, 1 },
+      { 628, 225, 90, 39, 28, 10, 1, 1, 1, 1 },
+      { 632, 224, 89, 39, 27, 9, 1, 1, 1, 1 },
+      { 636, 223, 88, 38, 26, 9, 1, 1, 1, 1 },
+      { 640, 222, 87, 37, 25, 9, 1, 1, 1, 1 },
+      { 644, 221, 86, 36, 25, 8, 1, 1, 1, 1 },
+      { 648, 220, 84, 36, 24, 8, 1, 1, 1, 1 },
+      { 652, 219, 83, 35, 23, 8, 1, 1, 1, 1 },
+      { 656, 218, 82, 34, 23, 7, 1, 1, 1, 1 },
+      { 660, 217, 81, 33, 22, 7, 1, 1, 1, 1 },
+      { 664, 215, 80, 33, 21, 7, 1, 1, 1, 1 },
+      { 668, 214, 78, 32, 21, 7, 1, 1, 1, 1 },
+      { 672, 213, 78, 31, 20, 6, 1, 1, 1, 1 },
+      { 676, 211, 76, 31, 20, 6, 1, 1, 1, 1 },
+      { 680, 210, 75, 30, 19, 6, 1, 1, 1, 1 },
+      { 684, 209, 74, 29, 18, 6, 1, 1, 1, 1 },
+      { 688, 208, 73, 28, 18, 5, 1, 1, 1, 1 },
+      { 692, 206, 72, 28, 17, 5, 1, 1, 1, 1 },
+      { 696, 205, 70, 27, 17, 5, 1, 1, 1, 1 },
+      { 700, 203, 69, 27, 16, 5, 1, 1, 1, 1 },
+      { 704, 201, 68, 26, 16, 5, 1, 1, 1, 1 },
+      { 708, 201, 67, 25, 15, 4, 1, 1, 1, 1 },
+      { 712, 198, 66, 25, 15, 4, 1, 1, 1, 1 },
+      { 716, 197, 65, 24, 14, 4, 1, 1, 1, 1 },
+      { 720, 196, 63, 23, 14, 4, 1, 1, 1, 1 },
+      { 724, 194, 62, 23, 13, 4, 1, 1, 1, 1 },
+      { 728, 193, 61, 22, 13, 3, 1, 1, 1, 1 },
+      { 732, 191, 60, 22, 12, 3, 1, 1, 1, 1 },
+      { 736, 189, 59, 21, 12, 3, 1, 1, 1, 1 },
+      { 740, 188, 58, 20, 11, 3, 1, 1, 1, 1 },
+      { 744, 186, 56, 20, 11, 3, 1, 1, 1, 1 },
+      { 748, 184, 55, 19, 11, 3, 1, 1, 1, 1 },
+      { 752, 182, 54, 19, 10, 3, 1, 1, 1, 1 },
+      { 756, 181, 53, 18, 10, 2, 1, 1, 1, 1 },
+      { 760, 179, 52, 18, 9, 2, 1, 1, 1, 1 },
+      { 764, 177, 51, 17, 9, 2, 1, 1, 1, 1 },
+      { 768, 174, 50, 17, 9, 2, 1, 1, 1, 1 },
+      { 772, 173, 49, 16, 8, 2, 1, 1, 1, 1 },
+      { 776, 171, 47, 16, 8, 2, 1, 1, 1, 1 },
+      { 780, 169, 46, 15, 8, 2, 1, 1, 1, 1 },
+      { 784, 167, 45, 15, 7, 2, 1, 1, 1, 1 },
+      { 788, 165, 44, 14, 7, 2, 1, 1, 1, 1 },
+      { 792, 162, 43, 14, 7, 2, 1, 1, 1, 1 },
+      { 796, 161, 42, 13, 7, 1, 1, 1, 1, 1 },
+      { 800, 159, 41, 13, 6, 1, 1, 1, 1, 1 },
+      { 804, 157, 40, 12, 6, 1, 1, 1, 1, 1 },
+      { 808, 154, 39, 12, 6, 1, 1, 1, 1, 1 },
+      { 812, 153, 38, 11, 5, 1, 1, 1, 1, 1 },
+      { 816, 150, 37, 11, 5, 1, 1, 1, 1, 1 },
+      { 820, 148, 36, 10, 5, 1, 1, 1, 1, 1 },
+      { 824, 145, 35, 10, 5, 1, 1, 1, 1, 1 },
+      { 828, 143, 34, 10, 4, 1, 1, 1, 1, 1 },
+      { 832, 141, 33, 9, 4, 1, 1, 1, 1, 1 },
+      { 836, 138, 32, 9, 4, 1, 1, 1, 1, 1 },
+      { 840, 136, 30, 9, 4, 1, 1, 1, 1, 1 },
+      { 844, 133, 30, 8, 4, 1, 1, 1, 1, 1 },
+      { 848, 131, 29, 8, 3, 1, 1, 1, 1, 1 },
+      { 852, 129, 28, 7, 3, 1, 1, 1, 1, 1 },
+      { 856, 126, 27, 7, 3, 1, 1, 1, 1, 1 },
+      { 860, 123, 26, 7, 3, 1, 1, 1, 1, 1 },
+      { 864, 121, 25, 6, 3, 1, 1, 1, 1, 1 },
+      { 868, 118, 24, 6, 3, 1, 1, 1, 1, 1 },
+      { 872, 116, 23, 6, 2, 1, 1, 1, 1, 1 },
+      { 876, 113, 22, 6, 2, 1, 1, 1, 1, 1 },
+      { 880, 111, 21, 5, 2, 1, 1, 1, 1, 1 },
+      { 884, 108, 20, 5, 2, 1, 1, 1, 1, 1 },
+      { 888, 105, 19, 5, 2, 1, 1, 1, 1, 1 },
+      { 892, 102, 19, 4, 2, 1, 1, 1, 1, 1 },
+      { 896, 99, 18, 4, 2, 1, 1, 1, 1, 1 },
+      { 900, 97, 17, 4, 1, 1, 1, 1, 1, 1 },
+      { 904, 94, 16, 4, 1, 1, 1, 1, 1, 1 },
+      { 908, 92, 15, 3, 1, 1, 1, 1, 1, 1 },
+      { 912, 89, 14, 3, 1, 1, 1, 1, 1, 1 },
+      { 916, 85, 14, 3, 1, 1, 1, 1, 1, 1 },
+      { 920, 82, 13, 3, 1, 1, 1, 1, 1, 1 },
+      { 924, 79, 12, 3, 1, 1, 1, 1, 1, 1 },
+      { 928, 77, 11, 2, 1, 1, 1, 1, 1, 1 },
+      { 932, 73, 11, 2, 1, 1, 1, 1, 1, 1 },
+      { 936, 70, 10, 2, 1, 1, 1, 1, 1, 1 },
+      { 940, 67, 9, 2, 1, 1, 1, 1, 1, 1 },
+      { 944, 64, 8, 2, 1, 1, 1, 1, 1, 1 },
+      { 948, 60, 8, 2, 1, 1, 1, 1, 1, 1 },
+      { 952, 58, 7, 1, 1, 1, 1, 1, 1, 1 },
+      { 956, 54, 7, 1, 1, 1, 1, 1, 1, 1 },
+      { 960, 51, 6, 1, 1, 1, 1, 1, 1, 1 },
+      { 964, 48, 5, 1, 1, 1, 1, 1, 1, 1 },
+      { 968, 44, 5, 1, 1, 1, 1, 1, 1, 1 },
+      { 972, 41, 4, 1, 1, 1, 1, 1, 1, 1 },
+      { 976, 37, 4, 1, 1, 1, 1, 1, 1, 1 },
+      { 980, 34, 3, 1, 1, 1, 1, 1, 1, 1 },
+      { 984, 30, 3, 1, 1, 1, 1, 1, 1, 1 },
+      { 988, 27, 2, 1, 1, 1, 1, 1, 1, 1 },
+      { 992, 23, 2, 1, 1, 1, 1, 1, 1, 1 },
+      { 996, 19, 2, 1, 1, 1, 1, 1, 1, 1 },
+      { 1000, 16, 1, 1, 1, 1, 1, 1, 1, 1 },
+      { 1004, 12, 1, 1, 1, 1, 1, 1, 1, 1 },
+      { 1008, 8, 1, 1, 1, 1, 1, 1, 1, 1 },
+      { 1012, 4, 1, 1, 1, 1, 1, 1, 1, 1 },
+      { 1015, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+      { 1015, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+    };
 #endif  // CONFIG_ANS
 
 /* clang-format off */
@@ -2801,15 +2802,13 @@
 }
 
 #if CONFIG_ANS
-void av1_build_token_cdfs(const aom_prob *pdf_model, rans_lut cdf) {
-  AnsP10 pdf_tab[ENTROPY_TOKENS - 1];
+static void build_token_cdfs(const aom_prob *pdf_model,
+                             aom_cdf_prob cdf[ENTROPY_TOKENS]) {
+  int i, sum = 0;
   assert(pdf_model[2] != 0);
-  // TODO(aconverse): Investigate making the precision of the zero and EOB tree
-  // nodes 10-bits.
-  aom_rans_merge_prob8_pdf(pdf_tab, pdf_model[1],
-                           av1_pareto8_token_probs[pdf_model[2] - 1],
-                           ENTROPY_TOKENS - 2);
-  aom_rans_build_cdf_from_pdf(pdf_tab, cdf);
+  for (i = 0; i < ENTROPY_TOKENS - 2; ++i) {
+    cdf[i] = sum += av1_pareto8_token_probs[pdf_model[2] - 1][i];
+  }
 }
 
 void av1_coef_pareto_cdfs(FRAME_CONTEXT *fc) {
@@ -2819,9 +2818,10 @@
     for (i = 0; i < PLANE_TYPES; ++i)
       for (j = 0; j < REF_TYPES; ++j)
         for (k = 0; k < COEF_BANDS; ++k)
-          for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l)
-            av1_build_token_cdfs(fc->coef_probs[t][i][j][k][l],
-                                 fc->coef_cdfs[t][i][j][k][l]);
+          for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+            build_token_cdfs(fc->coef_probs[t][i][j][k][l],
+                             fc->coef_cdfs[t][i][j][k][l]);
+          }
 }
 #endif  // CONFIG_ANS
 

diff --git a/av1/common/entropy.h b/av1/common/entropy.h
index f0727c0..fd68e82 100644
--- a/av1/common/entropy.h
+++ b/av1/common/entropy.h

@@ -191,10 +191,10 @@
 extern const aom_tree_index av1_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)];
 extern const aom_prob av1_pareto8_full[COEFF_PROB_MODELS][MODEL_NODES];
 #if CONFIG_ANS
-extern const AnsP10 av1_pareto8_token_probs[COEFF_PROB_MODELS]
-                                           [ENTROPY_TOKENS - 2];
-
-typedef rans_lut coeff_cdf_model[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS];
+typedef aom_cdf_prob coeff_cdf_model[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS]
+                                    [ENTROPY_TOKENS];
+extern const aom_cdf_prob av1_pareto8_token_probs[COEFF_PROB_MODELS]
+                                                 [ENTROPY_TOKENS - 2];
 #endif  // CONFIG_ANS
 
 typedef aom_prob av1_coeff_probs_model[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS]

diff --git a/av1/common/enums.h b/av1/common/enums.h
index 89a219d..cefed6c 100644
--- a/av1/common/enums.h
+++ b/av1/common/enums.h

@@ -246,7 +246,17 @@
   PALETTE_COLORS
 } PALETTE_COLOR;
 
+#ifdef CONFIG_CLPF
+#define CLPF_NOFLAG -1
+typedef enum {
+  CLPF_NOSIZE = 0,
+  CLPF_32X32 = 1,
+  CLPF_64X64 = 2,
+  CLPF_128X128 = 3
+} CLPF_BLOCK_SIZE;
+#endif
 typedef enum ATTRIBUTE_PACKED {
+
   DC_PRED,    // Average of above and left pixels
   V_PRED,     // Vertical
   H_PRED,     // Horizontal

diff --git a/av1/common/idct.c b/av1/common/idct.c
index 328f360..eedbc79 100644
--- a/av1/common/idct.c
+++ b/av1/common/idct.c

@@ -33,6 +33,9 @@
   return txsize_sqr_up_map[tx_size] == TX_32X32;
 }
 
+// NOTE: The implementation of all inverses need to be aware of the fact
+// that input and output could be the same buffer.
+
 #if CONFIG_EXT_TX
 static void iidtx4_c(const tran_low_t *input, tran_low_t *output) {
   int i;
@@ -56,17 +59,17 @@
   for (i = 0; i < 32; ++i) output[i] = input[i] * 4;
 }
 
-// For use in lieu of DST
+// For use in lieu of ADST
 static void ihalfright32_c(const tran_low_t *input, tran_low_t *output) {
   int i;
   tran_low_t inputhalf[16];
-  for (i = 0; i < 16; ++i) {
-    output[i] = input[16 + i] * 4;
-  }
   // Multiply input by sqrt(2)
   for (i = 0; i < 16; ++i) {
     inputhalf[i] = (tran_low_t)dct_const_round_shift(input[i] * Sqrt2);
   }
+  for (i = 0; i < 16; ++i) {
+    output[i] = input[16 + i] * 4;
+  }
   idct16_c(inputhalf, output + 16);
   // Note overall scaling factor is 4 times orthogonal
 }
@@ -106,14 +109,14 @@
                                   int bd) {
   int i;
   tran_low_t inputhalf[16];
-  for (i = 0; i < 16; ++i) {
-    output[i] = input[16 + i] * 4;
-  }
   // Multiply input by sqrt(2)
   for (i = 0; i < 16; ++i) {
     inputhalf[i] =
         HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[i] * Sqrt2), bd);
   }
+  for (i = 0; i < 16; ++i) {
+    output[i] = input[16 + i] * 4;
+  }
   aom_highbd_idct16_c(inputhalf, output + 16, bd);
   // Note overall scaling factor is 4 times orthogonal
 }

diff --git a/av1/common/od_dering.c b/av1/common/od_dering.c
index fa59266..7aa704f 100644
--- a/av1/common/od_dering.c
+++ b/av1/common/od_dering.c

@@ -12,14 +12,12 @@
 #include "config.h"
 #endif
 
+// clang-format off
+
 #include <stdlib.h>
 #include <math.h>
 #include "dering.h"
-
-const od_dering_opt_vtbl OD_DERING_VTBL_C = {
-  { od_filter_dering_direction_4x4_c, od_filter_dering_direction_8x8_c },
-  { od_filter_dering_orthogonal_4x4_c, od_filter_dering_orthogonal_8x8_c }
-};
+#include "./av1_rtcd.h"
 
 /* Generated from gen_filter_tables.c. */
 const int OD_DIRECTION_OFFSETS_TABLE[8][3] = {
@@ -35,9 +33,6 @@
   { 1 * OD_FILT_BSTRIDE + 0, 2 * OD_FILT_BSTRIDE - 1, 3 * OD_FILT_BSTRIDE - 1 },
 };
 
-const double OD_DERING_GAIN_TABLE[OD_DERING_LEVELS] = { 0, 0.5,  0.707,
-                                                        1, 1.41, 2 };
-
 /* Detect direction. 0 means 45-degree up-right, 2 is horizontal, and so on.
    The search minimizes the weighted variance along all the lines in a
    particular direction, i.e. the squared error between the input and a
@@ -45,8 +40,8 @@
    in a particular direction. Since each direction have the same sum(x^2) term,
    that term is never computed. See Section 2, step 2, of:
    http://jmvalin.ca/notes/intra_paint.pdf */
-static int od_dir_find8(const od_dering_in *img, int stride, int32_t *var,
-                        int coeff_shift) {
+int od_dir_find8_c(const od_dering_in *img, int stride, int32_t *var,
+                   int coeff_shift) {
   int i;
   int32_t cost[8] = { 0 };
   int partial[8][15] = { { 0 } };
@@ -121,14 +116,15 @@
   ((OD_BSIZE_MAX + 2 * OD_FILT_BORDER) * (OD_BSIZE_MAX + 2 * OD_FILT_BORDER))
 
 /* Smooth in the direction detected. */
-void od_filter_dering_direction_c(int16_t *y, int ystride, const int16_t *in,
-                                  int ln, int threshold, int dir) {
+int od_filter_dering_direction_8x8_c(int16_t *y, int ystride, const int16_t *in,
+                                     int threshold, int dir) {
   int i;
   int j;
   int k;
   static const int taps[3] = { 3, 2, 1 };
-  for (i = 0; i < 1 << ln; i++) {
-    for (j = 0; j < 1 << ln; j++) {
+  int total_abs = 0;
+  for (i = 0; i < 8; i++) {
+    for (j = 0; j < 8; j++) {
       int16_t sum;
       int16_t xx;
       int16_t yy;
@@ -144,28 +140,53 @@
         if (abs(p0) < threshold) sum += taps[k] * p0;
         if (abs(p1) < threshold) sum += taps[k] * p1;
       }
-      yy = xx + ((sum + 8) >> 4);
+      sum = (sum + 8) >> 4;
+      total_abs += abs(sum);
+      yy = xx + sum;
       y[i * ystride + j] = yy;
     }
   }
+  return (total_abs + 8) >> 4;
 }
 
-void od_filter_dering_direction_4x4_c(int16_t *y, int ystride,
-                                      const int16_t *in, int threshold,
-                                      int dir) {
-  od_filter_dering_direction_c(y, ystride, in, 2, threshold, dir);
-}
-
-void od_filter_dering_direction_8x8_c(int16_t *y, int ystride,
-                                      const int16_t *in, int threshold,
-                                      int dir) {
-  od_filter_dering_direction_c(y, ystride, in, 3, threshold, dir);
+/* Smooth in the direction detected. */
+int od_filter_dering_direction_4x4_c(int16_t *y, int ystride, const int16_t *in,
+                                     int threshold, int dir) {
+  int i;
+  int j;
+  int k;
+  static const int taps[2] = { 4, 1 };
+  int total_abs = 0;
+  for (i = 0; i < 4; i++) {
+    for (j = 0; j < 4; j++) {
+      int16_t sum;
+      int16_t xx;
+      int16_t yy;
+      xx = in[i * OD_FILT_BSTRIDE + j];
+      sum = 0;
+      for (k = 0; k < 2; k++) {
+        int16_t p0;
+        int16_t p1;
+        p0 = in[i * OD_FILT_BSTRIDE + j + OD_DIRECTION_OFFSETS_TABLE[dir][k]] -
+             xx;
+        p1 = in[i * OD_FILT_BSTRIDE + j - OD_DIRECTION_OFFSETS_TABLE[dir][k]] -
+             xx;
+        if (abs(p0) < threshold) sum += taps[k] * p0;
+        if (abs(p1) < threshold) sum += taps[k] * p1;
+      }
+      sum = (sum + 8) >> 4;
+      total_abs += abs(sum);
+      yy = xx + sum;
+      y[i * ystride + j] = yy;
+    }
+  }
+  return (total_abs + 2) >> 2;
 }
 
 /* Smooth in the direction orthogonal to what was detected. */
-void od_filter_dering_orthogonal_c(int16_t *y, int ystride, const int16_t *in,
-                                   const od_dering_in *x, int xstride, int ln,
-                                   int threshold, int dir) {
+void od_filter_dering_orthogonal_8x8_c(int16_t *y, int ystride,
+                                       const int16_t *in, int threshold,
+                                       int dir) {
   int i;
   int j;
   int offset;
@@ -173,48 +194,51 @@
     offset = OD_FILT_BSTRIDE;
   else
     offset = 1;
-  for (i = 0; i < 1 << ln; i++) {
-    for (j = 0; j < 1 << ln; j++) {
-      int16_t athresh;
+  for (i = 0; i < 8; i++) {
+    for (j = 0; j < 8; j++) {
       int16_t yy;
       int16_t sum;
       int16_t p;
-      /* Deringing orthogonal to the direction uses a tighter threshold
-         because we want to be conservative. We've presumably already
-         achieved some deringing, so the amount of change is expected
-         to be low. Also, since we might be filtering across an edge, we
-         want to make sure not to blur it. That being said, we might want
-         to be a little bit more aggressive on pure horizontal/vertical
-         since the ringing there tends to be directional, so it doesn't
-         get removed by the directional filtering. */
-      athresh = OD_MINI(
-          threshold, threshold / 3 +
-                         abs(in[i * OD_FILT_BSTRIDE + j] - x[i * xstride + j]));
       yy = in[i * OD_FILT_BSTRIDE + j];
       sum = 0;
       p = in[i * OD_FILT_BSTRIDE + j + offset] - yy;
-      if (abs(p) < athresh) sum += p;
+      if (abs(p) < threshold) sum += p;
       p = in[i * OD_FILT_BSTRIDE + j - offset] - yy;
-      if (abs(p) < athresh) sum += p;
+      if (abs(p) < threshold) sum += p;
       p = in[i * OD_FILT_BSTRIDE + j + 2 * offset] - yy;
-      if (abs(p) < athresh) sum += p;
+      if (abs(p) < threshold) sum += p;
       p = in[i * OD_FILT_BSTRIDE + j - 2 * offset] - yy;
-      if (abs(p) < athresh) sum += p;
+      if (abs(p) < threshold) sum += p;
       y[i * ystride + j] = yy + ((3 * sum + 8) >> 4);
     }
   }
 }
 
+/* Smooth in the direction orthogonal to what was detected. */
 void od_filter_dering_orthogonal_4x4_c(int16_t *y, int ystride,
-                                       const int16_t *in, const od_dering_in *x,
-                                       int xstride, int threshold, int dir) {
-  od_filter_dering_orthogonal_c(y, ystride, in, x, xstride, 2, threshold, dir);
-}
-
-void od_filter_dering_orthogonal_8x8_c(int16_t *y, int ystride,
-                                       const int16_t *in, const od_dering_in *x,
-                                       int xstride, int threshold, int dir) {
-  od_filter_dering_orthogonal_c(y, ystride, in, x, xstride, 3, threshold, dir);
+                                       const int16_t *in, int threshold,
+                                       int dir) {
+  int i;
+  int j;
+  int offset;
+  if (dir > 0 && dir < 4)
+    offset = OD_FILT_BSTRIDE;
+  else
+    offset = 1;
+  for (i = 0; i < 4; i++) {
+    for (j = 0; j < 4; j++) {
+      int16_t yy;
+      int16_t sum;
+      int16_t p;
+      yy = in[i * OD_FILT_BSTRIDE + j];
+      sum = 0;
+      p = in[i * OD_FILT_BSTRIDE + j + offset] - yy;
+      if (abs(p) < threshold) sum += p;
+      p = in[i * OD_FILT_BSTRIDE + j - offset] - yy;
+      if (abs(p) < threshold) sum += p;
+      y[i * ystride + j] = yy + ((5 * sum + 8) >> 4);
+    }
+  }
 }
 
 /* This table approximates x^0.16 with the index being log2(x). It is clamped
@@ -225,34 +249,24 @@
   327, 365, 408, 455, 509, 569, 635, 710, 768,
 };
 
-/* Compute deringing filter threshold for each 8x8 block based on the
+/* Compute deringing filter threshold for an 8x8 block based on the
    directional variance difference. A high variance difference means that we
    have a highly directional pattern (e.g. a high contrast edge), so we can
    apply more deringing. A low variance means that we either have a low
    contrast edge, or a non-directional texture, so we want to be careful not
    to blur. */
-static void od_compute_thresh(int thresh[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS],
-                              int threshold,
-                              int32_t var[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS],
-                              int nhb, int nvb) {
-  int bx;
-  int by;
-  for (by = 0; by < nvb; by++) {
-    for (bx = 0; bx < nhb; bx++) {
-      int v1;
-      /* We use the variance of 8x8 blocks to adjust the threshold. */
-      v1 = OD_MINI(32767, var[by][bx] >> 6);
-      thresh[by][bx] = (threshold * OD_THRESH_TABLE_Q8[OD_ILOG(v1)] + 128) >> 8;
-    }
-  }
+static INLINE int od_adjust_thresh(int threshold, int32_t var) {
+  int v1;
+  /* We use the variance of 8x8 blocks to adjust the threshold. */
+  v1 = OD_MINI(32767, var >> 6);
+  return (threshold * OD_THRESH_TABLE_Q8[OD_ILOG(v1)] + 128) >> 8;
 }
 
-void od_dering(const od_dering_opt_vtbl *vtbl, int16_t *y, int ystride,
-               const od_dering_in *x, int xstride, int nhb, int nvb, int sbx,
-               int sby, int nhsb, int nvsb, int xdec,
+void od_dering(int16_t *y, int ystride, const od_dering_in *x, int xstride,
+               int nhb, int nvb, int sbx, int sby, int nhsb, int nvsb, int xdec,
                int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
                unsigned char *bskip, int skip_stride, int threshold,
-               int overlap, int coeff_shift) {
+               int coeff_shift) {
   int i;
   int j;
   int bx;
@@ -261,7 +275,13 @@
   int16_t *in;
   int bsize;
   int32_t var[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS];
-  int thresh[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS];
+  int filter2_thresh[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS];
+  od_filter_dering_direction_func filter_dering_direction[OD_DERINGSIZES] = {
+    od_filter_dering_direction_4x4, od_filter_dering_direction_8x8
+  };
+  od_filter_dering_orthogonal_func filter_dering_orthogonal[OD_DERINGSIZES] = {
+    od_filter_dering_orthogonal_4x4, od_filter_dering_orthogonal_8x8
+  };
   bsize = 3 - xdec;
   in = inbuf + OD_FILT_BORDER * OD_FILT_BSTRIDE + OD_FILT_BORDER;
   /* We avoid filtering the pixels for which some of the pixels to average
@@ -275,62 +295,44 @@
       in[i * OD_FILT_BSTRIDE + j] = x[i * xstride + j];
     }
   }
+  /* Assume deringing filter is sparsely applied, so do one large copy rather
+     than small copies later if deringing is skipped. */
+  for (i = 0; i < nvb << bsize; i++) {
+    for (j = 0; j < nhb << bsize; j++) {
+      y[i * ystride + j] = in[i * OD_FILT_BSTRIDE + j];
+    }
+  }
   if (pli == 0) {
     for (by = 0; by < nvb; by++) {
       for (bx = 0; bx < nhb; bx++) {
+        if (bskip[by * skip_stride + bx]) continue;
         dir[by][bx] = od_dir_find8(&x[8 * by * xstride + 8 * bx], xstride,
                                    &var[by][bx], coeff_shift);
+        /* Deringing orthogonal to the direction uses a tighter threshold
+           because we want to be conservative. We've presumably already
+           achieved some deringing, so the amount of change is expected
+           to be low. Also, since we might be filtering across an edge, we
+           want to make sure not to blur it. That being said, we might want
+           to be a little bit more aggressive on pure horizontal/vertical
+           since the ringing there tends to be directional, so it doesn't
+           get removed by the directional filtering. */
+        filter2_thresh[by][bx] = (filter_dering_direction[bsize - OD_LOG_BSIZE0])(
+            &y[(by * ystride << bsize) + (bx << bsize)], ystride,
+            &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)],
+            od_adjust_thresh(threshold, var[by][bx]), dir[by][bx]);
       }
     }
-    od_compute_thresh(thresh, threshold, var, nhb, nvb);
   } else {
     for (by = 0; by < nvb; by++) {
       for (bx = 0; bx < nhb; bx++) {
-        thresh[by][bx] = threshold;
+        if (bskip[by * skip_stride + bx]) continue;
+        filter2_thresh[by][bx] = (filter_dering_direction[bsize - OD_LOG_BSIZE0])(
+            &y[(by * ystride << bsize) + (bx << bsize)], ystride,
+            &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], threshold,
+            dir[by][bx]);
       }
     }
   }
-  for (by = 0; by < nvb; by++) {
-    for (bx = 0; bx < nhb; bx++) {
-      int skip;
-#if defined(DAALA_ODINTRIN)
-      int xstart;
-      int ystart;
-      int xend;
-      int yend;
-      xstart = ystart = 0;
-      xend = yend = (2 >> xdec);
-      if (overlap) {
-        xstart -= (sbx != 0);
-        ystart -= (sby != 0);
-        xend += (sbx != nhsb - 1);
-        yend += (sby != nvsb - 1);
-      }
-      skip = 1;
-      /* We look at whether the current block and its 4x4 surrounding (due to
-         lapping) are skipped to avoid filtering the same content multiple
-         times. */
-      for (i = ystart; i < yend; i++) {
-        for (j = xstart; j < xend; j++) {
-          skip = skip && bskip[((by << 1 >> xdec) + i) * skip_stride +
-                               (bx << 1 >> xdec) + j];
-        }
-      }
-#else
-      (void)overlap;
-      skip = bskip[by * skip_stride + bx];
-#endif
-      if (skip) thresh[by][bx] = 0;
-    }
-  }
-  for (by = 0; by < nvb; by++) {
-    for (bx = 0; bx < nhb; bx++) {
-      (vtbl->filter_dering_direction[bsize - OD_LOG_BSIZE0])(
-          &y[(by * ystride << bsize) + (bx << bsize)], ystride,
-          &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], thresh[by][bx],
-          dir[by][bx]);
-    }
-  }
   for (i = 0; i < nvb << bsize; i++) {
     for (j = 0; j < nhb << bsize; j++) {
       in[i * OD_FILT_BSTRIDE + j] = y[i * ystride + j];
@@ -338,10 +340,10 @@
   }
   for (by = 0; by < nvb; by++) {
     for (bx = 0; bx < nhb; bx++) {
-      (vtbl->filter_dering_orthogonal[bsize - OD_LOG_BSIZE0])(
+      if (bskip[by * skip_stride + bx] || filter2_thresh[by][bx] == 0) continue;
+      (filter_dering_orthogonal[bsize - OD_LOG_BSIZE0])(
           &y[(by * ystride << bsize) + (bx << bsize)], ystride,
-          &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)],
-          &x[(by * xstride << bsize) + (bx << bsize)], xstride, thresh[by][bx],
+          &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], filter2_thresh[by][bx],
           dir[by][bx]);
     }
   }

diff --git a/av1/common/od_dering.h b/av1/common/od_dering.h
index 17fee7d..c64439f 100644
--- a/av1/common/od_dering.h
+++ b/av1/common/od_dering.h

@@ -21,12 +21,6 @@
 
 #define OD_DERINGSIZES (2)
 
-#define OD_DERING_NO_CHECK_OVERLAP (0)
-#define OD_DERING_CHECK_OVERLAP (1)
-
-#define OD_DERING_LEVELS (6)
-extern const double OD_DERING_GAIN_TABLE[OD_DERING_LEVELS];
-
 #define OD_DERING_NBLOCKS (OD_BSIZE_MAX / 8)
 
 #define OD_FILT_BORDER (3)
@@ -34,46 +28,25 @@
 
 extern const int OD_DIRECTION_OFFSETS_TABLE[8][3];
 
-typedef void (*od_filter_dering_direction_func)(int16_t *y, int ystride,
-                                                const int16_t *in,
-                                                int threshold, int dir);
+typedef int (*od_filter_dering_direction_func)(int16_t *y, int ystride,
+                                               const int16_t *in, int threshold,
+                                               int dir);
 typedef void (*od_filter_dering_orthogonal_func)(int16_t *y, int ystride,
                                                  const int16_t *in,
-                                                 const od_dering_in *x,
-                                                 int xstride, int threshold,
-                                                 int dir);
-
-struct od_dering_opt_vtbl {
-  od_filter_dering_direction_func filter_dering_direction[OD_DERINGSIZES];
-  od_filter_dering_orthogonal_func filter_dering_orthogonal[OD_DERINGSIZES];
-};
-typedef struct od_dering_opt_vtbl od_dering_opt_vtbl;
-
-void od_dering(const od_dering_opt_vtbl *vtbl, int16_t *y, int ystride,
-               const od_dering_in *x, int xstride, int nvb, int nhb, int sbx,
-               int sby, int nhsb, int nvsb, int xdec,
+                                                 int threshold, int dir);
+void od_dering(int16_t *y, int ystride, const od_dering_in *x, int xstride,
+               int nvb, int nhb, int sbx, int sby, int nhsb, int nvsb, int xdec,
                int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
                unsigned char *bskip, int skip_stride, int threshold,
-               int overlap, int coeff_shift);
-void od_filter_dering_direction_c(int16_t *y, int ystride, const int16_t *in,
-                                  int ln, int threshold, int dir);
-void od_filter_dering_orthogonal_c(int16_t *y, int ystride, const int16_t *in,
-                                   const od_dering_in *x, int xstride, int ln,
-                                   int threshold, int dir);
-
-extern const od_dering_opt_vtbl OD_DERING_VTBL_C;
-
-void od_filter_dering_direction_4x4_c(int16_t *y, int ystride,
-                                      const int16_t *in, int threshold,
-                                      int dir);
-void od_filter_dering_direction_8x8_c(int16_t *y, int ystride,
-                                      const int16_t *in, int threshold,
-                                      int dir);
+               int coeff_shift);
+int od_filter_dering_direction_4x4_c(int16_t *y, int ystride, const int16_t *in,
+                                     int threshold, int dir);
+int od_filter_dering_direction_8x8_c(int16_t *y, int ystride, const int16_t *in,
+                                     int threshold, int dir);
 void od_filter_dering_orthogonal_4x4_c(int16_t *y, int ystride,
-                                       const int16_t *in, const od_dering_in *x,
-                                       int xstride, int threshold, int dir);
+                                       const int16_t *in, int threshold,
+                                       int dir);
 void od_filter_dering_orthogonal_8x8_c(int16_t *y, int ystride,
-                                       const int16_t *in, const od_dering_in *x,
-                                       int xstride, int threshold, int dir);
-
+                                       const int16_t *in, int threshold,
+                                       int dir);
 #endif

diff --git a/av1/common/onyxc_int.h b/av1/common/onyxc_int.h
index 98f4f51..3a2203a 100644
--- a/av1/common/onyxc_int.h
+++ b/av1/common/onyxc_int.h

@@ -151,12 +151,27 @@
   int use_highbitdepth;
 #endif
 #if CONFIG_CLPF
-  int clpf_numblocks;
-  int clpf_size;
+  // Two bits are used to signal the strength for all blocks and the
+  // valid values are:
+  // 0: no filtering
+  // 1: strength = 1
+  // 2: strength = 2
+  // 3: strength = 4
   int clpf_strength_y;
   int clpf_strength_u;
   int clpf_strength_v;
-  uint8_t *clpf_blocks;
+
+  // If clpf_strength_y is not 0, another two bits are used to signal
+  // the filter block size.  The valid values for clfp_size are:
+  // 0: no block signalling
+  // 1: 32x32
+  // 2: 64x64
+  // 3: 128x128
+  CLPF_BLOCK_SIZE clpf_size;
+
+  // Buffer for storing whether to filter individual blocks.
+  int8_t *clpf_blocks;
+  int clpf_stride;
 #endif
 
   YV12_BUFFER_CONFIG *frame_to_show;

diff --git a/av1/common/x86/od_dering_sse4.c b/av1/common/x86/od_dering_sse4.c
new file mode 100644
index 0000000..80bdba7
--- /dev/null
+++ b/av1/common/x86/od_dering_sse4.c

@@ -0,0 +1,499 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+#include <emmintrin.h>
+#include <tmmintrin.h>
+
+#include "./av1_rtcd.h"
+#include "av1/common/x86/od_dering_sse4.h"
+
+/* partial A is a 16-bit vector of the form:
+   [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form:
+   [0  y1 y2 y3 y4 y5 y6 y7].
+   This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
+   (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1
+   and const2. */
+static INLINE __m128i fold_mul_and_sum(__m128i partiala, __m128i partialb,
+                                       __m128i const1, __m128i const2) {
+  __m128i tmp;
+  /* Reverse partial B. */
+  partialb = _mm_shuffle_epi8(
+      partialb,
+      _mm_set_epi8(15, 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12));
+  /* Interleave the x and y values of identical indices and pair x8 with 0. */
+  tmp = partiala;
+  partiala = _mm_unpacklo_epi16(partiala, partialb);
+  partialb = _mm_unpackhi_epi16(tmp, partialb);
+  /* Square and add the corresponding x and y values. */
+  partiala = _mm_madd_epi16(partiala, partiala);
+  partialb = _mm_madd_epi16(partialb, partialb);
+  /* Multiply by constant. */
+  partiala = _mm_mullo_epi32(partiala, const1);
+  partialb = _mm_mullo_epi32(partialb, const2);
+  /* Sum all results. */
+  partiala = _mm_add_epi32(partiala, partialb);
+  return partiala;
+}
+
+static INLINE __m128i hsum4(__m128i x0, __m128i x1, __m128i x2, __m128i x3) {
+  __m128i t0, t1, t2, t3;
+  t0 = _mm_unpacklo_epi32(x0, x1);
+  t1 = _mm_unpacklo_epi32(x2, x3);
+  t2 = _mm_unpackhi_epi32(x0, x1);
+  t3 = _mm_unpackhi_epi32(x2, x3);
+  x0 = _mm_unpacklo_epi64(t0, t1);
+  x1 = _mm_unpackhi_epi64(t0, t1);
+  x2 = _mm_unpacklo_epi64(t2, t3);
+  x3 = _mm_unpackhi_epi64(t2, t3);
+  return _mm_add_epi32(_mm_add_epi32(x0, x1), _mm_add_epi32(x2, x3));
+}
+
+/* Horizontal sum of 8x16-bit unsigned values. */
+static INLINE int32_t hsum_epi16(__m128i a) {
+  a = _mm_madd_epi16(a, _mm_set1_epi16(1));
+  a = _mm_hadd_epi32(a, a);
+  a = _mm_hadd_epi32(a, a);
+  return _mm_cvtsi128_si32(a);
+}
+
+/* Computes cost for directions 0, 5, 6 and 7. We can call this function again
+   to compute the remaining directions. */
+static INLINE __m128i compute_directions(__m128i lines[8],
+                                         int32_t tmp_cost1[4]) {
+  __m128i partial4a, partial4b, partial5a, partial5b, partial7a, partial7b;
+  __m128i partial6;
+  __m128i tmp;
+  /* Partial sums for lines 0 and 1. */
+  partial4a = _mm_slli_si128(lines[0], 14);
+  partial4b = _mm_srli_si128(lines[0], 2);
+  partial4a = _mm_add_epi16(partial4a, _mm_slli_si128(lines[1], 12));
+  partial4b = _mm_add_epi16(partial4b, _mm_srli_si128(lines[1], 4));
+  tmp = _mm_add_epi16(lines[0], lines[1]);
+  partial5a = _mm_slli_si128(tmp, 10);
+  partial5b = _mm_srli_si128(tmp, 6);
+  partial7a = _mm_slli_si128(tmp, 4);
+  partial7b = _mm_srli_si128(tmp, 12);
+  partial6 = tmp;
+
+  /* Partial sums for lines 2 and 3. */
+  partial4a = _mm_add_epi16(partial4a, _mm_slli_si128(lines[2], 10));
+  partial4b = _mm_add_epi16(partial4b, _mm_srli_si128(lines[2], 6));
+  partial4a = _mm_add_epi16(partial4a, _mm_slli_si128(lines[3], 8));
+  partial4b = _mm_add_epi16(partial4b, _mm_srli_si128(lines[3], 8));
+  tmp = _mm_add_epi16(lines[2], lines[3]);
+  partial5a = _mm_add_epi16(partial5a, _mm_slli_si128(tmp, 8));
+  partial5b = _mm_add_epi16(partial5b, _mm_srli_si128(tmp, 8));
+  partial7a = _mm_add_epi16(partial7a, _mm_slli_si128(tmp, 6));
+  partial7b = _mm_add_epi16(partial7b, _mm_srli_si128(tmp, 10));
+  partial6 = _mm_add_epi16(partial6, tmp);
+
+  /* Partial sums for lines 4 and 5. */
+  partial4a = _mm_add_epi16(partial4a, _mm_slli_si128(lines[4], 6));
+  partial4b = _mm_add_epi16(partial4b, _mm_srli_si128(lines[4], 10));
+  partial4a = _mm_add_epi16(partial4a, _mm_slli_si128(lines[5], 4));
+  partial4b = _mm_add_epi16(partial4b, _mm_srli_si128(lines[5], 12));
+  tmp = _mm_add_epi16(lines[4], lines[5]);
+  partial5a = _mm_add_epi16(partial5a, _mm_slli_si128(tmp, 6));
+  partial5b = _mm_add_epi16(partial5b, _mm_srli_si128(tmp, 10));
+  partial7a = _mm_add_epi16(partial7a, _mm_slli_si128(tmp, 8));
+  partial7b = _mm_add_epi16(partial7b, _mm_srli_si128(tmp, 8));
+  partial6 = _mm_add_epi16(partial6, tmp);
+
+  /* Partial sums for lines 6 and 7. */
+  partial4a = _mm_add_epi16(partial4a, _mm_slli_si128(lines[6], 2));
+  partial4b = _mm_add_epi16(partial4b, _mm_srli_si128(lines[6], 14));
+  partial4a = _mm_add_epi16(partial4a, lines[7]);
+  tmp = _mm_add_epi16(lines[6], lines[7]);
+  partial5a = _mm_add_epi16(partial5a, _mm_slli_si128(tmp, 4));
+  partial5b = _mm_add_epi16(partial5b, _mm_srli_si128(tmp, 12));
+  partial7a = _mm_add_epi16(partial7a, _mm_slli_si128(tmp, 10));
+  partial7b = _mm_add_epi16(partial7b, _mm_srli_si128(tmp, 6));
+  partial6 = _mm_add_epi16(partial6, tmp);
+
+  /* Compute costs in terms of partial sums. */
+  partial4a =
+      fold_mul_and_sum(partial4a, partial4b, _mm_set_epi32(210, 280, 420, 840),
+                       _mm_set_epi32(105, 120, 140, 168));
+  partial7a =
+      fold_mul_and_sum(partial7a, partial7b, _mm_set_epi32(210, 420, 0, 0),
+                       _mm_set_epi32(105, 105, 105, 140));
+  partial5a =
+      fold_mul_and_sum(partial5a, partial5b, _mm_set_epi32(210, 420, 0, 0),
+                       _mm_set_epi32(105, 105, 105, 140));
+  partial6 = _mm_madd_epi16(partial6, partial6);
+  partial6 = _mm_mullo_epi32(partial6, _mm_set1_epi32(105));
+
+  partial4a = hsum4(partial4a, partial5a, partial6, partial7a);
+  _mm_storeu_si128((__m128i *)tmp_cost1, partial4a);
+  return partial4a;
+}
+
+/* transpose and reverse the order of the lines -- equivalent to a 90-degree
+   counter-clockwise rotation of the pixels. */
+static INLINE void array_reverse_transpose_8x8(__m128i *in, __m128i *res) {
+  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
+  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
+  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
+
+  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+
+  res[7] = _mm_unpacklo_epi64(tr1_0, tr1_1);
+  res[6] = _mm_unpackhi_epi64(tr1_0, tr1_1);
+  res[5] = _mm_unpacklo_epi64(tr1_2, tr1_3);
+  res[4] = _mm_unpackhi_epi64(tr1_2, tr1_3);
+  res[3] = _mm_unpacklo_epi64(tr1_4, tr1_5);
+  res[2] = _mm_unpackhi_epi64(tr1_4, tr1_5);
+  res[1] = _mm_unpacklo_epi64(tr1_6, tr1_7);
+  res[0] = _mm_unpackhi_epi64(tr1_6, tr1_7);
+}
+
+int od_dir_find8_sse4_1(const od_dering_in *img, int stride, int32_t *var,
+                        int coeff_shift) {
+  int i;
+  int32_t cost[8];
+  int32_t best_cost = 0;
+  int best_dir = 0;
+  __m128i lines[8];
+  __m128i dir03, dir47;
+  __m128i max;
+  for (i = 0; i < 8; i++) {
+    lines[i] = _mm_loadu_si128((__m128i *)&img[i * stride]);
+    lines[i] = _mm_sub_epi16(_mm_srai_epi16(lines[i], coeff_shift),
+                             _mm_set1_epi16(128));
+  }
+
+  /* Compute "mostly vertical" directions. */
+  dir47 = compute_directions(lines, cost + 4);
+
+  array_reverse_transpose_8x8(lines, lines);
+
+  /* Compute "mostly horizontal" directions. */
+  dir03 = compute_directions(lines, cost);
+
+#if 1
+  max = _mm_max_epi32(dir03, dir47);
+  max = _mm_max_epi32(max, _mm_shuffle_epi32(max, _MM_SHUFFLE(1, 0, 3, 2)));
+  max = _mm_max_epi32(max, _mm_shuffle_epi32(max, _MM_SHUFFLE(2, 3, 0, 1)));
+  dir03 = _mm_and_si128(_mm_cmpeq_epi32(max, dir03),
+                        _mm_setr_epi32(-1, -2, -3, -4));
+  dir47 = _mm_and_si128(_mm_cmpeq_epi32(max, dir47),
+                        _mm_setr_epi32(-5, -6, -7, -8));
+  dir03 = _mm_max_epu32(dir03, dir47);
+  dir03 = _mm_max_epu32(dir03, _mm_unpackhi_epi64(dir03, dir03));
+  dir03 =
+      _mm_max_epu32(dir03, _mm_shufflelo_epi16(dir03, _MM_SHUFFLE(1, 0, 3, 2)));
+  dir03 = _mm_xor_si128(dir03, _mm_set1_epi32(0xFFFFFFFF));
+
+  best_dir = _mm_cvtsi128_si32(dir03);
+  best_cost = _mm_cvtsi128_si32(max);
+#else
+  for (i = 0; i < 8; i++) {
+    if (cost[i] > best_cost) {
+      best_cost = cost[i];
+      best_dir = i;
+    }
+  }
+#endif
+  /* Difference between the optimal variance and the variance along the
+     orthogonal direction. Again, the sum(x^2) terms cancel out. */
+  *var = best_cost - cost[(best_dir + 4) & 7];
+  /* We'd normally divide by 840, but dividing by 1024 is close enough
+     for what we're going to do with this. */
+  *var >>= 10;
+  return best_dir;
+}
+
+static INLINE __m128i od_cmplt_abs_epi16(__m128i in, __m128i threshold) {
+  return _mm_cmplt_epi16(_mm_abs_epi16(in), threshold);
+}
+
+int od_filter_dering_direction_4x4_sse4_1(int16_t *y, int ystride,
+                                          const int16_t *in, int threshold,
+                                          int dir) {
+  int i;
+  __m128i sum;
+  __m128i p;
+  __m128i cmp;
+  __m128i row;
+  __m128i res;
+  __m128i tmp;
+  __m128i thresh;
+  __m128i total_abs;
+  int off1, off2;
+  off1 = OD_DIRECTION_OFFSETS_TABLE[dir][0];
+  off2 = OD_DIRECTION_OFFSETS_TABLE[dir][1];
+  total_abs = _mm_setzero_si128();
+  thresh = _mm_set1_epi16(threshold);
+  for (i = 0; i < 4; i += 2) {
+    sum = _mm_set1_epi16(0);
+    row = _mm_unpacklo_epi64(
+        _mm_loadl_epi64((__m128i *)&in[i * OD_FILT_BSTRIDE]),
+        _mm_loadl_epi64((__m128i *)&in[(i + 1) * OD_FILT_BSTRIDE]));
+
+    /*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
+    tmp = _mm_unpacklo_epi64(
+        _mm_loadl_epi64((__m128i *)&in[i * OD_FILT_BSTRIDE + off1]),
+        _mm_loadl_epi64((__m128i *)&in[(i + 1) * OD_FILT_BSTRIDE + off1]));
+    p = _mm_sub_epi16(tmp, row);
+    /*if (abs(p) < thresh) sum += taps[k]*p*/
+    cmp = od_cmplt_abs_epi16(p, thresh);
+    p = _mm_slli_epi16(p, 2);
+    p = _mm_and_si128(p, cmp);
+    sum = _mm_add_epi16(sum, p);
+    /*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
+    tmp = _mm_unpacklo_epi64(
+        _mm_loadl_epi64((__m128i *)&in[i * OD_FILT_BSTRIDE - off1]),
+        _mm_loadl_epi64((__m128i *)&in[(i + 1) * OD_FILT_BSTRIDE - off1]));
+    p = _mm_sub_epi16(tmp, row);
+    /*if (abs(p) < thresh) sum += taps[k]*p1*/
+    cmp = od_cmplt_abs_epi16(p, thresh);
+    p = _mm_slli_epi16(p, 2);
+    p = _mm_and_si128(p, cmp);
+    sum = _mm_add_epi16(sum, p);
+
+    /*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
+    tmp = _mm_unpacklo_epi64(
+        _mm_loadl_epi64((__m128i *)&in[i * OD_FILT_BSTRIDE + off2]),
+        _mm_loadl_epi64((__m128i *)&in[(i + 1) * OD_FILT_BSTRIDE + off2]));
+    p = _mm_sub_epi16(tmp, row);
+    /*if (abs(p) < thresh) sum += taps[k]*p*/
+    cmp = od_cmplt_abs_epi16(p, thresh);
+    p = _mm_and_si128(p, cmp);
+    sum = _mm_add_epi16(sum, p);
+    /*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
+    tmp = _mm_unpacklo_epi64(
+        _mm_loadl_epi64((__m128i *)&in[i * OD_FILT_BSTRIDE - off2]),
+        _mm_loadl_epi64((__m128i *)&in[(i + 1) * OD_FILT_BSTRIDE - off2]));
+    p = _mm_sub_epi16(tmp, row);
+    /*if (abs(p) < thresh) sum += taps[k]*p1*/
+    cmp = od_cmplt_abs_epi16(p, thresh);
+    p = _mm_and_si128(p, cmp);
+    sum = _mm_add_epi16(sum, p);
+
+    /*res = row + ((sum + 8) >> 4)*/
+    res = _mm_add_epi16(sum, _mm_set1_epi16(8));
+    res = _mm_srai_epi16(res, 4);
+    total_abs = _mm_add_epi16(total_abs, _mm_abs_epi16(res));
+    res = _mm_add_epi16(row, res);
+    _mm_storel_epi64((__m128i *)&y[i * ystride], res);
+    _mm_storel_epi64((__m128i *)&y[(i + 1) * ystride],
+                     _mm_unpackhi_epi64(res, res));
+  }
+  return (hsum_epi16(total_abs) + 2) >> 2;
+}
+
+int od_filter_dering_direction_8x8_sse4_1(int16_t *y, int ystride,
+                                          const int16_t *in, int threshold,
+                                          int dir) {
+  int i;
+  __m128i sum;
+  __m128i p;
+  __m128i cmp;
+  __m128i row;
+  __m128i res;
+  __m128i thresh;
+  __m128i total_abs;
+  int off1, off2, off3;
+  off1 = OD_DIRECTION_OFFSETS_TABLE[dir][0];
+  off2 = OD_DIRECTION_OFFSETS_TABLE[dir][1];
+  off3 = OD_DIRECTION_OFFSETS_TABLE[dir][2];
+  total_abs = _mm_setzero_si128();
+  thresh = _mm_set1_epi16(threshold);
+  for (i = 0; i < 8; i++) {
+    sum = _mm_set1_epi16(0);
+    row = _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE]);
+
+    /*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
+    p = _mm_sub_epi16(
+        _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE + off1]), row);
+    /*if (abs(p) < thresh) sum += taps[k]*p*/
+    cmp = od_cmplt_abs_epi16(p, thresh);
+    p = _mm_add_epi16(p, _mm_slli_epi16(p, 1));
+    p = _mm_and_si128(p, cmp);
+    sum = _mm_add_epi16(sum, p);
+
+    /*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
+    p = _mm_sub_epi16(
+        _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE - off1]), row);
+    /*if (abs(p) < thresh) sum += taps[k]*p1*/
+    cmp = od_cmplt_abs_epi16(p, thresh);
+    p = _mm_add_epi16(p, _mm_slli_epi16(p, 1));
+    p = _mm_and_si128(p, cmp);
+    sum = _mm_add_epi16(sum, p);
+
+    /*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
+    p = _mm_sub_epi16(
+        _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE + off2]), row);
+    /*if (abs(p) < thresh) sum += taps[k]*p*/
+    cmp = od_cmplt_abs_epi16(p, thresh);
+    p = _mm_slli_epi16(p, 1);
+    p = _mm_and_si128(p, cmp);
+    sum = _mm_add_epi16(sum, p);
+
+    /*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
+    p = _mm_sub_epi16(
+        _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE - off2]), row);
+    /*if (abs(p) < thresh) sum += taps[k]*p1*/
+    cmp = od_cmplt_abs_epi16(p, thresh);
+    p = _mm_slli_epi16(p, 1);
+    p = _mm_and_si128(p, cmp);
+    sum = _mm_add_epi16(sum, p);
+
+    /*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
+    p = _mm_sub_epi16(
+        _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE + off3]), row);
+    /*if (abs(p) < thresh) sum += taps[k]*p*/
+    cmp = od_cmplt_abs_epi16(p, thresh);
+    p = _mm_and_si128(p, cmp);
+    sum = _mm_add_epi16(sum, p);
+
+    /*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
+    p = _mm_sub_epi16(
+        _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE - off3]), row);
+    /*if (abs(p) < thresh) sum += taps[k]*p1*/
+    cmp = od_cmplt_abs_epi16(p, thresh);
+    p = _mm_and_si128(p, cmp);
+    sum = _mm_add_epi16(sum, p);
+
+    /*res = row + ((sum + 8) >> 4)*/
+    res = _mm_add_epi16(sum, _mm_set1_epi16(8));
+    res = _mm_srai_epi16(res, 4);
+    total_abs = _mm_add_epi16(total_abs, _mm_abs_epi16(res));
+    res = _mm_add_epi16(row, res);
+    _mm_storeu_si128((__m128i *)&y[i * ystride], res);
+  }
+  return (hsum_epi16(total_abs) + 8) >> 4;
+}
+
+void od_filter_dering_orthogonal_4x4_sse4_1(int16_t *y, int ystride,
+                                            const int16_t *in, int threshold,
+                                            int dir) {
+  int i;
+  int offset;
+  __m128i res;
+  __m128i p;
+  __m128i cmp;
+  __m128i row;
+  __m128i sum;
+  __m128i tmp;
+  __m128i thresh;
+  thresh = _mm_set1_epi16(threshold);
+  if (dir > 0 && dir < 4)
+    offset = OD_FILT_BSTRIDE;
+  else
+    offset = 1;
+  for (i = 0; i < 4; i += 2) {
+    sum = _mm_set1_epi16(0);
+    row = _mm_unpacklo_epi64(
+        _mm_loadl_epi64((__m128i *)&in[i * OD_FILT_BSTRIDE]),
+        _mm_loadl_epi64((__m128i *)&in[(i + 1) * OD_FILT_BSTRIDE]));
+
+    /*p = in[i*OD_FILT_BSTRIDE + k*offset] - row*/
+    tmp = _mm_unpacklo_epi64(
+        _mm_loadl_epi64((__m128i *)&in[i * OD_FILT_BSTRIDE + offset]),
+        _mm_loadl_epi64((__m128i *)&in[(i + 1) * OD_FILT_BSTRIDE + offset]));
+    p = _mm_sub_epi16(tmp, row);
+    /*if (abs(p) < threshold) sum += p*/
+    cmp = od_cmplt_abs_epi16(p, thresh);
+    p = _mm_and_si128(p, cmp);
+    sum = _mm_add_epi16(sum, p);
+    /*p = in[i*OD_FILT_BSTRIDE - k*offset] - row*/
+    tmp = _mm_unpacklo_epi64(
+        _mm_loadl_epi64((__m128i *)&in[i * OD_FILT_BSTRIDE - offset]),
+        _mm_loadl_epi64((__m128i *)&in[(i + 1) * OD_FILT_BSTRIDE - offset]));
+    p = _mm_sub_epi16(tmp, row);
+    /*if (abs(p) < threshold) sum += p*/
+    cmp = od_cmplt_abs_epi16(p, thresh);
+    p = _mm_and_si128(p, cmp);
+    sum = _mm_add_epi16(sum, p);
+
+    /*row + ((5*sum + 8) >> 4)*/
+    res = _mm_mullo_epi16(sum, _mm_set1_epi16(5));
+    res = _mm_add_epi16(res, _mm_set1_epi16(8));
+    res = _mm_srai_epi16(res, 4);
+    res = _mm_add_epi16(res, row);
+    _mm_storel_epi64((__m128i *)&y[i * ystride], res);
+    _mm_storel_epi64((__m128i *)&y[(i + 1) * ystride],
+                     _mm_unpackhi_epi64(res, res));
+  }
+}
+
+void od_filter_dering_orthogonal_8x8_sse4_1(int16_t *y, int ystride,
+                                            const int16_t *in, int threshold,
+                                            int dir) {
+  int i;
+  int offset;
+  __m128i res;
+  __m128i p;
+  __m128i cmp;
+  __m128i row;
+  __m128i sum;
+  __m128i thresh;
+  thresh = _mm_set1_epi16(threshold);
+  if (dir > 0 && dir < 4)
+    offset = OD_FILT_BSTRIDE;
+  else
+    offset = 1;
+  for (i = 0; i < 8; i++) {
+    sum = _mm_set1_epi16(0);
+    row = _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE]);
+
+    /*p = in[i*OD_FILT_BSTRIDE + k*offset] - row*/
+    p = _mm_sub_epi16(
+        _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE + 1 * offset]), row);
+    /*if (abs(p) < thresh) sum += p*/
+    cmp = od_cmplt_abs_epi16(p, thresh);
+    p = _mm_and_si128(p, cmp);
+    sum = _mm_add_epi16(sum, p);
+    /*p = in[i*OD_FILT_BSTRIDE - k*offset] - row*/
+    p = _mm_sub_epi16(
+        _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE - 1 * offset]), row);
+    /*if (abs(p) < threshold) sum += p*/
+    cmp = od_cmplt_abs_epi16(p, thresh);
+    p = _mm_and_si128(p, cmp);
+    sum = _mm_add_epi16(sum, p);
+
+    /*p = in[i*OD_FILT_BSTRIDE + k*offset] - row*/
+    p = _mm_sub_epi16(
+        _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE + 2 * offset]), row);
+    /*if (abs(p) < threshold) sum += p*/
+    cmp = od_cmplt_abs_epi16(p, thresh);
+    p = _mm_and_si128(p, cmp);
+    sum = _mm_add_epi16(sum, p);
+    /*p = in[i*OD_FILT_BSTRIDE - k*offset] - row*/
+    p = _mm_sub_epi16(
+        _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE - 2 * offset]), row);
+    /*if (abs(p) < threshold) sum += p*/
+    cmp = od_cmplt_abs_epi16(p, thresh);
+    p = _mm_and_si128(p, cmp);
+    sum = _mm_add_epi16(sum, p);
+
+    /*row + ((3*sum + 8) >> 4)*/
+    res = _mm_mullo_epi16(sum, _mm_set1_epi16(3));
+    res = _mm_add_epi16(res, _mm_set1_epi16(8));
+    res = _mm_srai_epi16(res, 4);
+    res = _mm_add_epi16(res, row);
+    _mm_storeu_si128((__m128i *)&y[i * ystride], res);
+  }
+}

diff --git a/av1/common/x86/od_dering_sse4.h b/av1/common/x86/od_dering_sse4.h
new file mode 100644
index 0000000..950ec5f
--- /dev/null
+++ b/av1/common/x86/od_dering_sse4.h

@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "av1/common/od_dering.h"
+#ifndef AOM_COMMON_OD_DERING_X86_SSE4_H_
+#define AOM_COMMON_OD_DERING_X86_SSE4_H_
+#endif  // AOM_COMMON_OD_DERING_X86_SSE4_H_

diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index e7a0578..acca4cb 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c

@@ -1440,6 +1440,22 @@
   }
 }
 #endif  // CONFIG_SUPERTX
+#if CONFIG_CLPF
+static int clpf_all_skip(const AV1_COMMON *cm, int mi_col, int mi_row,
+                         int size) {
+  int r, c;
+  int skip = 1;
+  const int maxc = AOMMIN(size, cm->mi_cols - mi_col);
+  const int maxr = AOMMIN(size, cm->mi_rows - mi_row);
+  for (r = 0; r < maxr && skip; r++) {
+    for (c = 0; c < maxc && skip; c++) {
+      skip &= !!cm->mi_grid_visible[(mi_row + r) * cm->mi_stride + mi_col + c]
+                    ->mbmi.skip;
+    }
+  }
+  return skip;
+}
+#endif
 
 // TODO(slavarnway): eliminate bsize and subsize in future commits
 static void decode_partition(AV1Decoder *const pbi, MACROBLOCKD *const xd,
@@ -1772,7 +1788,44 @@
   if (bsize >= BLOCK_8X8 &&
       (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
     dec_update_partition_context(xd, mi_row, mi_col, subsize, num_8x8_wh);
-#if DERING_REFINEMENT
+
+#if CONFIG_CLPF
+  if (bsize == BLOCK_64X64 && cm->clpf_strength_y &&
+      cm->clpf_size != CLPF_NOSIZE) {
+    const int tl = mi_row * MI_SIZE / MIN_FB_SIZE * cm->clpf_stride +
+                   mi_col * MI_SIZE / MIN_FB_SIZE;
+
+    if (!((mi_row * MI_SIZE) & 127) && !((mi_col * MI_SIZE) & 127) &&
+        cm->clpf_size == CLPF_128X128) {
+      cm->clpf_blocks[tl] = aom_read_literal(r, 1, ACCT_STR);
+    } else if (cm->clpf_size == CLPF_64X64 &&
+               !clpf_all_skip(cm, mi_col, mi_row, 64 / MI_SIZE)) {
+      cm->clpf_blocks[tl] = aom_read_literal(r, 1, ACCT_STR);
+    } else if (cm->clpf_size == CLPF_32X32) {
+      const int tr = tl + 1;
+      const int bl = tl + cm->clpf_stride;
+      const int br = tr + cm->clpf_stride;
+      const int size = 32 / MI_SIZE;
+
+      // Up to four bits per SB
+      if (!clpf_all_skip(cm, mi_col, mi_row, size))
+        cm->clpf_blocks[tl] = aom_read_literal(r, 1, ACCT_STR);
+
+      if (mi_col + size < cm->mi_cols &&
+          !clpf_all_skip(cm, mi_col + size, mi_row, size))
+        cm->clpf_blocks[tr] = aom_read_literal(r, 1, ACCT_STR);
+
+      if (mi_row + size < cm->mi_rows &&
+          !clpf_all_skip(cm, mi_col, mi_row + size, size))
+        cm->clpf_blocks[bl] = aom_read_literal(r, 1, ACCT_STR);
+
+      if (mi_col + size < cm->mi_cols && mi_row + size < cm->mi_rows &&
+          !clpf_all_skip(cm, mi_col + size, mi_row + size, size))
+        cm->clpf_blocks[br] = aom_read_literal(r, 1, ACCT_STR);
+    }
+  }
+#endif
+#if CONFIG_DERING
   if (bsize == BLOCK_64X64) {
     if (cm->dering_level != 0 && !sb_all_skip(cm, mi_row, mi_col)) {
       cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]->mbmi.dering_gain =
@@ -1782,7 +1835,7 @@
           0;
     }
   }
-#endif  // DERGING_REFINEMENT
+#endif
 #endif  // CONFIG_EXT_PARTITION_TYPES
 }
 
@@ -2045,20 +2098,26 @@
 }
 
 #if CONFIG_CLPF
-static void setup_clpf(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+static void setup_clpf(AV1Decoder *pbi, struct aom_read_bit_buffer *rb) {
+  AV1_COMMON *const cm = &pbi->common;
+  const int width = pbi->cur_buf->buf.y_crop_width;
+  const int height = pbi->cur_buf->buf.y_crop_height;
+
   cm->clpf_blocks = 0;
   cm->clpf_strength_y = aom_rb_read_literal(rb, 2);
   cm->clpf_strength_u = aom_rb_read_literal(rb, 2);
   cm->clpf_strength_v = aom_rb_read_literal(rb, 2);
   if (cm->clpf_strength_y) {
     cm->clpf_size = aom_rb_read_literal(rb, 2);
-    if (cm->clpf_size) {
-      int i;
-      cm->clpf_numblocks = aom_rb_read_literal(rb, av1_clpf_maxbits(cm));
-      CHECK_MEM_ERROR(cm, cm->clpf_blocks, aom_malloc(cm->clpf_numblocks));
-      for (i = 0; i < cm->clpf_numblocks; i++) {
-        cm->clpf_blocks[i] = aom_rb_read_literal(rb, 1);
-      }
+    if (cm->clpf_size != CLPF_NOSIZE) {
+      int size;
+      cm->clpf_stride =
+          ((width + MIN_FB_SIZE - 1) & ~(MIN_FB_SIZE - 1)) >> MIN_FB_SIZE_LOG2;
+      size =
+          cm->clpf_stride * ((height + MIN_FB_SIZE - 1) & ~(MIN_FB_SIZE - 1)) >>
+          MIN_FB_SIZE_LOG2;
+      CHECK_MEM_ERROR(cm, cm->clpf_blocks, aom_malloc(size));
+      memset(cm->clpf_blocks, -1, size);
     }
   }
 }
@@ -2068,7 +2127,7 @@
                     UNUSED const YV12_BUFFER_CONFIG *org,
                     UNUSED const AV1_COMMON *cm, UNUSED int block_size,
                     UNUSED int w, UNUSED int h, UNUSED unsigned int strength,
-                    UNUSED unsigned int fb_size_log2, uint8_t *bit) {
+                    UNUSED unsigned int fb_size_log2, int8_t *bit) {
   return *bit;
 }
 #endif
@@ -3224,11 +3283,10 @@
       memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
       pbi->need_resync = 0;
     }
-    if (frame_is_intra_only(cm))
-      cm->allow_screen_content_tools = aom_rb_read_bit(rb);
+    cm->allow_screen_content_tools = aom_rb_read_bit(rb);
   } else {
     cm->intra_only = cm->show_frame ? 0 : aom_rb_read_bit(rb);
-
+    if (cm->intra_only) cm->allow_screen_content_tools = aom_rb_read_bit(rb);
     if (cm->error_resilient_mode) {
       cm->reset_frame_context = RESET_FRAME_CONTEXT_ALL;
     } else {
@@ -3362,7 +3420,7 @@
 
   setup_loopfilter(cm, rb);
 #if CONFIG_CLPF
-  setup_clpf(cm, rb);
+  setup_clpf(pbi, rb);
 #endif
 #if CONFIG_DERING
   setup_dering(cm, rb);
@@ -3934,18 +3992,18 @@
   if (!cm->skip_loop_filter) {
     const YV12_BUFFER_CONFIG *const frame = &pbi->cur_buf->buf;
     if (cm->clpf_strength_y) {
-      av1_clpf_frame(frame, NULL, cm, !!cm->clpf_size,
+      av1_clpf_frame(frame, NULL, cm, cm->clpf_size != CLPF_NOSIZE,
                      cm->clpf_strength_y + (cm->clpf_strength_y == 3),
-                     4 + cm->clpf_size, cm->clpf_blocks, AOM_PLANE_Y, clpf_bit);
+                     4 + cm->clpf_size, AOM_PLANE_Y, clpf_bit);
     }
     if (cm->clpf_strength_u) {
-      av1_clpf_frame(frame, NULL, cm, 0,
-                     cm->clpf_strength_u + (cm->clpf_strength_u == 3), 4, NULL,
+      av1_clpf_frame(frame, NULL, cm, 0,  // No block signals for chroma
+                     cm->clpf_strength_u + (cm->clpf_strength_u == 3), 4,
                      AOM_PLANE_U, NULL);
     }
     if (cm->clpf_strength_v) {
-      av1_clpf_frame(frame, NULL, cm, 0,
-                     cm->clpf_strength_v + (cm->clpf_strength_v == 3), 4, NULL,
+      av1_clpf_frame(frame, NULL, cm, 0,  // No block signals for chroma
+                     cm->clpf_strength_v + (cm->clpf_strength_v == 3), 4,
                      AOM_PLANE_V, NULL);
     }
   }

diff --git a/av1/decoder/detokenize.c b/av1/decoder/detokenize.c
index b83ab3d..9a40f69 100644
--- a/av1/decoder/detokenize.c
+++ b/av1/decoder/detokenize.c

@@ -39,7 +39,6 @@
     if (counts) ++coef_counts[band][ctx][token]; \
   } while (0)
 
-#if !CONFIG_ANS
 static INLINE int read_coeff(const aom_prob *probs, int n, aom_reader *r) {
   int i, val = 0;
   for (i = 0; i < n; ++i) val = (val << 1) | aom_read(r, probs[i]);
@@ -75,6 +74,11 @@
   const aom_prob(*coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
       fc->coef_probs[tx_size_ctx][type][ref];
   const aom_prob *prob;
+#if CONFIG_ANS
+  const aom_cdf_prob(*const coef_cdfs)[COEFF_CONTEXTS][ENTROPY_TOKENS] =
+      fc->coef_cdfs[tx_size_ctx][type][ref];
+  const aom_cdf_prob(*cdf)[ENTROPY_TOKENS];
+#endif  // CONFIG_ANS
   unsigned int(*coef_counts)[COEFF_CONTEXTS][UNCONSTRAINED_NODES + 1];
   unsigned int(*eob_branch_count)[COEFF_CONTEXTS];
   uint8_t token_cache[MAX_TX_SQUARE];
@@ -160,7 +164,53 @@
       dqv_val = &dq_val[band][0];
 #endif  // CONFIG_NEW_QUANT
     }
-
+#if CONFIG_ANS
+    cdf = &coef_cdfs[band][ctx];
+    token =
+        ONE_TOKEN + aom_read_symbol(r, *cdf, CATEGORY6_TOKEN - ONE_TOKEN + 1);
+    INCREMENT_COUNT(ONE_TOKEN + (token > ONE_TOKEN));
+    switch (token) {
+      case ONE_TOKEN:
+      case TWO_TOKEN:
+      case THREE_TOKEN:
+      case FOUR_TOKEN: val = token; break;
+      case CATEGORY1_TOKEN:
+        val = CAT1_MIN_VAL + read_coeff(cat1_prob, 1, r);
+        break;
+      case CATEGORY2_TOKEN:
+        val = CAT2_MIN_VAL + read_coeff(cat2_prob, 2, r);
+        break;
+      case CATEGORY3_TOKEN:
+        val = CAT3_MIN_VAL + read_coeff(cat3_prob, 3, r);
+        break;
+      case CATEGORY4_TOKEN:
+        val = CAT4_MIN_VAL + read_coeff(cat4_prob, 4, r);
+        break;
+      case CATEGORY5_TOKEN:
+        val = CAT5_MIN_VAL + read_coeff(cat5_prob, 5, r);
+        break;
+      case CATEGORY6_TOKEN: {
+        const int skip_bits = TX_SIZES - 1 - txsize_sqr_up_map[tx_size];
+        const uint8_t *cat6p = cat6_prob + skip_bits;
+#if CONFIG_AOM_HIGHBITDEPTH
+        switch (xd->bd) {
+          case AOM_BITS_8:
+            val = CAT6_MIN_VAL + read_coeff(cat6p, 14 - skip_bits, r);
+            break;
+          case AOM_BITS_10:
+            val = CAT6_MIN_VAL + read_coeff(cat6p, 16 - skip_bits, r);
+            break;
+          case AOM_BITS_12:
+            val = CAT6_MIN_VAL + read_coeff(cat6p, 18 - skip_bits, r);
+            break;
+          default: assert(0); return -1;
+        }
+#else
+        val = CAT6_MIN_VAL + read_coeff(cat6p, 14 - skip_bits, r);
+#endif
+      } break;
+    }
+#else
     if (!aom_read(r, prob[ONE_CONTEXT_NODE])) {
       INCREMENT_COUNT(ONE_TOKEN);
       token = ONE_TOKEN;
@@ -211,8 +261,8 @@
         }
       }
     }
+#endif  // CONFIG_ANS
 #if CONFIG_NEW_QUANT
-
     v = av1_dequant_abscoeff_nuq(val, dqv, dqv_val);
     v = dq_shift ? ROUND_POWER_OF_TWO(v, dq_shift) : v;
 #else
@@ -240,186 +290,6 @@
 
   return c;
 }
-#else  // !CONFIG_ANS
-static INLINE int read_coeff(const aom_prob *const probs, int n,
-                             struct AnsDecoder *const ans) {
-  int i, val = 0;
-  for (i = 0; i < n; ++i) val = (val << 1) | uabs_read(ans, probs[i]);
-  return val;
-}
-
-static int decode_coefs_ans(const MACROBLOCKD *const xd, PLANE_TYPE type,
-                            tran_low_t *dqcoeff, TX_SIZE tx_size,
-                            TX_TYPE tx_type, const int16_t *dq,
-#if CONFIG_NEW_QUANT
-                            dequant_val_type_nuq *dq_val,
-#endif  // CONFIG_NEW_QUANT
-                            int ctx, const int16_t *scan, const int16_t *nb,
-                            struct AnsDecoder *const ans) {
-  FRAME_COUNTS *counts = xd->counts;
-  const int max_eob = get_tx2d_size(tx_size);
-  const FRAME_CONTEXT *const fc = xd->fc;
-  const int ref = is_inter_block(&xd->mi[0]->mbmi);
-  int band, c = 0;
-  int skip_eob = 0;
-  const int tx_size_ctx = txsize_sqr_map[tx_size];
-  const aom_prob(*coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
-      fc->coef_probs[tx_size_ctx][type][ref];
-  const rans_lut(*coef_cdfs)[COEFF_CONTEXTS] =
-      fc->coef_cdfs[tx_size_ctx][type][ref];
-  const aom_prob *prob;
-  const rans_lut *cdf;
-  unsigned int(*coef_counts)[COEFF_CONTEXTS][UNCONSTRAINED_NODES + 1];
-  unsigned int(*eob_branch_count)[COEFF_CONTEXTS];
-  uint8_t token_cache[MAX_TX_SQUARE];
-  const uint8_t *band_translate = get_band_translate(tx_size);
-  int dq_shift;
-  int v, token;
-  int16_t dqv = dq[0];
-#if CONFIG_NEW_QUANT
-  const tran_low_t *dqv_val = &dq_val[0][0];
-#endif  // CONFIG_NEW_QUANT
-  const uint8_t *cat1_prob;
-  const uint8_t *cat2_prob;
-  const uint8_t *cat3_prob;
-  const uint8_t *cat4_prob;
-  const uint8_t *cat5_prob;
-  const uint8_t *cat6_prob;
-
-  dq_shift = get_tx_scale(xd, tx_type, tx_size);
-
-  if (counts) {
-    coef_counts = counts->coef[tx_size_ctx][type][ref];
-    eob_branch_count = counts->eob_branch[tx_size_ctx][type][ref];
-  }
-
-#if CONFIG_AOM_HIGHBITDEPTH
-  if (xd->bd > AOM_BITS_8) {
-    if (xd->bd == AOM_BITS_10) {
-      cat1_prob = av1_cat1_prob_high10;
-      cat2_prob = av1_cat2_prob_high10;
-      cat3_prob = av1_cat3_prob_high10;
-      cat4_prob = av1_cat4_prob_high10;
-      cat5_prob = av1_cat5_prob_high10;
-      cat6_prob = av1_cat6_prob_high10;
-    } else {
-      cat1_prob = av1_cat1_prob_high12;
-      cat2_prob = av1_cat2_prob_high12;
-      cat3_prob = av1_cat3_prob_high12;
-      cat4_prob = av1_cat4_prob_high12;
-      cat5_prob = av1_cat5_prob_high12;
-      cat6_prob = av1_cat6_prob_high12;
-    }
-  } else {
-    cat1_prob = av1_cat1_prob;
-    cat2_prob = av1_cat2_prob;
-    cat3_prob = av1_cat3_prob;
-    cat4_prob = av1_cat4_prob;
-    cat5_prob = av1_cat5_prob;
-    cat6_prob = av1_cat6_prob;
-  }
-#else
-  cat1_prob = av1_cat1_prob;
-  cat2_prob = av1_cat2_prob;
-  cat3_prob = av1_cat3_prob;
-  cat4_prob = av1_cat4_prob;
-  cat5_prob = av1_cat5_prob;
-  cat6_prob = av1_cat6_prob;
-#endif
-
-  while (c < max_eob) {
-    int val = -1;
-    band = *band_translate++;
-    prob = coef_probs[band][ctx];
-    if (!skip_eob) {
-      if (counts) ++eob_branch_count[band][ctx];
-      if (!uabs_read(ans, prob[EOB_CONTEXT_NODE])) {
-        INCREMENT_COUNT(EOB_MODEL_TOKEN);
-        break;
-      }
-    }
-
-#if CONFIG_NEW_QUANT
-    dqv_val = &dq_val[band][0];
-#endif  // CONFIG_NEW_QUANT
-
-    cdf = &coef_cdfs[band][ctx];
-    token = ZERO_TOKEN + rans_read(ans, *cdf);
-    if (token == ZERO_TOKEN) {
-      INCREMENT_COUNT(ZERO_TOKEN);
-      token_cache[scan[c]] = 0;
-      skip_eob = 1;
-    } else {
-      INCREMENT_COUNT(ONE_TOKEN + (token > ONE_TOKEN));
-      switch (token) {
-        case ONE_TOKEN:
-        case TWO_TOKEN:
-        case THREE_TOKEN:
-        case FOUR_TOKEN: val = token; break;
-        case CATEGORY1_TOKEN:
-          val = CAT1_MIN_VAL + read_coeff(cat1_prob, 1, ans);
-          break;
-        case CATEGORY2_TOKEN:
-          val = CAT2_MIN_VAL + read_coeff(cat2_prob, 2, ans);
-          break;
-        case CATEGORY3_TOKEN:
-          val = CAT3_MIN_VAL + read_coeff(cat3_prob, 3, ans);
-          break;
-        case CATEGORY4_TOKEN:
-          val = CAT4_MIN_VAL + read_coeff(cat4_prob, 4, ans);
-          break;
-        case CATEGORY5_TOKEN:
-          val = CAT5_MIN_VAL + read_coeff(cat5_prob, 5, ans);
-          break;
-        case CATEGORY6_TOKEN: {
-          const int skip_bits = TX_SIZES - 1 - txsize_sqr_up_map[tx_size];
-          const uint8_t *cat6p = cat6_prob + skip_bits;
-#if CONFIG_AOM_HIGHBITDEPTH
-          switch (xd->bd) {
-            case AOM_BITS_8:
-              val = CAT6_MIN_VAL + read_coeff(cat6p, 14 - skip_bits, ans);
-              break;
-            case AOM_BITS_10:
-              val = CAT6_MIN_VAL + read_coeff(cat6p, 16 - skip_bits, ans);
-              break;
-            case AOM_BITS_12:
-              val = CAT6_MIN_VAL + read_coeff(cat6p, 18 - skip_bits, ans);
-              break;
-            default: assert(0); return -1;
-          }
-#else
-          val = CAT6_MIN_VAL + read_coeff(cat6p, 14 - skip_bits, ans);
-#endif
-        } break;
-      }
-#if CONFIG_NEW_QUANT
-      v = av1_dequant_abscoeff_nuq(val, dqv, dqv_val);
-      v = dq_shift ? ROUND_POWER_OF_TWO(v, dq_shift) : v;
-#else
-      v = (val * dqv) >> dq_shift;
-#endif  // CONFIG_NEW_QUANT
-
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
-#if CONFIG_AOM_HIGHBITDEPTH
-      dqcoeff[scan[c]] =
-          highbd_check_range((uabs_read_bit(ans) ? -v : v), xd->bd);
-#else
-      dqcoeff[scan[c]] = check_range(uabs_read_bit(ans) ? -v : v);
-#endif  // CONFIG_AOM_HIGHBITDEPTH
-#else
-      dqcoeff[scan[c]] = uabs_read_bit(ans) ? -v : v;
-#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
-      token_cache[scan[c]] = av1_pt_energy_class[token];
-      skip_eob = 0;
-    }
-    ++c;
-    ctx = get_coef_context(nb, token_cache, c);
-    dqv = dq[1];
-  }
-
-  return c;
-}
-#endif  // !CONFIG_ANS
 
 // TODO(slavarnway): Decode version of av1_set_context.  Modify
 // av1_set_context
@@ -510,7 +380,6 @@
       get_dq_profile_from_ctx(xd->qindex[seg_id], ctx, ref, pd->plane_type);
 #endif  //  CONFIG_NEW_QUANT
 
-#if !CONFIG_ANS
 #if CONFIG_AOM_QM
   const int eob =
       decode_coefs(xd, pd->plane_type, pd->dqcoeff, tx_size, tx_type, dequant,
@@ -523,14 +392,6 @@
 #endif  // CONFIG_NEW_QUANT
                    ctx, sc->scan, sc->neighbors, r);
 #endif  // CONFIG_AOM_QM
-#else
-  const int eob = decode_coefs_ans(xd, pd->plane_type, pd->dqcoeff, tx_size,
-                                   tx_type, dequant,
-#if CONFIG_NEW_QUANT
-                                   pd->seg_dequant_nuq[seg_id][dq],
-#endif  // CONFIG_NEW_QUANT
-                                   ctx, sc->scan, sc->neighbors, r);
-#endif  // !CONFIG_ANS
   dec_set_contexts(xd, pd, tx_size, eob > 0, x, y);
   /*
   av1_set_contexts(xd, pd,

diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index f4b5da6..80da661 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c

@@ -620,7 +620,6 @@
 }
 #endif  // CONFIG_SUPERTX
 
-#if !CONFIG_ANS
 static void pack_mb_tokens(aom_writer *w, const TOKENEXTRA **tp,
                            const TOKENEXTRA *const stop,
                            aom_bit_depth_t bit_depth, const TX_SIZE tx) {
@@ -632,9 +631,11 @@
 
   while (p < stop && p->token != EOSB_TOKEN) {
     const int t = p->token;
+#if !CONFIG_ANS
     const struct av1_token *const a = &av1_coef_encodings[t];
     int v = a->value;
     int n = a->len;
+#endif  // !CONFIG_ANS
 #if CONFIG_AOM_HIGHBITDEPTH
     const av1_extra_bit *b;
     if (bit_depth == AOM_BITS_12)
@@ -648,6 +649,19 @@
     (void)bit_depth;
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
+#if CONFIG_ANS
+    /* skip one or two nodes */
+    if (!p->skip_eob_node) aom_write(w, t != EOB_TOKEN, p->context_tree[0]);
+
+    if (t != EOB_TOKEN) {
+      aom_write(w, t != ZERO_TOKEN, p->context_tree[1]);
+
+      if (t != ZERO_TOKEN) {
+        aom_write_symbol(w, t - ONE_TOKEN, *p->token_cdf,
+                         CATEGORY6_TOKEN - ONE_TOKEN + 1);
+      }
+    }
+#else
     /* skip one or two nodes */
     if (p->skip_eob_node)
       n -= p->skip_eob_node;
@@ -668,6 +682,7 @@
         }
       }
     }
+#endif  // CONFIG_ANS
 
     if (b->base_val) {
       const int e = p->extra, l = b->len;
@@ -705,83 +720,6 @@
 
   *tp = p;
 }
-#else
-// This function serializes the tokens in forward order using a buffered ans
-// coder.
-static void pack_mb_tokens(struct BufAnsCoder *ans, const TOKENEXTRA **tp,
-                           const TOKENEXTRA *const stop,
-                           aom_bit_depth_t bit_depth, const TX_SIZE tx) {
-  const TOKENEXTRA *p = *tp;
-#if CONFIG_VAR_TX
-  int count = 0;
-  const int seg_eob = 16 << (tx << 1);
-#endif  // CONFIG_VAR_TX
-
-  while (p < stop && p->token != EOSB_TOKEN) {
-    const int t = p->token;
-#if CONFIG_AOM_HIGHBITDEPTH
-    const av1_extra_bit *b;
-    if (bit_depth == AOM_BITS_12)
-      b = &av1_extra_bits_high12[t];
-    else if (bit_depth == AOM_BITS_10)
-      b = &av1_extra_bits_high10[t];
-    else
-      b = &av1_extra_bits[t];
-#else
-    const av1_extra_bit *const b = &av1_extra_bits[t];
-    (void)bit_depth;
-#endif  // CONFIG_AOM_HIGHBITDEPTH
-
-    /* skip one or two nodes */
-    if (!p->skip_eob_node)
-      buf_uabs_write(ans, t != EOB_TOKEN, p->context_tree[0]);
-
-    if (t != EOB_TOKEN) {
-      struct rans_sym s;
-      const rans_lut *token_cdf = p->token_cdf;
-      assert(token_cdf);
-      s.cum_prob = (*token_cdf)[t - ZERO_TOKEN];
-      s.prob = (*token_cdf)[t - ZERO_TOKEN + 1] - s.cum_prob;
-      buf_rans_write(ans, &s);
-
-      if (b->base_val) {
-        const int e = p->extra, l = b->len;
-        int skip_bits = (b->base_val == CAT6_MIN_VAL)
-                            ? TX_SIZES - 1 - txsize_sqr_up_map[tx]
-                            : 0;
-
-        if (l) {
-          const unsigned char *pb = b->prob;
-          int v = e >> 1;
-          int n = l; /* number of bits in v, assumed nonzero */
-          int i = 0;
-
-          do {
-            const int bb = (v >> --n) & 1;
-            if (skip_bits) {
-              skip_bits--;
-              assert(!bb);
-            } else {
-              buf_uabs_write(ans, bb, pb[i >> 1]);
-            }
-            i = b->tree[i + bb];
-          } while (n);
-        }
-
-        buf_uabs_write(ans, e & 1, 128);
-      }
-    }
-    ++p;
-
-#if CONFIG_VAR_TX
-    ++count;
-    if (t == EOB_TOKEN || count == seg_eob) break;
-#endif  // CONFIG_VAR_TX
-  }
-
-  *tp = p;
-}
-#endif  // !CONFIG_ANS
 
 #if CONFIG_VAR_TX
 static void pack_txb_tokens(aom_writer *w, const TOKENEXTRA **tp,
@@ -1932,7 +1870,38 @@
       (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
     update_partition_context(xd, mi_row, mi_col, subsize, bsize);
 
-#if DERING_REFINEMENT
+#if CONFIG_CLPF
+  if (bsize == BLOCK_64X64 && cm->clpf_blocks && cm->clpf_strength_y &&
+      cm->clpf_size != CLPF_NOSIZE) {
+    const int tl = mi_row * MI_SIZE / MIN_FB_SIZE * cm->clpf_stride +
+                   mi_col * MI_SIZE / MIN_FB_SIZE;
+    const int tr = tl + 1;
+    const int bl = tl + cm->clpf_stride;
+    const int br = tr + cm->clpf_stride;
+
+    // Up to four bits per SB.
+    // When clpf_size indicates a size larger than the SB size
+    // (CLPF_128X128), one bit for every fourth SB will be transmitted
+    // regardless of skip blocks.
+    if (cm->clpf_blocks[tl] != CLPF_NOFLAG)
+      aom_write_literal(w, cm->clpf_blocks[tl], 1);
+
+    if (mi_col + MI_SIZE / 2 < cm->mi_cols &&
+        cm->clpf_blocks[tr] != CLPF_NOFLAG)
+      aom_write_literal(w, cm->clpf_blocks[tr], 1);
+
+    if (mi_row + MI_SIZE / 2 < cm->mi_rows &&
+        cm->clpf_blocks[bl] != CLPF_NOFLAG)
+      aom_write_literal(w, cm->clpf_blocks[bl], 1);
+
+    if (mi_row + MI_SIZE / 2 < cm->mi_rows &&
+        mi_col + MI_SIZE / 2 < cm->mi_cols &&
+        cm->clpf_blocks[br] != CLPF_NOFLAG)
+      aom_write_literal(w, cm->clpf_blocks[br], 1);
+  }
+#endif
+
+#if CONFIG_DERING
   if (bsize == BLOCK_64X64 && cm->dering_level != 0 &&
       !sb_all_skip(cm, mi_row, mi_col)) {
     aom_write_literal(
@@ -2596,18 +2565,6 @@
   aom_wb_write_literal(wb, cm->clpf_strength_v, 2);
   if (cm->clpf_strength_y) {
     aom_wb_write_literal(wb, cm->clpf_size, 2);
-    if (cm->clpf_size) {
-      int i;
-      // TODO(stemidts): The number of bits to transmit could be
-      // implicitly deduced if transmitted after the filter block or
-      // after the frame (when it's known whether the block is all
-      // skip and implicitly unfiltered).  And the bits do not have
-      // 50% probability, so a more efficient coding is possible.
-      aom_wb_write_literal(wb, cm->clpf_numblocks, av1_clpf_maxbits(cm));
-      for (i = 0; i < cm->clpf_numblocks; i++) {
-        aom_wb_write_literal(wb, cm->clpf_blocks ? cm->clpf_blocks[i] : 0, 1);
-      }
-    }
   }
 }
 #endif
@@ -3199,11 +3156,10 @@
     write_sync_code(wb);
     write_bitdepth_colorspace_sampling(cm, wb);
     write_frame_size(cm, wb);
-    if (frame_is_intra_only(cm))
-      aom_wb_write_bit(wb, cm->allow_screen_content_tools);
+    aom_wb_write_bit(wb, cm->allow_screen_content_tools);
   } else {
     if (!cm->show_frame) aom_wb_write_bit(wb, cm->intra_only);
-
+    if (cm->intra_only) aom_wb_write_bit(wb, cm->allow_screen_content_tools);
     if (!cm->error_resilient_mode) {
       if (cm->intra_only) {
         aom_wb_write_bit(wb,

diff --git a/av1/encoder/clpf_rdo.c b/av1/encoder/clpf_rdo.c
index 1d498f1..4e652b6 100644
--- a/av1/encoder/clpf_rdo.c
+++ b/av1/encoder/clpf_rdo.c

@@ -127,14 +127,15 @@
 int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec,
                       const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
                       int block_size, int w, int h, unsigned int strength,
-                      unsigned int fb_size_log2, uint8_t *res) {
+                      unsigned int fb_size_log2, int8_t *res) {
   int m, n, sum0 = 0, sum1 = 0;
 
   for (m = 0; m < h; m++) {
     for (n = 0; n < w; n++) {
       int xpos = (l << fb_size_log2) + n * block_size;
       int ypos = (k << fb_size_log2) + m * block_size;
-      if (!cm->mi_grid_visible[ypos / MI_SIZE * cm->mi_stride + xpos / MI_SIZE]
+      if (fb_size_log2 == MAX_FB_SIZE_LOG2 ||
+          !cm->mi_grid_visible[ypos / MI_SIZE * cm->mi_stride + xpos / MI_SIZE]
                ->mbmi.skip) {
 #if CONFIG_AOM_HIGHBITDEPTH
         if (cm->use_highbitdepth) {
@@ -167,6 +168,8 @@
 // (Only for luma:)
 // res[1][0]   : (bit count, fb size = 128)
 // res[1][1-3] : strength=1,2,4, fb size = 128
+// res[1][4]   : unfiltered, including skip
+// res[1][5-7] : strength=1,2,4, including skip, fb_size = 128
 // res[2][0]   : (bit count, fb size = 64)
 // res[2][1-3] : strength=1,2,4, fb size = 64
 // res[3][0]   : (bit count, fb size = 32)
@@ -174,9 +177,9 @@
 static int clpf_rdo(int y, int x, const YV12_BUFFER_CONFIG *rec,
                     const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
                     unsigned int block_size, unsigned int fb_size_log2, int w,
-                    int h, int64_t res[4][4], int plane) {
+                    int h, int64_t res[4][8], int plane) {
   int c, m, n, filtered = 0;
-  int sum[4];
+  int sum[8];
   const int subx = plane != AOM_PLANE_Y && rec->subsampling_x;
   const int suby = plane != AOM_PLANE_Y && rec->subsampling_y;
   int bslog = get_msb(block_size);
@@ -193,12 +196,12 @@
       plane != AOM_PLANE_Y ? rec->uv_crop_height : rec->y_crop_height;
   int rec_stride = plane != AOM_PLANE_Y ? rec->uv_stride : rec->y_stride;
   int org_stride = plane != AOM_PLANE_Y ? org->uv_stride : org->y_stride;
-  sum[0] = sum[1] = sum[2] = sum[3] = 0;
+  sum[0] = sum[1] = sum[2] = sum[3] = sum[4] = sum[5] = sum[6] = sum[7] = 0;
   if (plane == AOM_PLANE_Y &&
       fb_size_log2 > (unsigned int)get_msb(MAX_FB_SIZE) - 3) {
     int w1, h1, w2, h2, i, sum1, sum2, sum3, oldfiltered;
 
-    fb_size_log2--;
+    filtered = fb_size_log2-- == MAX_FB_SIZE_LOG2;
     w1 = AOMMIN(1 << (fb_size_log2 - bslog), w);
     h1 = AOMMIN(1 << (fb_size_log2 - bslog), h);
     w2 = AOMMIN(w - (1 << (fb_size_log2 - bslog)), w >> 1);
@@ -210,8 +213,8 @@
     oldfiltered = res[i][0];
     res[i][0] = 0;
 
-    filtered = clpf_rdo(y, x, rec, org, cm, block_size, fb_size_log2, w1, h1,
-                        res, plane);
+    filtered |= clpf_rdo(y, x, rec, org, cm, block_size, fb_size_log2, w1, h1,
+                         res, plane);
     if (1 << (fb_size_log2 - bslog) < w)
       filtered |= clpf_rdo(y, x + (1 << fb_size_log2), rec, org, cm, block_size,
                            fb_size_log2, w2, h1, res, plane);
@@ -223,10 +226,18 @@
                    cm, block_size, fb_size_log2, w2, h2, res, plane);
     }
 
+    // Correct sums for unfiltered blocks
     res[i][1] = AOMMIN(sum1 + res[i][0], res[i][1]);
     res[i][2] = AOMMIN(sum2 + res[i][0], res[i][2]);
     res[i][3] = AOMMIN(sum3 + res[i][0], res[i][3]);
+    if (i == 1) {
+      res[i][5] = AOMMIN(sum1 + res[i][4], res[i][5]);
+      res[i][6] = AOMMIN(sum2 + res[i][4], res[i][6]);
+      res[i][7] = AOMMIN(sum3 + res[i][4], res[i][7]);
+    }
+
     res[i][0] = oldfiltered + filtered;  // Number of signal bits
+
     return filtered;
   }
 
@@ -234,27 +245,28 @@
     for (n = 0; n < w; n++) {
       int xpos = x + n * block_size;
       int ypos = y + m * block_size;
-      if (!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride +
-                               (xpos << subx) / MI_SIZE]
-               ->mbmi.skip) {
+      int skip =  // Filtered skip blocks stored only for fb_size == 128
+          4 *
+          !!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride +
+                                (xpos << subx) / MI_SIZE]
+                ->mbmi.skip;
 #if CONFIG_AOM_HIGHBITDEPTH
-        if (cm->use_highbitdepth) {
-          aom_clpf_detect_multi_hbd(
-              CONVERT_TO_SHORTPTR(rec_buffer), CONVERT_TO_SHORTPTR(org_buffer),
-              rec_stride, org_stride, xpos, ypos, rec_width, rec_height, sum,
-              cm->bit_depth - 8, block_size);
-        } else {
-          aom_clpf_detect_multi(rec_buffer, org_buffer, rec_stride, org_stride,
-                                xpos, ypos, rec_width, rec_height, sum,
-                                block_size);
-        }
-#else
+      if (cm->use_highbitdepth) {
+        aom_clpf_detect_multi_hbd(CONVERT_TO_SHORTPTR(rec_buffer),
+                                  CONVERT_TO_SHORTPTR(org_buffer), rec_stride,
+                                  org_stride, xpos, ypos, rec_width, rec_height,
+                                  sum + skip, cm->bit_depth - 8, block_size);
+      } else {
         aom_clpf_detect_multi(rec_buffer, org_buffer, rec_stride, org_stride,
-                              xpos, ypos, rec_width, rec_height, sum,
+                              xpos, ypos, rec_width, rec_height, sum + skip,
                               block_size);
-#endif
-        filtered = 1;
       }
+#else
+      aom_clpf_detect_multi(rec_buffer, org_buffer, rec_stride, org_stride,
+                            xpos, ypos, rec_width, rec_height, sum + skip,
+                            block_size);
+#endif
+      filtered |= !skip;
     }
   }
 
@@ -263,6 +275,12 @@
     res[c][1] += sum[1];
     res[c][2] += sum[2];
     res[c][3] += sum[3];
+    if (c != 1) continue;
+    // Only needed when fb_size == 128
+    res[c][4] += sum[4];
+    res[c][5] += sum[5];
+    res[c][6] += sum[6];
+    res[c][7] += sum[7];
   }
   return filtered;
 }
@@ -271,7 +289,7 @@
                          const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
                          int *best_strength, int *best_bs, int plane) {
   int c, j, k, l;
-  int64_t best, sums[4][4];
+  int64_t best, sums[4][8];
   int width = plane != AOM_PLANE_Y ? rec->uv_crop_width : rec->y_crop_width;
   int height = plane != AOM_PLANE_Y ? rec->uv_crop_height : rec->y_crop_height;
   const int bs = MI_SIZE;
@@ -303,8 +321,14 @@
       }
     }
 
-  if (plane != AOM_PLANE_Y)  // Slightly favour unfiltered chroma
+  // For fb_size == 128 skip blocks are included in the result.
+  if (plane == AOM_PLANE_Y) {
+    sums[1][1] += sums[1][5] - sums[1][4];
+    sums[1][2] += sums[1][6] - sums[1][4];
+    sums[1][3] += sums[1][7] - sums[1][4];
+  } else {  // Slightly favour unfiltered chroma
     sums[0][0] -= sums[0][0] >> 7;
+  }
 
   for (j = 0; j < 4; j++) {
     static const double lambda_square[] = {

diff --git a/av1/encoder/clpf_rdo.h b/av1/encoder/clpf_rdo.h
index bb85fbc..586eed0 100644
--- a/av1/encoder/clpf_rdo.h
+++ b/av1/encoder/clpf_rdo.h

@@ -17,7 +17,7 @@
 int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec,
                       const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
                       int block_size, int w, int h, unsigned int strength,
-                      unsigned int fb_size_log2, uint8_t *res);
+                      unsigned int fb_size_log2, int8_t *res);
 
 void av1_clpf_test_frame(const YV12_BUFFER_CONFIG *rec,
                          const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,

diff --git a/av1/encoder/cost.c b/av1/encoder/cost.c
index 9a2ac8e..e3151a5 100644
--- a/av1/encoder/cost.c
+++ b/av1/encoder/cost.c

@@ -11,9 +11,6 @@
 #include <assert.h>
 
 #include "av1/encoder/cost.h"
-#if CONFIG_ANS
-#include "aom_dsp/ans.h"
-#endif  // CONFIG_ANS
 #include "av1/common/entropy.h"
 
 /* round(-log2(i/256.) * (1 << AV1_PROB_COST_SHIFT))
@@ -41,91 +38,6 @@
   26,   23,   20,   18,   15,   12,   9,    6,    3
 };
 
-#if CONFIG_ANS
-// round(-log2(i/1024.) * (1 << AV1_PROB_COST_SHIFT))
-static const uint16_t av1_prob_cost10[1024] = {
-  5120, 5120, 4608, 4308, 4096, 3931, 3796, 3683, 3584, 3497, 3419, 3349, 3284,
-  3225, 3171, 3120, 3072, 3027, 2985, 2945, 2907, 2871, 2837, 2804, 2772, 2742,
-  2713, 2685, 2659, 2633, 2608, 2583, 2560, 2537, 2515, 2494, 2473, 2453, 2433,
-  2414, 2395, 2377, 2359, 2342, 2325, 2308, 2292, 2276, 2260, 2245, 2230, 2216,
-  2201, 2187, 2173, 2160, 2147, 2134, 2121, 2108, 2096, 2083, 2071, 2060, 2048,
-  2037, 2025, 2014, 2003, 1992, 1982, 1971, 1961, 1951, 1941, 1931, 1921, 1911,
-  1902, 1892, 1883, 1874, 1865, 1856, 1847, 1838, 1830, 1821, 1813, 1804, 1796,
-  1788, 1780, 1772, 1764, 1756, 1748, 1741, 1733, 1726, 1718, 1711, 1704, 1697,
-  1689, 1682, 1675, 1668, 1661, 1655, 1648, 1641, 1635, 1628, 1622, 1615, 1609,
-  1602, 1596, 1590, 1584, 1578, 1571, 1565, 1559, 1554, 1548, 1542, 1536, 1530,
-  1525, 1519, 1513, 1508, 1502, 1497, 1491, 1486, 1480, 1475, 1470, 1465, 1459,
-  1454, 1449, 1444, 1439, 1434, 1429, 1424, 1419, 1414, 1409, 1404, 1399, 1395,
-  1390, 1385, 1380, 1376, 1371, 1367, 1362, 1357, 1353, 1348, 1344, 1340, 1335,
-  1331, 1326, 1322, 1318, 1313, 1309, 1305, 1301, 1297, 1292, 1288, 1284, 1280,
-  1276, 1272, 1268, 1264, 1260, 1256, 1252, 1248, 1244, 1240, 1236, 1233, 1229,
-  1225, 1221, 1218, 1214, 1210, 1206, 1203, 1199, 1195, 1192, 1188, 1185, 1181,
-  1177, 1174, 1170, 1167, 1163, 1160, 1156, 1153, 1149, 1146, 1143, 1139, 1136,
-  1133, 1129, 1126, 1123, 1119, 1116, 1113, 1110, 1106, 1103, 1100, 1097, 1094,
-  1090, 1087, 1084, 1081, 1078, 1075, 1072, 1069, 1066, 1062, 1059, 1056, 1053,
-  1050, 1047, 1044, 1042, 1039, 1036, 1033, 1030, 1027, 1024, 1021, 1018, 1015,
-  1013, 1010, 1007, 1004, 1001, 998,  996,  993,  990,  987,  985,  982,  979,
-  977,  974,  971,  968,  966,  963,  960,  958,  955,  953,  950,  947,  945,
-  942,  940,  937,  934,  932,  929,  927,  924,  922,  919,  917,  914,  912,
-  909,  907,  904,  902,  899,  897,  895,  892,  890,  887,  885,  883,  880,
-  878,  876,  873,  871,  868,  866,  864,  861,  859,  857,  855,  852,  850,
-  848,  845,  843,  841,  839,  836,  834,  832,  830,  828,  825,  823,  821,
-  819,  817,  814,  812,  810,  808,  806,  804,  801,  799,  797,  795,  793,
-  791,  789,  787,  785,  783,  780,  778,  776,  774,  772,  770,  768,  766,
-  764,  762,  760,  758,  756,  754,  752,  750,  748,  746,  744,  742,  740,
-  738,  736,  734,  732,  730,  728,  726,  724,  723,  721,  719,  717,  715,
-  713,  711,  709,  707,  706,  704,  702,  700,  698,  696,  694,  693,  691,
-  689,  687,  685,  683,  682,  680,  678,  676,  674,  673,  671,  669,  667,
-  665,  664,  662,  660,  658,  657,  655,  653,  651,  650,  648,  646,  644,
-  643,  641,  639,  637,  636,  634,  632,  631,  629,  627,  626,  624,  622,
-  621,  619,  617,  616,  614,  612,  611,  609,  607,  606,  604,  602,  601,
-  599,  598,  596,  594,  593,  591,  590,  588,  586,  585,  583,  582,  580,
-  578,  577,  575,  574,  572,  571,  569,  567,  566,  564,  563,  561,  560,
-  558,  557,  555,  554,  552,  550,  549,  547,  546,  544,  543,  541,  540,
-  538,  537,  535,  534,  532,  531,  530,  528,  527,  525,  524,  522,  521,
-  519,  518,  516,  515,  513,  512,  511,  509,  508,  506,  505,  503,  502,
-  501,  499,  498,  496,  495,  493,  492,  491,  489,  488,  486,  485,  484,
-  482,  481,  480,  478,  477,  475,  474,  473,  471,  470,  469,  467,  466,
-  465,  463,  462,  460,  459,  458,  456,  455,  454,  452,  451,  450,  448,
-  447,  446,  444,  443,  442,  441,  439,  438,  437,  435,  434,  433,  431,
-  430,  429,  428,  426,  425,  424,  422,  421,  420,  419,  417,  416,  415,
-  414,  412,  411,  410,  409,  407,  406,  405,  404,  402,  401,  400,  399,
-  397,  396,  395,  394,  392,  391,  390,  389,  387,  386,  385,  384,  383,
-  381,  380,  379,  378,  377,  375,  374,  373,  372,  371,  369,  368,  367,
-  366,  365,  364,  362,  361,  360,  359,  358,  356,  355,  354,  353,  352,
-  351,  349,  348,  347,  346,  345,  344,  343,  341,  340,  339,  338,  337,
-  336,  335,  333,  332,  331,  330,  329,  328,  327,  326,  324,  323,  322,
-  321,  320,  319,  318,  317,  316,  314,  313,  312,  311,  310,  309,  308,
-  307,  306,  305,  303,  302,  301,  300,  299,  298,  297,  296,  295,  294,
-  293,  292,  291,  289,  288,  287,  286,  285,  284,  283,  282,  281,  280,
-  279,  278,  277,  276,  275,  274,  273,  272,  271,  269,  268,  267,  266,
-  265,  264,  263,  262,  261,  260,  259,  258,  257,  256,  255,  254,  253,
-  252,  251,  250,  249,  248,  247,  246,  245,  244,  243,  242,  241,  240,
-  239,  238,  237,  236,  235,  234,  233,  232,  231,  230,  229,  228,  227,
-  226,  225,  224,  223,  222,  221,  220,  219,  218,  217,  216,  215,  214,
-  213,  212,  212,  211,  210,  209,  208,  207,  206,  205,  204,  203,  202,
-  201,  200,  199,  198,  197,  196,  195,  194,  194,  193,  192,  191,  190,
-  189,  188,  187,  186,  185,  184,  183,  182,  181,  181,  180,  179,  178,
-  177,  176,  175,  174,  173,  172,  171,  170,  170,  169,  168,  167,  166,
-  165,  164,  163,  162,  161,  161,  160,  159,  158,  157,  156,  155,  154,
-  153,  152,  152,  151,  150,  149,  148,  147,  146,  145,  145,  144,  143,
-  142,  141,  140,  139,  138,  138,  137,  136,  135,  134,  133,  132,  132,
-  131,  130,  129,  128,  127,  126,  125,  125,  124,  123,  122,  121,  120,
-  120,  119,  118,  117,  116,  115,  114,  114,  113,  112,  111,  110,  109,
-  109,  108,  107,  106,  105,  104,  104,  103,  102,  101,  100,  99,   99,
-  98,   97,   96,   95,   95,   94,   93,   92,   91,   90,   90,   89,   88,
-  87,   86,   86,   85,   84,   83,   82,   82,   81,   80,   79,   78,   78,
-  77,   76,   75,   74,   74,   73,   72,   71,   70,   70,   69,   68,   67,
-  66,   66,   65,   64,   63,   62,   62,   61,   60,   59,   59,   58,   57,
-  56,   55,   55,   54,   53,   52,   52,   51,   50,   49,   48,   48,   47,
-  46,   45,   45,   44,   43,   42,   42,   41,   40,   39,   38,   38,   37,
-  36,   35,   35,   34,   33,   32,   32,   31,   30,   29,   29,   28,   27,
-  26,   26,   25,   24,   23,   23,   22,   21,   20,   20,   19,   18,   18,
-  17,   16,   15,   15,   14,   13,   12,   12,   11,   10,   9,    9,    8,
-  7,    7,    6,    5,    4,    4,    3,    2,    1,    1
-};
-#endif  // CONFIG_ANS
-
 static void cost(int *costs, aom_tree tree, const aom_prob *probs, int i,
                  int c) {
   const aom_prob prob = probs[i / 2];
@@ -143,20 +55,6 @@
   }
 }
 
-#if CONFIG_ANS
-void av1_cost_tokens_ans(int *costs, const aom_prob *tree_probs,
-                         const rans_lut token_cdf, int skip_eob) {
-  int c_tree = 0;  // Cost of the "tree" nodes EOB and ZERO.
-  int i;
-  costs[EOB_TOKEN] = av1_cost_bit(tree_probs[0], 0);
-  if (!skip_eob) c_tree = av1_cost_bit(tree_probs[0], 1);
-  for (i = ZERO_TOKEN; i <= CATEGORY6_TOKEN; ++i) {
-    const int p = token_cdf[i + 1] - token_cdf[i];
-    costs[i] = c_tree + av1_prob_cost10[p];
-  }
-}
-#endif  // CONFIG_ANS
-
 void av1_cost_tokens(int *costs, const aom_prob *probs, aom_tree tree) {
   cost(costs, tree, probs, 0, 0);
 }

diff --git a/av1/encoder/cost.h b/av1/encoder/cost.h
index 448b905..379200e 100644
--- a/av1/encoder/cost.h
+++ b/av1/encoder/cost.h

@@ -13,9 +13,6 @@
 
 #include "aom_dsp/prob.h"
 #include "aom/aom_integer.h"
-#if CONFIG_ANS
-#include "aom_dsp/ans.h"
-#endif  // CONFIG_ANS
 
 #ifdef __cplusplus
 extern "C" {
@@ -58,11 +55,6 @@
 void av1_cost_tokens(int *costs, const aom_prob *probs, aom_tree tree);
 void av1_cost_tokens_skip(int *costs, const aom_prob *probs, aom_tree tree);
 
-#if CONFIG_ANS
-void av1_cost_tokens_ans(int *costs, const aom_prob *tree_probs,
-                         const rans_lut token_cdf, int skip_eob);
-#endif
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 163f4c0..2d9a892 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c

@@ -61,8 +61,9 @@
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
 static void encode_superblock(AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
-                              int output_enabled, int mi_row, int mi_col,
-                              BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx);
+                              RUN_TYPE dry_run, int mi_row, int mi_col,
+                              BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                              int *rate);
 
 #if CONFIG_SUPERTX
 static int check_intra_b(PICK_MODE_CONTEXT *ctx);
@@ -80,13 +81,13 @@
 static void predict_sb_complex(AV1_COMP *cpi, ThreadData *td,
                                const TileInfo *const tile, int mi_row,
                                int mi_col, int mi_row_ori, int mi_col_ori,
-                               int output_enabled, BLOCK_SIZE bsize,
+                               RUN_TYPE dry_run, BLOCK_SIZE bsize,
                                BLOCK_SIZE top_bsize, uint8_t *dst_buf[3],
                                int dst_stride[3], PC_TREE *pc_tree);
 static void update_state_sb_supertx(AV1_COMP *cpi, ThreadData *td,
                                     const TileInfo *const tile, int mi_row,
                                     int mi_col, BLOCK_SIZE bsize,
-                                    int output_enabled, PC_TREE *pc_tree);
+                                    RUN_TYPE dry_run, PC_TREE *pc_tree);
 static void rd_supertx_sb(AV1_COMP *cpi, ThreadData *td,
                           const TileInfo *const tile, int mi_row, int mi_col,
                           BLOCK_SIZE bsize, int *tmp_rate, int64_t *tmp_dist,
@@ -1012,7 +1013,7 @@
 
 static void update_state(AV1_COMP *cpi, ThreadData *td, PICK_MODE_CONTEXT *ctx,
                          int mi_row, int mi_col, BLOCK_SIZE bsize,
-                         int output_enabled) {
+                         RUN_TYPE dry_run) {
   int i, x_idx, y;
   AV1_COMMON *const cm = &cpi->common;
   RD_COUNTS *const rdc = &td->rd_counts;
@@ -1126,7 +1127,7 @@
            sizeof(uint8_t) * ctx->num_4x4_blk);
 #endif
 
-  if (!output_enabled) return;
+  if (dry_run) return;
 
 #if CONFIG_INTERNAL_STATS
   if (frame_is_intra_only(cm)) {
@@ -1195,7 +1196,7 @@
 #if CONFIG_SUPERTX
 static void update_state_supertx(AV1_COMP *cpi, ThreadData *td,
                                  PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col,
-                                 BLOCK_SIZE bsize, int output_enabled) {
+                                 BLOCK_SIZE bsize, RUN_TYPE dry_run) {
   int y, x_idx;
 #if CONFIG_VAR_TX || CONFIG_REF_MV
   int i;
@@ -1303,7 +1304,7 @@
   // Turn motion variation off for supertx
   mbmi->motion_mode = SIMPLE_TRANSLATION;
 
-  if (!output_enabled) return;
+  if (dry_run) return;
 
   if (!frame_is_intra_only(cm)) {
     av1_update_mv_count(td);
@@ -1341,7 +1342,7 @@
 static void update_state_sb_supertx(AV1_COMP *cpi, ThreadData *td,
                                     const TileInfo *const tile, int mi_row,
                                     int mi_col, BLOCK_SIZE bsize,
-                                    int output_enabled, PC_TREE *pc_tree) {
+                                    RUN_TYPE dry_run, PC_TREE *pc_tree) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -1365,27 +1366,27 @@
     case PARTITION_NONE:
       set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
       update_state_supertx(cpi, td, &pc_tree->none, mi_row, mi_col, subsize,
-                           output_enabled);
+                           dry_run);
       break;
     case PARTITION_VERT:
       set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
       update_state_supertx(cpi, td, &pc_tree->vertical[0], mi_row, mi_col,
-                           subsize, output_enabled);
+                           subsize, dry_run);
       if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
         set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize);
         update_state_supertx(cpi, td, &pc_tree->vertical[1], mi_row,
-                             mi_col + hbs, subsize, output_enabled);
+                             mi_col + hbs, subsize, dry_run);
       }
       pmc = &pc_tree->vertical_supertx;
       break;
     case PARTITION_HORZ:
       set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
       update_state_supertx(cpi, td, &pc_tree->horizontal[0], mi_row, mi_col,
-                           subsize, output_enabled);
+                           subsize, dry_run);
       if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
         set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize);
         update_state_supertx(cpi, td, &pc_tree->horizontal[1], mi_row + hbs,
-                             mi_col, subsize, output_enabled);
+                             mi_col, subsize, dry_run);
       }
       pmc = &pc_tree->horizontal_supertx;
       break;
@@ -1393,20 +1394,20 @@
       if (bsize == BLOCK_8X8) {
         set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
         update_state_supertx(cpi, td, pc_tree->leaf_split[0], mi_row, mi_col,
-                             subsize, output_enabled);
+                             subsize, dry_run);
       } else {
         set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
-        update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, subsize,
-                                output_enabled, pc_tree->split[0]);
+        update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, subsize, dry_run,
+                                pc_tree->split[0]);
         set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize);
         update_state_sb_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize,
-                                output_enabled, pc_tree->split[1]);
+                                dry_run, pc_tree->split[1]);
         set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize);
         update_state_sb_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize,
-                                output_enabled, pc_tree->split[2]);
+                                dry_run, pc_tree->split[2]);
         set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs, subsize);
         update_state_sb_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs,
-                                subsize, output_enabled, pc_tree->split[3]);
+                                subsize, dry_run, pc_tree->split[3]);
       }
       pmc = &pc_tree->split_supertx;
       break;
@@ -1414,49 +1415,49 @@
     case PARTITION_HORZ_A:
       set_offsets_supertx(cpi, td, tile, mi_row, mi_col, bsize2);
       update_state_supertx(cpi, td, &pc_tree->horizontala[0], mi_row, mi_col,
-                           bsize2, output_enabled);
+                           bsize2, dry_run);
       set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, bsize2);
       update_state_supertx(cpi, td, &pc_tree->horizontala[1], mi_row,
-                           mi_col + hbs, bsize2, output_enabled);
+                           mi_col + hbs, bsize2, dry_run);
       set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize);
       update_state_supertx(cpi, td, &pc_tree->horizontala[2], mi_row + hbs,
-                           mi_col, subsize, output_enabled);
+                           mi_col, subsize, dry_run);
       pmc = &pc_tree->horizontala_supertx;
       break;
     case PARTITION_HORZ_B:
       set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
       update_state_supertx(cpi, td, &pc_tree->horizontalb[0], mi_row, mi_col,
-                           subsize, output_enabled);
+                           subsize, dry_run);
       set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, bsize2);
       update_state_supertx(cpi, td, &pc_tree->horizontalb[1], mi_row + hbs,
-                           mi_col, bsize2, output_enabled);
+                           mi_col, bsize2, dry_run);
       set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs, bsize2);
       update_state_supertx(cpi, td, &pc_tree->horizontalb[2], mi_row + hbs,
-                           mi_col + hbs, bsize2, output_enabled);
+                           mi_col + hbs, bsize2, dry_run);
       pmc = &pc_tree->horizontalb_supertx;
       break;
     case PARTITION_VERT_A:
       set_offsets_supertx(cpi, td, tile, mi_row, mi_col, bsize2);
       update_state_supertx(cpi, td, &pc_tree->verticala[0], mi_row, mi_col,
-                           bsize2, output_enabled);
+                           bsize2, dry_run);
       set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, bsize2);
       update_state_supertx(cpi, td, &pc_tree->verticala[1], mi_row + hbs,
-                           mi_col, bsize2, output_enabled);
+                           mi_col, bsize2, dry_run);
       set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize);
       update_state_supertx(cpi, td, &pc_tree->verticala[2], mi_row,
-                           mi_col + hbs, subsize, output_enabled);
+                           mi_col + hbs, subsize, dry_run);
       pmc = &pc_tree->verticala_supertx;
       break;
     case PARTITION_VERT_B:
       set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
       update_state_supertx(cpi, td, &pc_tree->verticalb[0], mi_row, mi_col,
-                           subsize, output_enabled);
+                           subsize, dry_run);
       set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, bsize2);
       update_state_supertx(cpi, td, &pc_tree->verticalb[1], mi_row,
-                           mi_col + hbs, bsize2, output_enabled);
+                           mi_col + hbs, bsize2, dry_run);
       set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs, bsize2);
       update_state_supertx(cpi, td, &pc_tree->verticalb[2], mi_row + hbs,
-                           mi_col + hbs, bsize2, output_enabled);
+                           mi_col + hbs, bsize2, dry_run);
       pmc = &pc_tree->verticalb_supertx;
       break;
 #endif  // CONFIG_EXT_PARTITION_TYPES
@@ -2096,21 +2097,21 @@
 }
 
 static void encode_b(AV1_COMP *cpi, const TileInfo *const tile, ThreadData *td,
-                     TOKENEXTRA **tp, int mi_row, int mi_col,
-                     int output_enabled, BLOCK_SIZE bsize,
+                     TOKENEXTRA **tp, int mi_row, int mi_col, RUN_TYPE dry_run,
+                     BLOCK_SIZE bsize,
 #if CONFIG_EXT_PARTITION_TYPES
                      PARTITION_TYPE partition,
 #endif
-                     PICK_MODE_CONTEXT *ctx) {
+                     PICK_MODE_CONTEXT *ctx, int *rate) {
   MACROBLOCK *const x = &td->mb;
   set_offsets(cpi, tile, x, mi_row, mi_col, bsize);
 #if CONFIG_EXT_PARTITION_TYPES
   x->e_mbd.mi[0]->mbmi.partition = partition;
 #endif
-  update_state(cpi, td, ctx, mi_row, mi_col, bsize, output_enabled);
-  encode_superblock(cpi, td, tp, output_enabled, mi_row, mi_col, bsize, ctx);
+  update_state(cpi, td, ctx, mi_row, mi_col, bsize, dry_run);
+  encode_superblock(cpi, td, tp, dry_run, mi_row, mi_col, bsize, ctx, rate);
 
-  if (output_enabled) {
+  if (!dry_run) {
 #if CONFIG_SUPERTX
     update_stats(&cpi->common, td, 0);
 #else
@@ -2120,8 +2121,8 @@
 }
 
 static void encode_sb(AV1_COMP *cpi, ThreadData *td, const TileInfo *const tile,
-                      TOKENEXTRA **tp, int mi_row, int mi_col,
-                      int output_enabled, BLOCK_SIZE bsize, PC_TREE *pc_tree) {
+                      TOKENEXTRA **tp, int mi_row, int mi_col, RUN_TYPE dry_run,
+                      BLOCK_SIZE bsize, PC_TREE *pc_tree, int *rate) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -2138,7 +2139,7 @@
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
 
-  if (output_enabled) td->counts->partition[ctx][partition]++;
+  if (!dry_run) td->counts->partition[ctx][partition]++;
 
 #if CONFIG_SUPERTX
   if (!frame_is_intra_only(cm) && bsize <= MAX_SUPERTX_BLOCK_SIZE &&
@@ -2154,33 +2155,34 @@
       int dst_stride[3];
       set_skip_context(xd, mi_row, mi_col);
       set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
-      update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, bsize,
-                              output_enabled, pc_tree);
+      update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, bsize, dry_run,
+                              pc_tree);
 
       av1_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
       for (i = 0; i < MAX_MB_PLANE; i++) {
         dst_buf[i] = xd->plane[i].dst.buf;
         dst_stride[i] = xd->plane[i].dst.stride;
       }
-      predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row, mi_col,
-                         output_enabled, bsize, bsize, dst_buf, dst_stride,
-                         pc_tree);
+      predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row, mi_col, dry_run,
+                         bsize, bsize, dst_buf, dst_stride, pc_tree);
 
       set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
       set_segment_id_supertx(cpi, x, mi_row, mi_col, bsize);
 
       if (!x->skip) {
+        int this_rate = 0;
         x->skip_optimize = 0;
         x->use_lp32x32fdct = cpi->sf.use_lp32x32fdct;
 
         av1_encode_sb_supertx(x, bsize);
-        av1_tokenize_sb_supertx(cpi, td, tp, !output_enabled, bsize);
+        av1_tokenize_sb_supertx(cpi, td, tp, dry_run, bsize, rate);
+        if (rate) *rate += this_rate;
       } else {
         xd->mi[0]->mbmi.skip = 1;
-        if (output_enabled) td->counts->skip[av1_get_skip_context(xd)][1]++;
+        if (!dry_run) td->counts->skip[av1_get_skip_context(xd)][1]++;
         reset_skip_context(xd, bsize);
       }
-      if (output_enabled) {
+      if (!dry_run) {
         for (y_idx = 0; y_idx < mi_height; y_idx++)
           for (x_idx = 0; x_idx < mi_width; x_idx++) {
             if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width >
@@ -2221,7 +2223,7 @@
 #endif  // CONFIG_VAR_TX
       return;
     } else {
-      if (output_enabled) {
+      if (!dry_run) {
         td->counts->supertx[partition_supertx_context_lookup[partition]]
                            [supertx_size][0]++;
       }
@@ -2231,93 +2233,91 @@
 
   switch (partition) {
     case PARTITION_NONE:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize,
 #if CONFIG_EXT_PARTITION_TYPES
                partition,
 #endif
-               &pc_tree->none);
+               &pc_tree->none, rate);
       break;
     case PARTITION_VERT:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize,
 #if CONFIG_EXT_PARTITION_TYPES
                partition,
 #endif
-               &pc_tree->vertical[0]);
+               &pc_tree->vertical[0], rate);
       if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
-        encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, output_enabled,
-                 subsize,
+        encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
 #if CONFIG_EXT_PARTITION_TYPES
                  partition,
 #endif
-                 &pc_tree->vertical[1]);
+                 &pc_tree->vertical[1], rate);
       }
       break;
     case PARTITION_HORZ:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize,
 #if CONFIG_EXT_PARTITION_TYPES
                partition,
 #endif
-               &pc_tree->horizontal[0]);
+               &pc_tree->horizontal[0], rate);
       if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
-        encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, output_enabled,
-                 subsize,
+        encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
 #if CONFIG_EXT_PARTITION_TYPES
                  partition,
 #endif
-                 &pc_tree->horizontal[1]);
+                 &pc_tree->horizontal[1], rate);
       }
       break;
     case PARTITION_SPLIT:
       if (bsize == BLOCK_8X8) {
-        encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
+        encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize,
 #if CONFIG_EXT_PARTITION_TYPES
                  partition,
 #endif
-                 pc_tree->leaf_split[0]);
+                 pc_tree->leaf_split[0], rate);
       } else {
-        encode_sb(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
-                  pc_tree->split[0]);
-        encode_sb(cpi, td, tile, tp, mi_row, mi_col + hbs, output_enabled,
-                  subsize, pc_tree->split[1]);
-        encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col, output_enabled,
-                  subsize, pc_tree->split[2]);
-        encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs, output_enabled,
-                  subsize, pc_tree->split[3]);
+        encode_sb(cpi, td, tile, tp, mi_row, mi_col, dry_run, subsize,
+                  pc_tree->split[0], rate);
+        encode_sb(cpi, td, tile, tp, mi_row, mi_col + hbs, dry_run, subsize,
+                  pc_tree->split[1], rate);
+        encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col, dry_run, subsize,
+                  pc_tree->split[2], rate);
+        encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs, dry_run,
+                  subsize, pc_tree->split[3], rate);
       }
       break;
 #if CONFIG_EXT_PARTITION_TYPES
     case PARTITION_HORZ_A:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, bsize2,
-               partition, &pc_tree->horizontala[0]);
-      encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, output_enabled, bsize2,
-               partition, &pc_tree->horizontala[1]);
-      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, output_enabled, subsize,
-               partition, &pc_tree->horizontala[2]);
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, bsize2, partition,
+               &pc_tree->horizontala[0], rate);
+      encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, bsize2,
+               partition, &pc_tree->horizontala[1], rate);
+      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
+               partition, &pc_tree->horizontala[2], rate);
       break;
     case PARTITION_HORZ_B:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
-               partition, &pc_tree->horizontalb[0]);
-      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, output_enabled, bsize2,
-               partition, &pc_tree->horizontalb[1]);
-      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col + hbs, output_enabled,
-               bsize2, partition, &pc_tree->horizontalb[2]);
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize, partition,
+               &pc_tree->horizontalb[0], rate);
+      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, bsize2,
+               partition, &pc_tree->horizontalb[1], rate);
+      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col + hbs, dry_run, bsize2,
+               partition, &pc_tree->horizontalb[2], rate);
       break;
     case PARTITION_VERT_A:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, bsize2,
-               partition, &pc_tree->verticala[0]);
-      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, output_enabled, bsize2,
-               partition, &pc_tree->verticala[1]);
-      encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, output_enabled, subsize,
-               partition, &pc_tree->verticala[2]);
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, bsize2, partition,
+               &pc_tree->verticala[0], rate);
+      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, bsize2,
+               partition, &pc_tree->verticala[1], rate);
+      encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
+               partition, &pc_tree->verticala[2], rate);
 
       break;
     case PARTITION_VERT_B:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
-               partition, &pc_tree->verticalb[0]);
-      encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, output_enabled, bsize2,
-               partition, &pc_tree->verticalb[1]);
-      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col + hbs, output_enabled,
-               bsize2, partition, &pc_tree->verticalb[2]);
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize, partition,
+               &pc_tree->verticalb[0], rate);
+      encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, bsize2,
+               partition, &pc_tree->verticalb[1], rate);
+      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col + hbs, dry_run, bsize2,
+               partition, &pc_tree->verticalb[2], rate);
       break;
 #endif  // CONFIG_EXT_PARTITION_TYPES
     default: assert(0 && "Invalid partition type."); break;
@@ -2532,8 +2532,9 @@
 #endif
         PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0];
         av1_rd_cost_init(&tmp_rdc);
-        update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
-        encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
+        update_state(cpi, td, ctx, mi_row, mi_col, subsize, 1);
+        encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
+                          ctx, NULL);
         rd_pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &tmp_rdc,
 #if CONFIG_SUPERTX
                          &rt_nocoef,
@@ -2574,8 +2575,9 @@
 #endif
         PICK_MODE_CONTEXT *ctx = &pc_tree->vertical[0];
         av1_rd_cost_init(&tmp_rdc);
-        update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
-        encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
+        update_state(cpi, td, ctx, mi_row, mi_col, subsize, 1);
+        encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
+                          ctx, NULL);
         rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &tmp_rdc,
 #if CONFIG_SUPERTX
                          &rt_nocoef,
@@ -2728,8 +2730,8 @@
 #endif
 
       if (i != 3)
-        encode_sb(cpi, td, tile_info, tp, mi_row + y_idx, mi_col + x_idx, 0,
-                  split_subsize, pc_tree->split[i]);
+        encode_sb(cpi, td, tile_info, tp, mi_row + y_idx, mi_col + x_idx,
+                  OUTPUT_ENABLED, split_subsize, pc_tree->split[i], NULL);
 
       chosen_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
 #if CONFIG_SUPERTX
@@ -2772,9 +2774,17 @@
     assert(chosen_rdc.rate < INT_MAX && chosen_rdc.dist < INT64_MAX);
 
   if (do_recon) {
-    int output_enabled = (bsize == cm->sb_size);
-    encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize,
-              pc_tree);
+    if (bsize == cm->sb_size) {
+      // NOTE: To get estimate for rate due to the tokens, use:
+      // int rate_coeffs = 0;
+      // encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS,
+      //           bsize, pc_tree, &rate_coeffs);
+      encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+                pc_tree, NULL);
+    } else {
+      encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+                pc_tree, NULL);
+    }
   }
 
   *rate = chosen_rdc.rate;
@@ -3123,8 +3133,9 @@
   if (sum_rdc.rdcost < best_rdc->rdcost) {
 #endif
     PICK_MODE_CONTEXT *ctx = &ctxs[0];
-    update_state(cpi, td, ctx, mi_row0, mi_col0, subsize0, 0);
-    encode_superblock(cpi, td, tp, 0, mi_row0, mi_col0, subsize0, ctx);
+    update_state(cpi, td, ctx, mi_row0, mi_col0, subsize0, 1);
+    encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row0, mi_col0, subsize0,
+                      ctx, NULL);
 
     if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx);
 
@@ -3163,8 +3174,9 @@
     if (sum_rdc.rdcost < best_rdc->rdcost) {
 #endif
       PICK_MODE_CONTEXT *ctx = &ctxs[1];
-      update_state(cpi, td, ctx, mi_row1, mi_col1, subsize1, 0);
-      encode_superblock(cpi, td, tp, 0, mi_row1, mi_col1, subsize1, ctx);
+      update_state(cpi, td, ctx, mi_row1, mi_col1, subsize1, 1);
+      encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row1, mi_col1, subsize1,
+                        ctx, NULL);
 
       if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx);
 
@@ -3775,8 +3787,9 @@
 #endif  // CONFIG_SUPERTX
         mi_row + mi_step < cm->mi_rows && bsize > BLOCK_8X8) {
       PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0];
-      update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
-      encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
+      update_state(cpi, td, ctx, mi_row, mi_col, subsize, 1);
+      encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
+                        ctx, NULL);
 
       if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx);
 
@@ -3911,9 +3924,9 @@
     if (sum_rdc.rdcost < best_rdc.rdcost &&
 #endif  // CONFIG_SUPERTX
         mi_col + mi_step < cm->mi_cols && bsize > BLOCK_8X8) {
-      update_state(cpi, td, &pc_tree->vertical[0], mi_row, mi_col, subsize, 0);
-      encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize,
-                        &pc_tree->vertical[0]);
+      update_state(cpi, td, &pc_tree->vertical[0], mi_row, mi_col, subsize, 1);
+      encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
+                        &pc_tree->vertical[0], NULL);
 
       if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx);
 
@@ -4086,9 +4099,13 @@
 
   if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX &&
       pc_tree->index != 3) {
-    int output_enabled = (bsize == cm->sb_size);
-    encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize,
-              pc_tree);
+    if (bsize == cm->sb_size) {
+      encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+                pc_tree, NULL);
+    } else {
+      encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+                pc_tree, NULL);
+    }
   }
 
   if (bsize == cm->sb_size) {
@@ -4226,11 +4243,7 @@
                cpi->td.rd_counts.coef_counts);
       av1_copy(subframe_stats->eob_counts_buf[cm->coef_probs_update_idx],
                cm->counts.eob_branch);
-      av1_fill_token_costs(x->token_costs,
-#if CONFIG_ANS
-                           cm->fc->coef_cdfs,
-#endif  // CONFIG_ANS
-                           cm->fc->coef_probs);
+      av1_fill_token_costs(x->token_costs, cm->fc->coef_probs);
     }
   }
 #endif  // CONFIG_ENTROPY
@@ -4987,8 +5000,9 @@
 #endif
 
 static void encode_superblock(AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
-                              int output_enabled, int mi_row, int mi_col,
-                              BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
+                              RUN_TYPE dry_run, int mi_row, int mi_col,
+                              BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                              int *rate) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -5010,12 +5024,12 @@
     mbmi->skip = 1;
     for (plane = 0; plane < MAX_MB_PLANE; ++plane)
       av1_encode_intra_block_plane(x, AOMMAX(bsize, BLOCK_8X8), plane, 1);
-    if (output_enabled)
+    if (!dry_run)
       sum_intra_stats(td->counts, mi, xd->above_mi, xd->left_mi,
                       frame_is_intra_only(cm));
 
 #if CONFIG_EXT_INTRA
-    if (output_enabled && bsize >= BLOCK_8X8) {
+    if (!dry_run && bsize >= BLOCK_8X8) {
       FRAME_COUNTS *counts = td->counts;
       if (mbmi->mode == DC_PRED && mbmi->palette_mode_info.palette_size[0] == 0)
         ++counts->ext_intra[0][mbmi->ext_intra_mode_info.use_ext_intra_mode[0]];
@@ -5033,18 +5047,18 @@
     }
 #endif  // CONFIG_EXT_INTRA
 
-    if (bsize >= BLOCK_8X8 && output_enabled) {
+    if (bsize >= BLOCK_8X8 && !dry_run) {
       for (plane = 0; plane <= 1; ++plane) {
         if (mbmi->palette_mode_info.palette_size[plane] > 0) {
           mbmi->palette_mode_info.palette_first_color_idx[plane] =
               xd->plane[plane].color_index_map[0];
           // TODO(huisu): this increases the use of token buffer. Needs stretch
           // test to verify.
-          av1_tokenize_palette_sb(td, bsize, plane, t);
+          av1_tokenize_palette_sb(cpi, td, plane, t, dry_run, bsize, rate);
         }
       }
     }
-    av1_tokenize_sb(cpi, td, t, !output_enabled, AOMMAX(bsize, BLOCK_8X8));
+    av1_tokenize_sb(cpi, td, t, dry_run, AOMMAX(bsize, BLOCK_8X8), rate);
   } else {
     int ref;
     const int is_compound = has_second_ref(mbmi);
@@ -5116,17 +5130,17 @@
 #if CONFIG_VAR_TX
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
     if (is_rect_tx(mbmi->tx_size))
-      av1_tokenize_sb(cpi, td, t, !output_enabled, AOMMAX(bsize, BLOCK_8X8));
+      av1_tokenize_sb(cpi, td, t, dry_run, AOMMAX(bsize, BLOCK_8X8), rate);
     else
 #endif
-      av1_tokenize_sb_vartx(cpi, td, t, !output_enabled, mi_row, mi_col,
-                            AOMMAX(bsize, BLOCK_8X8));
+      av1_tokenize_sb_vartx(cpi, td, t, dry_run, mi_row, mi_col,
+                            AOMMAX(bsize, BLOCK_8X8), rate);
 #else
-    av1_tokenize_sb(cpi, td, t, !output_enabled, AOMMAX(bsize, BLOCK_8X8));
+    av1_tokenize_sb(cpi, td, t, dry_run, AOMMAX(bsize, BLOCK_8X8), rate);
 #endif
   }
 
-  if (output_enabled) {
+  if (!dry_run) {
     if (cm->tx_mode == TX_MODE_SELECT && mbmi->sb_type >= BLOCK_8X8 &&
         !(is_inter_block(mbmi) && (mbmi->skip || seg_skip))) {
       const int is_inter = is_inter_block(mbmi);
@@ -5209,8 +5223,7 @@
 #if CONFIG_VAR_TX
   if (cm->tx_mode == TX_MODE_SELECT && mbmi->sb_type >= BLOCK_8X8 &&
       is_inter_block(mbmi) && !(mbmi->skip || seg_skip)) {
-    if (!output_enabled)
-      tx_partition_set_contexts(cm, xd, bsize, mi_row, mi_col);
+    if (dry_run) tx_partition_set_contexts(cm, xd, bsize, mi_row, mi_col);
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
     if (is_rect_tx(mbmi->tx_size)) {
       set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h, xd);
@@ -5406,7 +5419,7 @@
                              int mi_col_pred, int mi_row_top, int mi_col_top,
                              uint8_t *dst_buf[3], int dst_stride[3],
                              BLOCK_SIZE bsize_top, BLOCK_SIZE bsize_pred,
-                             int output_enabled, int b_sub8x8, int bextend) {
+                             RUN_TYPE dry_run, int b_sub8x8, int bextend) {
   // Used in supertx
   // (mi_row_ori, mi_col_ori): location for mv
   // (mi_row_pred, mi_col_pred, bsize_pred): region to predict
@@ -5450,13 +5463,13 @@
 #endif  // CONFIG_EXT_INTER
                      mi_row_pred, mi_col_pred, bsize_pred, b_sub8x8, block);
 
-  if (output_enabled && !bextend) update_stats(&cpi->common, td, 1);
+  if (!dry_run && !bextend) update_stats(&cpi->common, td, 1);
 }
 
 static void extend_dir(AV1_COMP *cpi, ThreadData *td,
                        const TileInfo *const tile, int block, BLOCK_SIZE bsize,
                        BLOCK_SIZE top_bsize, int mi_row, int mi_col,
-                       int mi_row_top, int mi_col_top, int output_enabled,
+                       int mi_row_top, int mi_col_top, RUN_TYPE dry_run,
                        uint8_t *dst_buf[3], int dst_stride[3], int dir) {
   // dir: 0-lower, 1-upper, 2-left, 3-right
   //      4-lowerleft, 5-upperleft, 6-lowerright, 7-upperright
@@ -5480,7 +5493,7 @@
 
     predict_b_extend(cpi, td, tile, block, mi_row, mi_col, mi_row_pred,
                      mi_col_pred, mi_row_top, mi_col_top, dst_buf, dst_stride,
-                     top_bsize, extend_bsize, output_enabled, b_sub8x8, 1);
+                     top_bsize, extend_bsize, dry_run, b_sub8x8, 1);
 
     if (mi_width > unit) {
       int i;
@@ -5488,8 +5501,8 @@
         mi_col_pred += unit;
         predict_b_extend(cpi, td, tile, block, mi_row, mi_col, mi_row_pred,
                          mi_col_pred, mi_row_top, mi_col_top, dst_buf,
-                         dst_stride, top_bsize, extend_bsize, output_enabled,
-                         b_sub8x8, 1);
+                         dst_stride, top_bsize, extend_bsize, dry_run, b_sub8x8,
+                         1);
       }
     }
   } else if (dir == 2 || dir == 3) {  // left and right
@@ -5502,7 +5515,7 @@
 
     predict_b_extend(cpi, td, tile, block, mi_row, mi_col, mi_row_pred,
                      mi_col_pred, mi_row_top, mi_col_top, dst_buf, dst_stride,
-                     top_bsize, extend_bsize, output_enabled, b_sub8x8, 1);
+                     top_bsize, extend_bsize, dry_run, b_sub8x8, 1);
 
     if (mi_height > unit) {
       int i;
@@ -5510,8 +5523,8 @@
         mi_row_pred += unit;
         predict_b_extend(cpi, td, tile, block, mi_row, mi_col, mi_row_pred,
                          mi_col_pred, mi_row_top, mi_col_top, dst_buf,
-                         dst_stride, top_bsize, extend_bsize, output_enabled,
-                         b_sub8x8, 1);
+                         dst_stride, top_bsize, extend_bsize, dry_run, b_sub8x8,
+                         1);
       }
     }
   } else {
@@ -5521,32 +5534,32 @@
 
     predict_b_extend(cpi, td, tile, block, mi_row, mi_col, mi_row_pred,
                      mi_col_pred, mi_row_top, mi_col_top, dst_buf, dst_stride,
-                     top_bsize, extend_bsize, output_enabled, b_sub8x8, 1);
+                     top_bsize, extend_bsize, dry_run, b_sub8x8, 1);
   }
 }
 
 static void extend_all(AV1_COMP *cpi, ThreadData *td,
                        const TileInfo *const tile, int block, BLOCK_SIZE bsize,
                        BLOCK_SIZE top_bsize, int mi_row, int mi_col,
-                       int mi_row_top, int mi_col_top, int output_enabled,
+                       int mi_row_top, int mi_col_top, RUN_TYPE dry_run,
                        uint8_t *dst_buf[3], int dst_stride[3]) {
   assert(block >= 0 && block < 4);
   extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-             mi_col_top, output_enabled, dst_buf, dst_stride, 0);
+             mi_col_top, dry_run, dst_buf, dst_stride, 0);
   extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-             mi_col_top, output_enabled, dst_buf, dst_stride, 1);
+             mi_col_top, dry_run, dst_buf, dst_stride, 1);
   extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-             mi_col_top, output_enabled, dst_buf, dst_stride, 2);
+             mi_col_top, dry_run, dst_buf, dst_stride, 2);
   extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-             mi_col_top, output_enabled, dst_buf, dst_stride, 3);
+             mi_col_top, dry_run, dst_buf, dst_stride, 3);
   extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-             mi_col_top, output_enabled, dst_buf, dst_stride, 4);
+             mi_col_top, dry_run, dst_buf, dst_stride, 4);
   extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-             mi_col_top, output_enabled, dst_buf, dst_stride, 5);
+             mi_col_top, dry_run, dst_buf, dst_stride, 5);
   extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-             mi_col_top, output_enabled, dst_buf, dst_stride, 6);
+             mi_col_top, dry_run, dst_buf, dst_stride, 6);
   extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-             mi_col_top, output_enabled, dst_buf, dst_stride, 7);
+             mi_col_top, dry_run, dst_buf, dst_stride, 7);
 }
 
 // This function generates prediction for multiple blocks, between which
@@ -5560,7 +5573,7 @@
 static void predict_sb_complex(AV1_COMP *cpi, ThreadData *td,
                                const TileInfo *const tile, int mi_row,
                                int mi_col, int mi_row_top, int mi_col_top,
-                               int output_enabled, BLOCK_SIZE bsize,
+                               RUN_TYPE dry_run, BLOCK_SIZE bsize,
                                BLOCK_SIZE top_bsize, uint8_t *dst_buf[3],
                                int dst_stride[3], PC_TREE *pc_tree) {
   AV1_COMMON *const cm = &cpi->common;
@@ -5615,8 +5628,7 @@
   }
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
-  if (output_enabled && bsize < top_bsize)
-    cm->counts.partition[ctx][partition]++;
+  if (!dry_run && bsize < top_bsize) cm->counts.partition[ctx][partition]++;
 
   for (i = 0; i < MAX_MB_PLANE; i++) {
     xd->plane[i].dst.buf = dst_buf[i];
@@ -5628,29 +5640,27 @@
       assert(bsize < top_bsize);
       predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
                        mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                       bsize, output_enabled, 0, 0);
+                       bsize, dry_run, 0, 0);
       extend_all(cpi, td, tile, 0, bsize, top_bsize, mi_row, mi_col, mi_row_top,
-                 mi_col_top, output_enabled, dst_buf, dst_stride);
+                 mi_col_top, dry_run, dst_buf, dst_stride);
       break;
     case PARTITION_HORZ:
       if (bsize == BLOCK_8X8) {
         // Fisrt half
         predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
                          mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                         BLOCK_8X8, output_enabled, 1, 0);
+                         BLOCK_8X8, dry_run, 1, 0);
         if (bsize < top_bsize)
           extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, output_enabled, dst_buf,
-                     dst_stride);
+                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
 
         // Second half
         predict_b_extend(cpi, td, tile, 2, mi_row, mi_col, mi_row, mi_col,
                          mi_row_top, mi_col_top, dst_buf1, dst_stride1,
-                         top_bsize, BLOCK_8X8, output_enabled, 1, 1);
+                         top_bsize, BLOCK_8X8, dry_run, 1, 1);
         if (bsize < top_bsize)
           extend_all(cpi, td, tile, 2, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, output_enabled, dst_buf1,
-                     dst_stride1);
+                     mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
 
         // Smooth
         xd->plane[0].dst.buf = dst_buf[0];
@@ -5663,29 +5673,26 @@
         // First half
         predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
                          mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                         subsize, output_enabled, 0, 0);
+                         subsize, dry_run, 0, 0);
         if (bsize < top_bsize)
           extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, output_enabled, dst_buf,
-                     dst_stride);
+                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
         else
           extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, output_enabled, dst_buf,
-                     dst_stride, 0);
+                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride, 0);
 
         if (mi_row + hbs < cm->mi_rows) {
           // Second half
           predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
                            mi_col, mi_row_top, mi_col_top, dst_buf1,
-                           dst_stride1, top_bsize, subsize, output_enabled, 0,
-                           0);
+                           dst_stride1, top_bsize, subsize, dry_run, 0, 0);
           if (bsize < top_bsize)
             extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs,
-                       mi_col, mi_row_top, mi_col_top, output_enabled, dst_buf1,
+                       mi_col, mi_row_top, mi_col_top, dry_run, dst_buf1,
                        dst_stride1);
           else
             extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs,
-                       mi_col, mi_row_top, mi_col_top, output_enabled, dst_buf1,
+                       mi_col, mi_row_top, mi_col_top, dry_run, dst_buf1,
                        dst_stride1, 1);
 
           // Smooth
@@ -5705,20 +5712,18 @@
         // First half
         predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
                          mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                         BLOCK_8X8, output_enabled, 1, 0);
+                         BLOCK_8X8, dry_run, 1, 0);
         if (bsize < top_bsize)
           extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, output_enabled, dst_buf,
-                     dst_stride);
+                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
 
         // Second half
         predict_b_extend(cpi, td, tile, 1, mi_row, mi_col, mi_row, mi_col,
                          mi_row_top, mi_col_top, dst_buf1, dst_stride1,
-                         top_bsize, BLOCK_8X8, output_enabled, 1, 1);
+                         top_bsize, BLOCK_8X8, dry_run, 1, 1);
         if (bsize < top_bsize)
           extend_all(cpi, td, tile, 1, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, output_enabled, dst_buf1,
-                     dst_stride1);
+                     mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
 
         // Smooth
         xd->plane[0].dst.buf = dst_buf[0];
@@ -5731,29 +5736,26 @@
         // bsize: not important, not useful
         predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
                          mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                         subsize, output_enabled, 0, 0);
+                         subsize, dry_run, 0, 0);
         if (bsize < top_bsize)
           extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, output_enabled, dst_buf,
-                     dst_stride);
+                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
         else
           extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, output_enabled, dst_buf,
-                     dst_stride, 3);
+                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride, 3);
 
         if (mi_col + hbs < cm->mi_cols) {
           predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
                            mi_col + hbs, mi_row_top, mi_col_top, dst_buf1,
-                           dst_stride1, top_bsize, subsize, output_enabled, 0,
-                           0);
+                           dst_stride1, top_bsize, subsize, dry_run, 0, 0);
           if (bsize < top_bsize)
             extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row,
-                       mi_col + hbs, mi_row_top, mi_col_top, output_enabled,
-                       dst_buf1, dst_stride1);
+                       mi_col + hbs, mi_row_top, mi_col_top, dry_run, dst_buf1,
+                       dst_stride1);
           else
             extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row,
-                       mi_col + hbs, mi_row_top, mi_col_top, output_enabled,
-                       dst_buf1, dst_stride1, 2);
+                       mi_col + hbs, mi_row_top, mi_col_top, dry_run, dst_buf1,
+                       dst_stride1, 2);
 
           for (i = 0; i < MAX_MB_PLANE; i++) {
             xd->plane[i].dst.buf = dst_buf[i];
@@ -5770,46 +5772,42 @@
       if (bsize == BLOCK_8X8) {
         predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
                          mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                         BLOCK_8X8, output_enabled, 1, 0);
+                         BLOCK_8X8, dry_run, 1, 0);
         predict_b_extend(cpi, td, tile, 1, mi_row, mi_col, mi_row, mi_col,
                          mi_row_top, mi_col_top, dst_buf1, dst_stride1,
-                         top_bsize, BLOCK_8X8, output_enabled, 1, 1);
+                         top_bsize, BLOCK_8X8, dry_run, 1, 1);
         predict_b_extend(cpi, td, tile, 2, mi_row, mi_col, mi_row, mi_col,
                          mi_row_top, mi_col_top, dst_buf2, dst_stride2,
-                         top_bsize, BLOCK_8X8, output_enabled, 1, 1);
+                         top_bsize, BLOCK_8X8, dry_run, 1, 1);
         predict_b_extend(cpi, td, tile, 3, mi_row, mi_col, mi_row, mi_col,
                          mi_row_top, mi_col_top, dst_buf3, dst_stride3,
-                         top_bsize, BLOCK_8X8, output_enabled, 1, 1);
+                         top_bsize, BLOCK_8X8, dry_run, 1, 1);
 
         if (bsize < top_bsize) {
           extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, output_enabled, dst_buf,
-                     dst_stride);
+                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
           extend_all(cpi, td, tile, 1, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, output_enabled, dst_buf1,
-                     dst_stride1);
+                     mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
           extend_all(cpi, td, tile, 2, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, output_enabled, dst_buf2,
-                     dst_stride2);
+                     mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2);
           extend_all(cpi, td, tile, 3, subsize, top_bsize, mi_row, mi_col,
-                     mi_row_top, mi_col_top, output_enabled, dst_buf3,
-                     dst_stride3);
+                     mi_row_top, mi_col_top, dry_run, dst_buf3, dst_stride3);
         }
       } else {
         predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row_top,
-                           mi_col_top, output_enabled, subsize, top_bsize,
-                           dst_buf, dst_stride, pc_tree->split[0]);
+                           mi_col_top, dry_run, subsize, top_bsize, dst_buf,
+                           dst_stride, pc_tree->split[0]);
         if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols)
           predict_sb_complex(cpi, td, tile, mi_row, mi_col + hbs, mi_row_top,
-                             mi_col_top, output_enabled, subsize, top_bsize,
-                             dst_buf1, dst_stride1, pc_tree->split[1]);
+                             mi_col_top, dry_run, subsize, top_bsize, dst_buf1,
+                             dst_stride1, pc_tree->split[1]);
         if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols)
           predict_sb_complex(cpi, td, tile, mi_row + hbs, mi_col, mi_row_top,
-                             mi_col_top, output_enabled, subsize, top_bsize,
-                             dst_buf2, dst_stride2, pc_tree->split[2]);
+                             mi_col_top, dry_run, subsize, top_bsize, dst_buf2,
+                             dst_stride2, pc_tree->split[2]);
         if (mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols)
           predict_sb_complex(cpi, td, tile, mi_row + hbs, mi_col + hbs,
-                             mi_row_top, mi_col_top, output_enabled, subsize,
+                             mi_row_top, mi_col_top, dry_run, subsize,
                              top_bsize, dst_buf3, dst_stride3,
                              pc_tree->split[3]);
       }
@@ -5843,27 +5841,25 @@
     case PARTITION_HORZ_A:
       predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
                        mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                       bsize2, output_enabled, 0, 0);
+                       bsize2, dry_run, 0, 0);
       extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col,
-                 mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride);
+                 mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
 
       predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
                        mi_col + hbs, mi_row_top, mi_col_top, dst_buf1,
-                       dst_stride1, top_bsize, bsize2, output_enabled, 0, 0);
+                       dst_stride1, top_bsize, bsize2, dry_run, 0, 0);
       extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col + hbs,
-                 mi_row_top, mi_col_top, output_enabled, dst_buf1, dst_stride1);
+                 mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
 
       predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
                        mi_col, mi_row_top, mi_col_top, dst_buf2, dst_stride2,
-                       top_bsize, subsize, output_enabled, 0, 0);
+                       top_bsize, subsize, dry_run, 0, 0);
       if (bsize < top_bsize)
         extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs, mi_col,
-                   mi_row_top, mi_col_top, output_enabled, dst_buf2,
-                   dst_stride2);
+                   mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2);
       else
         extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs, mi_col,
-                   mi_row_top, mi_col_top, output_enabled, dst_buf2,
-                   dst_stride2, 1);
+                   mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2, 1);
 
       for (i = 0; i < MAX_MB_PLANE; i++) {
         xd->plane[i].dst.buf = dst_buf[i];
@@ -5885,27 +5881,25 @@
 
       predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
                        mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                       bsize2, output_enabled, 0, 0);
+                       bsize2, dry_run, 0, 0);
       extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col,
-                 mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride);
+                 mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
 
       predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
                        mi_col, mi_row_top, mi_col_top, dst_buf1, dst_stride1,
-                       top_bsize, bsize2, output_enabled, 0, 0);
+                       top_bsize, bsize2, dry_run, 0, 0);
       extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs, mi_col,
-                 mi_row_top, mi_col_top, output_enabled, dst_buf1, dst_stride1);
+                 mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
 
       predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
                        mi_col + hbs, mi_row_top, mi_col_top, dst_buf2,
-                       dst_stride2, top_bsize, subsize, output_enabled, 0, 0);
+                       dst_stride2, top_bsize, subsize, dry_run, 0, 0);
       if (bsize < top_bsize)
         extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col + hbs,
-                   mi_row_top, mi_col_top, output_enabled, dst_buf2,
-                   dst_stride2);
+                   mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2);
       else
         extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col + hbs,
-                   mi_row_top, mi_col_top, output_enabled, dst_buf2,
-                   dst_stride2, 2);
+                   mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2, 2);
 
       for (i = 0; i < MAX_MB_PLANE; i++) {
         xd->plane[i].dst.buf = dst_buf[i];
@@ -5926,27 +5920,25 @@
 
       predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
                        mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                       subsize, output_enabled, 0, 0);
+                       subsize, dry_run, 0, 0);
       if (bsize < top_bsize)
         extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                   mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride);
+                   mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
       else
         extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                   mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride,
-                   0);
+                   mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride, 0);
 
       predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
                        mi_col, mi_row_top, mi_col_top, dst_buf1, dst_stride1,
-                       top_bsize, bsize2, output_enabled, 0, 0);
+                       top_bsize, bsize2, dry_run, 0, 0);
       extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs, mi_col,
-                 mi_row_top, mi_col_top, output_enabled, dst_buf1, dst_stride1);
+                 mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
 
       predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col + hbs,
                        mi_row + hbs, mi_col + hbs, mi_row_top, mi_col_top,
-                       dst_buf2, dst_stride2, top_bsize, bsize2, output_enabled,
-                       0, 0);
+                       dst_buf2, dst_stride2, top_bsize, bsize2, dry_run, 0, 0);
       extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs,
-                 mi_col + hbs, mi_row_top, mi_col_top, output_enabled, dst_buf2,
+                 mi_col + hbs, mi_row_top, mi_col_top, dry_run, dst_buf2,
                  dst_stride2);
 
       for (i = 0; i < MAX_MB_PLANE; i++) {
@@ -5970,27 +5962,25 @@
 
       predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
                        mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
-                       subsize, output_enabled, 0, 0);
+                       subsize, dry_run, 0, 0);
       if (bsize < top_bsize)
         extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                   mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride);
+                   mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
       else
         extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
-                   mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride,
-                   3);
+                   mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride, 3);
 
       predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
                        mi_col + hbs, mi_row_top, mi_col_top, dst_buf1,
-                       dst_stride1, top_bsize, bsize2, output_enabled, 0, 0);
+                       dst_stride1, top_bsize, bsize2, dry_run, 0, 0);
       extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col + hbs,
-                 mi_row_top, mi_col_top, output_enabled, dst_buf1, dst_stride1);
+                 mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
 
       predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col + hbs,
                        mi_row + hbs, mi_col + hbs, mi_row_top, mi_col_top,
-                       dst_buf2, dst_stride2, top_bsize, bsize2, output_enabled,
-                       0, 0);
+                       dst_buf2, dst_stride2, top_bsize, bsize2, dry_run, 0, 0);
       extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs,
-                 mi_col + hbs, mi_row_top, mi_col_top, output_enabled, dst_buf2,
+                 mi_col + hbs, mi_row_top, mi_col_top, dry_run, dst_buf2,
                  dst_stride2);
 
       for (i = 0; i < MAX_MB_PLANE; i++) {
@@ -6046,13 +6036,13 @@
 
   set_skip_context(xd, mi_row, mi_col);
   set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
-  update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, bsize, 0, pc_tree);
+  update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, bsize, 1, pc_tree);
   av1_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
   for (plane = 0; plane < MAX_MB_PLANE; plane++) {
     dst_buf[plane] = xd->plane[plane].dst.buf;
     dst_stride[plane] = xd->plane[plane].dst.stride;
   }
-  predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row, mi_col, 0, bsize,
+  predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row, mi_col, 1, bsize,
                      bsize, dst_buf, dst_stride, pc_tree);
 
   set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);

diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index ddd3cc5..acb5498 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c

@@ -3086,15 +3086,15 @@
     cpi->lst_fb_idxes[ref_frame] = cpi->lst_fb_idxes[ref_frame - 1];
 
     // [0] is allocated to the current coded frame. The statistics for the
-    // reference frames start at [1].
+    // reference frames start at [LAST_FRAME], i.e. [1].
     if (!cpi->rc.is_src_frame_alt_ref) {
-      memcpy(cpi->interp_filter_selected[ref_frame + 1],
-             cpi->interp_filter_selected[ref_frame],
-             sizeof(cpi->interp_filter_selected[ref_frame]));
+      memcpy(cpi->interp_filter_selected[ref_frame + LAST_FRAME],
+             cpi->interp_filter_selected[ref_frame - 1 + LAST_FRAME],
+             sizeof(cpi->interp_filter_selected[ref_frame - 1 + LAST_FRAME]));
     }
   }
 }
-#endif
+#endif  // CONFIG_EXT_REFS
 
 void av1_update_reference_frames(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
@@ -3181,14 +3181,12 @@
     int tmp = cpi->lst_fb_idxes[LAST_REF_FRAMES - 1];
 
     shift_last_ref_frames(cpi);
-
     cpi->lst_fb_idxes[0] = cpi->bwd_fb_idx;
-    if (!cpi->rc.is_src_frame_alt_ref) {
-      memcpy(cpi->interp_filter_selected[0],
-             cpi->interp_filter_selected[BWDREF_FRAME],
-             sizeof(cpi->interp_filter_selected[BWDREF_FRAME]));
-    }
     cpi->bwd_fb_idx = tmp;
+
+    memcpy(cpi->interp_filter_selected[LAST_FRAME],
+           cpi->interp_filter_selected[BWDREF_FRAME],
+           sizeof(cpi->interp_filter_selected[BWDREF_FRAME]));
   } else if (cpi->rc.is_src_frame_ext_arf && cm->show_existing_frame) {
     // Deal with the special case for showing existing internal ALTREF_FRAME
     // Refresh the LAST_FRAME with the ALTREF_FRAME and retire the LAST3_FRAME
@@ -3198,15 +3196,15 @@
     int tmp = cpi->lst_fb_idxes[LAST_REF_FRAMES - 1];
 
     shift_last_ref_frames(cpi);
-
     cpi->lst_fb_idxes[0] = cpi->alt_fb_idx;
+    cpi->alt_fb_idx = tmp;
+
+    // We need to modify the mapping accordingly
+    cpi->arf_map[which_arf] = cpi->alt_fb_idx;
+
     memcpy(cpi->interp_filter_selected[LAST_FRAME],
            cpi->interp_filter_selected[ALTREF_FRAME + which_arf],
            sizeof(cpi->interp_filter_selected[ALTREF_FRAME + which_arf]));
-
-    cpi->alt_fb_idx = tmp;
-    // We need to modify the mapping accordingly
-    cpi->arf_map[which_arf] = cpi->alt_fb_idx;
 #endif     // CONFIG_EXT_REFS
   } else { /* For non key/golden frames */
     if (cpi->refresh_alt_ref_frame) {
@@ -3241,22 +3239,12 @@
         uref_cnt_fb(cpi->upsampled_ref_bufs,
                     &cpi->upsampled_ref_idx[cpi->gld_fb_idx], new_uidx);
 
-      if (!cpi->rc.is_src_frame_alt_ref) {
+#if !CONFIG_EXT_REFS
+      if (!cpi->rc.is_src_frame_alt_ref)
+#endif  // !CONFIG_EXT_REFS
         memcpy(cpi->interp_filter_selected[GOLDEN_FRAME],
                cpi->interp_filter_selected[0],
                sizeof(cpi->interp_filter_selected[0]));
-      } else {
-        int which_arf = 0;
-#if CONFIG_EXT_REFS
-        if (cpi->oxcf.pass == 2) {
-          const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-          which_arf = gf_group->arf_update_idx[gf_group->index];
-        }
-#endif  // CONFIG_EXT_REFS
-        memcpy(cpi->interp_filter_selected[GOLDEN_FRAME],
-               cpi->interp_filter_selected[ALTREF_FRAME + which_arf],
-               sizeof(cpi->interp_filter_selected[ALTREF_FRAME + which_arf]));
-      }
     }
 
 #if CONFIG_EXT_REFS
@@ -3271,6 +3259,7 @@
         cpi->alt_fb_idx = cpi->bwd_fb_idx;
         cpi->bwd_fb_idx = tmp;
       }
+
       ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->bwd_fb_idx],
                  cm->new_fb_idx);
       if (use_upsampled_ref)
@@ -3354,20 +3343,14 @@
       tmp = cpi->lst_fb_idxes[LAST_REF_FRAMES - 1];
 
       shift_last_ref_frames(cpi);
-
       cpi->lst_fb_idxes[0] = tmp;
 
-      if (!cpi->rc.is_src_frame_alt_ref) {
-        if (cm->show_existing_frame) {
-          memcpy(cpi->interp_filter_selected[LAST_FRAME],
-                 cpi->interp_filter_selected[BWDREF_FRAME],
-                 sizeof(cpi->interp_filter_selected[BWDREF_FRAME]));
-        } else {
-          memcpy(cpi->interp_filter_selected[LAST_FRAME],
-                 cpi->interp_filter_selected[0],
-                 sizeof(cpi->interp_filter_selected[0]));
-        }
-      }
+      assert(cm->show_existing_frame == 0);
+      // NOTE: Currently only LF_UPDATE and INTNL_OVERLAY_UPDATE frames are to
+      //       refresh the LAST_FRAME.
+      memcpy(cpi->interp_filter_selected[LAST_FRAME],
+             cpi->interp_filter_selected[0],
+             sizeof(cpi->interp_filter_selected[0]));
     }
 #else
     ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx],
@@ -3425,12 +3408,23 @@
   }
 #if CONFIG_CLPF
   cm->clpf_strength_y = cm->clpf_strength_u = cm->clpf_strength_v = 0;
-  cm->clpf_size = 2;
-  CHECK_MEM_ERROR(
-      cm, cm->clpf_blocks,
-      aom_malloc(((cm->frame_to_show->y_crop_width + 31) & ~31) *
-                     ((cm->frame_to_show->y_crop_height + 31) & ~31) >>
-                 10));
+  cm->clpf_size = CLPF_64X64;
+
+  // Allocate buffer to hold the status of all filter blocks:
+  // 1 = On, 0 = off, -1 = implicitly off
+  {
+    int size;
+    cm->clpf_stride = ((cm->frame_to_show->y_crop_width + MIN_FB_SIZE - 1) &
+                       ~(MIN_FB_SIZE - 1)) >>
+                      MIN_FB_SIZE_LOG2;
+    size = cm->clpf_stride *
+               ((cm->frame_to_show->y_crop_height + MIN_FB_SIZE - 1) &
+                ~(MIN_FB_SIZE - 1)) >>
+           MIN_FB_SIZE_LOG2;
+    CHECK_MEM_ERROR(cm, cm->clpf_blocks, aom_malloc(size));
+    memset(cm->clpf_blocks, CLPF_NOFLAG, size);
+  }
+
   if (!is_lossless_requested(&cpi->oxcf)) {
     const YV12_BUFFER_CONFIG *const frame = cm->frame_to_show;
 
@@ -3445,20 +3439,18 @@
       // Apply the filter using the chosen strength
       cm->clpf_strength_y = strength_y - (strength_y == 4);
       cm->clpf_size =
-          fb_size_log2 ? fb_size_log2 - get_msb(MAX_FB_SIZE) + 3 : 0;
-      cm->clpf_numblocks = av1_clpf_frame(
-          frame, cpi->Source, cm, !!cm->clpf_size, strength_y,
-          4 + cm->clpf_size, cm->clpf_blocks, AOM_PLANE_Y, av1_clpf_decision);
+          fb_size_log2 ? fb_size_log2 - MAX_FB_SIZE_LOG2 + 3 : CLPF_NOSIZE;
+      av1_clpf_frame(frame, cpi->Source, cm, cm->clpf_size != CLPF_NOSIZE,
+                     strength_y, 4 + cm->clpf_size, AOM_PLANE_Y,
+                     av1_clpf_decision);
     }
     if (strength_u) {
       cm->clpf_strength_u = strength_u - (strength_u == 4);
-      av1_clpf_frame(frame, NULL, cm, 0, strength_u, 4, NULL, AOM_PLANE_U,
-                     NULL);
+      av1_clpf_frame(frame, NULL, cm, 0, strength_u, 4, AOM_PLANE_U, NULL);
     }
     if (strength_v) {
       cm->clpf_strength_v = strength_v - (strength_v == 4);
-      av1_clpf_frame(frame, NULL, cm, 0, strength_v, 4, NULL, AOM_PLANE_V,
-                     NULL);
+      av1_clpf_frame(frame, NULL, cm, 0, strength_v, 4, AOM_PLANE_V, NULL);
     }
   }
 #endif

diff --git a/av1/encoder/firstpass.c b/av1/encoder/firstpass.c
index 5821d3f..9fdf540 100644
--- a/av1/encoder/firstpass.c
+++ b/av1/encoder/firstpass.c

@@ -2585,10 +2585,11 @@
       if (cpi->num_extra_arfs) {
         int tmp = cpi->bwd_fb_idx;
 
-        cpi->rc.is_bwd_ref_frame = 1;
         cpi->bwd_fb_idx = cpi->alt_fb_idx;
         cpi->alt_fb_idx = cpi->arf_map[0];
         cpi->arf_map[0] = tmp;
+
+        cpi->rc.is_bwd_ref_frame = 1;
       } else {
         cpi->rc.is_bwd_ref_frame = 0;
       }
@@ -2639,11 +2640,13 @@
         // NOTE: The indices will be swapped back after this frame is encoded
         //       (in av1_update_reference_frames()).
         int tmp = cpi->bwd_fb_idx;
+
         cpi->bwd_fb_idx = cpi->alt_fb_idx;
         cpi->alt_fb_idx = cpi->arf_map[0];
         cpi->arf_map[0] = tmp;
       }
       break;
+
     case LAST_BIPRED_UPDATE:
       cpi->refresh_last_frame = 0;
       cpi->refresh_golden_frame = 0;

diff --git a/av1/encoder/hybrid_fwd_txfm.c b/av1/encoder/hybrid_fwd_txfm.c
index 9589a48..1103c4b 100644
--- a/av1/encoder/hybrid_fwd_txfm.c
+++ b/av1/encoder/hybrid_fwd_txfm.c

@@ -180,16 +180,14 @@
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
-      av1_fht32x32_c(src_diff, coeff, diff_stride, tx_type);
+      av1_fht32x32(src_diff, coeff, diff_stride, tx_type);
       break;
     case V_DCT:
     case H_DCT:
     case V_ADST:
     case H_ADST:
     case V_FLIPADST:
-    case H_FLIPADST:
-      av1_fht32x32_c(src_diff, coeff, diff_stride, tx_type);
-      break;
+    case H_FLIPADST: av1_fht32x32(src_diff, coeff, diff_stride, tx_type); break;
     case IDTX: av1_fwd_idtx_c(src_diff, coeff, diff_stride, 32, tx_type); break;
 #endif  // CONFIG_EXT_TX
     default: assert(0); break;

diff --git a/av1/encoder/pickdering.c b/av1/encoder/pickdering.c
index 726a4c1..4ef83cd 100644
--- a/av1/encoder/pickdering.c
+++ b/av1/encoder/pickdering.c

@@ -10,6 +10,7 @@
  */
 
 #include <string.h>
+#include <math.h>
 
 #include "./aom_scale_rtcd.h"
 #include "av1/common/dering.h"
@@ -46,12 +47,8 @@
   int bsize[3];
   int dec[3];
   int pli;
-  int(*mse)[MAX_DERING_LEVEL];
-  double tot_mse[MAX_DERING_LEVEL] = { 0 };
   int level;
   int best_level;
-  int global_level;
-  double best_tot_mse = 1e15;
   int coeff_shift = AOMMAX(cm->bit_depth - 8, 0);
   src = aom_malloc(sizeof(*src) * cm->mi_rows * cm->mi_cols * 64);
   ref_coeff = aom_malloc(sizeof(*ref_coeff) * cm->mi_rows * cm->mi_cols * 64);
@@ -89,68 +86,47 @@
   }
   nvsb = (cm->mi_rows + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
   nhsb = (cm->mi_cols + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
-  mse = aom_malloc(nvsb * nhsb * sizeof(*mse));
+  /* Pick a base threshold based on the quantizer. The threshold will then be
+     adjusted on a 64x64 basis. We use a threshold of the form T = a*Q^b,
+     where a and b are derived empirically trying to optimize rate-distortion
+     at different quantizer settings. */
+  best_level = AOMMIN(
+      MAX_DERING_LEVEL - 1,
+      (int)floor(.5 +
+                 .45 * pow(av1_ac_quant(cm->base_qindex, 0, cm->bit_depth) >>
+                               (cm->bit_depth - 8),
+                           0.6)));
   for (sbr = 0; sbr < nvsb; sbr++) {
     for (sbc = 0; sbc < nhsb; sbc++) {
       int nvb, nhb;
+      int gi;
+      int best_gi;
+      int32_t best_mse = INT32_MAX;
       int16_t dst[MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8];
       nhb = AOMMIN(MAX_MIB_SIZE, cm->mi_cols - MAX_MIB_SIZE * sbc);
       nvb = AOMMIN(MAX_MIB_SIZE, cm->mi_rows - MAX_MIB_SIZE * sbr);
-      for (level = 0; level < 64; level++) {
+      if (sb_all_skip(cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE)) continue;
+      best_gi = 0;
+      for (gi = 0; gi < DERING_REFINEMENT_LEVELS; gi++) {
         int cur_mse;
         int threshold;
+        level = compute_level_from_index(best_level, gi);
         threshold = level << coeff_shift;
-        od_dering(
-            &OD_DERING_VTBL_C, dst, MAX_MIB_SIZE * bsize[0],
-            &src[sbr * stride * bsize[0] * MAX_MIB_SIZE +
-                 sbc * bsize[0] * MAX_MIB_SIZE],
-            cm->mi_cols * bsize[0], nhb, nvb, sbc, sbr, nhsb, nvsb, 0, dir, 0,
-            &bskip[MAX_MIB_SIZE * sbr * cm->mi_cols + MAX_MIB_SIZE * sbc],
-            cm->mi_cols, threshold, OD_DERING_NO_CHECK_OVERLAP, coeff_shift);
+        od_dering(dst, MAX_MIB_SIZE * bsize[0],
+                  &src[sbr * stride * bsize[0] * MAX_MIB_SIZE +
+                       sbc * bsize[0] * MAX_MIB_SIZE],
+                  cm->mi_cols * bsize[0], nhb, nvb, sbc, sbr, nhsb, nvsb, 0,
+                  dir, 0,
+                  &bskip[MAX_MIB_SIZE * sbr * cm->mi_cols + MAX_MIB_SIZE * sbc],
+                  cm->mi_cols, threshold, coeff_shift);
         cur_mse = (int)compute_dist(
             dst, MAX_MIB_SIZE * bsize[0],
             &ref_coeff[sbr * stride * bsize[0] * MAX_MIB_SIZE +
                        sbc * bsize[0] * MAX_MIB_SIZE],
             stride, nhb, nvb, coeff_shift);
-        mse[nhsb * sbr + sbc][level] = cur_mse;
-        tot_mse[level] += cur_mse;
-      }
-    }
-  }
-#if DERING_REFINEMENT
-  best_level = 0;
-  /* Search for the best global level one value at a time. */
-  for (global_level = 2; global_level < MAX_DERING_LEVEL; global_level++) {
-    double tot_mse = 0;
-    for (sbr = 0; sbr < nvsb; sbr++) {
-      for (sbc = 0; sbc < nhsb; sbc++) {
-        int gi;
-        int best_mse = mse[nhsb * sbr + sbc][0];
-        for (gi = 1; gi < 4; gi++) {
-          level = compute_level_from_index(global_level, gi);
-          if (mse[nhsb * sbr + sbc][level] < best_mse) {
-            best_mse = mse[nhsb * sbr + sbc][level];
-          }
-        }
-        tot_mse += best_mse;
-      }
-    }
-    if (tot_mse < best_tot_mse) {
-      best_level = global_level;
-      best_tot_mse = tot_mse;
-    }
-  }
-  for (sbr = 0; sbr < nvsb; sbr++) {
-    for (sbc = 0; sbc < nhsb; sbc++) {
-      int gi;
-      int best_gi;
-      int best_mse = mse[nhsb * sbr + sbc][0];
-      best_gi = 0;
-      for (gi = 1; gi < DERING_REFINEMENT_LEVELS; gi++) {
-        level = compute_level_from_index(best_level, gi);
-        if (mse[nhsb * sbr + sbc][level] < best_mse) {
+        if (cur_mse < best_mse) {
           best_gi = gi;
-          best_mse = mse[nhsb * sbr + sbc][level];
+          best_mse = cur_mse;
         }
       }
       cm->mi_grid_visible[MAX_MIB_SIZE * sbr * cm->mi_stride +
@@ -158,15 +134,8 @@
           ->mbmi.dering_gain = best_gi;
     }
   }
-#else
-    best_level = 0;
-    for (level = 0; level < MAX_DERING_LEVEL; level++) {
-      if (tot_mse[level] < tot_mse[best_level]) best_level = level;
-    }
-#endif
   aom_free(src);
   aom_free(ref_coeff);
   aom_free(bskip);
-  aom_free(mse);
   return best_level;
 }

diff --git a/av1/encoder/rd.c b/av1/encoder/rd.c
index ee65107..ff96714 100644
--- a/av1/encoder/rd.c
+++ b/av1/encoder/rd.c

@@ -153,9 +153,6 @@
 }
 
 void av1_fill_token_costs(av1_coeff_cost *c,
-#if CONFIG_ANS
-                          coeff_cdf_model (*cdf)[PLANE_TYPES],
-#endif  // CONFIG_ANS
                           av1_coeff_probs_model (*p)[PLANE_TYPES]) {
   int i, j, k, l;
   TX_SIZE t;
@@ -164,19 +161,11 @@
       for (j = 0; j < REF_TYPES; ++j)
         for (k = 0; k < COEF_BANDS; ++k)
           for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
-#if CONFIG_ANS
-            const aom_prob *const tree_probs = p[t][i][j][k][l];
-            av1_cost_tokens_ans((int *)c[t][i][j][k][0][l], tree_probs,
-                                cdf[t][i][j][k][l], 0);
-            av1_cost_tokens_ans((int *)c[t][i][j][k][1][l], tree_probs,
-                                cdf[t][i][j][k][l], 1);
-#else
             aom_prob probs[ENTROPY_NODES];
             av1_model_to_full_probs(p[t][i][j][k][l], probs);
             av1_cost_tokens((int *)c[t][i][j][k][0][l], probs, av1_coef_tree);
             av1_cost_tokens_skip((int *)c[t][i][j][k][1][l], probs,
                                  av1_coef_tree);
-#endif  // CONFIG_ANS
             assert(c[t][i][j][k][0][l][EOB_TOKEN] ==
                    c[t][i][j][k][1][l][EOB_TOKEN]);
           }
@@ -387,11 +376,7 @@
 #endif
   }
   if (cpi->oxcf.pass != 1) {
-    av1_fill_token_costs(x->token_costs,
-#if CONFIG_ANS
-                         cm->fc->coef_cdfs,
-#endif  // CONFIG_ANS
-                         cm->fc->coef_probs);
+    av1_fill_token_costs(x->token_costs, cm->fc->coef_probs);
 
     if (cpi->sf.partition_search_type != VAR_BASED_PARTITION ||
         cm->frame_type == KEY_FRAME) {

diff --git a/av1/encoder/rd.h b/av1/encoder/rd.h
index 933733b..3ca4768 100644
--- a/av1/encoder/rd.h
+++ b/av1/encoder/rd.h

@@ -431,9 +431,6 @@
                                int best_mode_index);
 
 void av1_fill_token_costs(av1_coeff_cost *c,
-#if CONFIG_ANS
-                          coeff_cdf_model (*cdf)[PLANE_TYPES],
-#endif  // CONFIG_ANS
                           av1_coeff_probs_model (*p)[PLANE_TYPES]);
 
 static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh,

diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 78d61e2..8707061 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c

@@ -46,6 +46,7 @@
 #include "av1/encoder/ratectrl.h"
 #include "av1/encoder/rd.h"
 #include "av1/encoder/rdopt.h"
+#include "av1/encoder/tokenize.h"
 
 #if CONFIG_DUAL_FILTER
 #if CONFIG_EXT_INTERP
@@ -865,14 +866,14 @@
 }
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
-/* The trailing '0' is a terminator which is used inside cost_coeffs() to
+/* The trailing '0' is a terminator which is used inside av1_cost_coeffs() to
  * decide whether to include cost of a trailing EOB node or not (i.e. we
  * can skip this if the last coefficient in this transform block, e.g. the
  * 16th coefficient in a 4x4 block or the 64th coefficient in a 8x8 block,
  * were non-zero). */
-static int cost_coeffs(MACROBLOCK *x, int plane, int block, int coeff_ctx,
-                       TX_SIZE tx_size, const int16_t *scan, const int16_t *nb,
-                       int use_fast_coef_costing) {
+int av1_cost_coeffs(MACROBLOCK *x, int plane, int block, int coeff_ctx,
+                    TX_SIZE tx_size, const int16_t *scan, const int16_t *nb,
+                    int use_fast_coef_costing) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   const struct macroblock_plane *p = &x->plane[plane];
@@ -1064,8 +1065,9 @@
 
 static int rate_block(int plane, int block, int coeff_ctx, TX_SIZE tx_size,
                       struct rdcost_block_args *args) {
-  return cost_coeffs(args->x, plane, block, coeff_ctx, tx_size, args->so->scan,
-                     args->so->neighbors, args->use_fast_coef_costing);
+  return av1_cost_coeffs(args->x, plane, block, coeff_ctx, tx_size,
+                         args->so->scan, args->so->neighbors,
+                         args->use_fast_coef_costing);
 }
 
 static uint64_t sum_squares_2d(const int16_t *diff, int diff_stride,
@@ -1946,8 +1948,9 @@
             av1_xform_quant(x, 0, block, row + idy, col + idx, BLOCK_8X8,
                             TX_4X4, AV1_XFORM_QUANT_FP);
 #endif  // CONFIG_NEW_QUANT
-            ratey += cost_coeffs(x, 0, block, coeff_ctx, TX_4X4, so->scan,
-                                 so->neighbors, cpi->sf.use_fast_coef_costing);
+            ratey +=
+                av1_cost_coeffs(x, 0, block, coeff_ctx, TX_4X4, so->scan,
+                                so->neighbors, cpi->sf.use_fast_coef_costing);
             *(tempa + idx) = !(p->eobs[block] == 0);
             *(templ + idy) = !(p->eobs[block] == 0);
             can_skip &= (p->eobs[block] == 0);
@@ -1971,8 +1974,9 @@
                             TX_4X4, AV1_XFORM_QUANT_FP);
 #endif  // CONFIG_NEW_QUANT
             av1_optimize_b(x, 0, block, TX_4X4, coeff_ctx);
-            ratey += cost_coeffs(x, 0, block, coeff_ctx, TX_4X4, so->scan,
-                                 so->neighbors, cpi->sf.use_fast_coef_costing);
+            ratey +=
+                av1_cost_coeffs(x, 0, block, coeff_ctx, TX_4X4, so->scan,
+                                so->neighbors, cpi->sf.use_fast_coef_costing);
             *(tempa + idx) = !(p->eobs[block] == 0);
             *(templ + idy) = !(p->eobs[block] == 0);
             can_skip &= (p->eobs[block] == 0);
@@ -2064,8 +2068,9 @@
           av1_xform_quant(x, 0, block, row + idy, col + idx, BLOCK_8X8, TX_4X4,
                           AV1_XFORM_QUANT_B);
 #endif  // CONFIG_NEW_QUANT
-          ratey += cost_coeffs(x, 0, block, coeff_ctx, TX_4X4, so->scan,
-                               so->neighbors, cpi->sf.use_fast_coef_costing);
+          ratey +=
+              av1_cost_coeffs(x, 0, block, coeff_ctx, TX_4X4, so->scan,
+                              so->neighbors, cpi->sf.use_fast_coef_costing);
           *(tempa + idx) = !(p->eobs[block] == 0);
           *(templ + idy) = !(p->eobs[block] == 0);
           can_skip &= (p->eobs[block] == 0);
@@ -2088,8 +2093,9 @@
                           AV1_XFORM_QUANT_FP);
 #endif  // CONFIG_NEW_QUANT
           av1_optimize_b(x, 0, block, TX_4X4, coeff_ctx);
-          ratey += cost_coeffs(x, 0, block, coeff_ctx, TX_4X4, so->scan,
-                               so->neighbors, cpi->sf.use_fast_coef_costing);
+          ratey +=
+              av1_cost_coeffs(x, 0, block, coeff_ctx, TX_4X4, so->scan,
+                              so->neighbors, cpi->sf.use_fast_coef_costing);
           *(tempa + idx) = !(p->eobs[block] == 0);
           *(templ + idy) = !(p->eobs[block] == 0);
           can_skip &= (p->eobs[block] == 0);
@@ -2964,8 +2970,8 @@
     }
   }
   *dist += tmp * 16;
-  *rate += cost_coeffs(x, plane, block, coeff_ctx, tx_size, scan_order->scan,
-                       scan_order->neighbors, 0);
+  *rate += av1_cost_coeffs(x, plane, block, coeff_ctx, tx_size,
+                           scan_order->scan, scan_order->neighbors, 0);
   *skip &= (p->eobs[block] == 0);
 }
 
@@ -4374,8 +4380,8 @@
                  &dist, &ssz);
       thisdistortion += dist;
       thissse += ssz;
-      thisrate += cost_coeffs(x, 0, block, coeff_ctx, tx_size, so->scan,
-                              so->neighbors, cpi->sf.use_fast_coef_costing);
+      thisrate += av1_cost_coeffs(x, 0, block, coeff_ctx, tx_size, so->scan,
+                                  so->neighbors, cpi->sf.use_fast_coef_costing);
       *(ta + (k & 1)) = !(p->eobs[block] == 0);
       *(tl + (k >> 1)) = !(p->eobs[block] == 0);
 #if CONFIG_EXT_TX

diff --git a/av1/encoder/rdopt.h b/av1/encoder/rdopt.h
index eb0ff9f..584c439 100644
--- a/av1/encoder/rdopt.h
+++ b/av1/encoder/rdopt.h

@@ -26,6 +26,9 @@
 struct macroblock;
 struct RD_COST;
 
+int av1_cost_coeffs(MACROBLOCK *x, int plane, int block, int coeff_ctx,
+                    TX_SIZE tx_size, const int16_t *scan, const int16_t *nb,
+                    int use_fast_coef_costing);
 void av1_rd_pick_intra_mode_sb(struct AV1_COMP *cpi, struct macroblock *x,
                                struct RD_COST *rd_cost, BLOCK_SIZE bsize,
                                PICK_MODE_CONTEXT *ctx, int64_t best_rd);

diff --git a/av1/encoder/tokenize.c b/av1/encoder/tokenize.c
index 3bf2410..8095681 100644
--- a/av1/encoder/tokenize.c
+++ b/av1/encoder/tokenize.c

@@ -23,6 +23,7 @@
 
 #include "av1/encoder/cost.h"
 #include "av1/encoder/encoder.h"
+#include "av1/encoder/rdopt.h"
 #include "av1/encoder/tokenize.h"
 
 static const TOKENVALUE dct_cat_lt_10_value_tokens[] = {
@@ -346,8 +347,31 @@
   AV1_COMP *cpi;
   ThreadData *td;
   TOKENEXTRA **tp;
+  int this_rate;
 };
 
+static void cost_coeffs_b(int plane, int block, int blk_row, int blk_col,
+                          BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
+  struct tokenize_b_args *const args = arg;
+  ThreadData *const td = args->td;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  struct macroblock_plane *p = &x->plane[plane];
+  struct macroblockd_plane *pd = &xd->plane[plane];
+  const PLANE_TYPE type = pd->plane_type;
+  const int ref = is_inter_block(mbmi);
+  const TX_TYPE tx_type = get_tx_type(type, xd, block, tx_size);
+  const scan_order *const so = get_scan(tx_size, tx_type, ref);
+  int pt = get_entropy_context(tx_size, pd->above_context + blk_col,
+                               pd->left_context + blk_row);
+  int rate =
+      av1_cost_coeffs(x, plane, block, pt, tx_size, so->scan, so->neighbors, 0);
+  args->this_rate += rate;
+  av1_set_contexts(xd, pd, plane_bsize, tx_size, p->eobs[block] > 0, blk_col,
+                   blk_row);
+}
+
 static void set_entropy_context_b(int plane, int block, int blk_row,
                                   int blk_col, BLOCK_SIZE plane_bsize,
                                   TX_SIZE tx_size, void *arg) {
@@ -363,7 +387,7 @@
 
 static INLINE void add_token(TOKENEXTRA **t, const aom_prob *context_tree,
 #if CONFIG_ANS
-                             const rans_lut *token_cdf,
+                             const aom_cdf_prob (*token_cdf)[ENTROPY_TOKENS],
 #endif  // CONFIG_ANS
                              int32_t extra, uint8_t token,
                              uint8_t skip_eob_node, unsigned int *counts) {
@@ -378,25 +402,15 @@
   ++counts[token];
 }
 
-static INLINE void add_token_no_extra(TOKENEXTRA **t,
-                                      const aom_prob *context_tree,
-                                      uint8_t token, uint8_t skip_eob_node,
-                                      unsigned int *counts) {
-  (*t)->token = token;
-  (*t)->context_tree = context_tree;
-  (*t)->skip_eob_node = skip_eob_node;
-  (*t)++;
-  ++counts[token];
-}
-
 static INLINE int get_tx_eob(const struct segmentation *seg, int segment_id,
                              TX_SIZE tx_size) {
   const int eob_max = num_4x4_blocks_txsize_lookup[tx_size] << 4;
   return segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
 }
 
-void av1_tokenize_palette_sb(struct ThreadData *const td, BLOCK_SIZE bsize,
-                             int plane, TOKENEXTRA **t) {
+void av1_tokenize_palette_sb(AV1_COMP *cpi, struct ThreadData *const td,
+                             int plane, TOKENEXTRA **t, RUN_TYPE dry_run,
+                             BLOCK_SIZE bsize, int *rate) {
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
@@ -404,7 +418,8 @@
   PALETTE_MODE_INFO *pmi = &mbmi->palette_mode_info;
   int n = pmi->palette_size[plane != 0];
   int i, j, k;
-  int color_new_idx = -1, color_ctx, color_order[PALETTE_MAX_SIZE];
+  int this_rate = 0;
+  int color_idx = -1, color_ctx, color_order[PALETTE_MAX_SIZE];
   const int rows = (4 * num_4x4_blocks_high_lookup[bsize]) >>
                    (xd->plane[plane != 0].subsampling_y);
   const int cols = (4 * num_4x4_blocks_wide_lookup[bsize]) >>
@@ -419,16 +434,19 @@
           av1_get_palette_color_context(color_map, cols, i, j, n, color_order);
       for (k = 0; k < n; ++k)
         if (color_map[i * cols + j] == color_order[k]) {
-          color_new_idx = k;
+          color_idx = k;
           break;
         }
-      assert(color_new_idx >= 0 && color_new_idx < n);
-      (*t)->token = color_new_idx;
+      assert(color_idx >= 0 && color_idx < n);
+      if (dry_run == DRY_RUN_COSTCOEFFS)
+        this_rate += cpi->palette_y_color_cost[n - 2][color_ctx][color_idx];
+      (*t)->token = color_idx;
       (*t)->context_tree = probs[n - 2][color_ctx];
       (*t)->skip_eob_node = 0;
       ++(*t);
     }
   }
+  if (rate) *rate += this_rate;
 }
 
 static void tokenize_b(int plane, int block, int blk_row, int blk_col,
@@ -469,8 +487,8 @@
       cpi->common.fc->coef_probs[txsize_sqr_map[tx_size]][type][ref];
 #endif  // CONFIG_ENTROPY
 #if CONFIG_ANS
-  rans_lut(*const coef_cdfs)[COEFF_CONTEXTS] =
-      cpi->common.fc->coef_cdfs[txsize_sqr_map[tx_size]][type][ref];
+  aom_cdf_prob(*const coef_cdfs)[COEFF_CONTEXTS][ENTROPY_TOKENS] =
+      cpi->common.fc->coef_cdfs[tx_size][type][ref];
 #endif  // CONFIG_ANS
   unsigned int(*const eob_branch)[COEFF_CONTEXTS] =
       td->counts->eob_branch[txsize_sqr_map[tx_size]][type][ref];
@@ -493,7 +511,7 @@
 
     add_token(&t, coef_probs[band[c]][pt],
 #if CONFIG_ANS
-              (const rans_lut *)&coef_cdfs[band[c]][pt],
+              (const aom_cdf_prob(*)[ENTROPY_TOKENS]) & coef_cdfs[band[c]][pt],
 #endif  // CONFIG_ANS
               extra, (uint8_t)token, (uint8_t)skip_eob, counts[band[c]][pt]);
 
@@ -503,8 +521,11 @@
     skip_eob = (token == ZERO_TOKEN);
   }
   if (c < seg_eob) {
-    add_token_no_extra(&t, coef_probs[band[c]][pt], EOB_TOKEN, 0,
-                       counts[band[c]][pt]);
+    add_token(&t, coef_probs[band[c]][pt],
+#if CONFIG_ANS || CONFIG_DAALA_EC
+              NULL,
+#endif
+              0, EOB_TOKEN, 0, counts[band[c]][pt]);
     ++eob_branch[band[c]][pt];
   }
 
@@ -560,7 +581,7 @@
 }
 
 #if CONFIG_VAR_TX
-void tokenize_vartx(ThreadData *td, TOKENEXTRA **t, int dry_run,
+void tokenize_vartx(ThreadData *td, TOKENEXTRA **t, RUN_TYPE dry_run,
                     TX_SIZE tx_size, BLOCK_SIZE plane_bsize, int blk_row,
                     int blk_col, int block, int plane, void *arg) {
   MACROBLOCK *const x = &td->mb;
@@ -593,9 +614,11 @@
     BLOCK_SIZE plane_bsize = get_plane_block_size(mbmi->sb_type, pd);
     if (!dry_run)
       tokenize_b(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg);
-    else
+    else if (dry_run == DRY_RUN_NORMAL)
       set_entropy_context_b(plane, block, blk_row, blk_col, plane_bsize,
                             tx_size, arg);
+    else if (dry_run == DRY_RUN_COSTCOEFFS)
+      cost_coeffs_b(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg);
   } else {
     int bsl = b_width_log2_lookup[bsize];
     int i;
@@ -617,8 +640,8 @@
 }
 
 void av1_tokenize_sb_vartx(AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
-                           int dry_run, int mi_row, int mi_col,
-                           BLOCK_SIZE bsize) {
+                           RUN_TYPE dry_run, int mi_row, int mi_col,
+                           BLOCK_SIZE bsize, int *rate) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -627,7 +650,7 @@
   const int ctx = av1_get_skip_context(xd);
   const int skip_inc =
       !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
-  struct tokenize_b_args arg = { cpi, td, t };
+  struct tokenize_b_args arg = { cpi, td, t, 0 };
   int plane;
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
 
@@ -667,11 +690,12 @@
       (*t)++;
     }
   }
+  if (rate) *rate += arg.this_rate;
 }
 #endif  // CONFIG_VAR_TX
 
-void av1_tokenize_sb(AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t, int dry_run,
-                     BLOCK_SIZE bsize) {
+void av1_tokenize_sb(AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
+                     RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -679,7 +703,7 @@
   const int ctx = av1_get_skip_context(xd);
   const int skip_inc =
       !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
-  struct tokenize_b_args arg = { cpi, td, t };
+  struct tokenize_b_args arg = { cpi, td, t, 0 };
   if (mbmi->skip) {
     if (!dry_run) td->counts->skip[ctx][1] += skip_inc;
     reset_skip_context(xd, bsize);
@@ -697,14 +721,17 @@
       (*t)->token = EOSB_TOKEN;
       (*t)++;
     }
-  } else {
+  } else if (dry_run == DRY_RUN_NORMAL) {
     av1_foreach_transformed_block(xd, bsize, set_entropy_context_b, &arg);
+  } else if (dry_run == DRY_RUN_COSTCOEFFS) {
+    av1_foreach_transformed_block(xd, bsize, cost_coeffs_b, &arg);
   }
+  if (rate) *rate += arg.this_rate;
 }
 
 #if CONFIG_SUPERTX
 void av1_tokenize_sb_supertx(AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
-                             int dry_run, BLOCK_SIZE bsize) {
+                             RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &td->mb.e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
@@ -712,7 +739,7 @@
   const int ctx = av1_get_skip_context(xd);
   const int skip_inc =
       !segfeature_active(&cm->seg, mbmi->segment_id_supertx, SEG_LVL_SKIP);
-  struct tokenize_b_args arg = { cpi, td, t };
+  struct tokenize_b_args arg = { cpi, td, t, 0 };
   if (mbmi->skip) {
     if (!dry_run) td->counts->skip[ctx][1] += skip_inc;
     reset_skip_context(xd, bsize);
@@ -730,9 +757,12 @@
       (*t)->token = EOSB_TOKEN;
       (*t)++;
     }
-  } else {
+  } else if (dry_run == DRY_RUN_NORMAL) {
     av1_foreach_transformed_block(xd, bsize, set_entropy_context_b, &arg);
     *t = t_backup;
+  } else if (dry_run == DRY_RUN_COSTCOEFFS) {
+    av1_foreach_transformed_block(xd, bsize, cost_coeffs_b, &arg);
   }
+  if (rate) *rate += arg.this_rate;
 }
 #endif  // CONFIG_SUPERTX

diff --git a/av1/encoder/tokenize.h b/av1/encoder/tokenize.h
index a7e30d5..f20848a 100644
--- a/av1/encoder/tokenize.h
+++ b/av1/encoder/tokenize.h

@@ -37,7 +37,7 @@
 typedef struct {
   const aom_prob *context_tree;
 #if CONFIG_ANS
-  const rans_lut *token_cdf;
+  const aom_cdf_prob (*token_cdf)[ENTROPY_TOKENS];
 #endif  // CONFIG_ANS
   EXTRABIT extra;
   uint8_t token;
@@ -56,19 +56,31 @@
 struct AV1_COMP;
 struct ThreadData;
 
+typedef enum {
+  OUTPUT_ENABLED = 0,
+  DRY_RUN_NORMAL,
+  DRY_RUN_COSTCOEFFS,
+} RUN_TYPE;
+
+// Note in all the tokenize functions rate if non NULL is incremented
+// with the coefficient token cost only if dry_run = DRY_RUN_COSTCOEFS,
+// otherwise rate is not incremented.
 #if CONFIG_VAR_TX
 void av1_tokenize_sb_vartx(struct AV1_COMP *cpi, struct ThreadData *td,
-                           TOKENEXTRA **t, int dry_run, int mi_row, int mi_col,
-                           BLOCK_SIZE bsize);
+                           TOKENEXTRA **t, RUN_TYPE dry_run, int mi_row,
+                           int mi_col, BLOCK_SIZE bsize, int *rate);
 #endif
 
-void av1_tokenize_palette_sb(struct ThreadData *const td, BLOCK_SIZE bsize,
-                             int plane, TOKENEXTRA **t);
+void av1_tokenize_palette_sb(struct AV1_COMP *cpi, struct ThreadData *const td,
+                             int plane, TOKENEXTRA **t, RUN_TYPE dry_run,
+                             BLOCK_SIZE bsize, int *rate);
 void av1_tokenize_sb(struct AV1_COMP *cpi, struct ThreadData *td,
-                     TOKENEXTRA **t, int dry_run, BLOCK_SIZE bsize);
+                     TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                     int *rate);
 #if CONFIG_SUPERTX
 void av1_tokenize_sb_supertx(struct AV1_COMP *cpi, struct ThreadData *td,
-                             TOKENEXTRA **t, int dry_run, BLOCK_SIZE bsize);
+                             TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                             int *rate);
 #endif
 
 extern const int16_t *av1_dct_value_cost_ptr;

diff --git a/av1/encoder/x86/hybrid_fwd_txfm_avx2.c b/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
index b23d39d..69bf89a 100644
--- a/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
+++ b/av1/encoder/x86/hybrid_fwd_txfm_avx2.c

@@ -25,8 +25,7 @@
   *u = _mm256_permute2x128_si256(v, v, 1);
 }
 
-void aom_fdct16x16_1_avx2(const int16_t *input, tran_low_t *output,
-                          int stride) {
+static int32_t get_16x16_sum(const int16_t *input, int stride) {
   __m256i r0, r1, r2, r3, u0, u1;
   __m256i zero = _mm256_setzero_si256();
   __m256i sum = _mm256_setzero_si256();
@@ -61,8 +60,14 @@
                      _mm256_castsi256_si128(u1));
   v1 = _mm_srli_si128(v0, 4);
   v0 = _mm_add_epi32(v0, v1);
-  v0 = _mm_srai_epi32(v0, 1);
-  output[0] = (tran_low_t)_mm_extract_epi32(v0, 0);
+  return (int32_t)_mm_extract_epi32(v0, 0);
+}
+
+void aom_fdct16x16_1_avx2(const int16_t *input, tran_low_t *output,
+                          int stride) {
+  int32_t dc = get_16x16_sum(input, stride);
+  output[0] = (tran_low_t)(dc >> 1);
+  _mm256_zeroupper();
 }
 
 static void mm256_transpose_16x16(__m256i *in) {
@@ -559,8 +564,6 @@
   x1 = _mm256_unpackhi_epi16(u3, u4);
   in[13] = butter_fly(x0, x1, cospi_p06_p26);
   in[3] = butter_fly(x0, x1, cospi_m26_p06);
-
-  mm256_transpose_16x16(in);
 }
 
 void fadst16_avx2(__m256i *in) {
@@ -1105,8 +1108,6 @@
   in[3] = _mm256_sub_epi16(zero, x4);
   in[13] = _mm256_sub_epi16(zero, x13);
   in[15] = _mm256_sub_epi16(zero, x1);
-
-  mm256_transpose_16x16(in);
 }
 
 #if CONFIG_EXT_TX
@@ -1134,7 +1135,6 @@
     in[i] = _mm256_packs_epi32(u0, u1);
     i++;
   }
-  mm256_transpose_16x16(in);
 }
 #endif
 
@@ -1146,24 +1146,28 @@
     case DCT_DCT:
       load_buffer_16x16(input, stride, 0, 0, in);
       fdct16_avx2(in);
+      mm256_transpose_16x16(in);
       right_shift_16x16(in);
       fdct16_avx2(in);
       break;
     case ADST_DCT:
       load_buffer_16x16(input, stride, 0, 0, in);
       fadst16_avx2(in);
+      mm256_transpose_16x16(in);
       right_shift_16x16(in);
       fdct16_avx2(in);
       break;
     case DCT_ADST:
       load_buffer_16x16(input, stride, 0, 0, in);
       fdct16_avx2(in);
+      mm256_transpose_16x16(in);
       right_shift_16x16(in);
       fadst16_avx2(in);
       break;
     case ADST_ADST:
       load_buffer_16x16(input, stride, 0, 0, in);
       fadst16_avx2(in);
+      mm256_transpose_16x16(in);
       right_shift_16x16(in);
       fadst16_avx2(in);
       break;
@@ -1171,71 +1175,698 @@
     case FLIPADST_DCT:
       load_buffer_16x16(input, stride, 1, 0, in);
       fadst16_avx2(in);
+      mm256_transpose_16x16(in);
       right_shift_16x16(in);
       fdct16_avx2(in);
       break;
     case DCT_FLIPADST:
       load_buffer_16x16(input, stride, 0, 1, in);
       fdct16_avx2(in);
+      mm256_transpose_16x16(in);
       right_shift_16x16(in);
       fadst16_avx2(in);
       break;
     case FLIPADST_FLIPADST:
       load_buffer_16x16(input, stride, 1, 1, in);
       fadst16_avx2(in);
+      mm256_transpose_16x16(in);
       right_shift_16x16(in);
       fadst16_avx2(in);
       break;
     case ADST_FLIPADST:
       load_buffer_16x16(input, stride, 0, 1, in);
       fadst16_avx2(in);
+      mm256_transpose_16x16(in);
       right_shift_16x16(in);
       fadst16_avx2(in);
       break;
     case FLIPADST_ADST:
       load_buffer_16x16(input, stride, 1, 0, in);
       fadst16_avx2(in);
+      mm256_transpose_16x16(in);
       right_shift_16x16(in);
       fadst16_avx2(in);
       break;
     case V_DCT:
       load_buffer_16x16(input, stride, 0, 0, in);
       fdct16_avx2(in);
+      mm256_transpose_16x16(in);
       right_shift_16x16(in);
       fidtx16_avx2(in);
       break;
     case H_DCT:
       load_buffer_16x16(input, stride, 0, 0, in);
       fidtx16_avx2(in);
+      mm256_transpose_16x16(in);
       right_shift_16x16(in);
       fdct16_avx2(in);
       break;
     case V_ADST:
       load_buffer_16x16(input, stride, 0, 0, in);
       fadst16_avx2(in);
+      mm256_transpose_16x16(in);
       right_shift_16x16(in);
       fidtx16_avx2(in);
       break;
     case H_ADST:
       load_buffer_16x16(input, stride, 0, 0, in);
       fidtx16_avx2(in);
+      mm256_transpose_16x16(in);
       right_shift_16x16(in);
       fadst16_avx2(in);
       break;
     case V_FLIPADST:
       load_buffer_16x16(input, stride, 1, 0, in);
       fadst16_avx2(in);
+      mm256_transpose_16x16(in);
       right_shift_16x16(in);
       fidtx16_avx2(in);
       break;
     case H_FLIPADST:
       load_buffer_16x16(input, stride, 0, 1, in);
       fidtx16_avx2(in);
+      mm256_transpose_16x16(in);
       right_shift_16x16(in);
       fadst16_avx2(in);
       break;
 #endif  // CONFIG_EXT_TX
     default: assert(0); break;
   }
+  mm256_transpose_16x16(in);
   write_buffer_16x16(in, 16, output);
+  _mm256_zeroupper();
+}
+
+void aom_fdct32x32_1_avx2(const int16_t *input, tran_low_t *output,
+                          int stride) {
+  // left and upper corner
+  int32_t sum = get_16x16_sum(input, stride);
+  // right and upper corner
+  sum += get_16x16_sum(input + 16, stride);
+  // left and lower corner
+  sum += get_16x16_sum(input + (stride << 4), stride);
+  // right and lower corner
+  sum += get_16x16_sum(input + (stride << 4) + 16, stride);
+
+  sum >>= 3;
+  output[0] = (tran_low_t)sum;
+  _mm256_zeroupper();
+}
+
+#if CONFIG_EXT_TX
+static void mm256_vectors_swap(__m256i *a0, __m256i *a1, const int size) {
+  int i = 0;
+  __m256i temp;
+  while (i < size) {
+    temp = a0[i];
+    a0[i] = a1[i];
+    a1[i] = temp;
+    i++;
+  }
+}
+
+static void mm256_transpose_32x32(__m256i *in0, __m256i *in1) {
+  mm256_transpose_16x16(in0);
+  mm256_transpose_16x16(&in0[16]);
+  mm256_transpose_16x16(in1);
+  mm256_transpose_16x16(&in1[16]);
+  mm256_vectors_swap(&in0[16], in1, 16);
+}
+
+static void prepare_16x16_even(const __m256i *in, __m256i *even) {
+  even[0] = _mm256_add_epi16(in[0], in[31]);
+  even[1] = _mm256_add_epi16(in[1], in[30]);
+  even[2] = _mm256_add_epi16(in[2], in[29]);
+  even[3] = _mm256_add_epi16(in[3], in[28]);
+  even[4] = _mm256_add_epi16(in[4], in[27]);
+  even[5] = _mm256_add_epi16(in[5], in[26]);
+  even[6] = _mm256_add_epi16(in[6], in[25]);
+  even[7] = _mm256_add_epi16(in[7], in[24]);
+  even[8] = _mm256_add_epi16(in[8], in[23]);
+  even[9] = _mm256_add_epi16(in[9], in[22]);
+  even[10] = _mm256_add_epi16(in[10], in[21]);
+  even[11] = _mm256_add_epi16(in[11], in[20]);
+  even[12] = _mm256_add_epi16(in[12], in[19]);
+  even[13] = _mm256_add_epi16(in[13], in[18]);
+  even[14] = _mm256_add_epi16(in[14], in[17]);
+  even[15] = _mm256_add_epi16(in[15], in[16]);
+}
+
+static void prepare_16x16_odd(const __m256i *in, __m256i *odd) {
+  odd[0] = _mm256_sub_epi16(in[15], in[16]);
+  odd[1] = _mm256_sub_epi16(in[14], in[17]);
+  odd[2] = _mm256_sub_epi16(in[13], in[18]);
+  odd[3] = _mm256_sub_epi16(in[12], in[19]);
+  odd[4] = _mm256_sub_epi16(in[11], in[20]);
+  odd[5] = _mm256_sub_epi16(in[10], in[21]);
+  odd[6] = _mm256_sub_epi16(in[9], in[22]);
+  odd[7] = _mm256_sub_epi16(in[8], in[23]);
+  odd[8] = _mm256_sub_epi16(in[7], in[24]);
+  odd[9] = _mm256_sub_epi16(in[6], in[25]);
+  odd[10] = _mm256_sub_epi16(in[5], in[26]);
+  odd[11] = _mm256_sub_epi16(in[4], in[27]);
+  odd[12] = _mm256_sub_epi16(in[3], in[28]);
+  odd[13] = _mm256_sub_epi16(in[2], in[29]);
+  odd[14] = _mm256_sub_epi16(in[1], in[30]);
+  odd[15] = _mm256_sub_epi16(in[0], in[31]);
+}
+
+static void collect_16col(const __m256i *even, const __m256i *odd,
+                          __m256i *out) {
+  // fdct16_avx2() already maps the output
+  out[0] = even[0];
+  out[2] = even[1];
+  out[4] = even[2];
+  out[6] = even[3];
+  out[8] = even[4];
+  out[10] = even[5];
+  out[12] = even[6];
+  out[14] = even[7];
+  out[16] = even[8];
+  out[18] = even[9];
+  out[20] = even[10];
+  out[22] = even[11];
+  out[24] = even[12];
+  out[26] = even[13];
+  out[28] = even[14];
+  out[30] = even[15];
+
+  out[1] = odd[0];
+  out[17] = odd[1];
+  out[9] = odd[2];
+  out[25] = odd[3];
+  out[5] = odd[4];
+  out[21] = odd[5];
+  out[13] = odd[6];
+  out[29] = odd[7];
+  out[3] = odd[8];
+  out[19] = odd[9];
+  out[11] = odd[10];
+  out[27] = odd[11];
+  out[7] = odd[12];
+  out[23] = odd[13];
+  out[15] = odd[14];
+  out[31] = odd[15];
+}
+
+static void collect_coeffs(const __m256i *first_16col_even,
+                           const __m256i *first_16col_odd,
+                           const __m256i *second_16col_even,
+                           const __m256i *second_16col_odd, __m256i *in0,
+                           __m256i *in1) {
+  collect_16col(first_16col_even, first_16col_odd, in0);
+  collect_16col(second_16col_even, second_16col_odd, in1);
+}
+
+static void fdct16_odd_avx2(__m256i *in) {
+  // sequence: cospi_L_H = pairs(L, H) and L first
+  const __m256i cospi_p16_p16 = pair256_set_epi16(cospi_16_64, cospi_16_64);
+  const __m256i cospi_m16_p16 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64);
+  const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m256i cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m256i cospi_p28_p04 = pair256_set_epi16(cospi_28_64, cospi_4_64);
+  const __m256i cospi_m28_m04 = pair256_set_epi16(-cospi_28_64, -cospi_4_64);
+  const __m256i cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m256i cospi_p12_p20 = pair256_set_epi16(cospi_12_64, cospi_20_64);
+  const __m256i cospi_m12_m20 = pair256_set_epi16(-cospi_12_64, -cospi_20_64);
+
+  const __m256i cospi_p31_p01 = pair256_set_epi16(cospi_31_64, cospi_1_64);
+  const __m256i cospi_m01_p31 = pair256_set_epi16(-cospi_1_64, cospi_31_64);
+  const __m256i cospi_p15_p17 = pair256_set_epi16(cospi_15_64, cospi_17_64);
+  const __m256i cospi_m17_p15 = pair256_set_epi16(-cospi_17_64, cospi_15_64);
+  const __m256i cospi_p23_p09 = pair256_set_epi16(cospi_23_64, cospi_9_64);
+  const __m256i cospi_m09_p23 = pair256_set_epi16(-cospi_9_64, cospi_23_64);
+  const __m256i cospi_p07_p25 = pair256_set_epi16(cospi_7_64, cospi_25_64);
+  const __m256i cospi_m25_p07 = pair256_set_epi16(-cospi_25_64, cospi_7_64);
+  const __m256i cospi_p27_p05 = pair256_set_epi16(cospi_27_64, cospi_5_64);
+  const __m256i cospi_m05_p27 = pair256_set_epi16(-cospi_5_64, cospi_27_64);
+  const __m256i cospi_p11_p21 = pair256_set_epi16(cospi_11_64, cospi_21_64);
+  const __m256i cospi_m21_p11 = pair256_set_epi16(-cospi_21_64, cospi_11_64);
+  const __m256i cospi_p19_p13 = pair256_set_epi16(cospi_19_64, cospi_13_64);
+  const __m256i cospi_m13_p19 = pair256_set_epi16(-cospi_13_64, cospi_19_64);
+  const __m256i cospi_p03_p29 = pair256_set_epi16(cospi_3_64, cospi_29_64);
+  const __m256i cospi_m29_p03 = pair256_set_epi16(-cospi_29_64, cospi_3_64);
+
+  __m256i x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
+  __m256i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, y15;
+  __m256i u0, u1;
+
+  // stage 1 is in prepare_16x16_odd()
+
+  // stage 2
+  y0 = in[0];
+  y1 = in[1];
+  y2 = in[2];
+  y3 = in[3];
+
+  u0 = _mm256_unpacklo_epi16(in[4], in[11]);
+  u1 = _mm256_unpackhi_epi16(in[4], in[11]);
+  y4 = butter_fly(u0, u1, cospi_m16_p16);
+  y11 = butter_fly(u0, u1, cospi_p16_p16);
+
+  u0 = _mm256_unpacklo_epi16(in[5], in[10]);
+  u1 = _mm256_unpackhi_epi16(in[5], in[10]);
+  y5 = butter_fly(u0, u1, cospi_m16_p16);
+  y10 = butter_fly(u0, u1, cospi_p16_p16);
+
+  u0 = _mm256_unpacklo_epi16(in[6], in[9]);
+  u1 = _mm256_unpackhi_epi16(in[6], in[9]);
+  y6 = butter_fly(u0, u1, cospi_m16_p16);
+  y9 = butter_fly(u0, u1, cospi_p16_p16);
+
+  u0 = _mm256_unpacklo_epi16(in[7], in[8]);
+  u1 = _mm256_unpackhi_epi16(in[7], in[8]);
+  y7 = butter_fly(u0, u1, cospi_m16_p16);
+  y8 = butter_fly(u0, u1, cospi_p16_p16);
+
+  y12 = in[12];
+  y13 = in[13];
+  y14 = in[14];
+  y15 = in[15];
+
+  // stage 3
+  x0 = _mm256_add_epi16(y0, y7);
+  x1 = _mm256_add_epi16(y1, y6);
+  x2 = _mm256_add_epi16(y2, y5);
+  x3 = _mm256_add_epi16(y3, y4);
+  x4 = _mm256_sub_epi16(y3, y4);
+  x5 = _mm256_sub_epi16(y2, y5);
+  x6 = _mm256_sub_epi16(y1, y6);
+  x7 = _mm256_sub_epi16(y0, y7);
+  x8 = _mm256_sub_epi16(y15, y8);
+  x9 = _mm256_sub_epi16(y14, y9);
+  x10 = _mm256_sub_epi16(y13, y10);
+  x11 = _mm256_sub_epi16(y12, y11);
+  x12 = _mm256_add_epi16(y12, y11);
+  x13 = _mm256_add_epi16(y13, y10);
+  x14 = _mm256_add_epi16(y14, y9);
+  x15 = _mm256_add_epi16(y15, y8);
+
+  // stage 4
+  y0 = x0;
+  y1 = x1;
+  y6 = x6;
+  y7 = x7;
+  y8 = x8;
+  y9 = x9;
+  y14 = x14;
+  y15 = x15;
+
+  u0 = _mm256_unpacklo_epi16(x2, x13);
+  u1 = _mm256_unpackhi_epi16(x2, x13);
+  y2 = butter_fly(u0, u1, cospi_m08_p24);
+  y13 = butter_fly(u0, u1, cospi_p24_p08);
+
+  u0 = _mm256_unpacklo_epi16(x3, x12);
+  u1 = _mm256_unpackhi_epi16(x3, x12);
+  y3 = butter_fly(u0, u1, cospi_m08_p24);
+  y12 = butter_fly(u0, u1, cospi_p24_p08);
+
+  u0 = _mm256_unpacklo_epi16(x4, x11);
+  u1 = _mm256_unpackhi_epi16(x4, x11);
+  y4 = butter_fly(u0, u1, cospi_m24_m08);
+  y11 = butter_fly(u0, u1, cospi_m08_p24);
+
+  u0 = _mm256_unpacklo_epi16(x5, x10);
+  u1 = _mm256_unpackhi_epi16(x5, x10);
+  y5 = butter_fly(u0, u1, cospi_m24_m08);
+  y10 = butter_fly(u0, u1, cospi_m08_p24);
+
+  // stage 5
+  x0 = _mm256_add_epi16(y0, y3);
+  x1 = _mm256_add_epi16(y1, y2);
+  x2 = _mm256_sub_epi16(y1, y2);
+  x3 = _mm256_sub_epi16(y0, y3);
+  x4 = _mm256_sub_epi16(y7, y4);
+  x5 = _mm256_sub_epi16(y6, y5);
+  x6 = _mm256_add_epi16(y6, y5);
+  x7 = _mm256_add_epi16(y7, y4);
+
+  x8 = _mm256_add_epi16(y8, y11);
+  x9 = _mm256_add_epi16(y9, y10);
+  x10 = _mm256_sub_epi16(y9, y10);
+  x11 = _mm256_sub_epi16(y8, y11);
+  x12 = _mm256_sub_epi16(y15, y12);
+  x13 = _mm256_sub_epi16(y14, y13);
+  x14 = _mm256_add_epi16(y14, y13);
+  x15 = _mm256_add_epi16(y15, y12);
+
+  // stage 6
+  y0 = x0;
+  y3 = x3;
+  y4 = x4;
+  y7 = x7;
+  y8 = x8;
+  y11 = x11;
+  y12 = x12;
+  y15 = x15;
+
+  u0 = _mm256_unpacklo_epi16(x1, x14);
+  u1 = _mm256_unpackhi_epi16(x1, x14);
+  y1 = butter_fly(u0, u1, cospi_m04_p28);
+  y14 = butter_fly(u0, u1, cospi_p28_p04);
+
+  u0 = _mm256_unpacklo_epi16(x2, x13);
+  u1 = _mm256_unpackhi_epi16(x2, x13);
+  y2 = butter_fly(u0, u1, cospi_m28_m04);
+  y13 = butter_fly(u0, u1, cospi_m04_p28);
+
+  u0 = _mm256_unpacklo_epi16(x5, x10);
+  u1 = _mm256_unpackhi_epi16(x5, x10);
+  y5 = butter_fly(u0, u1, cospi_m20_p12);
+  y10 = butter_fly(u0, u1, cospi_p12_p20);
+
+  u0 = _mm256_unpacklo_epi16(x6, x9);
+  u1 = _mm256_unpackhi_epi16(x6, x9);
+  y6 = butter_fly(u0, u1, cospi_m12_m20);
+  y9 = butter_fly(u0, u1, cospi_m20_p12);
+
+  // stage 7
+  x0 = _mm256_add_epi16(y0, y1);
+  x1 = _mm256_sub_epi16(y0, y1);
+  x2 = _mm256_sub_epi16(y3, y2);
+  x3 = _mm256_add_epi16(y3, y2);
+  x4 = _mm256_add_epi16(y4, y5);
+  x5 = _mm256_sub_epi16(y4, y5);
+  x6 = _mm256_sub_epi16(y7, y6);
+  x7 = _mm256_add_epi16(y7, y6);
+
+  x8 = _mm256_add_epi16(y8, y9);
+  x9 = _mm256_sub_epi16(y8, y9);
+  x10 = _mm256_sub_epi16(y11, y10);
+  x11 = _mm256_add_epi16(y11, y10);
+  x12 = _mm256_add_epi16(y12, y13);
+  x13 = _mm256_sub_epi16(y12, y13);
+  x14 = _mm256_sub_epi16(y15, y14);
+  x15 = _mm256_add_epi16(y15, y14);
+
+  // stage 8
+  u0 = _mm256_unpacklo_epi16(x0, x15);
+  u1 = _mm256_unpackhi_epi16(x0, x15);
+  in[0] = butter_fly(u0, u1, cospi_p31_p01);
+  in[15] = butter_fly(u0, u1, cospi_m01_p31);
+
+  u0 = _mm256_unpacklo_epi16(x1, x14);
+  u1 = _mm256_unpackhi_epi16(x1, x14);
+  in[1] = butter_fly(u0, u1, cospi_p15_p17);
+  in[14] = butter_fly(u0, u1, cospi_m17_p15);
+
+  u0 = _mm256_unpacklo_epi16(x2, x13);
+  u1 = _mm256_unpackhi_epi16(x2, x13);
+  in[2] = butter_fly(u0, u1, cospi_p23_p09);
+  in[13] = butter_fly(u0, u1, cospi_m09_p23);
+
+  u0 = _mm256_unpacklo_epi16(x3, x12);
+  u1 = _mm256_unpackhi_epi16(x3, x12);
+  in[3] = butter_fly(u0, u1, cospi_p07_p25);
+  in[12] = butter_fly(u0, u1, cospi_m25_p07);
+
+  u0 = _mm256_unpacklo_epi16(x4, x11);
+  u1 = _mm256_unpackhi_epi16(x4, x11);
+  in[4] = butter_fly(u0, u1, cospi_p27_p05);
+  in[11] = butter_fly(u0, u1, cospi_m05_p27);
+
+  u0 = _mm256_unpacklo_epi16(x5, x10);
+  u1 = _mm256_unpackhi_epi16(x5, x10);
+  in[5] = butter_fly(u0, u1, cospi_p11_p21);
+  in[10] = butter_fly(u0, u1, cospi_m21_p11);
+
+  u0 = _mm256_unpacklo_epi16(x6, x9);
+  u1 = _mm256_unpackhi_epi16(x6, x9);
+  in[6] = butter_fly(u0, u1, cospi_p19_p13);
+  in[9] = butter_fly(u0, u1, cospi_m13_p19);
+
+  u0 = _mm256_unpacklo_epi16(x7, x8);
+  u1 = _mm256_unpackhi_epi16(x7, x8);
+  in[7] = butter_fly(u0, u1, cospi_p03_p29);
+  in[8] = butter_fly(u0, u1, cospi_m29_p03);
+}
+
+static void fdct32_avx2(__m256i *in0, __m256i *in1) {
+  __m256i even0[16], even1[16], odd0[16], odd1[16];
+  prepare_16x16_even(in0, even0);
+  fdct16_avx2(even0);
+
+  prepare_16x16_odd(in0, odd0);
+  fdct16_odd_avx2(odd0);
+
+  prepare_16x16_even(in1, even1);
+  fdct16_avx2(even1);
+
+  prepare_16x16_odd(in1, odd1);
+  fdct16_odd_avx2(odd1);
+
+  collect_coeffs(even0, odd0, even1, odd1, in0, in1);
+
+  mm256_transpose_32x32(in0, in1);
+}
+#endif  // CONFIG_EXT_TX
+
+static INLINE void write_buffer_32x32(const __m256i *in0, const __m256i *in1,
+                                      int stride, tran_low_t *output) {
+  int i = 0;
+  tran_low_t *coeff = output;
+  while (i < 32) {
+    _mm256_storeu_si256((__m256i *)coeff, in0[i]);
+    _mm256_storeu_si256((__m256i *)(coeff + 16), in1[i]);
+    coeff += stride;
+    i += 1;
+  }
+}
+
+#if CONFIG_EXT_TX
+static void fhalfright32_16col_avx2(__m256i *in) {
+  int i = 0;
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i sqrt2 = _mm256_set1_epi16(Sqrt2);
+  const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+  __m256i x0, x1;
+
+  while (i < 16) {
+    in[i] = _mm256_slli_epi16(in[i], 2);
+    x0 = _mm256_unpacklo_epi16(in[i + 16], zero);
+    x1 = _mm256_unpackhi_epi16(in[i + 16], zero);
+    x0 = _mm256_madd_epi16(x0, sqrt2);
+    x1 = _mm256_madd_epi16(x1, sqrt2);
+    x0 = _mm256_add_epi32(x0, dct_rounding);
+    x1 = _mm256_add_epi32(x1, dct_rounding);
+    x0 = _mm256_srai_epi32(x0, DCT_CONST_BITS);
+    x1 = _mm256_srai_epi32(x1, DCT_CONST_BITS);
+    in[i + 16] = _mm256_packs_epi32(x0, x1);
+    i += 1;
+  }
+  fdct16_avx2(&in[16]);
+}
+
+static void fhalfright32_avx2(__m256i *in0, __m256i *in1) {
+  fhalfright32_16col_avx2(in0);
+  fhalfright32_16col_avx2(in1);
+  mm256_vectors_swap(in0, &in0[16], 16);
+  mm256_vectors_swap(in1, &in1[16], 16);
+  mm256_transpose_32x32(in0, in1);
+}
+
+static void load_buffer_32x32(const int16_t *input, int stride, int flipud,
+                              int fliplr, __m256i *in0, __m256i *in1) {
+  // Load 4 16x16 blocks
+  const int16_t *topL = input;
+  const int16_t *topR = input + 16;
+  const int16_t *botL = input + 16 * stride;
+  const int16_t *botR = input + 16 * stride + 16;
+
+  const int16_t *tmp;
+
+  if (flipud) {
+    // Swap left columns
+    tmp = topL;
+    topL = botL;
+    botL = tmp;
+    // Swap right columns
+    tmp = topR;
+    topR = botR;
+    botR = tmp;
+  }
+
+  if (fliplr) {
+    // Swap top rows
+    tmp = topL;
+    topL = topR;
+    topR = tmp;
+    // Swap bottom rows
+    tmp = botL;
+    botL = botR;
+    botR = tmp;
+  }
+
+  // load first 16 columns
+  load_buffer_16x16(topL, stride, flipud, fliplr, in0);
+  load_buffer_16x16(botL, stride, flipud, fliplr, in0 + 16);
+
+  // load second 16 columns
+  load_buffer_16x16(topR, stride, flipud, fliplr, in1);
+  load_buffer_16x16(botR, stride, flipud, fliplr, in1 + 16);
+}
+#endif  // CONFIG_EXT_TX
+
+static void nr_right_shift_32x32_16col(__m256i *in) {
+  int i = 0;
+  const __m256i one = _mm256_set1_epi16(1);
+  __m256i sign;
+  while (i < 32) {
+    sign = _mm256_srai_epi16(in[i], 15);
+    in[i] = _mm256_add_epi16(in[i], one);
+    in[i] = _mm256_sub_epi16(in[i], sign);
+    in[i] = _mm256_srai_epi16(in[i], 2);
+    i += 1;
+  }
+}
+
+// Negative rounding
+static void nr_right_shift_32x32(__m256i *in0, __m256i *in1) {
+  nr_right_shift_32x32_16col(in0);
+  nr_right_shift_32x32_16col(in1);
+}
+
+#if CONFIG_EXT_TX
+static void pr_right_shift_32x32_16col(__m256i *in) {
+  int i = 0;
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i one = _mm256_set1_epi16(1);
+  __m256i sign;
+  while (i < 32) {
+    sign = _mm256_cmpgt_epi16(in[i], zero);
+    in[i] = _mm256_add_epi16(in[i], one);
+    in[i] = _mm256_sub_epi16(in[i], sign);
+    in[i] = _mm256_srai_epi16(in[i], 2);
+    i += 1;
+  }
+}
+
+// Positive rounding
+static void pr_right_shift_32x32(__m256i *in0, __m256i *in1) {
+  pr_right_shift_32x32_16col(in0);
+  pr_right_shift_32x32_16col(in1);
+}
+
+static void fidtx32_avx2(__m256i *in0, __m256i *in1) {
+  int i = 0;
+  while (i < 32) {
+    in0[i] = _mm256_slli_epi16(in0[i], 2);
+    in1[i] = _mm256_slli_epi16(in1[i], 2);
+    i += 1;
+  }
+  mm256_transpose_32x32(in0, in1);
+}
+#endif
+
+void av1_fht32x32_avx2(const int16_t *input, tran_low_t *output, int stride,
+                       int tx_type) {
+  __m256i in0[32];  // left 32 columns
+  __m256i in1[32];  // right 32 columns
+  (void)input;
+  (void)stride;
+
+  switch (tx_type) {
+// TODO(luoyi): For DCT_DCT, fwd_txfm_32x32() uses aom set. But this
+// function has better speed. The replacement must work with the
+// corresponding inverse transform.
+// case DCT_DCT:
+//   load_buffer_32x32(input, stride, 0, 0, in0, in1);
+//   fdct32_avx2(in0, in1);
+//   pr_right_shift_32x32(in0, in1);
+//   fdct32_avx2(in0, in1);
+//   break;
+#if CONFIG_EXT_TX
+    case ADST_DCT:
+      load_buffer_32x32(input, stride, 0, 0, in0, in1);
+      fhalfright32_avx2(in0, in1);
+      pr_right_shift_32x32(in0, in1);
+      fdct32_avx2(in0, in1);
+      break;
+    case DCT_ADST:
+      load_buffer_32x32(input, stride, 0, 0, in0, in1);
+      fdct32_avx2(in0, in1);
+      pr_right_shift_32x32(in0, in1);
+      fhalfright32_avx2(in0, in1);
+      break;
+    case ADST_ADST:
+      load_buffer_32x32(input, stride, 0, 0, in0, in1);
+      fhalfright32_avx2(in0, in1);
+      pr_right_shift_32x32(in0, in1);
+      fhalfright32_avx2(in0, in1);
+      break;
+    case FLIPADST_DCT:
+      load_buffer_32x32(input, stride, 1, 0, in0, in1);
+      fhalfright32_avx2(in0, in1);
+      pr_right_shift_32x32(in0, in1);
+      fdct32_avx2(in0, in1);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_32x32(input, stride, 0, 1, in0, in1);
+      fdct32_avx2(in0, in1);
+      pr_right_shift_32x32(in0, in1);
+      fhalfright32_avx2(in0, in1);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_32x32(input, stride, 1, 1, in0, in1);
+      fhalfright32_avx2(in0, in1);
+      pr_right_shift_32x32(in0, in1);
+      fhalfright32_avx2(in0, in1);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_32x32(input, stride, 0, 1, in0, in1);
+      fhalfright32_avx2(in0, in1);
+      pr_right_shift_32x32(in0, in1);
+      fhalfright32_avx2(in0, in1);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_32x32(input, stride, 1, 0, in0, in1);
+      fhalfright32_avx2(in0, in1);
+      pr_right_shift_32x32(in0, in1);
+      fhalfright32_avx2(in0, in1);
+      break;
+    case V_DCT:
+      load_buffer_32x32(input, stride, 0, 0, in0, in1);
+      fdct32_avx2(in0, in1);
+      pr_right_shift_32x32(in0, in1);
+      fidtx32_avx2(in0, in1);
+      break;
+    case H_DCT:
+      load_buffer_32x32(input, stride, 0, 0, in0, in1);
+      fidtx32_avx2(in0, in1);
+      pr_right_shift_32x32(in0, in1);
+      fdct32_avx2(in0, in1);
+      break;
+    case V_ADST:
+      load_buffer_32x32(input, stride, 0, 0, in0, in1);
+      fhalfright32_avx2(in0, in1);
+      pr_right_shift_32x32(in0, in1);
+      fidtx32_avx2(in0, in1);
+      break;
+    case H_ADST:
+      load_buffer_32x32(input, stride, 0, 0, in0, in1);
+      fidtx32_avx2(in0, in1);
+      pr_right_shift_32x32(in0, in1);
+      fhalfright32_avx2(in0, in1);
+      break;
+    case V_FLIPADST:
+      load_buffer_32x32(input, stride, 1, 0, in0, in1);
+      fhalfright32_avx2(in0, in1);
+      pr_right_shift_32x32(in0, in1);
+      fidtx32_avx2(in0, in1);
+      break;
+    case H_FLIPADST:
+      load_buffer_32x32(input, stride, 0, 1, in0, in1);
+      fidtx32_avx2(in0, in1);
+      pr_right_shift_32x32(in0, in1);
+      fhalfright32_avx2(in0, in1);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+  nr_right_shift_32x32(in0, in1);
+  write_buffer_32x32(in0, in1, 32, output);
+  _mm256_zeroupper();
 }

diff --git a/configure b/configure
index 231909b..0e33876 100755
--- a/configure
+++ b/configure

@@ -606,6 +606,7 @@
         check_add_cflags -Wimplicit-function-declaration
         check_add_cflags -Wuninitialized
         check_add_cflags -Wunused-variable
+        check_add_cflags -Wsign-compare
         case ${CC} in
           *clang*) ;;
           *) check_add_cflags -Wunused-but-set-variable ;;

diff --git a/examples/aom_cx_set_ref.c b/examples/aom_cx_set_ref.c
index 43e8fe0..fdb9739 100644
--- a/examples/aom_cx_set_ref.c
+++ b/examples/aom_cx_set_ref.c

@@ -307,6 +307,7 @@
   const char *height_arg = NULL;
   const char *infile_arg = NULL;
   const char *outfile_arg = NULL;
+  const char *update_frame_num_arg = NULL;
   unsigned int limit = 0;
   exec_name = argv[0];
 
@@ -317,18 +318,21 @@
   height_arg = argv[3];
   infile_arg = argv[4];
   outfile_arg = argv[5];
+  update_frame_num_arg = argv[6];
 
   encoder = get_aom_encoder_by_name(codec_arg);
   if (!encoder) die("Unsupported codec.");
 
-  update_frame_num = atoi(argv[6]);
+  update_frame_num = (unsigned int)strtoul(update_frame_num_arg, NULL, 0);
   // In AV1, the reference buffers (cm->buffer_pool->frame_bufs[i].buf) are
   // allocated while calling aom_codec_encode(), thus, setting reference for
   // 1st frame isn't supported.
-  if (update_frame_num <= 1) die("Couldn't parse frame number '%s'\n", argv[6]);
+  if (update_frame_num <= 1) {
+    die("Couldn't parse frame number '%s'\n", update_frame_num_arg);
+  }
 
   if (argc > 7) {
-    limit = atoi(argv[7]);
+    limit = (unsigned int)strtoul(argv[7], NULL, 0);
     if (update_frame_num > limit)
       die("Update frame number couldn't larger than limit\n");
   }

diff --git a/test/ans_test.cc b/test/ans_test.cc
index ca38de2..ba8e3c7 100644
--- a/test/ans_test.cc
+++ b/test/ans_test.cc

@@ -74,18 +74,21 @@
   return ans_read_end(&d);
 }
 
-// TODO(aconverse@google.com): replace this with a more representative
-// distribution from the codec.
-const rans_sym rans_sym_tab[] = {
-  { 67, 0 }, { 99, 67 }, { 575, 166 }, { 283, 741 },
-};
+const aom_cdf_prob spareto65[] = { 260, 188, 138, 102, 133, 122, 64, 15, 1, 1 };
 
-std::vector<int> ans_encode_build_vals(const rans_sym *tab, int iters) {
+const int kRansSymbols =
+    static_cast<int>(sizeof(spareto65) / sizeof(spareto65[0]));
+
+std::vector<int> ans_encode_build_vals(rans_sym *const tab, int iters) {
+  aom_cdf_prob sum = 0;
+  for (int i = 0; i < kRansSymbols; ++i) {
+    tab[i].cum_prob = sum;
+    tab[i].prob = spareto65[i];
+    sum += spareto65[i];
+  }
   std::vector<int> p_to_sym;
-  int i = 0;
-  while (p_to_sym.size() < RANS_PRECISION) {
+  for (int i = 0; i < kRansSymbols; ++i) {
     p_to_sym.insert(p_to_sym.end(), tab[i].prob, i);
-    ++i;
   }
   assert(p_to_sym.size() == RANS_PRECISION);
   std::vector<int> ret;
@@ -97,10 +100,11 @@
   return ret;
 }
 
-void rans_build_dec_tab(const struct rans_sym sym_tab[], rans_lut dec_tab) {
-  dec_tab[0] = 0;
-  for (int i = 1; dec_tab[i - 1] < RANS_PRECISION; ++i) {
-    dec_tab[i] = dec_tab[i - 1] + sym_tab[i - 1].prob;
+void rans_build_dec_tab(const struct rans_sym sym_tab[],
+                        aom_cdf_prob *dec_tab) {
+  unsigned int sum = 0;
+  for (int i = 0; sum < RANS_PRECISION; ++i) {
+    dec_tab[i] = sum += sym_tab[i].prob;
   }
 }
 
@@ -108,7 +112,7 @@
                 uint8_t *buf) {
   AnsCoder a;
   ans_write_init(&a, buf);
-  rans_lut dec_tab;
+  aom_cdf_prob dec_tab[kRansSymbols];
   rans_build_dec_tab(tab, dec_tab);
 
   std::clock_t start = std::clock();
@@ -149,16 +153,20 @@
 class AnsTest : public ::testing::Test {
  protected:
   static void SetUpTestCase() {
-    sym_vec_ = ans_encode_build_vals(rans_sym_tab, kNumSyms);
+    sym_vec_ = ans_encode_build_vals(rans_sym_tab_, kNumSyms);
   }
   virtual void SetUp() { buf_ = new uint8_t[kNumSyms / 2]; }
   virtual void TearDown() { delete[] buf_; }
   static const int kNumSyms = 25000000;
   static std::vector<int> sym_vec_;
+  static rans_sym rans_sym_tab_[kRansSymbols];
   uint8_t *buf_;
 };
 std::vector<int> AnsTest::sym_vec_;
+rans_sym AnsTest::rans_sym_tab_[kRansSymbols];
 
 TEST_F(AbsTest, Uabs) { EXPECT_TRUE(check_uabs(pv_vec_, buf_)); }
-TEST_F(AnsTest, Rans) { EXPECT_TRUE(check_rans(sym_vec_, rans_sym_tab, buf_)); }
+TEST_F(AnsTest, Rans) {
+  EXPECT_TRUE(check_rans(sym_vec_, rans_sym_tab_, buf_));
+}
 }  // namespace

diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index 9a661f9..e4179ef 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc

@@ -402,6 +402,12 @@
                                                      AOM_BITS_8)));
 #endif  // HAVE_SSE2 && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
+#if HAVE_AVX2 && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(AVX2, PartialTrans32x32Test,
+                        ::testing::Values(make_tuple(&aom_fdct32x32_1_avx2,
+                                                     AOM_BITS_8)));
+#endif  // HAVE_AVX2 && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
 #if HAVE_SSE2 && CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans32x32Test,

diff --git a/test/fht32x32_test.cc b/test/fht32x32_test.cc
new file mode 100644
index 0000000..a949ebf
--- /dev/null
+++ b/test/fht32x32_test.cc

@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./av1_rtcd.h"
+#include "./aom_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/transform_test_base.h"
+#include "test/util.h"
+#include "aom_ports/mem.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
+                        int tx_type);
+using std::tr1::tuple;
+using libaom_test::FhtFunc;
+typedef tuple<FhtFunc, IhtFunc, int, aom_bit_depth_t, int> Ht32x32Param;
+
+void fht32x32_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
+  av1_fht32x32_c(in, out, stride, tx_type);
+}
+
+#if CONFIG_AOM_HIGHBITDEPTH
+typedef void (*IHbdHtFunc)(const tran_low_t *in, uint8_t *out, int stride,
+                           int tx_type, int bd);
+typedef void (*HbdHtFunc)(const int16_t *input, int32_t *output, int stride,
+                          int tx_type, int bd);
+
+// Target optimized function, tx_type, bit depth
+typedef tuple<HbdHtFunc, int, int> HighbdHt32x32Param;
+
+void highbd_fht32x32_ref(const int16_t *in, int32_t *out, int stride,
+                         int tx_type, int bd) {
+  av1_fwd_txfm2d_32x32_c(in, out, stride, tx_type, bd);
+}
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+#if HAVE_AVX2
+void dummy_inv_txfm(const tran_low_t *in, uint8_t *out, int stride,
+                    int tx_type) {
+  (void)in;
+  (void)out;
+  (void)stride;
+  (void)tx_type;
+}
+#endif
+
+class AV1Trans32x32HT : public libaom_test::TransformTestBase,
+                        public ::testing::TestWithParam<Ht32x32Param> {
+ public:
+  virtual ~AV1Trans32x32HT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    tx_type_ = GET_PARAM(2);
+    pitch_ = 32;
+    fwd_txfm_ref = fht32x32_ref;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
+    num_coeffs_ = GET_PARAM(4);
+  }
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
+    fwd_txfm_(in, out, stride, tx_type_);
+  }
+
+  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride, tx_type_);
+  }
+
+  FhtFunc fwd_txfm_;
+  IhtFunc inv_txfm_;
+};
+
+TEST_P(AV1Trans32x32HT, CoeffCheck) { RunCoeffCheck(); }
+
+#if CONFIG_AOM_HIGHBITDEPTH
+class AV1HighbdTrans32x32HT
+    : public ::testing::TestWithParam<HighbdHt32x32Param> {
+ public:
+  virtual ~AV1HighbdTrans32x32HT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    fwd_txfm_ref_ = highbd_fht32x32_ref;
+    tx_type_ = GET_PARAM(1);
+    bit_depth_ = GET_PARAM(2);
+    mask_ = (1 << bit_depth_) - 1;
+    num_coeffs_ = 1024;
+
+    input_ = reinterpret_cast<int16_t *>(
+        aom_memalign(32, sizeof(int16_t) * num_coeffs_));
+    output_ = reinterpret_cast<int32_t *>(
+        aom_memalign(32, sizeof(int32_t) * num_coeffs_));
+    output_ref_ = reinterpret_cast<int32_t *>(
+        aom_memalign(32, sizeof(int32_t) * num_coeffs_));
+  }
+
+  virtual void TearDown() {
+    aom_free(input_);
+    aom_free(output_);
+    aom_free(output_ref_);
+    libaom_test::ClearSystemState();
+  }
+
+ protected:
+  void RunBitexactCheck();
+
+ private:
+  HbdHtFunc fwd_txfm_;
+  HbdHtFunc fwd_txfm_ref_;
+  int tx_type_;
+  int bit_depth_;
+  int mask_;
+  int num_coeffs_;
+  int16_t *input_;
+  int32_t *output_;
+  int32_t *output_ref_;
+};
+
+void AV1HighbdTrans32x32HT::RunBitexactCheck() {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  int i, j;
+  const int stride = 32;
+  const int num_tests = 1000;
+
+  for (i = 0; i < num_tests; ++i) {
+    for (j = 0; j < num_coeffs_; ++j) {
+      input_[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
+    }
+
+    fwd_txfm_ref_(input_, output_ref_, stride, tx_type_, bit_depth_);
+    ASM_REGISTER_STATE_CHECK(
+        fwd_txfm_(input_, output_, stride, tx_type_, bit_depth_));
+
+    for (j = 0; j < num_coeffs_; ++j) {
+      EXPECT_EQ(output_ref_[j], output_[j])
+          << "Not bit-exact result at index: " << j << " at test block: " << i;
+    }
+  }
+}
+
+TEST_P(AV1HighbdTrans32x32HT, HighbdCoeffCheck) { RunBitexactCheck(); }
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+using std::tr1::make_tuple;
+
+#if HAVE_AVX2
+const Ht32x32Param kArrayHt32x32Param_avx2[] = {
+  // TODO(luoyi): DCT_DCT tx_type is not enabled in av1_fht32x32_c(avx2) yet.
+  // make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 0, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 1, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 2, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 3, AOM_BITS_8, 1024),
+#if CONFIG_EXT_TX
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 4, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 5, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 6, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 7, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 8, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 10, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 11, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 12, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 13, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 14, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 15, AOM_BITS_8, 1024)
+#endif  // CONFIG_EXT_TX
+};
+INSTANTIATE_TEST_CASE_P(AVX2, AV1Trans32x32HT,
+                        ::testing::ValuesIn(kArrayHt32x32Param_avx2));
+#endif  // HAVE_AVX2
+}  // namespace

diff --git a/test/test.mk b/test/test.mk
index c0573e7..f149fc5 100644
--- a/test/test.mk
+++ b/test/test.mk

@@ -144,6 +144,7 @@
 LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_fht16x8_test.cc
 LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_iht8x16_test.cc
 LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_iht16x8_test.cc
+LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += fht32x32_test.cc
 endif
 LIBAOM_TEST_SRCS-$(CONFIG_EXT_TILE)     += av1_ext_tile_test.cc
 

diff --git a/tools/gen_authors.sh b/tools/gen_authors.sh
index 4cfd81e..5def8bc 100755
--- a/tools/gen_authors.sh
+++ b/tools/gen_authors.sh

@@ -6,8 +6,5 @@
 # This file is automatically generated from the git commit history
 # by tools/gen_authors.sh.
 
-$(git log --pretty=format:"%aN <%aE>" | sort | uniq | grep -v corp.google)
-Google Inc.
-The Mozilla Foundation
-The Xiph.Org Foundation
+$(git log --pretty=format:"%aN <%aE>" | sort | uniq | grep -v "corp.google\|clang-format")
 EOF

diff --git a/tools/gen_constrained_tokenset.py b/tools/gen_constrained_tokenset.py
new file mode 100755
index 0000000..a0f8280
--- /dev/null
+++ b/tools/gen_constrained_tokenset.py

@@ -0,0 +1,115 @@
+#!/usr/bin/python
+##
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+"""Generate the probability model for the constrained token set.
+
+Model obtained from a 2-sided zero-centered distribution derived
+from a Pareto distribution. The cdf of the distribution is:
+cdf(x) = 0.5 + 0.5 * sgn(x) * [1 - {alpha/(alpha + |x|)} ^ beta]
+
+For a given beta and a given probability of the 1-node, the alpha
+is first solved, and then the {alpha, beta} pair is used to generate
+the probabilities for the rest of the nodes.
+"""
+
+import heapq
+import sys
+import numpy as np
+import scipy.optimize
+import scipy.stats
+
+
+def cdf_spareto(x, xm, beta):
+  p = 1 - (xm / (np.abs(x) + xm))**beta
+  p = 0.5 + 0.5 * np.sign(x) * p
+  return p
+
+
+def get_spareto(p, beta):
+  cdf = cdf_spareto
+
+  def func(x):
+    return ((cdf(1.5, x, beta) - cdf(0.5, x, beta)) /
+            (1 - cdf(0.5, x, beta)) - p)**2
+
+  alpha = scipy.optimize.fminbound(func, 1e-12, 10000, xtol=1e-12)
+  parray = np.zeros(11)
+  parray[0] = 2 * (cdf(0.5, alpha, beta) - 0.5)
+  parray[1] = (2 * (cdf(1.5, alpha, beta) - cdf(0.5, alpha, beta)))
+  parray[2] = (2 * (cdf(2.5, alpha, beta) - cdf(1.5, alpha, beta)))
+  parray[3] = (2 * (cdf(3.5, alpha, beta) - cdf(2.5, alpha, beta)))
+  parray[4] = (2 * (cdf(4.5, alpha, beta) - cdf(3.5, alpha, beta)))
+  parray[5] = (2 * (cdf(6.5, alpha, beta) - cdf(4.5, alpha, beta)))
+  parray[6] = (2 * (cdf(10.5, alpha, beta) - cdf(6.5, alpha, beta)))
+  parray[7] = (2 * (cdf(18.5, alpha, beta) - cdf(10.5, alpha, beta)))
+  parray[8] = (2 * (cdf(34.5, alpha, beta) - cdf(18.5, alpha, beta)))
+  parray[9] = (2 * (cdf(66.5, alpha, beta) - cdf(34.5, alpha, beta)))
+  parray[10] = 2 * (1. - cdf(66.5, alpha, beta))
+  return parray
+
+
+def quantize_probs(p, save_first_bin, bits):
+  """Quantize probability precisely.
+
+  Quantize probabilities minimizing dH (Kullback-Leibler divergence)
+  approximated by: sum (p_i-q_i)^2/p_i.
+  References:
+  https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
+  https://github.com/JarekDuda/AsymmetricNumeralSystemsToolkit
+  """
+  num_sym = p.size
+  p = np.clip(p, 1e-16, 1)
+  L = 2**bits
+  pL = p * L
+  ip = 1. / p  # inverse probability
+  q = np.clip(np.round(pL), 1, L + 1 - num_sym)
+  quant_err = (pL - q)**2 * ip
+  sgn = np.sign(L - q.sum())  # direction of correction
+  if sgn != 0:  # correction is needed
+    v = []  # heap of adjustment results (adjustment err, index) of each symbol
+    for i in range(1 if save_first_bin else 0, num_sym):
+      q_adj = q[i] + sgn
+      if q_adj > 0 and q_adj < L:
+        adj_err = (pL[i] - q_adj)**2 * ip[i] - quant_err[i]
+        heapq.heappush(v, (adj_err, i))
+    while q.sum() != L:
+      # apply lowest error adjustment
+      (adj_err, i) = heapq.heappop(v)
+      quant_err[i] += adj_err
+      q[i] += sgn
+      # calculate the cost of adjusting this symbol again
+      q_adj = q[i] + sgn
+      if q_adj > 0 and q_adj < L:
+        adj_err = (pL[i] - q_adj)**2 * ip[i] - quant_err[i]
+        heapq.heappush(v, (adj_err, i))
+  return q
+
+
+def get_quantized_spareto(p, beta, bits):
+  parray = get_spareto(p, beta)
+  parray = parray[1:] / (1 - parray[0])
+  qarray = quantize_probs(parray, True, bits)
+  return qarray.astype(np.int)
+
+
+def main(bits=8):
+  beta = 8
+  for q in range(1, 256):
+    parray = get_quantized_spareto(q / 256., beta, bits)
+    assert parray.sum() == 2**bits
+    print '{', ', '.join('%d' % i for i in parray), '},'
+
+
+if __name__ == '__main__':
+  if len(sys.argv) > 1:
+    main(int(sys.argv[1]))
+  else:
+    main()
commit	a48764d05fb4e4b7fc6bec88beaac4da4240bcd1	[log] [tgz]
author	Yue Chen <yuec@google.com>	Fri Oct 14 01:33:00 2016 +0000
committer	Gerrit Code Review <noreply-gerritcodereview@google.com>	Fri Oct 14 01:33:00 2016 +0000
tree	0afe5082a8b8d67332cb97d1cd4256c829cf6a66
parent	975350387ce0b55bf5af8cb944f6a242b72251ff [diff]
parent	cb60b185c73e0f15eeac71d83175345de9b03fa6 [diff]