Merge "Fix update_delta_q_probs compile warning" into nextgenv2
diff --git a/aom_dsp/ansreader.h b/aom_dsp/ansreader.h
index c46778b..0e9a671 100644
--- a/aom_dsp/ansreader.h
+++ b/aom_dsp/ansreader.h
@@ -20,6 +20,9 @@
 #include "aom_dsp/prob.h"
 #include "aom_dsp/ans.h"
 #include "aom_ports/mem_ops.h"
+#if CONFIG_ACCOUNTING
+#include "av1/common/accounting.h"
+#endif
 
 #ifdef __cplusplus
 extern "C" {
@@ -29,6 +32,9 @@
   const uint8_t *buf;
   int buf_offset;
   uint32_t state;
+#if CONFIG_ACCOUNTING
+  Accounting *accounting;
+#endif
 };
 
 static INLINE int uabs_read(struct AnsDecoder *ans, AnsP8 p0) {
@@ -119,6 +125,9 @@
     // 110xxxxx implies this byte is a superframe marker
     return 1;
   }
+#if CONFIG_ACCOUNTING
+  ans->accounting = NULL;
+#endif
   ans->state += L_BASE;
   if (ans->state >= L_BASE * IO_BASE) return 1;
   return 0;
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index d6fa90b..b073b1b 100644
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -708,7 +708,7 @@
     specialize qw/aom_fdct4x4_1 sse2/;
 
     add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/aom_fdct8x8 sse2/;
+    specialize qw/aom_fdct8x8 sse2/, "$ssse3_x86_64";
 
     add_proto qw/void aom_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
     specialize qw/aom_fdct8x8_1 sse2/;
diff --git a/aom_dsp/inv_txfm.c b/aom_dsp/inv_txfm.c
index e0dda12..4bb656b 100644
--- a/aom_dsp/inv_txfm.c
+++ b/aom_dsp/inv_txfm.c
@@ -93,7 +93,7 @@
   }
 }
 
-void idct4_c(const tran_low_t *input, tran_low_t *output) {
+void aom_idct4_c(const tran_low_t *input, tran_low_t *output) {
   tran_low_t step[4];
   tran_high_t temp1, temp2;
   // stage 1
@@ -121,7 +121,7 @@
 
   // Rows
   for (i = 0; i < 4; ++i) {
-    idct4_c(input, outptr);
+    aom_idct4_c(input, outptr);
     input += 4;
     outptr += 4;
   }
@@ -129,7 +129,7 @@
   // Columns
   for (i = 0; i < 4; ++i) {
     for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
-    idct4_c(temp_in, temp_out);
+    aom_idct4_c(temp_in, temp_out);
     for (j = 0; j < 4; ++j) {
       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
                                             ROUND_POWER_OF_TWO(temp_out[j], 4));
@@ -154,7 +154,7 @@
   }
 }
 
-void idct8_c(const tran_low_t *input, tran_low_t *output) {
+void aom_idct8_c(const tran_low_t *input, tran_low_t *output) {
   tran_low_t step1[8], step2[8];
   tran_high_t temp1, temp2;
   // stage 1
@@ -216,7 +216,7 @@
 
   // First transform rows
   for (i = 0; i < 8; ++i) {
-    idct8_c(input, outptr);
+    aom_idct8_c(input, outptr);
     input += 8;
     outptr += 8;
   }
@@ -224,7 +224,7 @@
   // Then transform columns
   for (i = 0; i < 8; ++i) {
     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
-    idct8_c(temp_in, temp_out);
+    aom_idct8_c(temp_in, temp_out);
     for (j = 0; j < 8; ++j) {
       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
                                             ROUND_POWER_OF_TWO(temp_out[j], 5));
@@ -244,7 +244,7 @@
   }
 }
 
-void iadst4_c(const tran_low_t *input, tran_low_t *output) {
+void aom_iadst4_c(const tran_low_t *input, tran_low_t *output) {
   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
 
   tran_low_t x0 = input[0];
@@ -281,7 +281,7 @@
   output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
 }
 
-void iadst8_c(const tran_low_t *input, tran_low_t *output) {
+void aom_iadst8_c(const tran_low_t *input, tran_low_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7;
 
   tran_high_t x0 = input[7];
@@ -367,7 +367,7 @@
   // First transform rows
   // only first 4 row has non-zero coefs
   for (i = 0; i < 4; ++i) {
-    idct8_c(input, outptr);
+    aom_idct8_c(input, outptr);
     input += 8;
     outptr += 8;
   }
@@ -375,7 +375,7 @@
   // Then transform columns
   for (i = 0; i < 8; ++i) {
     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
-    idct8_c(temp_in, temp_out);
+    aom_idct8_c(temp_in, temp_out);
     for (j = 0; j < 8; ++j) {
       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
                                             ROUND_POWER_OF_TWO(temp_out[j], 5));
@@ -383,7 +383,7 @@
   }
 }
 
-void idct16_c(const tran_low_t *input, tran_low_t *output) {
+void aom_idct16_c(const tran_low_t *input, tran_low_t *output) {
   tran_low_t step1[16], step2[16];
   tran_high_t temp1, temp2;
 
@@ -557,7 +557,7 @@
 
   // First transform rows
   for (i = 0; i < 16; ++i) {
-    idct16_c(input, outptr);
+    aom_idct16_c(input, outptr);
     input += 16;
     outptr += 16;
   }
@@ -565,7 +565,7 @@
   // Then transform columns
   for (i = 0; i < 16; ++i) {
     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
-    idct16_c(temp_in, temp_out);
+    aom_idct16_c(temp_in, temp_out);
     for (j = 0; j < 16; ++j) {
       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
@@ -573,7 +573,7 @@
   }
 }
 
-void iadst16_c(const tran_low_t *input, tran_low_t *output) {
+void aom_iadst16_c(const tran_low_t *input, tran_low_t *output) {
   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
   tran_high_t s9, s10, s11, s12, s13, s14, s15;
 
@@ -754,7 +754,7 @@
   // First transform rows. Since all non-zero dct coefficients are in
   // upper-left 4x4 area, we only need to calculate first 4 rows here.
   for (i = 0; i < 4; ++i) {
-    idct16_c(input, outptr);
+    aom_idct16_c(input, outptr);
     input += 16;
     outptr += 16;
   }
@@ -762,7 +762,7 @@
   // Then transform columns
   for (i = 0; i < 16; ++i) {
     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
-    idct16_c(temp_in, temp_out);
+    aom_idct16_c(temp_in, temp_out);
     for (j = 0; j < 16; ++j) {
       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
@@ -782,7 +782,7 @@
   }
 }
 
-void idct32_c(const tran_low_t *input, tran_low_t *output) {
+void aom_idct32_c(const tran_low_t *input, tran_low_t *output) {
   tran_low_t step1[32], step2[32];
   tran_high_t temp1, temp2;
 
@@ -1168,7 +1168,7 @@
       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
 
     if (zero_coeff[0] | zero_coeff[1])
-      idct32_c(input, outptr);
+      aom_idct32_c(input, outptr);
     else
       memset(outptr, 0, sizeof(tran_low_t) * 32);
     input += 32;
@@ -1178,7 +1178,7 @@
   // Columns
   for (i = 0; i < 32; ++i) {
     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
-    idct32_c(temp_in, temp_out);
+    aom_idct32_c(temp_in, temp_out);
     for (j = 0; j < 32; ++j) {
       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
@@ -1196,7 +1196,7 @@
   // Rows
   // only upper-left 16x16 has non-zero coeff
   for (i = 0; i < 16; ++i) {
-    idct32_c(input, outptr);
+    aom_idct32_c(input, outptr);
     input += 32;
     outptr += 32;
   }
@@ -1204,7 +1204,7 @@
   // Columns
   for (i = 0; i < 32; ++i) {
     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
-    idct32_c(temp_in, temp_out);
+    aom_idct32_c(temp_in, temp_out);
     for (j = 0; j < 32; ++j) {
       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
@@ -1222,7 +1222,7 @@
   // Rows
   // only upper-left 8x8 has non-zero coeff
   for (i = 0; i < 8; ++i) {
-    idct32_c(input, outptr);
+    aom_idct32_c(input, outptr);
     input += 32;
     outptr += 32;
   }
@@ -1230,7 +1230,7 @@
   // Columns
   for (i = 0; i < 32; ++i) {
     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
-    idct32_c(temp_in, temp_out);
+    aom_idct32_c(temp_in, temp_out);
     for (j = 0; j < 32; ++j) {
       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
diff --git a/aom_dsp/inv_txfm.h b/aom_dsp/inv_txfm.h
index 0f84e38..c3d794e 100644
--- a/aom_dsp/inv_txfm.h
+++ b/aom_dsp/inv_txfm.h
@@ -97,13 +97,13 @@
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 #endif  // CONFIG_EMULATE_HARDWARE
 
-void idct4_c(const tran_low_t *input, tran_low_t *output);
-void idct8_c(const tran_low_t *input, tran_low_t *output);
-void idct16_c(const tran_low_t *input, tran_low_t *output);
-void idct32_c(const tran_low_t *input, tran_low_t *output);
-void iadst4_c(const tran_low_t *input, tran_low_t *output);
-void iadst8_c(const tran_low_t *input, tran_low_t *output);
-void iadst16_c(const tran_low_t *input, tran_low_t *output);
+void aom_idct4_c(const tran_low_t *input, tran_low_t *output);
+void aom_idct8_c(const tran_low_t *input, tran_low_t *output);
+void aom_idct16_c(const tran_low_t *input, tran_low_t *output);
+void aom_idct32_c(const tran_low_t *input, tran_low_t *output);
+void aom_iadst4_c(const tran_low_t *input, tran_low_t *output);
+void aom_iadst8_c(const tran_low_t *input, tran_low_t *output);
+void aom_iadst16_c(const tran_low_t *input, tran_low_t *output);
 
 #if CONFIG_AOM_HIGHBITDEPTH
 void aom_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd);
diff --git a/aom_dsp/prob.c b/aom_dsp/prob.c
index d3556cb..b85fa7e 100644
--- a/aom_dsp/prob.c
+++ b/aom_dsp/prob.c
@@ -11,7 +11,7 @@
 
 #include "./aom_config.h"
 
-#if CONFIG_DAALA_EC
+#if CONFIG_DAALA_EC || CONFIG_RANS
 #include <string.h>
 #endif
 
@@ -57,7 +57,7 @@
   tree_merge_probs_impl(0, tree, pre_probs, counts, probs);
 }
 
-#if CONFIG_DAALA_EC
+#if CONFIG_DAALA_EC || CONFIG_RANS
 typedef struct tree_node tree_node;
 
 struct tree_node {
@@ -86,7 +86,7 @@
   int i;
   uint32_t pa;
   uint32_t pb;
-  for (i = 0; i < OD_MINI(a.len, b.len) && a.probs[i] == b.probs[i]; i++) {
+  for (i = 0; i < AOMMIN(a.len, b.len) && a.probs[i] == b.probs[i]; i++) {
   }
   pa = tree_node_prob(a, i);
   pb = tree_node_prob(b, i);
diff --git a/aom_dsp/prob.h b/aom_dsp/prob.h
index bf9abbf..fcd1a74 100644
--- a/aom_dsp/prob.h
+++ b/aom_dsp/prob.h
@@ -96,7 +96,7 @@
 void aom_tree_merge_probs(const aom_tree_index *tree, const aom_prob *pre_probs,
                           const unsigned int *counts, aom_prob *probs);
 
-#if CONFIG_DAALA_EC
+#if CONFIG_DAALA_EC || CONFIG_RANS
 int tree_to_cdf(const aom_tree_index *tree, const aom_prob *probs,
                 aom_tree_index root, aom_cdf_prob *cdf, aom_tree_index *ind,
                 int *pth, int *len);
diff --git a/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
index 6f3c470..5b2aab2 100644
--- a/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
+++ b/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
@@ -130,12 +130,30 @@
   psraw              m%2, 1
 %endmacro
 
+%macro STORE_OUTPUT 2 ; index, result
+%if CONFIG_AOM_HIGHBITDEPTH
+  ; const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+  ; __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+  ; __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+  ; _mm_store_si128((__m128i *)(dst_ptr), out0);
+  ; _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
+  pxor               m11, m11
+  pcmpgtw            m11, m%2
+  movdqa             m12, m%2
+  punpcklwd          m%2, m11
+  punpckhwd          m12, m11
+  mova               [outputq + 4*%1 +  0], m%2
+  mova               [outputq + 4*%1 + 16], m12
+%else
+  mova               [outputq + 2*%1], m%2
+%endif
+%endmacro
+
 INIT_XMM ssse3
 cglobal fdct8x8, 3, 5, 13, input, output, stride
 
   mova               m8, [pd_8192]
   mova              m12, [pw_11585x2]
-  pxor              m11, m11
 
   lea                r3, [2 * strideq]
   lea                r4, [4 * strideq]
@@ -173,14 +191,14 @@
   DIVIDE_ROUND_2X   4, 5, 9, 10
   DIVIDE_ROUND_2X   6, 7, 9, 10
 
-  mova              [outputq +   0], m0
-  mova              [outputq +  16], m1
-  mova              [outputq +  32], m2
-  mova              [outputq +  48], m3
-  mova              [outputq +  64], m4
-  mova              [outputq +  80], m5
-  mova              [outputq +  96], m6
-  mova              [outputq + 112], m7
+  STORE_OUTPUT       0, 0
+  STORE_OUTPUT       8, 1
+  STORE_OUTPUT      16, 2
+  STORE_OUTPUT      24, 3
+  STORE_OUTPUT      32, 4
+  STORE_OUTPUT      40, 5
+  STORE_OUTPUT      48, 6
+  STORE_OUTPUT      56, 7
 
   RET
 %endif
diff --git a/aom_dsp/x86/inv_txfm_sse2.c b/aom_dsp/x86/inv_txfm_sse2.c
index 4735d97..2217a46 100644
--- a/aom_dsp/x86/inv_txfm_sse2.c
+++ b/aom_dsp/x86/inv_txfm_sse2.c
@@ -171,7 +171,7 @@
   RECON_AND_STORE4X4(dest + 3 * stride, dc_value);
 }
 
-void idct4_sse2(__m128i *in) {
+void aom_idct4_sse2(__m128i *in) {
   const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
@@ -207,7 +207,7 @@
   in[1] = _mm_shuffle_epi32(in[1], 0x4E);
 }
 
-void iadst4_sse2(__m128i *in) {
+void aom_iadst4_sse2(__m128i *in) {
   const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
   const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
   const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
@@ -533,7 +533,7 @@
   RECON_AND_STORE(dest + 7 * stride, dc_value);
 }
 
-void idct8_sse2(__m128i *in) {
+void aom_idct8_sse2(__m128i *in) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
   const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
@@ -558,7 +558,7 @@
         in[4], in[5], in[6], in[7]);
 }
 
-void iadst8_sse2(__m128i *in) {
+void aom_iadst8_sse2(__m128i *in) {
   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
   const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
@@ -2114,13 +2114,13 @@
   in[15] = _mm_sub_epi16(s[0], s[15]);
 }
 
-void idct16_sse2(__m128i *in0, __m128i *in1) {
+void aom_idct16_sse2(__m128i *in0, __m128i *in1) {
   array_transpose_16x16(in0, in1);
   idct16_8col(in0);
   idct16_8col(in1);
 }
 
-void iadst16_sse2(__m128i *in0, __m128i *in1) {
+void aom_iadst16_sse2(__m128i *in0, __m128i *in1) {
   array_transpose_16x16(in0, in1);
   iadst16_8col(in0);
   iadst16_8col(in1);
@@ -3596,7 +3596,7 @@
 
   if (!test) {
     // Do the row transform
-    idct4_sse2(inptr);
+    aom_idct4_sse2(inptr);
 
     // Check the min & max values
     max_input = _mm_max_epi16(inptr[0], inptr[1]);
@@ -3632,7 +3632,7 @@
   }
 
   if (optimised_cols) {
-    idct4_sse2(inptr);
+    aom_idct4_sse2(inptr);
 
     // Final round and shift
     inptr[0] = _mm_add_epi16(inptr[0], eight);
@@ -3712,7 +3712,7 @@
 
   if (!test) {
     // Do the row transform
-    idct8_sse2(inptr);
+    aom_idct8_sse2(inptr);
 
     // Find the min & max for the column transform
     max_input = _mm_max_epi16(inptr[0], inptr[1]);
@@ -3749,7 +3749,7 @@
   }
 
   if (optimised_cols) {
-    idct8_sse2(inptr);
+    aom_idct8_sse2(inptr);
 
     // Final round & shift and Reconstruction and Store
     {
@@ -3813,7 +3813,7 @@
 
   if (!test) {
     // Do the row transform
-    idct8_sse2(inptr);
+    aom_idct8_sse2(inptr);
 
     // Find the min & max for the column transform
     // N.B. Only first 4 cols contain non-zero coeffs
@@ -3852,7 +3852,7 @@
   }
 
   if (optimised_cols) {
-    idct8_sse2(inptr);
+    aom_idct8_sse2(inptr);
 
     // Final round & shift and Reconstruction and Store
     {
@@ -3918,7 +3918,7 @@
 
   if (!test) {
     // Do the row transform
-    idct16_sse2(inptr, inptr + 16);
+    aom_idct16_sse2(inptr, inptr + 16);
 
     // Find the min & max for the column transform
     max_input = _mm_max_epi16(inptr[0], inptr[1]);
@@ -3960,7 +3960,7 @@
   }
 
   if (optimised_cols) {
-    idct16_sse2(inptr, inptr + 16);
+    aom_idct16_sse2(inptr, inptr + 16);
 
     // Final round & shift and Reconstruction and Store
     {
@@ -4033,7 +4033,7 @@
 
   if (!test) {
     // Do the row transform (N.B. This transposes inptr)
-    idct16_sse2(inptr, inptr + 16);
+    aom_idct16_sse2(inptr, inptr + 16);
 
     // Find the min & max for the column transform
     // N.B. Only first 4 cols contain non-zero coeffs
@@ -4078,7 +4078,7 @@
   }
 
   if (optimised_cols) {
-    idct16_sse2(inptr, inptr + 16);
+    aom_idct16_sse2(inptr, inptr + 16);
 
     // Final round & shift and Reconstruction and Store
     {
diff --git a/aom_dsp/x86/inv_txfm_sse2.h b/aom_dsp/x86/inv_txfm_sse2.h
index c271b28..4ebb34d 100644
--- a/aom_dsp/x86/inv_txfm_sse2.h
+++ b/aom_dsp/x86/inv_txfm_sse2.h
@@ -197,12 +197,12 @@
 
 void iadst16_8col(__m128i *in);
 void idct16_8col(__m128i *in);
-void idct4_sse2(__m128i *in);
-void idct8_sse2(__m128i *in);
-void idct16_sse2(__m128i *in0, __m128i *in1);
-void iadst4_sse2(__m128i *in);
-void iadst8_sse2(__m128i *in);
-void iadst16_sse2(__m128i *in0, __m128i *in1);
+void aom_idct4_sse2(__m128i *in);
+void aom_idct8_sse2(__m128i *in);
+void aom_idct16_sse2(__m128i *in0, __m128i *in1);
+void aom_iadst4_sse2(__m128i *in);
+void aom_iadst8_sse2(__m128i *in);
+void aom_iadst16_sse2(__m128i *in0, __m128i *in1);
 void idct32_8col(__m128i *in0, __m128i *in1);
 
 #endif  // AOM_DSP_X86_INV_TXFM_SSE2_H_
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index 65f7440..43843cd 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -443,14 +443,6 @@
 }
 #endif  // CONFIG_SUPERTX
 
-static INLINE int get_tx1d_width(TX_SIZE tx_size) {
-  return num_4x4_blocks_wide_txsize_lookup[tx_size] << 2;
-}
-
-static INLINE int get_tx1d_height(TX_SIZE tx_size) {
-  return num_4x4_blocks_high_txsize_lookup[tx_size] << 2;
-}
-
 static INLINE int get_tx2d_size(TX_SIZE tx_size) {
   return num_4x4_blocks_txsize_lookup[tx_size] << 4;
 }
diff --git a/av1/common/dering.c b/av1/common/dering.c
index c21d4e5..4519031 100644
--- a/av1/common/dering.c
+++ b/av1/common/dering.c
@@ -54,23 +54,33 @@
   unsigned char *bskip;
   int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS] = { { 0 } };
   int stride;
-  int bsize[3];
-  int dec[3];
+  int bsize_x[3];
+  int bsize_y[3];
+  int dec_x[3];
+  int dec_y[3];
   int pli;
   int coeff_shift = AOMMAX(cm->bit_depth - 8, 0);
+  int nplanes;
+  if (xd->plane[1].subsampling_x == xd->plane[1].subsampling_y &&
+      xd->plane[2].subsampling_x == xd->plane[2].subsampling_y)
+    nplanes = 3;
+  else
+    nplanes = 1;
   nvsb = (cm->mi_rows + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
   nhsb = (cm->mi_cols + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
   bskip = aom_malloc(sizeof(*bskip) * cm->mi_rows * cm->mi_cols);
   av1_setup_dst_planes(xd->plane, frame, 0, 0);
-  for (pli = 0; pli < 3; pli++) {
-    dec[pli] = xd->plane[pli].subsampling_x;
-    bsize[pli] = 8 >> dec[pli];
+  for (pli = 0; pli < nplanes; pli++) {
+    dec_x[pli] = xd->plane[pli].subsampling_x;
+    dec_y[pli] = xd->plane[pli].subsampling_y;
+    bsize_x[pli] = 8 >> dec_x[pli];
+    bsize_y[pli] = 8 >> dec_y[pli];
   }
-  stride = bsize[0] * cm->mi_cols;
-  for (pli = 0; pli < 3; pli++) {
+  stride = bsize_x[0] * cm->mi_cols;
+  for (pli = 0; pli < nplanes; pli++) {
     src[pli] = aom_malloc(sizeof(*src) * cm->mi_rows * cm->mi_cols * 64);
-    for (r = 0; r < bsize[pli] * cm->mi_rows; ++r) {
-      for (c = 0; c < bsize[pli] * cm->mi_cols; ++c) {
+    for (r = 0; r < bsize_y[pli] * cm->mi_rows; ++r) {
+      for (c = 0; c < bsize_x[pli] * cm->mi_cols; ++c) {
 #if CONFIG_AOM_HIGHBITDEPTH
         if (cm->use_highbitdepth) {
           src[pli][r * stride + c] = CONVERT_TO_SHORTPTR(
@@ -104,7 +114,7 @@
                             ->mbmi.dering_gain);
       if (level == 0 || sb_all_skip(cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE))
         continue;
-      for (pli = 0; pli < 3; pli++) {
+      for (pli = 0; pli < nplanes; pli++) {
         int16_t dst[MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8];
         int threshold;
         /* FIXME: This is a temporary hack that uses more conservative
@@ -114,27 +124,29 @@
         else
           threshold = level << coeff_shift;
         if (threshold == 0) continue;
-        od_dering(dst, MAX_MIB_SIZE * bsize[pli],
-                  &src[pli][sbr * stride * bsize[pli] * MAX_MIB_SIZE +
-                            sbc * bsize[pli] * MAX_MIB_SIZE],
-                  stride, nhb, nvb, sbc, sbr, nhsb, nvsb, dec[pli], dir, pli,
+        od_dering(dst, MAX_MIB_SIZE * bsize_x[pli],
+                  &src[pli][sbr * stride * bsize_x[pli] * MAX_MIB_SIZE +
+                            sbc * bsize_x[pli] * MAX_MIB_SIZE],
+                  stride, nhb, nvb, sbc, sbr, nhsb, nvsb, dec_x[pli],
+                  dec_y[pli], dir, pli,
                   &bskip[MAX_MIB_SIZE * sbr * cm->mi_cols + MAX_MIB_SIZE * sbc],
                   cm->mi_cols, threshold, coeff_shift);
-        for (r = 0; r < bsize[pli] * nvb; ++r) {
-          for (c = 0; c < bsize[pli] * nhb; ++c) {
+        for (r = 0; r < bsize_y[pli] * nvb; ++r) {
+          for (c = 0; c < bsize_x[pli] * nhb; ++c) {
 #if CONFIG_AOM_HIGHBITDEPTH
             if (cm->use_highbitdepth) {
               CONVERT_TO_SHORTPTR(xd->plane[pli].dst.buf)
               [xd->plane[pli].dst.stride *
-                   (bsize[pli] * MAX_MIB_SIZE * sbr + r) +
-               sbc * bsize[pli] * MAX_MIB_SIZE + c] =
-                  dst[r * MAX_MIB_SIZE * bsize[pli] + c];
+                   (bsize_x[pli] * MAX_MIB_SIZE * sbr + r) +
+               sbc * bsize_x[pli] * MAX_MIB_SIZE + c] =
+                  dst[r * MAX_MIB_SIZE * bsize_x[pli] + c];
             } else {
 #endif
-              xd->plane[pli].dst.buf[xd->plane[pli].dst.stride *
-                                         (bsize[pli] * MAX_MIB_SIZE * sbr + r) +
-                                     sbc * bsize[pli] * MAX_MIB_SIZE + c] =
-                  dst[r * MAX_MIB_SIZE * bsize[pli] + c];
+              xd->plane[pli]
+                  .dst.buf[xd->plane[pli].dst.stride *
+                               (bsize_x[pli] * MAX_MIB_SIZE * sbr + r) +
+                           sbc * bsize_x[pli] * MAX_MIB_SIZE + c] =
+                  dst[r * MAX_MIB_SIZE * bsize_x[pli] + c];
 #if CONFIG_AOM_HIGHBITDEPTH
             }
 #endif
@@ -143,7 +155,7 @@
       }
     }
   }
-  for (pli = 0; pli < 3; pli++) {
+  for (pli = 0; pli < nplanes; pli++) {
     aom_free(src[pli]);
   }
   aom_free(bskip);
diff --git a/av1/common/entropymode.c b/av1/common/entropymode.c
index 52dc8f1..f23ac96 100644
--- a/av1/common/entropymode.c
+++ b/av1/common/entropymode.c
@@ -911,11 +911,13 @@
   // this (or similar) bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=59124
   int scores[PALETTE_MAX_SIZE + 10];
   const int weights[4] = { 3, 2, 3, 2 };
-  int color_ctx = 0;
+  int color_ctx_hash;
+  int color_ctx;
   int color_neighbors[4];
   int inverse_color_order[PALETTE_MAX_SIZE];
   assert(n <= PALETTE_MAX_SIZE);
 
+  // Get color indices of neighbors.
   color_neighbors[0] = (c - 1 >= 0) ? color_map[r * cols + c - 1] : -1;
   color_neighbors[1] =
       (c - 1 >= 0 && r - 1 >= 0) ? color_map[(r - 1) * cols + c - 1] : -1;
@@ -960,15 +962,19 @@
     }
   }
 
-  for (i = 0; i < 4; ++i) color_ctx = color_ctx * 11 + scores[i];
+  // Get hash value of context.
+  color_ctx_hash = 0;
+  for (i = 0; i < 4; ++i) color_ctx_hash = color_ctx_hash * 11 + scores[i];
 
-  for (i = 0; i < PALETTE_COLOR_CONTEXTS; ++i)
-    if (color_ctx == palette_color_context_lookup[i]) {
+  // Lookup context from hash.
+  color_ctx = 0;  // Default.
+  for (i = 0; i < PALETTE_COLOR_CONTEXTS; ++i) {
+    if (color_ctx_hash == palette_color_context_lookup[i]) {
       color_ctx = i;
       break;
     }
+  }
 
-  if (color_ctx >= PALETTE_COLOR_CONTEXTS) color_ctx = 0;
   if (color_idx != NULL) {
     *color_idx = inverse_color_order[color_map[r * cols + c]];
   }
diff --git a/av1/common/entropymv.c b/av1/common/entropymv.c
index a80165e..1ed4dbb 100644
--- a/av1/common/entropymv.c
+++ b/av1/common/entropymv.c
@@ -43,21 +43,21 @@
 
 static const nmv_context default_nmv_context = {
   { 32, 64, 96 },  // joints
-#if CONFIG_DAALA_EC
+#if CONFIG_DAALA_EC || CONFIG_RANS
   { 0, 0, 0, 0 },  // joint_cdf is computed from joints in av1_init_mv_probs()
 #endif
   { {
         // Vertical component
         128,                                                   // sign
         { 224, 144, 192, 168, 192, 176, 192, 198, 198, 245 },  // class
-#if CONFIG_DAALA_EC
+#if CONFIG_DAALA_EC || CONFIG_RANS
         { 0 },  // class_cdf is computed from class in av1_init_mv_probs()
 #endif
         { 216 },                                               // class0
         { 136, 140, 148, 160, 176, 192, 224, 234, 234, 240 },  // bits
         { { 128, 128, 64 }, { 96, 112, 64 } },                 // class0_fp
         { 64, 96, 64 },                                        // fp
-#if CONFIG_DAALA_EC
+#if CONFIG_DAALA_EC || CONFIG_RANS
         { { 0 }, { 0 } },  // class0_fp_cdf is computed in av1_init_mv_probs()
         { 0 },             // fp_cdf is computed from fp in av1_init_mv_probs()
 #endif
@@ -68,14 +68,14 @@
         // Horizontal component
         128,                                                   // sign
         { 216, 128, 176, 160, 176, 176, 192, 198, 198, 208 },  // class
-#if CONFIG_DAALA_EC
+#if CONFIG_DAALA_EC || CONFIG_RANS
         { 0 },  // class_cdf is computed from class in av1_init_mv_probs()
 #endif
         { 208 },                                               // class0
         { 136, 140, 148, 160, 176, 192, 224, 234, 234, 240 },  // bits
         { { 128, 128, 64 }, { 96, 112, 64 } },                 // class0_fp
         { 64, 96, 64 },                                        // fp
-#if CONFIG_DAALA_EC
+#if CONFIG_DAALA_EC || CONFIG_RANS
         { { 0 }, { 0 } },  // class0_fp_cdf is computed in av1_init_mv_probs()
         { 0 },             // fp_cdf is computed from fp in av1_init_mv_probs()
 #endif
@@ -149,13 +149,6 @@
   return c;
 }
 
-// TODO(jingning): This idle function is intentionally left as is for
-// experimental purpose.
-int av1_use_mv_hp(const MV *ref) {
-  (void)ref;
-  return 1;
-}
-
 static void inc_mv_component(int v, nmv_component_counts *comp_counts, int incr,
                              int usehp) {
   int s, z, c, o, d, e, f;
@@ -279,7 +272,7 @@
   for (i = 0; i < NMV_CONTEXTS; ++i) cm->fc->nmvc[i] = default_nmv_context;
 #else
   cm->fc->nmvc = default_nmv_context;
-#if CONFIG_DAALA_EC
+#if CONFIG_DAALA_EC || CONFIG_RANS
   {
     int i, j;
     av1_tree_to_cdf(av1_mv_joint_tree, cm->fc->nmvc.joints,
diff --git a/av1/common/entropymv.h b/av1/common/entropymv.h
index f308ef3..c215d23 100644
--- a/av1/common/entropymv.h
+++ b/av1/common/entropymv.h
@@ -27,7 +27,6 @@
 void av1_init_mv_probs(struct AV1Common *cm);
 
 void av1_adapt_mv_probs(struct AV1Common *cm, int usehp);
-int av1_use_mv_hp(const MV *ref);
 
 #define MV_UPDATE_PROB 252
 
@@ -85,14 +84,14 @@
 typedef struct {
   aom_prob sign;
   aom_prob classes[MV_CLASSES - 1];
-#if CONFIG_DAALA_EC
+#if CONFIG_DAALA_EC || CONFIG_RANS
   aom_cdf_prob class_cdf[MV_CLASSES];
 #endif
   aom_prob class0[CLASS0_SIZE - 1];
   aom_prob bits[MV_OFFSET_BITS];
   aom_prob class0_fp[CLASS0_SIZE][MV_FP_SIZE - 1];
   aom_prob fp[MV_FP_SIZE - 1];
-#if CONFIG_DAALA_EC
+#if CONFIG_DAALA_EC || CONFIG_RANS
   aom_cdf_prob class0_fp_cdf[CLASS0_SIZE][MV_FP_SIZE];
   aom_cdf_prob fp_cdf[MV_FP_SIZE];
 #endif
@@ -102,7 +101,7 @@
 
 typedef struct {
   aom_prob joints[MV_JOINTS - 1];
-#if CONFIG_DAALA_EC
+#if CONFIG_DAALA_EC || CONFIG_RANS
   aom_cdf_prob joint_cdf[MV_JOINTS];
 #endif
   nmv_component comps[2];
diff --git a/av1/common/idct.c b/av1/common/idct.c
index 4f33f9b..81581a4 100644
--- a/av1/common/idct.c
+++ b/av1/common/idct.c
@@ -70,7 +70,7 @@
   for (i = 0; i < 16; ++i) {
     output[i] = input[16 + i] * 4;
   }
-  idct16_c(inputhalf, output + 16);
+  aom_idct16_c(inputhalf, output + 16);
   // Note overall scaling factor is 4 times orthogonal
 }
 
@@ -241,24 +241,24 @@
 void av1_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                          int tx_type) {
   static const transform_2d IHT_4[] = {
-    { idct4_c, idct4_c },    // DCT_DCT
-    { iadst4_c, idct4_c },   // ADST_DCT
-    { idct4_c, iadst4_c },   // DCT_ADST
-    { iadst4_c, iadst4_c },  // ADST_ADST
+    { aom_idct4_c, aom_idct4_c },    // DCT_DCT  = 0
+    { aom_iadst4_c, aom_idct4_c },   // ADST_DCT = 1
+    { aom_idct4_c, aom_iadst4_c },   // DCT_ADST = 2
+    { aom_iadst4_c, aom_iadst4_c },  // ADST_ADST = 3
 #if CONFIG_EXT_TX
-    { iadst4_c, idct4_c },   // FLIPADST_DCT
-    { idct4_c, iadst4_c },   // DCT_FLIPADST
-    { iadst4_c, iadst4_c },  // FLIPADST_FLIPADST
-    { iadst4_c, iadst4_c },  // ADST_FLIPADST
-    { iadst4_c, iadst4_c },  // FLIPADST_ADST
-    { iidtx4_c, iidtx4_c },  // IDTX
-    { idct4_c, iidtx4_c },   // V_DCT
-    { iidtx4_c, idct4_c },   // H_DCT
-    { iadst4_c, iidtx4_c },  // V_ADST
-    { iidtx4_c, iadst4_c },  // H_ADST
-    { iadst4_c, iidtx4_c },  // V_FLIPADST
-    { iidtx4_c, iadst4_c },  // H_FLIPADST
-#endif                       // CONFIG_EXT_TX
+    { aom_iadst4_c, aom_idct4_c },   // FLIPADST_DCT
+    { aom_idct4_c, aom_iadst4_c },   // DCT_FLIPADST
+    { aom_iadst4_c, aom_iadst4_c },  // FLIPADST_FLIPADST
+    { aom_iadst4_c, aom_iadst4_c },  // ADST_FLIPADST
+    { aom_iadst4_c, aom_iadst4_c },  // FLIPADST_ADST
+    { iidtx4_c, iidtx4_c },          // IDTX
+    { aom_idct4_c, iidtx4_c },       // V_DCT
+    { iidtx4_c, aom_idct4_c },       // H_DCT
+    { aom_iadst4_c, iidtx4_c },      // V_ADST
+    { iidtx4_c, aom_iadst4_c },      // H_ADST
+    { aom_iadst4_c, iidtx4_c },      // V_FLIPADST
+    { iidtx4_c, aom_iadst4_c },      // H_FLIPADST
+#endif                               // CONFIG_EXT_TX
   };
 
   int i, j;
@@ -305,22 +305,22 @@
 void av1_iht4x8_32_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                          int tx_type) {
   static const transform_2d IHT_4x8[] = {
-    { idct8_c, idct4_c },    // DCT_DCT
-    { iadst8_c, idct4_c },   // ADST_DCT
-    { idct8_c, iadst4_c },   // DCT_ADST
-    { iadst8_c, iadst4_c },  // ADST_ADST
-    { iadst8_c, idct4_c },   // FLIPADST_DCT
-    { idct8_c, iadst4_c },   // DCT_FLIPADST
-    { iadst8_c, iadst4_c },  // FLIPADST_FLIPADST
-    { iadst8_c, iadst4_c },  // ADST_FLIPADST
-    { iadst8_c, iadst4_c },  // FLIPADST_ADST
-    { iidtx8_c, iidtx4_c },  // IDTX
-    { idct8_c, iidtx4_c },   // V_DCT
-    { iidtx8_c, idct4_c },   // H_DCT
-    { iadst8_c, iidtx4_c },  // V_ADST
-    { iidtx8_c, iadst4_c },  // H_ADST
-    { iadst8_c, iidtx4_c },  // V_FLIPADST
-    { iidtx8_c, iadst4_c },  // H_FLIPADST
+    { aom_idct8_c, aom_idct4_c },    // DCT_DCT
+    { aom_iadst8_c, aom_idct4_c },   // ADST_DCT
+    { aom_idct8_c, aom_iadst4_c },   // DCT_ADST
+    { aom_iadst8_c, aom_iadst4_c },  // ADST_ADST
+    { aom_iadst8_c, aom_idct4_c },   // FLIPADST_DCT
+    { aom_idct8_c, aom_iadst4_c },   // DCT_FLIPADST
+    { aom_iadst8_c, aom_iadst4_c },  // FLIPADST_FLIPADST
+    { aom_iadst8_c, aom_iadst4_c },  // ADST_FLIPADST
+    { aom_iadst8_c, aom_iadst4_c },  // FLIPADST_ADST
+    { iidtx8_c, iidtx4_c },          // IDTX
+    { aom_idct8_c, iidtx4_c },       // V_DCT
+    { iidtx8_c, aom_idct4_c },       // H_DCT
+    { aom_iadst8_c, iidtx4_c },      // V_ADST
+    { iidtx8_c, aom_iadst4_c },      // H_ADST
+    { aom_iadst8_c, iidtx4_c },      // V_FLIPADST
+    { iidtx8_c, aom_iadst4_c },      // H_FLIPADST
   };
 
   const int n = 4;
@@ -358,22 +358,22 @@
 void av1_iht8x4_32_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                          int tx_type) {
   static const transform_2d IHT_8x4[] = {
-    { idct4_c, idct8_c },    // DCT_DCT
-    { iadst4_c, idct8_c },   // ADST_DCT
-    { idct4_c, iadst8_c },   // DCT_ADST
-    { iadst4_c, iadst8_c },  // ADST_ADST
-    { iadst4_c, idct8_c },   // FLIPADST_DCT
-    { idct4_c, iadst8_c },   // DCT_FLIPADST
-    { iadst4_c, iadst8_c },  // FLIPADST_FLIPADST
-    { iadst4_c, iadst8_c },  // ADST_FLIPADST
-    { iadst4_c, iadst8_c },  // FLIPADST_ADST
-    { iidtx4_c, iidtx8_c },  // IDTX
-    { idct4_c, iidtx8_c },   // V_DCT
-    { iidtx4_c, idct8_c },   // H_DCT
-    { iadst4_c, iidtx8_c },  // V_ADST
-    { iidtx4_c, iadst8_c },  // H_ADST
-    { iadst4_c, iidtx8_c },  // V_FLIPADST
-    { iidtx4_c, iadst8_c },  // H_FLIPADST
+    { aom_idct4_c, aom_idct8_c },    // DCT_DCT
+    { aom_iadst4_c, aom_idct8_c },   // ADST_DCT
+    { aom_idct4_c, aom_iadst8_c },   // DCT_ADST
+    { aom_iadst4_c, aom_iadst8_c },  // ADST_ADST
+    { aom_iadst4_c, aom_idct8_c },   // FLIPADST_DCT
+    { aom_idct4_c, aom_iadst8_c },   // DCT_FLIPADST
+    { aom_iadst4_c, aom_iadst8_c },  // FLIPADST_FLIPADST
+    { aom_iadst4_c, aom_iadst8_c },  // ADST_FLIPADST
+    { aom_iadst4_c, aom_iadst8_c },  // FLIPADST_ADST
+    { iidtx4_c, iidtx8_c },          // IDTX
+    { aom_idct4_c, iidtx8_c },       // V_DCT
+    { iidtx4_c, aom_idct8_c },       // H_DCT
+    { aom_iadst4_c, iidtx8_c },      // V_ADST
+    { iidtx4_c, aom_iadst8_c },      // H_ADST
+    { aom_iadst4_c, iidtx8_c },      // V_FLIPADST
+    { iidtx4_c, aom_iadst8_c },      // H_FLIPADST
   };
   const int n = 4;
   const int n2 = 8;
@@ -411,22 +411,22 @@
 void av1_iht8x16_128_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                            int tx_type) {
   static const transform_2d IHT_8x16[] = {
-    { idct16_c, idct8_c },    // DCT_DCT
-    { iadst16_c, idct8_c },   // ADST_DCT
-    { idct16_c, iadst8_c },   // DCT_ADST
-    { iadst16_c, iadst8_c },  // ADST_ADST
-    { iadst16_c, idct8_c },   // FLIPADST_DCT
-    { idct16_c, iadst8_c },   // DCT_FLIPADST
-    { iadst16_c, iadst8_c },  // FLIPADST_FLIPADST
-    { iadst16_c, iadst8_c },  // ADST_FLIPADST
-    { iadst16_c, iadst8_c },  // FLIPADST_ADST
-    { iidtx16_c, iidtx8_c },  // IDTX
-    { idct16_c, iidtx8_c },   // V_DCT
-    { iidtx16_c, idct8_c },   // H_DCT
-    { iadst16_c, iidtx8_c },  // V_ADST
-    { iidtx16_c, iadst8_c },  // H_ADST
-    { iadst16_c, iidtx8_c },  // V_FLIPADST
-    { iidtx16_c, iadst8_c },  // H_FLIPADST
+    { aom_idct16_c, aom_idct8_c },    // DCT_DCT
+    { aom_iadst16_c, aom_idct8_c },   // ADST_DCT
+    { aom_idct16_c, aom_iadst8_c },   // DCT_ADST
+    { aom_iadst16_c, aom_iadst8_c },  // ADST_ADST
+    { aom_iadst16_c, aom_idct8_c },   // FLIPADST_DCT
+    { aom_idct16_c, aom_iadst8_c },   // DCT_FLIPADST
+    { aom_iadst16_c, aom_iadst8_c },  // FLIPADST_FLIPADST
+    { aom_iadst16_c, aom_iadst8_c },  // ADST_FLIPADST
+    { aom_iadst16_c, aom_iadst8_c },  // FLIPADST_ADST
+    { iidtx16_c, iidtx8_c },          // IDTX
+    { aom_idct16_c, iidtx8_c },       // V_DCT
+    { iidtx16_c, aom_idct8_c },       // H_DCT
+    { aom_iadst16_c, iidtx8_c },      // V_ADST
+    { iidtx16_c, aom_iadst8_c },      // H_ADST
+    { aom_iadst16_c, iidtx8_c },      // V_FLIPADST
+    { iidtx16_c, aom_iadst8_c },      // H_FLIPADST
   };
 
   const int n = 8;
@@ -464,22 +464,22 @@
 void av1_iht16x8_128_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                            int tx_type) {
   static const transform_2d IHT_16x8[] = {
-    { idct8_c, idct16_c },    // DCT_DCT
-    { iadst8_c, idct16_c },   // ADST_DCT
-    { idct8_c, iadst16_c },   // DCT_ADST
-    { iadst8_c, iadst16_c },  // ADST_ADST
-    { iadst8_c, idct16_c },   // FLIPADST_DCT
-    { idct8_c, iadst16_c },   // DCT_FLIPADST
-    { iadst8_c, iadst16_c },  // FLIPADST_FLIPADST
-    { iadst8_c, iadst16_c },  // ADST_FLIPADST
-    { iadst8_c, iadst16_c },  // FLIPADST_ADST
-    { iidtx8_c, iidtx16_c },  // IDTX
-    { idct8_c, iidtx16_c },   // V_DCT
-    { iidtx8_c, idct16_c },   // H_DCT
-    { iadst8_c, iidtx16_c },  // V_ADST
-    { iidtx8_c, iadst16_c },  // H_ADST
-    { iadst8_c, iidtx16_c },  // V_FLIPADST
-    { iidtx8_c, iadst16_c },  // H_FLIPADST
+    { aom_idct8_c, aom_idct16_c },    // DCT_DCT
+    { aom_iadst8_c, aom_idct16_c },   // ADST_DCT
+    { aom_idct8_c, aom_iadst16_c },   // DCT_ADST
+    { aom_iadst8_c, aom_iadst16_c },  // ADST_ADST
+    { aom_iadst8_c, aom_idct16_c },   // FLIPADST_DCT
+    { aom_idct8_c, aom_iadst16_c },   // DCT_FLIPADST
+    { aom_iadst8_c, aom_iadst16_c },  // FLIPADST_FLIPADST
+    { aom_iadst8_c, aom_iadst16_c },  // ADST_FLIPADST
+    { aom_iadst8_c, aom_iadst16_c },  // FLIPADST_ADST
+    { iidtx8_c, iidtx16_c },          // IDTX
+    { aom_idct8_c, iidtx16_c },       // V_DCT
+    { iidtx8_c, aom_idct16_c },       // H_DCT
+    { aom_iadst8_c, iidtx16_c },      // V_ADST
+    { iidtx8_c, aom_iadst16_c },      // H_ADST
+    { aom_iadst8_c, iidtx16_c },      // V_FLIPADST
+    { iidtx8_c, aom_iadst16_c },      // H_FLIPADST
   };
   const int n = 8;
   const int n2 = 16;
@@ -517,22 +517,22 @@
 void av1_iht16x32_512_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                             int tx_type) {
   static const transform_2d IHT_16x32[] = {
-    { idct32_c, idct16_c },         // DCT_DCT
-    { ihalfright32_c, idct16_c },   // ADST_DCT
-    { idct32_c, iadst16_c },        // DCT_ADST
-    { ihalfright32_c, iadst16_c },  // ADST_ADST
-    { ihalfright32_c, idct16_c },   // FLIPADST_DCT
-    { idct32_c, iadst16_c },        // DCT_FLIPADST
-    { ihalfright32_c, iadst16_c },  // FLIPADST_FLIPADST
-    { ihalfright32_c, iadst16_c },  // ADST_FLIPADST
-    { ihalfright32_c, iadst16_c },  // FLIPADST_ADST
-    { iidtx32_c, iidtx16_c },       // IDTX
-    { idct32_c, iidtx16_c },        // V_DCT
-    { iidtx32_c, idct16_c },        // H_DCT
-    { ihalfright32_c, iidtx16_c },  // V_ADST
-    { iidtx32_c, iadst16_c },       // H_ADST
-    { ihalfright32_c, iidtx16_c },  // V_FLIPADST
-    { iidtx32_c, iadst16_c },       // H_FLIPADST
+    { aom_idct32_c, aom_idct16_c },     // DCT_DCT
+    { ihalfright32_c, aom_idct16_c },   // ADST_DCT
+    { aom_idct32_c, aom_iadst16_c },    // DCT_ADST
+    { ihalfright32_c, aom_iadst16_c },  // ADST_ADST
+    { ihalfright32_c, aom_idct16_c },   // FLIPADST_DCT
+    { aom_idct32_c, aom_iadst16_c },    // DCT_FLIPADST
+    { ihalfright32_c, aom_iadst16_c },  // FLIPADST_FLIPADST
+    { ihalfright32_c, aom_iadst16_c },  // ADST_FLIPADST
+    { ihalfright32_c, aom_iadst16_c },  // FLIPADST_ADST
+    { iidtx32_c, iidtx16_c },           // IDTX
+    { aom_idct32_c, iidtx16_c },        // V_DCT
+    { iidtx32_c, aom_idct16_c },        // H_DCT
+    { ihalfright32_c, iidtx16_c },      // V_ADST
+    { iidtx32_c, aom_iadst16_c },       // H_ADST
+    { ihalfright32_c, iidtx16_c },      // V_FLIPADST
+    { iidtx32_c, aom_iadst16_c },       // H_FLIPADST
   };
 
   const int n = 16;
@@ -570,22 +570,22 @@
 void av1_iht32x16_512_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                             int tx_type) {
   static const transform_2d IHT_32x16[] = {
-    { idct16_c, idct32_c },         // DCT_DCT
-    { iadst16_c, idct32_c },        // ADST_DCT
-    { idct16_c, ihalfright32_c },   // DCT_ADST
-    { iadst16_c, ihalfright32_c },  // ADST_ADST
-    { iadst16_c, idct32_c },        // FLIPADST_DCT
-    { idct16_c, ihalfright32_c },   // DCT_FLIPADST
-    { iadst16_c, ihalfright32_c },  // FLIPADST_FLIPADST
-    { iadst16_c, ihalfright32_c },  // ADST_FLIPADST
-    { iadst16_c, ihalfright32_c },  // FLIPADST_ADST
-    { iidtx16_c, iidtx32_c },       // IDTX
-    { idct16_c, iidtx32_c },        // V_DCT
-    { iidtx16_c, idct32_c },        // H_DCT
-    { iadst16_c, iidtx32_c },       // V_ADST
-    { iidtx16_c, ihalfright32_c },  // H_ADST
-    { iadst16_c, iidtx32_c },       // V_FLIPADST
-    { iidtx16_c, ihalfright32_c },  // H_FLIPADST
+    { aom_idct16_c, aom_idct32_c },     // DCT_DCT
+    { aom_iadst16_c, aom_idct32_c },    // ADST_DCT
+    { aom_idct16_c, ihalfright32_c },   // DCT_ADST
+    { aom_iadst16_c, ihalfright32_c },  // ADST_ADST
+    { aom_iadst16_c, aom_idct32_c },    // FLIPADST_DCT
+    { aom_idct16_c, ihalfright32_c },   // DCT_FLIPADST
+    { aom_iadst16_c, ihalfright32_c },  // FLIPADST_FLIPADST
+    { aom_iadst16_c, ihalfright32_c },  // ADST_FLIPADST
+    { aom_iadst16_c, ihalfright32_c },  // FLIPADST_ADST
+    { iidtx16_c, iidtx32_c },           // IDTX
+    { aom_idct16_c, iidtx32_c },        // V_DCT
+    { iidtx16_c, aom_idct32_c },        // H_DCT
+    { aom_iadst16_c, iidtx32_c },       // V_ADST
+    { iidtx16_c, ihalfright32_c },      // H_ADST
+    { aom_iadst16_c, iidtx32_c },       // V_FLIPADST
+    { iidtx16_c, ihalfright32_c },      // H_FLIPADST
   };
   const int n = 16;
   const int n2 = 32;
@@ -624,24 +624,24 @@
 void av1_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                          int tx_type) {
   static const transform_2d IHT_8[] = {
-    { idct8_c, idct8_c },    // DCT_DCT
-    { iadst8_c, idct8_c },   // ADST_DCT
-    { idct8_c, iadst8_c },   // DCT_ADST
-    { iadst8_c, iadst8_c },  // ADST_ADST
+    { aom_idct8_c, aom_idct8_c },    // DCT_DCT  = 0
+    { aom_iadst8_c, aom_idct8_c },   // ADST_DCT = 1
+    { aom_idct8_c, aom_iadst8_c },   // DCT_ADST = 2
+    { aom_iadst8_c, aom_iadst8_c },  // ADST_ADST = 3
 #if CONFIG_EXT_TX
-    { iadst8_c, idct8_c },   // FLIPADST_DCT
-    { idct8_c, iadst8_c },   // DCT_FLIPADST
-    { iadst8_c, iadst8_c },  // FLIPADST_FLIPADST
-    { iadst8_c, iadst8_c },  // ADST_FLIPADST
-    { iadst8_c, iadst8_c },  // FLIPADST_ADST
-    { iidtx8_c, iidtx8_c },  // IDTX
-    { idct8_c, iidtx8_c },   // V_DCT
-    { iidtx8_c, idct8_c },   // H_DCT
-    { iadst8_c, iidtx8_c },  // V_ADST
-    { iidtx8_c, iadst8_c },  // H_ADST
-    { iadst8_c, iidtx8_c },  // V_FLIPADST
-    { iidtx8_c, iadst8_c },  // H_FLIPADST
-#endif                       // CONFIG_EXT_TX
+    { aom_iadst8_c, aom_idct8_c },   // FLIPADST_DCT
+    { aom_idct8_c, aom_iadst8_c },   // DCT_FLIPADST
+    { aom_iadst8_c, aom_iadst8_c },  // FLIPADST_FLIPADST
+    { aom_iadst8_c, aom_iadst8_c },  // ADST_FLIPADST
+    { aom_iadst8_c, aom_iadst8_c },  // FLIPADST_ADST
+    { iidtx8_c, iidtx8_c },          // IDTX
+    { aom_idct8_c, iidtx8_c },       // V_DCT
+    { iidtx8_c, aom_idct8_c },       // H_DCT
+    { aom_iadst8_c, iidtx8_c },      // V_ADST
+    { iidtx8_c, aom_iadst8_c },      // H_ADST
+    { aom_iadst8_c, iidtx8_c },      // V_FLIPADST
+    { iidtx8_c, aom_iadst8_c },      // H_FLIPADST
+#endif                               // CONFIG_EXT_TX
   };
 
   int i, j;
@@ -687,24 +687,24 @@
 void av1_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                             int tx_type) {
   static const transform_2d IHT_16[] = {
-    { idct16_c, idct16_c },    // DCT_DCT
-    { iadst16_c, idct16_c },   // ADST_DCT
-    { idct16_c, iadst16_c },   // DCT_ADST
-    { iadst16_c, iadst16_c },  // ADST_ADST
+    { aom_idct16_c, aom_idct16_c },    // DCT_DCT  = 0
+    { aom_iadst16_c, aom_idct16_c },   // ADST_DCT = 1
+    { aom_idct16_c, aom_iadst16_c },   // DCT_ADST = 2
+    { aom_iadst16_c, aom_iadst16_c },  // ADST_ADST = 3
 #if CONFIG_EXT_TX
-    { iadst16_c, idct16_c },   // FLIPADST_DCT
-    { idct16_c, iadst16_c },   // DCT_FLIPADST
-    { iadst16_c, iadst16_c },  // FLIPADST_FLIPADST
-    { iadst16_c, iadst16_c },  // ADST_FLIPADST
-    { iadst16_c, iadst16_c },  // FLIPADST_ADST
-    { iidtx16_c, iidtx16_c },  // IDTX
-    { idct16_c, iidtx16_c },   // V_DCT
-    { iidtx16_c, idct16_c },   // H_DCT
-    { iadst16_c, iidtx16_c },  // V_ADST
-    { iidtx16_c, iadst16_c },  // H_ADST
-    { iadst16_c, iidtx16_c },  // V_FLIPADST
-    { iidtx16_c, iadst16_c },  // H_FLIPADST
-#endif                         // CONFIG_EXT_TX
+    { aom_iadst16_c, aom_idct16_c },   // FLIPADST_DCT
+    { aom_idct16_c, aom_iadst16_c },   // DCT_FLIPADST
+    { aom_iadst16_c, aom_iadst16_c },  // FLIPADST_FLIPADST
+    { aom_iadst16_c, aom_iadst16_c },  // ADST_FLIPADST
+    { aom_iadst16_c, aom_iadst16_c },  // FLIPADST_ADST
+    { iidtx16_c, iidtx16_c },          // IDTX
+    { aom_idct16_c, iidtx16_c },       // V_DCT
+    { iidtx16_c, aom_idct16_c },       // H_DCT
+    { aom_iadst16_c, iidtx16_c },      // V_ADST
+    { iidtx16_c, aom_iadst16_c },      // H_ADST
+    { aom_iadst16_c, iidtx16_c },      // V_FLIPADST
+    { iidtx16_c, aom_iadst16_c },      // H_FLIPADST
+#endif                                 // CONFIG_EXT_TX
   };
 
   int i, j;
@@ -751,18 +751,18 @@
 void av1_iht32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                              int tx_type) {
   static const transform_2d IHT_32[] = {
-    { idct32_c, idct32_c },              // DCT_DCT
-    { ihalfright32_c, idct32_c },        // ADST_DCT
-    { idct32_c, ihalfright32_c },        // DCT_ADST
+    { aom_idct32_c, aom_idct32_c },      // DCT_DCT
+    { ihalfright32_c, aom_idct32_c },    // ADST_DCT
+    { aom_idct32_c, ihalfright32_c },    // DCT_ADST
     { ihalfright32_c, ihalfright32_c },  // ADST_ADST
-    { ihalfright32_c, idct32_c },        // FLIPADST_DCT
-    { idct32_c, ihalfright32_c },        // DCT_FLIPADST
+    { ihalfright32_c, aom_idct32_c },    // FLIPADST_DCT
+    { aom_idct32_c, ihalfright32_c },    // DCT_FLIPADST
     { ihalfright32_c, ihalfright32_c },  // FLIPADST_FLIPADST
     { ihalfright32_c, ihalfright32_c },  // ADST_FLIPADST
     { ihalfright32_c, ihalfright32_c },  // FLIPADST_ADST
     { iidtx32_c, iidtx32_c },            // IDTX
-    { idct32_c, iidtx32_c },             // V_DCT
-    { iidtx32_c, idct32_c },             // H_DCT
+    { aom_idct32_c, iidtx32_c },         // V_DCT
+    { iidtx32_c, aom_idct32_c },         // H_DCT
     { ihalfright32_c, iidtx16_c },       // V_ADST
     { iidtx16_c, ihalfright32_c },       // H_ADST
     { ihalfright32_c, iidtx16_c },       // V_FLIPADST
diff --git a/av1/common/mvref_common.h b/av1/common/mvref_common.h
index 25ebbfd..55688a9 100644
--- a/av1/common/mvref_common.h
+++ b/av1/common/mvref_common.h
@@ -341,8 +341,7 @@
 }
 
 static INLINE void lower_mv_precision(MV *mv, int allow_hp) {
-  const int use_hp = allow_hp && av1_use_mv_hp(mv);
-  if (!use_hp) {
+  if (!allow_hp) {
     if (mv->row & 1) mv->row += (mv->row > 0 ? -1 : 1);
     if (mv->col & 1) mv->col += (mv->col > 0 ? -1 : 1);
   }
diff --git a/av1/common/od_dering.c b/av1/common/od_dering.c
index 7aa704f..7ed49a4 100644
--- a/av1/common/od_dering.c
+++ b/av1/common/od_dering.c
@@ -183,6 +183,19 @@
   return (total_abs + 2) >> 2;
 }
 
+int od_filter_dering_direction_4x8(int16_t *y, int ystride, const int16_t *in,
+                                     int threshold, int dir) {
+  return od_filter_dering_direction_4x4(y, ystride, in, threshold, dir)
+  + od_filter_dering_direction_4x4(y + 4*ystride, ystride,
+                                   in + 4*OD_FILT_BSTRIDE, threshold, dir);
+}
+
+int od_filter_dering_direction_8x4(int16_t *y, int ystride, const int16_t *in,
+                                     int threshold, int dir) {
+  return od_filter_dering_direction_4x4(y, ystride, in, threshold, dir)
+  + od_filter_dering_direction_4x4(y + 4, ystride, in + 4, threshold, dir);
+}
+
 /* Smooth in the direction orthogonal to what was detected. */
 void od_filter_dering_orthogonal_8x8_c(int16_t *y, int ystride,
                                        const int16_t *in, int threshold,
@@ -241,6 +254,21 @@
   }
 }
 
+void od_filter_dering_orthogonal_4x8(int16_t *y, int ystride,
+                                       const int16_t *in, int threshold,
+                                       int dir) {
+  od_filter_dering_orthogonal_4x4(y, ystride, in, threshold, dir);
+  od_filter_dering_orthogonal_4x4(y + 4*ystride, ystride,
+                                  in + 4*OD_FILT_BSTRIDE, threshold, dir);
+}
+
+void od_filter_dering_orthogonal_8x4(int16_t *y, int ystride,
+                                       const int16_t *in, int threshold,
+                                       int dir) {
+  od_filter_dering_orthogonal_4x4(y, ystride, in, threshold, dir);
+  od_filter_dering_orthogonal_4x4(y + 4, ystride, in + 4, threshold, dir);
+}
+
 /* This table approximates x^0.16 with the index being log2(x). It is clamped
    to [-.5, 3]. The table is computed as:
    round(256*min(3, max(.5, 1.08*(sqrt(2)*2.^([0:17]+8)/256/256).^.16))) */
@@ -264,7 +292,7 @@
 
 void od_dering(int16_t *y, int ystride, const od_dering_in *x, int xstride,
                int nhb, int nvb, int sbx, int sby, int nhsb, int nvsb, int xdec,
-               int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
+               int ydec, int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
                unsigned char *bskip, int skip_stride, int threshold,
                int coeff_shift) {
   int i;
@@ -273,32 +301,35 @@
   int by;
   int16_t inbuf[OD_DERING_INBUF_SIZE];
   int16_t *in;
-  int bsize;
+  int bsize_x = 3 - xdec;
+  int bsize_y = 3 - ydec;
   int32_t var[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS];
   int filter2_thresh[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS];
   od_filter_dering_direction_func filter_dering_direction[OD_DERINGSIZES] = {
-    od_filter_dering_direction_4x4, od_filter_dering_direction_8x8
+    od_filter_dering_direction_8x8, od_filter_dering_direction_8x4,
+    od_filter_dering_direction_4x8, od_filter_dering_direction_4x4
   };
   od_filter_dering_orthogonal_func filter_dering_orthogonal[OD_DERINGSIZES] = {
-    od_filter_dering_orthogonal_4x4, od_filter_dering_orthogonal_8x8
+    od_filter_dering_orthogonal_8x8, od_filter_dering_orthogonal_8x4,
+    od_filter_dering_orthogonal_4x8, od_filter_dering_orthogonal_4x4
   };
-  bsize = 3 - xdec;
+  int filter_idx = xdec*2 + ydec;
   in = inbuf + OD_FILT_BORDER * OD_FILT_BSTRIDE + OD_FILT_BORDER;
   /* We avoid filtering the pixels for which some of the pixels to average
      are outside the frame. We could change the filter instead, but it would
      add special cases for any future vectorization. */
   for (i = 0; i < OD_DERING_INBUF_SIZE; i++) inbuf[i] = OD_DERING_VERY_LARGE;
   for (i = -OD_FILT_BORDER * (sby != 0);
-       i < (nvb << bsize) + OD_FILT_BORDER * (sby != nvsb - 1); i++) {
+       i < (nvb << bsize_y) + OD_FILT_BORDER * (sby != nvsb - 1); i++) {
     for (j = -OD_FILT_BORDER * (sbx != 0);
-         j < (nhb << bsize) + OD_FILT_BORDER * (sbx != nhsb - 1); j++) {
+         j < (nhb << bsize_x) + OD_FILT_BORDER * (sbx != nhsb - 1); j++) {
       in[i * OD_FILT_BSTRIDE + j] = x[i * xstride + j];
     }
   }
   /* Assume deringing filter is sparsely applied, so do one large copy rather
      than small copies later if deringing is skipped. */
-  for (i = 0; i < nvb << bsize; i++) {
-    for (j = 0; j < nhb << bsize; j++) {
+  for (i = 0; i < nvb << bsize_y; i++) {
+    for (j = 0; j < nhb << bsize_x; j++) {
       y[i * ystride + j] = in[i * OD_FILT_BSTRIDE + j];
     }
   }
@@ -316,9 +347,9 @@
            to be a little bit more aggressive on pure horizontal/vertical
            since the ringing there tends to be directional, so it doesn't
            get removed by the directional filtering. */
-        filter2_thresh[by][bx] = (filter_dering_direction[bsize - OD_LOG_BSIZE0])(
-            &y[(by * ystride << bsize) + (bx << bsize)], ystride,
-            &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)],
+        filter2_thresh[by][bx] = (filter_dering_direction[filter_idx])(
+            &y[(by * ystride << bsize_y) + (bx << bsize_x)], ystride,
+            &in[(by * OD_FILT_BSTRIDE << bsize_y) + (bx << bsize_x)],
             od_adjust_thresh(threshold, var[by][bx]), dir[by][bx]);
       }
     }
@@ -326,25 +357,25 @@
     for (by = 0; by < nvb; by++) {
       for (bx = 0; bx < nhb; bx++) {
         if (bskip[by * skip_stride + bx]) continue;
-        filter2_thresh[by][bx] = (filter_dering_direction[bsize - OD_LOG_BSIZE0])(
-            &y[(by * ystride << bsize) + (bx << bsize)], ystride,
-            &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], threshold,
+        filter2_thresh[by][bx] = (filter_dering_direction[filter_idx])(
+            &y[(by * ystride << bsize_y) + (bx << bsize_x)], ystride,
+            &in[(by * OD_FILT_BSTRIDE << bsize_y) + (bx << bsize_x)], threshold,
             dir[by][bx]);
       }
     }
   }
-  for (i = 0; i < nvb << bsize; i++) {
-    for (j = 0; j < nhb << bsize; j++) {
+  for (i = 0; i < nvb << bsize_y; i++) {
+    for (j = 0; j < nhb << bsize_x; j++) {
       in[i * OD_FILT_BSTRIDE + j] = y[i * ystride + j];
     }
   }
   for (by = 0; by < nvb; by++) {
     for (bx = 0; bx < nhb; bx++) {
       if (bskip[by * skip_stride + bx] || filter2_thresh[by][bx] == 0) continue;
-      (filter_dering_orthogonal[bsize - OD_LOG_BSIZE0])(
-          &y[(by * ystride << bsize) + (bx << bsize)], ystride,
-          &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], filter2_thresh[by][bx],
-          dir[by][bx]);
+      (filter_dering_orthogonal[filter_idx])(
+          &y[(by * ystride << bsize_y) + (bx << bsize_x)], ystride,
+          &in[(by * OD_FILT_BSTRIDE << bsize_y) + (bx << bsize_x)],
+          filter2_thresh[by][bx], dir[by][bx]);
     }
   }
 }
diff --git a/av1/common/od_dering.h b/av1/common/od_dering.h
index c64439f..fc3a3ef 100644
--- a/av1/common/od_dering.h
+++ b/av1/common/od_dering.h
@@ -19,7 +19,7 @@
 typedef int16_t od_dering_in;
 #endif
 
-#define OD_DERINGSIZES (2)
+#define OD_DERINGSIZES (4)
 
 #define OD_DERING_NBLOCKS (OD_BSIZE_MAX / 8)
 
@@ -36,16 +36,24 @@
                                                  int threshold, int dir);
 void od_dering(int16_t *y, int ystride, const od_dering_in *x, int xstride,
                int nvb, int nhb, int sbx, int sby, int nhsb, int nvsb, int xdec,
-               int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
+               int ydec, int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
                unsigned char *bskip, int skip_stride, int threshold,
                int coeff_shift);
 int od_filter_dering_direction_4x4_c(int16_t *y, int ystride, const int16_t *in,
                                      int threshold, int dir);
+int od_filter_dering_direction_4x8(int16_t *y, int ystride, const int16_t *in,
+                                   int threshold, int dir);
+int od_filter_dering_direction_8x4(int16_t *y, int ystride, const int16_t *in,
+                                   int threshold, int dir);
 int od_filter_dering_direction_8x8_c(int16_t *y, int ystride, const int16_t *in,
                                      int threshold, int dir);
 void od_filter_dering_orthogonal_4x4_c(int16_t *y, int ystride,
                                        const int16_t *in, int threshold,
                                        int dir);
+void od_filter_dering_orthogonal_4x8(int16_t *y, int ystride, const int16_t *in,
+                                     int threshold, int dir);
+void od_filter_dering_orthogonal_8x4(int16_t *y, int ystride, const int16_t *in,
+                                     int threshold, int dir);
 void od_filter_dering_orthogonal_8x8_c(int16_t *y, int ystride,
                                        const int16_t *in, int threshold,
                                        int dir);
diff --git a/av1/common/reconinter.c b/av1/common/reconinter.c
index 6c4ae2a..66b6bfd 100644
--- a/av1/common/reconinter.c
+++ b/av1/common/reconinter.c
@@ -1889,6 +1889,7 @@
                                                   int dst_stride,
                                                   PREDICTION_MODE mode,
                                                   BLOCK_SIZE bsize, int plane) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
   BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, &xd->plane[plane]);
   const int bwl = b_width_log2_lookup[plane_bsize];
   const int bhl = b_height_log2_lookup[plane_bsize];
@@ -1897,14 +1898,14 @@
   TX_SIZE max_tx_size = max_txsize_lookup[plane_bsize];
 
   if (bwl == bhl) {
-    av1_predict_intra_block(xd, bwl, bhl, max_tx_size, mode, ref, ref_stride,
-                            dst, dst_stride, 0, 0, plane);
+    av1_predict_intra_block(xd, pd->width, pd->height, max_tx_size, mode, ref,
+                            ref_stride, dst, dst_stride, 0, 0, plane);
 
   } else if (bwl < bhl) {
     uint8_t *src_2 = ref + pxbw * ref_stride;
     uint8_t *dst_2 = dst + pxbw * dst_stride;
-    av1_predict_intra_block(xd, bwl, bhl, max_tx_size, mode, ref, ref_stride,
-                            dst, dst_stride, 0, 0, plane);
+    av1_predict_intra_block(xd, pd->width, pd->height, max_tx_size, mode, ref,
+                            ref_stride, dst, dst_stride, 0, 0, plane);
 #if CONFIG_AOM_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       uint16_t *src_216 = CONVERT_TO_SHORTPTR(src_2);
@@ -1916,14 +1917,14 @@
     {
       memcpy(src_2 - ref_stride, dst_2 - dst_stride, sizeof(*src_2) * pxbw);
     }
-    av1_predict_intra_block(xd, bwl, bhl, max_tx_size, mode, src_2, ref_stride,
-                            dst_2, dst_stride, 0, 1 << bwl, plane);
+    av1_predict_intra_block(xd, pd->width, pd->height, max_tx_size, mode, src_2,
+                            ref_stride, dst_2, dst_stride, 0, 1 << bwl, plane);
   } else {  // bwl > bhl
     int i;
     uint8_t *src_2 = ref + pxbh;
     uint8_t *dst_2 = dst + pxbh;
-    av1_predict_intra_block(xd, bwl, bhl, max_tx_size, mode, ref, ref_stride,
-                            dst, dst_stride, 0, 0, plane);
+    av1_predict_intra_block(xd, pd->width, pd->height, max_tx_size, mode, ref,
+                            ref_stride, dst, dst_stride, 0, 0, plane);
 #if CONFIG_AOM_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       uint16_t *src_216 = CONVERT_TO_SHORTPTR(src_2);
@@ -1936,8 +1937,8 @@
       for (i = 0; i < pxbh; ++i)
         src_2[i * ref_stride - 1] = dst_2[i * dst_stride - 1];
     }
-    av1_predict_intra_block(xd, bwl, bhl, max_tx_size, mode, src_2, ref_stride,
-                            dst_2, dst_stride, 1 << bhl, 0, plane);
+    av1_predict_intra_block(xd, pd->width, pd->height, max_tx_size, mode, src_2,
+                            ref_stride, dst_2, dst_stride, 1 << bhl, 0, plane);
   }
 }
 
diff --git a/av1/common/x86/av1_inv_txfm_sse2.h b/av1/common/x86/av1_inv_txfm_sse2.h
index 3aab34c..a8bb6c1 100644
--- a/av1/common/x86/av1_inv_txfm_sse2.h
+++ b/av1/common/x86/av1_inv_txfm_sse2.h
@@ -175,11 +175,4 @@
   RECON_AND_STORE(dest + 15 * stride, in[15]);
 }
 
-void idct4_sse2(__m128i *in);
-void idct8_sse2(__m128i *in);
-void idct16_sse2(__m128i *in0, __m128i *in1);
-void iadst4_sse2(__m128i *in);
-void iadst8_sse2(__m128i *in);
-void iadst16_sse2(__m128i *in0, __m128i *in1);
-
 #endif  // AOM_DSP_X86_INV_TXFM_SSE2_H_
diff --git a/av1/common/x86/idct_intrin_sse2.c b/av1/common/x86/idct_intrin_sse2.c
index 27cd756..10102e7 100644
--- a/av1/common/x86/idct_intrin_sse2.c
+++ b/av1/common/x86/idct_intrin_sse2.c
@@ -69,46 +69,46 @@
 
   switch (tx_type) {
     case DCT_DCT:
-      idct4_sse2(in);
-      idct4_sse2(in);
+      aom_idct4_sse2(in);
+      aom_idct4_sse2(in);
       break;
     case ADST_DCT:
-      idct4_sse2(in);
-      iadst4_sse2(in);
+      aom_idct4_sse2(in);
+      aom_iadst4_sse2(in);
       break;
     case DCT_ADST:
-      iadst4_sse2(in);
-      idct4_sse2(in);
+      aom_iadst4_sse2(in);
+      aom_idct4_sse2(in);
       break;
     case ADST_ADST:
-      iadst4_sse2(in);
-      iadst4_sse2(in);
+      aom_iadst4_sse2(in);
+      aom_iadst4_sse2(in);
       break;
 #if CONFIG_EXT_TX
     case FLIPADST_DCT:
-      idct4_sse2(in);
-      iadst4_sse2(in);
+      aom_idct4_sse2(in);
+      aom_iadst4_sse2(in);
       FLIPUD_PTR(dest, stride, 4);
       break;
     case DCT_FLIPADST:
-      iadst4_sse2(in);
-      idct4_sse2(in);
+      aom_iadst4_sse2(in);
+      aom_idct4_sse2(in);
       fliplr_4x4(in);
       break;
     case FLIPADST_FLIPADST:
-      iadst4_sse2(in);
-      iadst4_sse2(in);
+      aom_iadst4_sse2(in);
+      aom_iadst4_sse2(in);
       FLIPUD_PTR(dest, stride, 4);
       fliplr_4x4(in);
       break;
     case ADST_FLIPADST:
-      iadst4_sse2(in);
-      iadst4_sse2(in);
+      aom_iadst4_sse2(in);
+      aom_iadst4_sse2(in);
       fliplr_4x4(in);
       break;
     case FLIPADST_ADST:
-      iadst4_sse2(in);
-      iadst4_sse2(in);
+      aom_iadst4_sse2(in);
+      aom_iadst4_sse2(in);
       FLIPUD_PTR(dest, stride, 4);
       break;
 #endif  // CONFIG_EXT_TX
@@ -167,46 +167,46 @@
 
   switch (tx_type) {
     case DCT_DCT:
-      idct8_sse2(in);
-      idct8_sse2(in);
+      aom_idct8_sse2(in);
+      aom_idct8_sse2(in);
       break;
     case ADST_DCT:
-      idct8_sse2(in);
-      iadst8_sse2(in);
+      aom_idct8_sse2(in);
+      aom_iadst8_sse2(in);
       break;
     case DCT_ADST:
-      iadst8_sse2(in);
-      idct8_sse2(in);
+      aom_iadst8_sse2(in);
+      aom_idct8_sse2(in);
       break;
     case ADST_ADST:
-      iadst8_sse2(in);
-      iadst8_sse2(in);
+      aom_iadst8_sse2(in);
+      aom_iadst8_sse2(in);
       break;
 #if CONFIG_EXT_TX
     case FLIPADST_DCT:
-      idct8_sse2(in);
-      iadst8_sse2(in);
+      aom_idct8_sse2(in);
+      aom_iadst8_sse2(in);
       FLIPUD_PTR(dest, stride, 8);
       break;
     case DCT_FLIPADST:
-      iadst8_sse2(in);
-      idct8_sse2(in);
+      aom_iadst8_sse2(in);
+      aom_idct8_sse2(in);
       fliplr_8x8(in);
       break;
     case FLIPADST_FLIPADST:
-      iadst8_sse2(in);
-      iadst8_sse2(in);
+      aom_iadst8_sse2(in);
+      aom_iadst8_sse2(in);
       FLIPUD_PTR(dest, stride, 8);
       fliplr_8x8(in);
       break;
     case ADST_FLIPADST:
-      iadst8_sse2(in);
-      iadst8_sse2(in);
+      aom_iadst8_sse2(in);
+      aom_iadst8_sse2(in);
       fliplr_8x8(in);
       break;
     case FLIPADST_ADST:
-      iadst8_sse2(in);
-      iadst8_sse2(in);
+      aom_iadst8_sse2(in);
+      aom_iadst8_sse2(in);
       FLIPUD_PTR(dest, stride, 8);
       break;
 #endif  // CONFIG_EXT_TX
@@ -254,46 +254,46 @@
 
   switch (tx_type) {
     case DCT_DCT:
-      idct16_sse2(in0, in1);
-      idct16_sse2(in0, in1);
+      aom_idct16_sse2(in0, in1);
+      aom_idct16_sse2(in0, in1);
       break;
     case ADST_DCT:
-      idct16_sse2(in0, in1);
-      iadst16_sse2(in0, in1);
+      aom_idct16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
       break;
     case DCT_ADST:
-      iadst16_sse2(in0, in1);
-      idct16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
+      aom_idct16_sse2(in0, in1);
       break;
     case ADST_ADST:
-      iadst16_sse2(in0, in1);
-      iadst16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
       break;
 #if CONFIG_EXT_TX
     case FLIPADST_DCT:
-      idct16_sse2(in0, in1);
-      iadst16_sse2(in0, in1);
+      aom_idct16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
       FLIPUD_PTR(dest, stride, 16);
       break;
     case DCT_FLIPADST:
-      iadst16_sse2(in0, in1);
-      idct16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
+      aom_idct16_sse2(in0, in1);
       FLIPLR_16x16(in0, in1);
       break;
     case FLIPADST_FLIPADST:
-      iadst16_sse2(in0, in1);
-      iadst16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
       FLIPUD_PTR(dest, stride, 16);
       FLIPLR_16x16(in0, in1);
       break;
     case ADST_FLIPADST:
-      iadst16_sse2(in0, in1);
-      iadst16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
       FLIPLR_16x16(in0, in1);
       break;
     case FLIPADST_ADST:
-      iadst16_sse2(in0, in1);
-      iadst16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
       FLIPUD_PTR(dest, stride, 16);
       break;
 #endif  // CONFIG_EXT_TX
@@ -667,9 +667,9 @@
     case ADST_DCT:
     case FLIPADST_DCT:
     case H_DCT:
-      idct8_sse2(in);
+      aom_idct8_sse2(in);
       array_transpose_8x8(in, in);
-      idct8_sse2(in + 8);
+      aom_idct8_sse2(in + 8);
       array_transpose_8x8(in + 8, in + 8);
       break;
     case DCT_ADST:
@@ -680,9 +680,9 @@
     case FLIPADST_ADST:
     case H_ADST:
     case H_FLIPADST:
-      iadst8_sse2(in);
+      aom_iadst8_sse2(in);
       array_transpose_8x8(in, in);
-      iadst8_sse2(in + 8);
+      aom_iadst8_sse2(in + 8);
       array_transpose_8x8(in + 8, in + 8);
       break;
     case V_FLIPADST:
@@ -836,8 +836,8 @@
     case DCT_ADST:
     case DCT_FLIPADST:
     case V_DCT:
-      idct8_sse2(in);
-      idct8_sse2(in + 8);
+      aom_idct8_sse2(in);
+      aom_idct8_sse2(in + 8);
       break;
     case ADST_DCT:
     case ADST_ADST:
@@ -847,8 +847,8 @@
     case FLIPADST_DCT:
     case V_ADST:
     case V_FLIPADST:
-      iadst8_sse2(in);
-      iadst8_sse2(in + 8);
+      aom_iadst8_sse2(in);
+      aom_iadst8_sse2(in + 8);
       break;
     case H_DCT:
     case H_ADST:
@@ -934,7 +934,7 @@
     case DCT_DCT:
     case ADST_DCT:
     case FLIPADST_DCT:
-    case H_DCT: idct8_sse2(in); break;
+    case H_DCT: aom_idct8_sse2(in); break;
     case DCT_ADST:
     case ADST_ADST:
     case DCT_FLIPADST:
@@ -942,7 +942,7 @@
     case ADST_FLIPADST:
     case FLIPADST_ADST:
     case H_ADST:
-    case H_FLIPADST: iadst8_sse2(in); break;
+    case H_FLIPADST: aom_iadst8_sse2(in); break;
     case V_FLIPADST:
     case V_ADST:
     case V_DCT:
@@ -969,8 +969,8 @@
     case DCT_ADST:
     case DCT_FLIPADST:
     case V_DCT:
-      idct4_sse2(in + 4);
-      idct4_sse2(in + 6);
+      aom_idct4_sse2(in + 4);
+      aom_idct4_sse2(in + 6);
       break;
     case ADST_DCT:
     case ADST_ADST:
@@ -980,8 +980,8 @@
     case FLIPADST_DCT:
     case V_ADST:
     case V_FLIPADST:
-      iadst4_sse2(in + 4);
-      iadst4_sse2(in + 6);
+      aom_iadst4_sse2(in + 4);
+      aom_iadst4_sse2(in + 6);
       break;
     case H_DCT:
     case H_ADST:
@@ -1113,8 +1113,8 @@
     case ADST_DCT:
     case FLIPADST_DCT:
     case H_DCT:
-      idct4_sse2(in + 4);
-      idct4_sse2(in + 6);
+      aom_idct4_sse2(in + 4);
+      aom_idct4_sse2(in + 6);
       break;
     case DCT_ADST:
     case ADST_ADST:
@@ -1124,8 +1124,8 @@
     case FLIPADST_ADST:
     case H_ADST:
     case H_FLIPADST:
-      iadst4_sse2(in + 4);
-      iadst4_sse2(in + 6);
+      aom_iadst4_sse2(in + 4);
+      aom_iadst4_sse2(in + 6);
       break;
     case V_FLIPADST:
     case V_ADST:
@@ -1150,7 +1150,7 @@
     case DCT_DCT:
     case DCT_ADST:
     case DCT_FLIPADST:
-    case V_DCT: idct8_sse2(in); break;
+    case V_DCT: aom_idct8_sse2(in); break;
     case ADST_DCT:
     case ADST_ADST:
     case FLIPADST_ADST:
@@ -1158,7 +1158,7 @@
     case FLIPADST_FLIPADST:
     case FLIPADST_DCT:
     case V_ADST:
-    case V_FLIPADST: iadst8_sse2(in); break;
+    case V_FLIPADST: aom_iadst8_sse2(in); break;
     case H_DCT:
     case H_ADST:
     case H_FLIPADST:
@@ -1252,7 +1252,7 @@
   // Generate the bottom half of the output
   scale_sqrt2_8x16(bl);
   scale_sqrt2_8x16(br);
-  idct16_sse2(bl, br);  // Includes a transposition
+  aom_idct16_sse2(bl, br);  // Includes a transposition
 }
 
 static INLINE void iidtx32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
@@ -1309,8 +1309,8 @@
     case ADST_DCT:
     case FLIPADST_DCT:
     case H_DCT:
-      idct16_sse2(intl, intr);
-      idct16_sse2(inbl, inbr);
+      aom_idct16_sse2(intl, intr);
+      aom_idct16_sse2(inbl, inbr);
       break;
     case DCT_ADST:
     case ADST_ADST:
@@ -1320,8 +1320,8 @@
     case FLIPADST_ADST:
     case H_ADST:
     case H_FLIPADST:
-      iadst16_sse2(intl, intr);
-      iadst16_sse2(inbl, inbr);
+      aom_iadst16_sse2(intl, intr);
+      aom_iadst16_sse2(inbl, inbr);
       break;
     case V_FLIPADST:
     case V_ADST:
@@ -1467,8 +1467,8 @@
     case DCT_ADST:
     case DCT_FLIPADST:
     case V_DCT:
-      idct16_sse2(in0, in1);
-      idct16_sse2(in2, in3);
+      aom_idct16_sse2(in0, in1);
+      aom_idct16_sse2(in2, in3);
       break;
     case ADST_DCT:
     case ADST_ADST:
@@ -1478,8 +1478,8 @@
     case FLIPADST_DCT:
     case V_ADST:
     case V_FLIPADST:
-      iadst16_sse2(in0, in1);
-      iadst16_sse2(in2, in3);
+      aom_iadst16_sse2(in0, in1);
+      aom_iadst16_sse2(in2, in3);
       break;
     case H_DCT:
     case H_ADST:
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index 4007eaa..b33e1ae 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -217,7 +217,7 @@
   int i, j;
 
   update_mv_probs(ctx->joints, MV_JOINTS - 1, r);
-#if CONFIG_DAALA_EC
+#if CONFIG_DAALA_EC || CONFIG_RANS
   av1_tree_to_cdf(av1_mv_joint_tree, ctx->joints, ctx->joint_cdf);
 #endif
 
@@ -227,7 +227,7 @@
     update_mv_probs(comp_ctx->classes, MV_CLASSES - 1, r);
     update_mv_probs(comp_ctx->class0, CLASS0_SIZE - 1, r);
     update_mv_probs(comp_ctx->bits, MV_OFFSET_BITS, r);
-#if CONFIG_DAALA_EC
+#if CONFIG_DAALA_EC || CONFIG_RANS
     av1_tree_to_cdf(av1_mv_class_tree, comp_ctx->classes, comp_ctx->class_cdf);
 #endif
   }
@@ -236,13 +236,13 @@
     nmv_component *const comp_ctx = &ctx->comps[i];
     for (j = 0; j < CLASS0_SIZE; ++j) {
       update_mv_probs(comp_ctx->class0_fp[j], MV_FP_SIZE - 1, r);
-#if CONFIG_DAALA_EC
+#if CONFIG_DAALA_EC || CONFIG_RANS
       av1_tree_to_cdf(av1_mv_fp_tree, comp_ctx->class0_fp[j],
                       comp_ctx->class0_fp_cdf[j]);
 #endif
     }
     update_mv_probs(comp_ctx->fp, MV_FP_SIZE - 1, r);
-#if CONFIG_DAALA_EC
+#if CONFIG_DAALA_EC || CONFIG_RANS
     av1_tree_to_cdf(av1_mv_fp_tree, comp_ctx->fp, comp_ctx->fp_cdf);
 #endif
   }
@@ -333,13 +333,17 @@
   const TX_SIZE plane_tx_size =
       plane ? uv_txsize_lookup[bsize][mbmi->inter_tx_size[tx_row][tx_col]][0][0]
             : mbmi->inter_tx_size[tx_row][tx_col];
-  int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
-  int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
+  // Scale to match transform block unit.
+  int max_blocks_high = block_size_high[plane_bsize];
+  int max_blocks_wide = block_size_wide[plane_bsize];
 
   if (xd->mb_to_bottom_edge < 0)
-    max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
+    max_blocks_high += xd->mb_to_bottom_edge >> (3 + pd->subsampling_y);
   if (xd->mb_to_right_edge < 0)
-    max_blocks_wide += xd->mb_to_right_edge >> (5 + pd->subsampling_x);
+    max_blocks_wide += xd->mb_to_right_edge >> (3 + pd->subsampling_x);
+
+  max_blocks_high >>= tx_size_wide_log2[0];
+  max_blocks_wide >>= tx_size_wide_log2[0];
 
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
@@ -358,15 +362,14 @@
         pd->dst.stride, max_scan_line, eob);
     *eob_total += eob;
   } else {
-    int bsl = b_width_log2_lookup[bsize];
+    int bsl = block_size_wide[bsize] >> (tx_size_wide_log2[0] + 1);
     int i;
 
     assert(bsl > 0);
-    --bsl;
 
     for (i = 0; i < 4; ++i) {
-      const int offsetr = blk_row + ((i >> 1) << bsl);
-      const int offsetc = blk_col + ((i & 0x01) << bsl);
+      const int offsetr = blk_row + (i >> 1) * bsl;
+      const int offsetc = blk_col + (i & 0x01) * bsl;
 
       if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
 
@@ -1289,8 +1292,8 @@
 
       for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
         const struct macroblockd_plane *const pd = &xd->plane[plane];
-        const int num_4x4_w = pd->n4_w;
-        const int num_4x4_h = pd->n4_h;
+        int block_width = pd->width;
+        int block_height = pd->height;
         int row, col;
 #if CONFIG_VAR_TX
         // TODO(jingning): This can be simplified for decoder performance.
@@ -1305,23 +1308,26 @@
               plane ? get_uv_tx_size(mbmi, pd) : mbmi->tx_size;
           const int stepr = tx_size_high_unit[tx_size];
           const int stepc = tx_size_wide_unit[tx_size];
-          const int max_blocks_wide =
-              num_4x4_w +
+          int max_blocks_wide =
+              block_width +
               (xd->mb_to_right_edge >= 0 ? 0 : xd->mb_to_right_edge >>
-                                                   (5 + pd->subsampling_x));
-          const int max_blocks_high =
-              num_4x4_h +
+                                                   (3 + pd->subsampling_x));
+          int max_blocks_high =
+              block_height +
               (xd->mb_to_bottom_edge >= 0 ? 0 : xd->mb_to_bottom_edge >>
-                                                    (5 + pd->subsampling_y));
-
+                                                    (3 + pd->subsampling_y));
+          max_blocks_wide >>= tx_size_wide_log2[0];
+          max_blocks_high >>= tx_size_wide_log2[0];
           for (row = 0; row < max_blocks_high; row += stepr)
             for (col = 0; col < max_blocks_wide; col += stepc)
               eobtotal += reconstruct_inter_block(cm, xd, r, mbmi->segment_id,
                                                   plane, row, col, tx_size);
         } else {
 #endif
-          for (row = 0; row < num_4x4_h; row += bh_var_tx)
-            for (col = 0; col < num_4x4_w; col += bw_var_tx)
+          block_width >>= tx_size_wide_log2[0];
+          block_height >>= tx_size_wide_log2[0];
+          for (row = 0; row < block_height; row += bh_var_tx)
+            for (col = 0; col < block_width; col += bw_var_tx)
               decode_reconstruct_tx(cm, xd, r, mbmi, plane, plane_bsize, row,
                                     col, max_tx_size, &eobtotal);
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
@@ -1332,15 +1338,16 @@
             plane ? get_uv_tx_size(mbmi, pd) : mbmi->tx_size;
         const int stepr = tx_size_high_unit[tx_size];
         const int stepc = tx_size_wide_unit[tx_size];
-        const int max_blocks_wide =
-            num_4x4_w + (xd->mb_to_right_edge >= 0
-                             ? 0
-                             : xd->mb_to_right_edge >> (5 + pd->subsampling_x));
-        const int max_blocks_high =
-            num_4x4_h +
+        int max_blocks_wide =
+            block_width +
+            (xd->mb_to_right_edge >= 0 ? 0 : xd->mb_to_right_edge >>
+                                                 (3 + pd->subsampling_x));
+        int max_blocks_high =
+            block_height +
             (xd->mb_to_bottom_edge >= 0 ? 0 : xd->mb_to_bottom_edge >>
-                                                  (5 + pd->subsampling_y));
-
+                                                  (3 + pd->subsampling_y));
+        max_blocks_wide >>= tx_size_wide_log2[0];
+        max_blocks_high >>= tx_size_wide_log2[0];
         for (row = 0; row < max_blocks_high; row += stepr)
           for (col = 0; col < max_blocks_wide; col += stepc)
             eobtotal += reconstruct_inter_block(cm, xd, r, mbmi->segment_id,
@@ -1783,6 +1790,7 @@
   if (bsize >= BLOCK_8X8 &&
       (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
     dec_update_partition_context(xd, mi_row, mi_col, subsize, num_8x8_wh);
+#endif  // CONFIG_EXT_PARTITION_TYPES
 
 #if CONFIG_CLPF
   if (bsize == BLOCK_64X64 && cm->clpf_strength_y &&
@@ -1820,6 +1828,7 @@
     }
   }
 #endif
+
 #if CONFIG_DERING
   if (bsize == BLOCK_64X64) {
     if (cm->dering_level != 0 && !sb_all_skip(cm, mi_row, mi_col)) {
@@ -1831,7 +1840,6 @@
     }
   }
 #endif
-#endif  // CONFIG_EXT_PARTITION_TYPES
 }
 
 #if !CONFIG_ANS
diff --git a/av1/decoder/decodemv.c b/av1/decoder/decodemv.c
index 31183c0..9aa182d 100644
--- a/av1/decoder/decodemv.c
+++ b/av1/decoder/decodemv.c
@@ -281,24 +281,28 @@
   int is_split = 0;
   const int tx_row = blk_row >> 1;
   const int tx_col = blk_col >> 1;
-  int max_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
-  int max_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
+  int max_blocks_high = block_size_high[mbmi->sb_type];
+  int max_blocks_wide = block_size_wide[mbmi->sb_type];
   int ctx = txfm_partition_context(xd->above_txfm_context + tx_col,
                                    xd->left_txfm_context + tx_row, tx_size);
   TX_SIZE(*const inter_tx_size)
   [MAX_MIB_SIZE] =
       (TX_SIZE(*)[MAX_MIB_SIZE]) & mbmi->inter_tx_size[tx_row][tx_col];
 
-  if (xd->mb_to_bottom_edge < 0) max_blocks_high += xd->mb_to_bottom_edge >> 5;
-  if (xd->mb_to_right_edge < 0) max_blocks_wide += xd->mb_to_right_edge >> 5;
+  if (xd->mb_to_bottom_edge < 0) max_blocks_high += xd->mb_to_bottom_edge >> 3;
+  if (xd->mb_to_right_edge < 0) max_blocks_wide += xd->mb_to_right_edge >> 3;
+
+  // Scale to transform block unit.
+  max_blocks_high >>= tx_size_wide_log2[0];
+  max_blocks_wide >>= tx_size_wide_log2[0];
 
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
   if (depth == MAX_VARTX_DEPTH) {
     int idx, idy;
     inter_tx_size[0][0] = tx_size;
-    for (idy = 0; idy < num_4x4_blocks_high_txsize_lookup[tx_size] / 2; ++idy)
-      for (idx = 0; idx < num_4x4_blocks_wide_txsize_lookup[tx_size] / 2; ++idx)
+    for (idy = 0; idy < tx_size_high_unit[tx_size] / 2; ++idy)
+      for (idx = 0; idx < tx_size_wide_unit[tx_size] / 2; ++idx)
         inter_tx_size[idy][idx] = tx_size;
     mbmi->tx_size = tx_size;
     if (counts) ++counts->txfm_partition[ctx][0];
@@ -311,7 +315,8 @@
 
   if (is_split) {
     BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
-    int bsl = b_width_log2_lookup[bsize];
+    // Half the block size in transform block unit.
+    int bsl = block_size_wide[bsize] >> (tx_size_wide_log2[0] + 1);
     int i;
 
     if (counts) ++counts->txfm_partition[ctx][1];
@@ -325,18 +330,17 @@
     }
 
     assert(bsl > 0);
-    --bsl;
     for (i = 0; i < 4; ++i) {
-      int offsetr = blk_row + ((i >> 1) << bsl);
-      int offsetc = blk_col + ((i & 0x01) << bsl);
+      int offsetr = blk_row + ((i >> 1) * bsl);
+      int offsetc = blk_col + ((i & 0x01) * bsl);
       read_tx_size_vartx(cm, xd, mbmi, counts, tx_size - 1, depth + 1, offsetr,
                          offsetc, r);
     }
   } else {
     int idx, idy;
     inter_tx_size[0][0] = tx_size;
-    for (idy = 0; idy < num_4x4_blocks_high_txsize_lookup[tx_size] / 2; ++idy)
-      for (idx = 0; idx < num_4x4_blocks_wide_txsize_lookup[tx_size] / 2; ++idx)
+    for (idy = 0; idy < tx_size_high_unit[tx_size] / 2; ++idy)
+      for (idx = 0; idx < tx_size_wide_unit[tx_size] / 2; ++idx)
         inter_tx_size[idy][idx] = tx_size;
     mbmi->tx_size = tx_size;
     if (counts) ++counts->txfm_partition[ctx][0];
@@ -797,7 +801,7 @@
   int mag, d, fr, hp;
   const int sign = aom_read(r, mvcomp->sign, ACCT_STR);
   const int mv_class =
-#if CONFIG_DAALA_EC
+#if CONFIG_DAALA_EC || CONFIG_RANS
       aom_read_symbol(r, mvcomp->class_cdf, MV_CLASSES, ACCT_STR);
 #else
       aom_read_tree(r, av1_mv_class_tree, mvcomp->classes, ACCT_STR);
@@ -818,7 +822,7 @@
   }
 
 // Fractional part
-#if CONFIG_DAALA_EC
+#if CONFIG_DAALA_EC || CONFIG_RANS
   fr = aom_read_symbol(r, class0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf,
                        MV_FP_SIZE, ACCT_STR);
 #else
@@ -839,22 +843,21 @@
                            const nmv_context *ctx, nmv_context_counts *counts,
                            int allow_hp) {
   MV_JOINT_TYPE joint_type;
-  const int use_hp = allow_hp && av1_use_mv_hp(ref);
   MV diff = { 0, 0 };
   joint_type =
-#if CONFIG_DAALA_EC
+#if CONFIG_DAALA_EC || CONFIG_RANS
       (MV_JOINT_TYPE)aom_read_symbol(r, ctx->joint_cdf, MV_JOINTS, ACCT_STR);
 #else
       (MV_JOINT_TYPE)aom_read_tree(r, av1_mv_joint_tree, ctx->joints, ACCT_STR);
 #endif
 
   if (mv_joint_vertical(joint_type))
-    diff.row = read_mv_component(r, &ctx->comps[0], use_hp);
+    diff.row = read_mv_component(r, &ctx->comps[0], allow_hp);
 
   if (mv_joint_horizontal(joint_type))
-    diff.col = read_mv_component(r, &ctx->comps[1], use_hp);
+    diff.col = read_mv_component(r, &ctx->comps[1], allow_hp);
 
-  av1_inc_mv(&diff, counts, use_hp);
+  av1_inc_mv(&diff, counts, allow_hp);
 
   mv->row = ref->row + diff.row;
   mv->col = ref->col + diff.col;
@@ -1801,9 +1804,9 @@
         inter_block) {
       const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
       const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
-      const int bs = num_4x4_blocks_wide_lookup[txb_size];
-      const int width = num_4x4_blocks_wide_lookup[bsize];
-      const int height = num_4x4_blocks_high_lookup[bsize];
+      const int bs = block_size_wide[txb_size] >> tx_size_wide_log2[0];
+      const int width = block_size_wide[bsize] >> tx_size_wide_log2[0];
+      const int height = block_size_high[bsize] >> tx_size_wide_log2[0];
       int idx, idy;
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
       int is_rect_tx_allowed = inter_block && is_rect_tx_allowed_bsize(bsize) &&
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index ba11970..e62910d 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -2099,6 +2099,7 @@
   if (bsize >= BLOCK_8X8 &&
       (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
     update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+#endif  // CONFIG_EXT_PARTITION_TYPES
 
 #if CONFIG_CLPF
   if (bsize == BLOCK_64X64 && cm->clpf_blocks && cm->clpf_strength_y &&
@@ -2140,7 +2141,6 @@
         DERING_REFINEMENT_BITS);
   }
 #endif
-#endif  // CONFIG_EXT_PARTITION_TYPES
 }
 
 static void write_modes(AV1_COMP *const cpi, const TileInfo *const tile,
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index c94c1d8..21725d7 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -4492,10 +4492,10 @@
 // zero-centering.
 static int16_t add_param_offset(int param_index, int16_t param_value,
                                 int16_t offset) {
-  int scale_vals[2] = { GM_ALPHA_PREC_DIFF, GM_TRANS_PREC_DIFF };
-  int clamp_vals[2] = { GM_ALPHA_MAX, GM_TRANS_MAX };
-  int is_trans_param = param_index < 2;
-  int is_one_centered = (!is_trans_param) && (param_index & 1);
+  const int scale_vals[2] = { GM_ALPHA_PREC_DIFF, GM_TRANS_PREC_DIFF };
+  const int clamp_vals[2] = { GM_ALPHA_MAX, GM_TRANS_MAX };
+  const int is_trans_param = param_index < 2;
+  const int is_one_centered = (!is_trans_param) && (param_index & 1);
 
   // Make parameter zero-centered and offset the shift that was done to make
   // it compatible with the warped model
@@ -4507,11 +4507,11 @@
   // to it in the bitstream
   param_value = (int16_t)clamp(param_value, -clamp_vals[is_trans_param],
                                clamp_vals[is_trans_param]);
-  // Rescale the parameter to WARPEDMODEL_PRECIION_BITS so it is compatible
+  // Rescale the parameter to WARPEDMODEL_PRECISION_BITS so it is compatible
   // with the warped motion library
   param_value *= (1 << scale_vals[is_trans_param]);
 
-  // Undo the zero-centring step if necessary
+  // Undo the zero-centering step if necessary
   return param_value + (is_one_centered << WARPEDMODEL_PREC_BITS);
 }
 
@@ -4886,17 +4886,18 @@
       int count16x16_16x16p = 0, count16x16_lp = 0;
       int count32x32 = 0;
       for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
-        count4x4 += counts->tx_size[0][i][TX_4X4];
-        count4x4 += counts->tx_size[1][i][TX_4X4];
-        count4x4 += counts->tx_size[2][i][TX_4X4];
+        // counts->tx_size[max_depth][context_idx][this_depth_level]
+        count4x4 += counts->tx_size[0][i][0];
+        count4x4 += counts->tx_size[1][i][0];
+        count4x4 += counts->tx_size[2][i][0];
 
-        count8x8_lp += counts->tx_size[1][i][TX_8X8];
-        count8x8_lp += counts->tx_size[2][i][TX_8X8];
-        count8x8_8x8p += counts->tx_size[0][i][TX_8X8];
+        count8x8_lp += counts->tx_size[1][i][1];
+        count8x8_lp += counts->tx_size[2][i][1];
+        count8x8_8x8p += counts->tx_size[0][i][1];
 
-        count16x16_16x16p += counts->tx_size[1][i][TX_16X16];
-        count16x16_lp += counts->tx_size[2][i][TX_16X16];
-        count32x32 += counts->tx_size[2][i][TX_32X32];
+        count16x16_16x16p += counts->tx_size[1][i][2];
+        count16x16_lp += counts->tx_size[2][i][2];
+        count32x32 += counts->tx_size[2][i][3];
       }
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
       count4x4 += counts->tx_size_implied[0][TX_4X4];
diff --git a/av1/encoder/encodemv.c b/av1/encoder/encodemv.c
index a34ad0b..2f0daae 100644
--- a/av1/encoder/encodemv.c
+++ b/av1/encoder/encodemv.c
@@ -46,7 +46,7 @@
   aom_write(w, sign, mvcomp->sign);
 
 // Class
-#if CONFIG_DAALA_EC
+#if CONFIG_DAALA_EC || CONFIG_RANS
   aom_write_symbol(w, mv_class, mvcomp->class_cdf, MV_CLASSES);
 #else
   av1_write_token(w, av1_mv_class_tree, mvcomp->classes,
@@ -63,7 +63,7 @@
   }
 
 // Fractional bits
-#if CONFIG_DAALA_EC
+#if CONFIG_DAALA_EC || CONFIG_RANS
   aom_write_symbol(
       w, fr, mv_class == MV_CLASS_0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf,
       MV_FP_SIZE);
@@ -210,7 +210,12 @@
   nmv_context *const mvc = &cm->fc->nmvc;
   nmv_context_counts *const counts = nmv_counts;
 
+#if !(CONFIG_DAALA_EC || CONFIG_RANS)
   write_mv_update(av1_mv_joint_tree, mvc->joints, counts->joints, MV_JOINTS, w);
+#if CONFIG_DAALA_EC || CONFIG_RANS
+  av1_tree_to_cdf(av1_mv_joint_tree, cm->fc->nmvc.joints,
+                  cm->fc->nmvc.joint_cdf);
+#endif
 
   for (i = 0; i < 2; ++i) {
     nmv_component *comp = &mvc->comps[i];
@@ -219,7 +224,7 @@
     update_mv(w, comp_counts->sign, &comp->sign, MV_UPDATE_PROB);
     write_mv_update(av1_mv_class_tree, comp->classes, comp_counts->classes,
                     MV_CLASSES, w);
-#if CONFIG_DAALA_EC
+#if CONFIG_DAALA_EC || CONFIG_RANS
     av1_tree_to_cdf(av1_mv_class_tree, comp->classes, comp->class_cdf);
 #endif
     write_mv_update(av1_mv_class0_tree, comp->class0, comp_counts->class0,
@@ -232,17 +237,18 @@
     for (j = 0; j < CLASS0_SIZE; ++j) {
       write_mv_update(av1_mv_fp_tree, mvc->comps[i].class0_fp[j],
                       counts->comps[i].class0_fp[j], MV_FP_SIZE, w);
-#if CONFIG_DAALA_EC
+#if CONFIG_DAALA_EC || CONFIG_RANS
       av1_tree_to_cdf(av1_mv_fp_tree, mvc->comps[i].class0_fp[j],
                       mvc->comps[i].class0_fp_cdf[j]);
 #endif
     }
     write_mv_update(av1_mv_fp_tree, mvc->comps[i].fp, counts->comps[i].fp,
                     MV_FP_SIZE, w);
-#if CONFIG_DAALA_EC
+#if CONFIG_DAALA_EC || CONFIG_RANS
     av1_tree_to_cdf(av1_mv_fp_tree, mvc->comps[i].fp, mvc->comps[i].fp_cdf);
 #endif
   }
+#endif  // !CONFIG_EC_ADAPT || !(CONFIG_DAALA_EC || CONFIG_RANS)
 
   if (usehp) {
     for (i = 0; i < 2; ++i) {
@@ -261,11 +267,10 @@
                    const nmv_context *mvctx, int usehp) {
   const MV diff = { mv->row - ref->row, mv->col - ref->col };
   const MV_JOINT_TYPE j = av1_get_mv_joint(&diff);
-  usehp = usehp && av1_use_mv_hp(ref);
 #if CONFIG_REF_MV
   (void)is_compound;
 #endif
-#if CONFIG_DAALA_EC
+#if CONFIG_DAALA_EC || CONFIG_RANS
   aom_write_symbol(w, j, mvctx->joint_cdf, MV_JOINTS);
 #else
   av1_write_token(w, av1_mv_joint_tree, mvctx->joints, &mv_joint_encodings[j]);
@@ -318,7 +323,7 @@
       nmv_context_counts *counts = &nmv_counts[nmv_ctx];
       (void)pred_mvs;
 #endif
-      av1_inc_mv(&diff, counts, av1_use_mv_hp(ref));
+      av1_inc_mv(&diff, counts, 1);
     }
   } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) {
     const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0].as_mv;
@@ -331,7 +336,7 @@
                     mbmi_ext->ref_mv_stack[rf_type], 1, mbmi->ref_mv_idx);
     nmv_context_counts *counts = &nmv_counts[nmv_ctx];
 #endif
-    av1_inc_mv(&diff, counts, av1_use_mv_hp(ref));
+    av1_inc_mv(&diff, counts, 1);
   } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
     const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0].as_mv;
     const MV diff = { mvs[0].as_mv.row - ref->row,
@@ -343,7 +348,7 @@
                     mbmi_ext->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
     nmv_context_counts *counts = &nmv_counts[nmv_ctx];
 #endif
-    av1_inc_mv(&diff, counts, av1_use_mv_hp(ref));
+    av1_inc_mv(&diff, counts, 1);
   }
 }
 
@@ -372,7 +377,7 @@
                       mbmi_ext->ref_mv_stack[rf_type], i, mbmi->ref_mv_idx);
       nmv_context_counts *counts = &nmv_counts[nmv_ctx];
 #endif
-      av1_inc_mv(&diff, counts, av1_use_mv_hp(ref));
+      av1_inc_mv(&diff, counts, 1);
     }
   } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) {
     const MV *ref = &mi->bmi[block].ref_mv[1].as_mv;
@@ -385,7 +390,7 @@
                     mbmi_ext->ref_mv_stack[rf_type], 1, mbmi->ref_mv_idx);
     nmv_context_counts *counts = &nmv_counts[nmv_ctx];
 #endif
-    av1_inc_mv(&diff, counts, av1_use_mv_hp(ref));
+    av1_inc_mv(&diff, counts, 1);
   } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
     const MV *ref = &mi->bmi[block].ref_mv[0].as_mv;
     const MV diff = { mvs[0].as_mv.row - ref->row,
@@ -397,7 +402,7 @@
                     mbmi_ext->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
     nmv_context_counts *counts = &nmv_counts[nmv_ctx];
 #endif
-    av1_inc_mv(&diff, counts, av1_use_mv_hp(ref));
+    av1_inc_mv(&diff, counts, 1);
   }
 }
 #else
@@ -425,7 +430,7 @@
 #endif
     const MV diff = { mvs[i].as_mv.row - ref->row,
                       mvs[i].as_mv.col - ref->col };
-    av1_inc_mv(&diff, counts, av1_use_mv_hp(ref));
+    av1_inc_mv(&diff, counts, 1);
   }
 }
 #endif  // CONFIG_EXT_INTER
diff --git a/av1/encoder/mcomp.c b/av1/encoder/mcomp.c
index 3fbceab..4b54a2c 100644
--- a/av1/encoder/mcomp.c
+++ b/av1/encoder/mcomp.c
@@ -424,7 +424,7 @@
   tr = br;
   tc = bc;
 
-  if (allow_hp && av1_use_mv_hp(ref_mv) && forced_stop == 0) {
+  if (allow_hp && forced_stop == 0) {
     hstep >>= 1;
     FIRST_LEVEL_CHECKS;
     if (eighthiters > 1) {
@@ -484,7 +484,7 @@
     }
   }
 
-  if (allow_hp && av1_use_mv_hp(ref_mv) && forced_stop == 0) {
+  if (allow_hp && forced_stop == 0) {
     tr = br;
     tc = bc;
     hstep >>= 1;
@@ -572,7 +572,7 @@
     tc = bc;
   }
 
-  if (allow_hp && av1_use_mv_hp(ref_mv) && forced_stop == 0) {
+  if (allow_hp && forced_stop == 0) {
     hstep >>= 1;
     FIRST_LEVEL_CHECKS;
     if (eighthiters > 1) {
@@ -687,7 +687,7 @@
   unsigned int cost_array[5];
   int kr, kc;
 
-  if (!(allow_hp && av1_use_mv_hp(ref_mv)))
+  if (!allow_hp)
     if (round == 3) round = 2;
 
   bestmv->row *= 8;
@@ -2446,7 +2446,7 @@
     tc = bc;
   }
 
-  if (allow_hp && av1_use_mv_hp(ref_mv) && forced_stop == 0) {
+  if (allow_hp && forced_stop == 0) {
     hstep >>= 1;
     FIRST_LEVEL_CHECKS;
     if (eighthiters > 1) {
@@ -2581,7 +2581,7 @@
   y_stride = pd->pre[is_second].stride;
   offset = bestmv->row * y_stride + bestmv->col;
 
-  if (!(allow_hp && av1_use_mv_hp(ref_mv)))
+  if (!allow_hp)
     if (round == 3) round = 2;
 
   bestmv->row *= 8;
@@ -3083,7 +3083,7 @@
   y_stride = pd->pre[is_second].stride;
   offset = bestmv->row * y_stride + bestmv->col;
 
-  if (!(allow_hp && av1_use_mv_hp(ref_mv)))
+  if (!allow_hp)
     if (round == 3) round = 2;
 
   bestmv->row *= 8;
diff --git a/av1/encoder/pickdering.c b/av1/encoder/pickdering.c
index 4ef83cd..6f7767a 100644
--- a/av1/encoder/pickdering.c
+++ b/av1/encoder/pickdering.c
@@ -115,7 +115,7 @@
         od_dering(dst, MAX_MIB_SIZE * bsize[0],
                   &src[sbr * stride * bsize[0] * MAX_MIB_SIZE +
                        sbc * bsize[0] * MAX_MIB_SIZE],
-                  cm->mi_cols * bsize[0], nhb, nvb, sbc, sbr, nhsb, nvsb, 0,
+                  cm->mi_cols * bsize[0], nhb, nvb, sbc, sbr, nhsb, nvsb, 0, 0,
                   dir, 0,
                   &bskip[MAX_MIB_SIZE * sbr * cm->mi_cols + MAX_MIB_SIZE * sbc],
                   cm->mi_cols, threshold, coeff_shift);
diff --git a/av1/encoder/rd.c b/av1/encoder/rd.c
index cfc8cb3..f5af485 100644
--- a/av1/encoder/rd.c
+++ b/av1/encoder/rd.c
@@ -719,7 +719,7 @@
 }
 
 #if CONFIG_DUAL_FILTER
-int av1_get_switchable_rate(const AV1_COMP *cpi, const MACROBLOCKD *const xd) {
+int av1_get_switchable_rate(const AV1_COMP *cpi, const MACROBLOCKD *xd) {
   const AV1_COMMON *const cm = &cpi->common;
   if (cm->interp_filter == SWITCHABLE) {
     const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
@@ -741,7 +741,7 @@
   }
 }
 #else
-int av1_get_switchable_rate(const AV1_COMP *cpi, const MACROBLOCKD *const xd) {
+int av1_get_switchable_rate(const AV1_COMP *cpi, const MACROBLOCKD *xd) {
   const AV1_COMMON *const cm = &cpi->common;
   if (cm->interp_filter == SWITCHABLE) {
 #if CONFIG_EXT_INTERP
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index c6b9979..d4a089a 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -3054,9 +3054,9 @@
   TX_SIZE(*const inter_tx_size)
   [MAX_MIB_SIZE] =
       (TX_SIZE(*)[MAX_MIB_SIZE]) & mbmi->inter_tx_size[tx_row][tx_col];
-  const int bw = num_4x4_blocks_wide_lookup[plane_bsize];
-  int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
-  int max_blocks_wide = bw;
+  int max_blocks_high = block_size_high[plane_bsize];
+  int max_blocks_wide = block_size_wide[plane_bsize];
+  const int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
   int64_t this_rd = INT64_MAX;
   ENTROPY_CONTEXT *pta = ta + blk_col;
   ENTROPY_CONTEXT *ptl = tl + blk_row;
@@ -3103,9 +3103,12 @@
   coeff_ctx = combine_entropy_contexts(stxa, stxl);
 
   if (xd->mb_to_bottom_edge < 0)
-    max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
+    max_blocks_high += xd->mb_to_bottom_edge >> (3 + pd->subsampling_y);
   if (xd->mb_to_right_edge < 0)
-    max_blocks_wide += xd->mb_to_right_edge >> (5 + pd->subsampling_x);
+    max_blocks_wide += xd->mb_to_right_edge >> (3 + pd->subsampling_x);
+
+  max_blocks_high >>= tx_size_wide_log2[0];
+  max_blocks_wide >>= tx_size_wide_log2[0];
 
   *rate = 0;
   *dist = 0;
@@ -3144,8 +3147,10 @@
 
   if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH) {
     BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
-    int bsl = b_height_log2_lookup[bsize];
-    int sub_step = num_4x4_blocks_txsize_lookup[tx_size - 1];
+    int bsl = block_size_wide[bsize] >> (tx_size_wide_log2[0] + 1);
+    // TODO(jingning): Refactor this transform block size transition.
+    TX_SIZE sub_txs = tx_size - 1;
+    int sub_step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
     int this_rate;
     int64_t this_dist;
     int64_t this_bsse;
@@ -3156,15 +3161,13 @@
 #if CONFIG_EXT_TX
     assert(tx_size < TX_SIZES);
 #endif  // CONFIG_EXT_TX
-    --bsl;
     for (i = 0; i < 4 && this_cost_valid; ++i) {
-      int offsetr = (i >> 1) << bsl;
-      int offsetc = (i & 0x01) << bsl;
+      int offsetr = (i >> 1) * bsl;
+      int offsetc = (i & 0x01) * bsl;
       select_tx_block(cpi, x, blk_row + offsetr, blk_col + offsetc, plane,
-                      block + i * sub_step, tx_size - 1, depth + 1, plane_bsize,
-                      ta, tl, tx_above, tx_left, &this_rate, &this_dist,
-                      &this_bsse, &this_skip, ref_best_rd - tmp_rd,
-                      &this_cost_valid);
+                      block + i * sub_step, sub_txs, depth + 1, plane_bsize, ta,
+                      tl, tx_above, tx_left, &this_rate, &this_dist, &this_bsse,
+                      &this_skip, ref_best_rd - tmp_rd, &this_cost_valid);
       sum_rate += this_rate;
       sum_dist += this_dist;
       sum_bsse += this_bsse;
@@ -3177,15 +3180,13 @@
 
   if (this_rd < sum_rd) {
     int idx, idy;
-    for (i = 0; i < num_4x4_blocks_wide_txsize_lookup[tx_size]; ++i)
-      pta[i] = !(tmp_eob == 0);
-    for (i = 0; i < num_4x4_blocks_high_txsize_lookup[tx_size]; ++i)
-      ptl[i] = !(tmp_eob == 0);
+    for (i = 0; i < tx_size_wide_unit[tx_size]; ++i) pta[i] = !(tmp_eob == 0);
+    for (i = 0; i < tx_size_high_unit[tx_size]; ++i) ptl[i] = !(tmp_eob == 0);
     txfm_partition_update(tx_above + (blk_col >> 1), tx_left + (blk_row >> 1),
                           tx_size);
     inter_tx_size[0][0] = tx_size;
-    for (idy = 0; idy < num_4x4_blocks_high_txsize_lookup[tx_size] / 2; ++idy)
-      for (idx = 0; idx < num_4x4_blocks_wide_txsize_lookup[tx_size] / 2; ++idx)
+    for (idy = 0; idy < tx_size_high_unit[tx_size] / 2; ++idy)
+      for (idx = 0; idx < tx_size_wide_unit[tx_size] / 2; ++idx)
         inter_tx_size[idy][idx] = tx_size;
     mbmi->tx_size = tx_size;
     if (this_rd == INT64_MAX) *is_cost_valid = 0;
@@ -3479,17 +3480,20 @@
   const int tx_row = blk_row >> (1 - pd->subsampling_y);
   const int tx_col = blk_col >> (1 - pd->subsampling_x);
   TX_SIZE plane_tx_size;
-  int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
-  int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
+  int max_blocks_high = block_size_high[plane_bsize];
+  int max_blocks_wide = block_size_wide[plane_bsize];
 
 #if CONFIG_EXT_TX
   assert(tx_size < TX_SIZES);
 #endif  // CONFIG_EXT_TX
 
   if (xd->mb_to_bottom_edge < 0)
-    max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
+    max_blocks_high += xd->mb_to_bottom_edge >> (3 + pd->subsampling_y);
   if (xd->mb_to_right_edge < 0)
-    max_blocks_wide += xd->mb_to_right_edge >> (5 + pd->subsampling_x);
+    max_blocks_wide += xd->mb_to_right_edge >> (3 + pd->subsampling_x);
+
+  max_blocks_high >>= tx_size_wide_log2[0];
+  max_blocks_wide >>= tx_size_wide_log2[0];
 
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
@@ -3521,24 +3525,24 @@
     av1_tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block,
                       plane_bsize, coeff_ctx, rate, dist, bsse, skip);
 
-    for (i = 0; i < num_4x4_blocks_wide_txsize_lookup[tx_size]; ++i)
+    for (i = 0; i < tx_size_wide_unit[tx_size]; ++i)
       ta[i] = !(p->eobs[block] == 0);
-    for (i = 0; i < num_4x4_blocks_high_txsize_lookup[tx_size]; ++i)
+    for (i = 0; i < tx_size_high_unit[tx_size]; ++i)
       tl[i] = !(p->eobs[block] == 0);
   } else {
-    int bsl = b_width_log2_lookup[bsize];
-    int step = num_4x4_blocks_txsize_lookup[tx_size - 1];
+    const int bsl = block_size_wide[bsize] >> (1 + tx_size_wide_log2[0]);
+    const TX_SIZE sub_txs = tx_size - 1;
+    int step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
     int i;
 
     assert(bsl > 0);
-    --bsl;
 
     for (i = 0; i < 4; ++i) {
-      int offsetr = (i >> 1) << bsl;
-      int offsetc = (i & 0x01) << bsl;
+      int offsetr = (i >> 1) * bsl;
+      int offsetc = (i & 0x01) * bsl;
       tx_block_rd(cpi, x, blk_row + offsetr, blk_col + offsetc, plane,
-                  block + i * step, tx_size - 1, plane_bsize, above_ctx,
-                  left_ctx, rate, dist, bsse, skip);
+                  block + i * step, sub_txs, plane_bsize, above_ctx, left_ctx,
+                  rate, dist, bsse, skip);
     }
   }
 }
@@ -4220,8 +4224,7 @@
 #endif  // CONFIG_EXT_INTER
       this_mv[0].as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
 #if CONFIG_EXT_INTER
-      if (!cpi->common.allow_high_precision_mv ||
-          !av1_use_mv_hp(&best_ref_mv[0]->as_mv))
+      if (!cpi->common.allow_high_precision_mv)
         lower_mv_precision(&this_mv[0].as_mv, 0);
 #endif  // CONFIG_EXT_INTER
 
@@ -4280,11 +4283,9 @@
         this_mv[0].as_int = compound_seg_newmvs[0].as_int;
         this_mv[1].as_int = compound_seg_newmvs[1].as_int;
       }
-      if (!cpi->common.allow_high_precision_mv ||
-          !av1_use_mv_hp(&best_ref_mv[0]->as_mv))
+      if (!cpi->common.allow_high_precision_mv)
         lower_mv_precision(&this_mv[0].as_mv, 0);
-      if (!cpi->common.allow_high_precision_mv ||
-          !av1_use_mv_hp(&best_ref_mv[1]->as_mv))
+      if (!cpi->common.allow_high_precision_mv)
         lower_mv_precision(&this_mv[1].as_mv, 0);
       thismvcost += av1_mv_bit_cost(&this_mv[0].as_mv, &best_ref_mv[0]->as_mv,
                                     mvjcost, mvcost, MV_COST_WEIGHT_SUB);
@@ -4294,8 +4295,7 @@
     case NEW_NEARMV:
     case NEW_NEARESTMV:
       this_mv[0].as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
-      if (!cpi->common.allow_high_precision_mv ||
-          !av1_use_mv_hp(&best_ref_mv[0]->as_mv))
+      if (!cpi->common.allow_high_precision_mv)
         lower_mv_precision(&this_mv[0].as_mv, 0);
       thismvcost += av1_mv_bit_cost(&this_mv[0].as_mv, &best_ref_mv[0]->as_mv,
                                     mvjcost, mvcost, MV_COST_WEIGHT_SUB);
@@ -4305,8 +4305,7 @@
     case NEAREST_NEWMV:
       this_mv[0].as_int = frame_mv[mode][mbmi->ref_frame[0]].as_int;
       this_mv[1].as_int = seg_mvs[mbmi->ref_frame[1]].as_int;
-      if (!cpi->common.allow_high_precision_mv ||
-          !av1_use_mv_hp(&best_ref_mv[1]->as_mv))
+      if (!cpi->common.allow_high_precision_mv)
         lower_mv_precision(&this_mv[1].as_mv, 0);
       thismvcost += av1_mv_bit_cost(&this_mv[1].as_mv, &best_ref_mv[1]->as_mv,
                                     mvjcost, mvcost, MV_COST_WEIGHT_SUB);
@@ -4376,8 +4375,8 @@
   struct macroblock_plane *const p = &x->plane[0];
   MODE_INFO *const mi = xd->mi[0];
   const BLOCK_SIZE plane_bsize = get_plane_block_size(mi->mbmi.sb_type, pd);
-  const int width = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
-  const int height = 4 * num_4x4_blocks_high_lookup[plane_bsize];
+  const int width = block_size_wide[plane_bsize];
+  const int height = block_size_high[plane_bsize];
   int idx, idy;
   const uint8_t *const src =
       &p->src.buf[av1_raster_block_offset(BLOCK_8X8, i, p->src.stride)];
@@ -4389,8 +4388,8 @@
 
   TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, i, tx_size);
   const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, 1);
-  const int num_4x4_w = num_4x4_blocks_wide_txsize_lookup[tx_size];
-  const int num_4x4_h = num_4x4_blocks_high_txsize_lookup[tx_size];
+  const int num_4x4_w = tx_size_wide_unit[tx_size];
+  const int num_4x4_h = tx_size_high_unit[tx_size];
 
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
   assert(IMPLIES(xd->lossless[mi->mbmi.segment_id], tx_size == TX_4X4));
@@ -4430,11 +4429,8 @@
         block = k;
       else
         block = (i ? 2 : 0);
-#if CONFIG_VAR_TX
-      coeff_ctx = get_entropy_context(tx_size, ta + (k & 1), tl + (k >> 1));
-#else
+
       coeff_ctx = combine_entropy_contexts(*(ta + (k & 1)), *(tl + (k >> 1)));
-#endif
 #if CONFIG_NEW_QUANT
       av1_xform_quant_fp_nuq(cm, x, 0, block, idy + (i >> 1), idx + (i & 0x01),
                              BLOCK_8X8, tx_size, coeff_ctx);
@@ -5147,8 +5143,7 @@
         if (!has_second_rf &&
 #if CONFIG_EXT_INTER
             have_newmv_in_inter_mode(this_mode) &&
-            (seg_mvs[index][mv_idx][mbmi->ref_frame[0]].as_int == INVALID_MV ||
-             av1_use_mv_hp(&bsi->ref_mv[0]->as_mv) == 0)
+            (seg_mvs[index][mv_idx][mbmi->ref_frame[0]].as_int == INVALID_MV)
 #else
             this_mode == NEWMV &&
             (seg_mvs[index][mbmi->ref_frame[0]].as_int == INVALID_MV ||
@@ -6706,7 +6701,6 @@
 #endif  // CONFIG_EXT_INTER
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
   int64_t rd = INT64_MAX;
-  int64_t tmp_rd = INT64_MAX;
   uint8_t *orig_dst[MAX_MB_PLANE];
   int orig_dst_stride[MAX_MB_PLANE];
   uint8_t *tmp_dst[MAX_MB_PLANE];
@@ -7045,6 +7039,7 @@
           int tmp_skip_sb = 0;
           int64_t tmp_skip_sse = INT64_MAX;
           int tmp_rs;
+          int64_t tmp_rd;
 #if CONFIG_DUAL_FILTER
           mbmi->interp_filter[0] = filter_sets[i][0];
           mbmi->interp_filter[1] = filter_sets[i][1];
@@ -7240,7 +7235,6 @@
       return INT64_MAX;
 
     pred_exists = 0;
-    tmp_rd = AOMMIN(best_rd_wedge, best_rd_nowedge);
 
     if (mbmi->use_wedge_interinter)
       *compmode_wedge_cost =
@@ -7381,7 +7375,6 @@
     }
 
     pred_exists = 0;
-    tmp_rd = best_interintra_rd;
     *compmode_interintra_cost =
         av1_cost_bit(cm->fc->interintra_prob[size_group_lookup[bsize]], 1);
     *compmode_interintra_cost += interintra_mode_cost[mbmi->interintra_mode];
@@ -7460,6 +7453,7 @@
   for (mbmi->motion_mode = SIMPLE_TRANSLATION;
        mbmi->motion_mode < (allow_motvar ? MOTION_MODES : 1);
        mbmi->motion_mode++) {
+    int64_t tmp_rd = INT64_MAX;
 #if CONFIG_EXT_INTER
     int tmp_rate2 = mbmi->motion_mode != SIMPLE_TRANSLATION ? rate2_bmc_nocoeff
                                                             : rate2_nocoeff;
@@ -10190,14 +10184,13 @@
       this_rd_thresh = (ref_frame == LAST3_FRAME)
                            ? rd_opt->threshes[segment_id][bsize][THR_LAST3]
                            : this_rd_thresh;
+      this_rd_thresh = (ref_frame == BWDREF_FRAME)
+                           ? rd_opt->threshes[segment_id][bsize][THR_BWDR]
+                           : this_rd_thresh;
 #endif  // CONFIG_EXT_REFS
       this_rd_thresh = (ref_frame == GOLDEN_FRAME)
                            ? rd_opt->threshes[segment_id][bsize][THR_GOLD]
                            : this_rd_thresh;
-#if CONFIG_EXT_REFS
-// TODO(zoeliu): To explore whether this_rd_thresh should consider
-//               BWDREF_FRAME and ALTREF_FRAME
-#endif  // CONFIG_EXT_REFS
 
       // TODO(any): Add search of the tx_type to improve rd performance at the
       // expense of speed.
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index 9f62ffe..bbfb7f1 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -728,8 +728,7 @@
         make_tuple(&idct8x8_12, &idct8x8_64_add_12_sse2, 6225, AOM_BITS_12)));
 #endif  // HAVE_SSE2 && CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
-#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_AOM_HIGHBITDEPTH && \
-    !CONFIG_EMULATE_HARDWARE
+#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(SSSE3, FwdTrans8x8DCT,
                         ::testing::Values(make_tuple(&aom_fdct8x8_ssse3,
                                                      &aom_idct8x8_64_add_ssse3,
diff --git a/test/scan_test.cc b/test/scan_test.cc
new file mode 100644
index 0000000..43df09f
--- /dev/null
+++ b/test/scan_test.cc
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "av1/common/scan.h"
+#include "test/acm_random.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+
+TEST(scan_test, av1_augment_prob) {
+  int tx1d_size = 4;
+  uint32_t prob[16] = { 8, 8, 7, 7, 8, 8, 4, 2, 3, 3, 2, 2, 2, 2, 2, 2 };
+  uint32_t ref_prob[16] = { 8, 8, 7, 7, 8, 8, 4, 2, 3, 3, 2, 2, 2, 2, 2, 2 };
+  av1_augment_prob(prob, tx1d_size, tx1d_size);
+  for (int r = 0; r < tx1d_size; ++r) {
+    for (int c = 0; c < tx1d_size; ++c) {
+      int idx = r * tx1d_size + c;
+      EXPECT_EQ(ref_prob[idx], prob[idx] >> 16);
+    }
+  }
+
+  int mask = (1 << 10) - 1;
+  for (int r = 0; r < tx1d_size; ++r) {
+    for (int c = 0; c < tx1d_size; ++c) {
+      int idx = r * tx1d_size + c;
+      EXPECT_EQ(idx, mask ^ (prob[r * tx1d_size + c] & mask));
+    }
+  }
+}
+
+TEST(scan_test, av1_update_sort_order) {
+  TX_SIZE tx_size = TX_4X4;
+  uint32_t prob[16] = { 8, 8, 7, 7, 8, 8, 4, 2, 3, 3, 2, 2, 2, 2, 2, 2 };
+  int16_t ref_sort_order[16] = { 0, 1,  4, 5,  2,  3,  6,  8,
+                                 9, 12, 7, 10, 13, 11, 14, 15 };
+  int16_t sort_order[16];
+  av1_update_sort_order(tx_size, prob, sort_order);
+  for (int i = 0; i < 16; ++i) EXPECT_EQ(ref_sort_order[i], sort_order[i]);
+}
+
+TEST(scan_test, av1_update_scan_order) {
+  TX_SIZE tx_size = TX_4X4;
+  uint32_t prob[16] = { 4, 5, 7, 4, 5, 6, 8, 2, 3, 3, 2, 2, 2, 2, 2, 2 };
+  int16_t sort_order[16];
+  int16_t scan[16];
+  int16_t iscan[16];
+  int16_t ref_iscan[16] = {
+    0, 1, 2, 6, 3, 4, 5, 10, 7, 8, 11, 13, 9, 12, 14, 15
+  };
+
+  av1_update_sort_order(tx_size, prob, sort_order);
+  av1_update_scan_order(tx_size, sort_order, scan, iscan);
+
+  for (int i = 0; i < 16; ++i) EXPECT_EQ(ref_iscan[i], iscan[i]);
+
+  for (int i = 0; i < 16; ++i) EXPECT_EQ(i, scan[ref_iscan[i]]);
+}
+
+TEST(scan_test, av1_update_neighbors) {
+  TX_SIZE tx_size = TX_4X4;
+  // raster order
+  int16_t scan[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
+  int16_t nb[(16 + 1) * 2];
+  int16_t ref_nb[(16 + 1) * 2] = { 0, 0, 0,  0, 1,  1,  2,  2,  0, 0, 4,  1,
+                                   5, 2, 6,  3, 4,  4,  8,  5,  9, 6, 10, 7,
+                                   8, 8, 12, 9, 13, 10, 14, 11, 0, 0 };
+
+  // raster order's scan and iscan are the same
+  av1_update_neighbors(tx_size, scan, scan, nb);
+  for (int i = 0; i < (16 + 1) * 2; ++i) {
+    EXPECT_EQ(ref_nb[i], nb[i]);
+  }
+}
+
+}  // namespace