Merge "Unify set_contexts() function for encoder and decoder" into nextgenv2

diff --git a/aom_dsp/arm/aom_convolve8_avg_neon.c b/aom_dsp/arm/aom_convolve8_avg_neon.c
index 7dc936d..09429d6 100644
--- a/aom_dsp/arm/aom_convolve8_avg_neon.c
+++ b/aom_dsp/arm/aom_convolve8_avg_neon.c

@@ -65,6 +65,10 @@
 
   assert(x_step_q4 == 16);
 
+  (void)x_step_q4;
+  (void)y_step_q4;
+  (void)filter_y;
+
   q0s16 = vld1q_s16(filter_x);
 
   src -= 3;                // adjust for taps
@@ -241,6 +245,10 @@
 
   assert(y_step_q4 == 16);
 
+  (void)x_step_q4;
+  (void)y_step_q4;
+  (void)filter_x;
+
   src -= src_stride * 3;
   q0s16 = vld1q_s16(filter_y);
   for (; w > 0; w -= 4, src += 4, dst += 4) {  // loop_vert_h

diff --git a/aom_dsp/arm/aom_convolve8_neon.c b/aom_dsp/arm/aom_convolve8_neon.c
index ed0df6d..8ebffb5 100644
--- a/aom_dsp/arm/aom_convolve8_neon.c
+++ b/aom_dsp/arm/aom_convolve8_neon.c

@@ -65,6 +65,10 @@
 
   assert(x_step_q4 == 16);
 
+  (void)x_step_q4;
+  (void)y_step_q4;
+  (void)filter_y;
+
   q0s16 = vld1q_s16(filter_x);
 
   src -= 3;  // adjust for taps
@@ -225,6 +229,10 @@
 
   assert(y_step_q4 == 16);
 
+  (void)x_step_q4;
+  (void)y_step_q4;
+  (void)filter_x;
+
   src -= src_stride * 3;
   q0s16 = vld1q_s16(filter_y);
   for (; w > 0; w -= 4, src += 4, dst += 4) {  // loop_vert_h

diff --git a/aom_dsp/x86/convolve.h b/aom_dsp/x86/convolve.h
index ffaed02..87ff34b 100644
--- a/aom_dsp/x86/convolve.h
+++ b/aom_dsp/x86/convolve.h

@@ -27,6 +27,10 @@
       const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                \
       ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,          \
       const int16_t *filter_y, int y_step_q4, int w, int h) {                \
+    (void)filter_x;                                                          \
+    (void)x_step_q4;                                                         \
+    (void)filter_y;                                                          \
+    (void)y_step_q4;                                                         \
     assert(filter[3] != 128);                                                \
     assert(step_q4 == 16);                                                   \
     if (filter[0] | filter[1] | filter[2]) {                                 \

diff --git a/aom_scale/generic/aom_scale.c b/aom_scale/generic/aom_scale.c
index 49084db..28604ac 100644
--- a/aom_scale/generic/aom_scale.c
+++ b/aom_scale/generic/aom_scale.c

@@ -68,24 +68,25 @@
                           unsigned int source_scale, unsigned int source_length,
                           unsigned char *dest, int dest_step,
                           unsigned int dest_scale, unsigned int dest_length) {
-  unsigned int i, j;
-  unsigned int temp;
-  int source_pitch = source_step;
+  const unsigned int source_pitch = source_step;
+  const unsigned char *const dest_end = dest + dest_length * dest_step;
   (void)source_length;
   (void)source_scale;
   (void)dest_scale;
 
-  source_step *= 2;
-  dest[0] = source[0];
+  source_step *= 2;  // Every other row.
 
-  for (i = dest_step, j = source_step; i < dest_length * dest_step;
-       i += dest_step, j += source_step) {
-    temp = 8;
-    temp += 3 * source[j - source_pitch];
-    temp += 10 * source[j];
-    temp += 3 * source[j + source_pitch];
-    temp >>= 4;
-    dest[i] = (char)(temp);
+  dest[0] = source[0];  // Special case: 1st pixel.
+  source += source_step;
+  dest += dest_step;
+
+  while (dest < dest_end) {
+    const unsigned int a = 3 * source[-source_pitch];
+    const unsigned int b = 10 * source[0];
+    const unsigned int c = 3 * source[source_pitch];
+    *dest = (unsigned char)((8 + a + b + c) >> 4);
+    source += source_step;
+    dest += dest_step;
   }
 }
 
@@ -119,17 +120,18 @@
                            unsigned int source_length, unsigned char *dest,
                            int dest_step, unsigned int dest_scale,
                            unsigned int dest_length) {
-  unsigned int i, j;
-
+  const unsigned char *const dest_end = dest + dest_length * dest_step;
   (void)source_length;
   (void)source_scale;
   (void)dest_scale;
 
-  source_step *= 2;
-  j = 0;
+  source_step *= 2;  // Every other row.
 
-  for (i = 0; i < dest_length * dest_step; i += dest_step, j += source_step)
-    dest[i] = source[j];
+  while (dest < dest_end) {
+    *dest = *source;
+    source += source_step;
+    dest += dest_step;
+  }
 }
 /****************************************************************************
  *
@@ -159,12 +161,12 @@
                       unsigned int source_scale, unsigned int source_length,
                       unsigned char *dest, int dest_step,
                       unsigned int dest_scale, unsigned int dest_length) {
-  unsigned int i;
-  unsigned int round_value = dest_scale / 2;
+  const unsigned char *const dest_end = dest + dest_length * dest_step;
+  const unsigned int round_value = dest_scale / 2;
   unsigned int left_modifier = dest_scale;
   unsigned int right_modifier = 0;
-  unsigned char left_pixel = *source;
-  unsigned char right_pixel = *(source + source_step);
+  unsigned char left_pixel = source[0];
+  unsigned char right_pixel = source[source_step];
 
   (void)source_length;
 
@@ -173,18 +175,18 @@
   /* assert ( (source_length - 1) * dest_scale >= (dest_length - 1) *
    * source_scale);*/
 
-  for (i = 0; i < dest_length * dest_step; i += dest_step) {
-    dest[i] = (char)((left_modifier * left_pixel +
-                      right_modifier * right_pixel + round_value) /
-                     dest_scale);
+  while (dest < dest_end) {
+    *dest = (unsigned char)((left_modifier * left_pixel +
+                             right_modifier * right_pixel + round_value) /
+                            dest_scale);
 
     right_modifier += source_scale;
 
     while (right_modifier > dest_scale) {
       right_modifier -= dest_scale;
       source += source_step;
-      left_pixel = *source;
-      right_pixel = *(source + source_step);
+      left_pixel = source[0];
+      right_pixel = source[source_step];
     }
 
     left_modifier = dest_scale - right_modifier;
@@ -236,11 +238,10 @@
     unsigned int dest_width, unsigned int dest_height, unsigned char *temp_area,
     unsigned char temp_area_height, unsigned int hscale, unsigned int hratio,
     unsigned int vscale, unsigned int vratio, unsigned int interlaced) {
-  /*unsigned*/
-  int i, j, k;
-  int bands;
-  int dest_band_height;
-  int source_band_height;
+  unsigned int i, j, k;
+  unsigned int bands;
+  unsigned int dest_band_height;
+  unsigned int source_band_height;
 
   typedef void (*Scale1D)(const unsigned char *source, int source_step,
                           unsigned int source_scale, unsigned int source_length,
@@ -331,7 +332,7 @@
   if (ratio_scalable) {
     if (source_height == dest_height) {
       /* for each band of the image */
-      for (k = 0; k < (int)dest_height; k++) {
+      for (k = 0; k < dest_height; ++k) {
         horiz_line_scale(source, source_width, dest, dest_width);
         source += source_pitch;
         dest += dest_pitch;
@@ -346,14 +347,13 @@
       horiz_line_scale(source, source_width, temp_area, dest_width);
     }
 
-    for (k = 0;
-         k < (int)(dest_height + dest_band_height - 1) / dest_band_height;
-         k++) {
+    for (k = 0; k < (dest_height + dest_band_height - 1) / dest_band_height;
+         ++k) {
       /* scale one band horizontally */
-      for (i = 0; i < source_band_height; i++) {
+      for (i = 0; i < source_band_height; ++i) {
         /* Trap case where we could read off the base of the source buffer */
 
-        line_src = (unsigned char *)source + i * source_pitch;
+        line_src = source + i * source_pitch;
 
         if (line_src < source_base) line_src = source_base;
 
@@ -388,7 +388,7 @@
 
   if (source_height == dest_height) {
     /* for each band of the image */
-    for (k = 0; k < (int)dest_height; k++) {
+    for (k = 0; k < dest_height; ++k) {
       Scale1Dh(source, 1, hscale, source_width + 1, dest, 1, hratio,
                dest_width);
       source += source_pitch;
@@ -414,10 +414,10 @@
   /* for each band of the image */
   bands = (dest_height + dest_band_height - 1) / dest_band_height;
 
-  for (k = 0; k < bands; k++) {
+  for (k = 0; k < bands; ++k) {
     /* scale one band horizontally */
-    for (i = 1; i < source_band_height + 1; i++) {
-      if (k * source_band_height + i < (int)source_height) {
+    for (i = 1; i < source_band_height + 1; ++i) {
+      if (k * source_band_height + i < source_height) {
         Scale1Dh(source + i * source_pitch, 1, hscale, source_width + 1,
                  temp_area + i * dest_pitch, 1, hratio, dest_width);
       } else { /*  Duplicate the last row */
@@ -428,7 +428,7 @@
     }
 
     /* scale one band vertically */
-    for (j = 0; j < (int)dest_width; j++) {
+    for (j = 0; j < dest_width; ++j) {
       Scale1Dv(&temp_area[j], dest_pitch, vscale, source_band_height + 1,
                &dest[j], dest_pitch, vratio, dest_band_height);
     }
@@ -487,12 +487,12 @@
           temp_area, temp_height, hscale, hratio, vscale, vratio, interlaced);
 
   if (dw < (int)dst->y_width)
-    for (i = 0; i < dh; i++)
+    for (i = 0; i < dh; ++i)
       memset(dst->y_buffer + i * dst->y_stride + dw - 1,
              dst->y_buffer[i * dst->y_stride + dw - 2], dst->y_width - dw + 1);
 
   if (dh < (int)dst->y_height)
-    for (i = dh - 1; i < (int)dst->y_height; i++)
+    for (i = dh - 1; i < (int)dst->y_height; ++i)
       memcpy(dst->y_buffer + i * dst->y_stride,
              dst->y_buffer + (dh - 2) * dst->y_stride, dst->y_width + 1);
 
@@ -502,13 +502,13 @@
           vratio, interlaced);
 
   if (dw / 2 < (int)dst->uv_width)
-    for (i = 0; i < dst->uv_height; i++)
+    for (i = 0; i < dst->uv_height; ++i)
       memset(dst->u_buffer + i * dst->uv_stride + dw / 2 - 1,
              dst->u_buffer[i * dst->uv_stride + dw / 2 - 2],
              dst->uv_width - dw / 2 + 1);
 
   if (dh / 2 < (int)dst->uv_height)
-    for (i = dh / 2 - 1; i < (int)dst->y_height / 2; i++)
+    for (i = dh / 2 - 1; i < (int)dst->y_height / 2; ++i)
       memcpy(dst->u_buffer + i * dst->uv_stride,
              dst->u_buffer + (dh / 2 - 2) * dst->uv_stride, dst->uv_width);
 
@@ -518,13 +518,13 @@
           vratio, interlaced);
 
   if (dw / 2 < (int)dst->uv_width)
-    for (i = 0; i < dst->uv_height; i++)
+    for (i = 0; i < dst->uv_height; ++i)
       memset(dst->v_buffer + i * dst->uv_stride + dw / 2 - 1,
              dst->v_buffer[i * dst->uv_stride + dw / 2 - 2],
              dst->uv_width - dw / 2 + 1);
 
   if (dh / 2 < (int)dst->uv_height)
-    for (i = dh / 2 - 1; i < (int)dst->y_height / 2; i++)
+    for (i = dh / 2 - 1; i < (int)dst->y_height / 2; ++i)
       memcpy(dst->v_buffer + i * dst->uv_stride,
              dst->v_buffer + (dh / 2 - 2) * dst->uv_stride, dst->uv_width);
 }

diff --git a/aom_scale/generic/gen_scalers.c b/aom_scale/generic/gen_scalers.c
index 57c464d..fd638bd 100644
--- a/aom_scale/generic/gen_scalers.c
+++ b/aom_scale/generic/gen_scalers.c

@@ -39,27 +39,23 @@
                                      unsigned int source_width,
                                      unsigned char *dest,
                                      unsigned int dest_width) {
-  unsigned i;
-  unsigned int a, b, c, d, e;
-  unsigned char *des = dest;
-  const unsigned char *src = source;
-
+  const unsigned char *const source_end = source + source_width;
   (void)dest_width;
 
-  for (i = 0; i < source_width; i += 5) {
-    a = src[0];
-    b = src[1];
-    c = src[2];
-    d = src[3];
-    e = src[4];
+  while (source < source_end) {
+    const unsigned int a = source[0];
+    const unsigned int b = source[1];
+    const unsigned int c = source[2];
+    const unsigned int d = source[3];
+    const unsigned int e = source[4];
 
-    des[0] = (unsigned char)a;
-    des[1] = (unsigned char)((b * 192 + c * 64 + 128) >> 8);
-    des[2] = (unsigned char)((c * 128 + d * 128 + 128) >> 8);
-    des[3] = (unsigned char)((d * 64 + e * 192 + 128) >> 8);
+    dest[0] = (unsigned char)a;
+    dest[1] = (unsigned char)((b * 192 + c * 64 + 128) >> 8);
+    dest[2] = (unsigned char)((c * 128 + d * 128 + 128) >> 8);
+    dest[3] = (unsigned char)((d * 64 + e * 192 + 128) >> 8);
 
-    src += 5;
-    des += 4;
+    source += 5;
+    dest += 4;
   }
 }
 
@@ -67,25 +63,21 @@
                                    unsigned int src_pitch, unsigned char *dest,
                                    unsigned int dest_pitch,
                                    unsigned int dest_width) {
-  unsigned int i;
-  unsigned int a, b, c, d, e;
-  unsigned char *des = dest;
-  unsigned char *src = source;
+  const unsigned char *const dest_end = dest + dest_width;
+  while (dest < dest_end) {
+    const unsigned int a = source[0 * src_pitch];
+    const unsigned int b = source[1 * src_pitch];
+    const unsigned int c = source[2 * src_pitch];
+    const unsigned int d = source[3 * src_pitch];
+    const unsigned int e = source[4 * src_pitch];
 
-  for (i = 0; i < dest_width; i++) {
-    a = src[0 * src_pitch];
-    b = src[1 * src_pitch];
-    c = src[2 * src_pitch];
-    d = src[3 * src_pitch];
-    e = src[4 * src_pitch];
+    dest[0 * dest_pitch] = (unsigned char)a;
+    dest[1 * dest_pitch] = (unsigned char)((b * 192 + c * 64 + 128) >> 8);
+    dest[2 * dest_pitch] = (unsigned char)((c * 128 + d * 128 + 128) >> 8);
+    dest[3 * dest_pitch] = (unsigned char)((d * 64 + e * 192 + 128) >> 8);
 
-    des[0 * dest_pitch] = (unsigned char)a;
-    des[1 * dest_pitch] = (unsigned char)((b * 192 + c * 64 + 128) >> 8);
-    des[2 * dest_pitch] = (unsigned char)((c * 128 + d * 128 + 128) >> 8);
-    des[3 * dest_pitch] = (unsigned char)((d * 64 + e * 192 + 128) >> 8);
-
-    src++;
-    des++;
+    ++source;
+    ++dest;
   }
 }
 
@@ -114,26 +106,21 @@
                                      unsigned int source_width,
                                      unsigned char *dest,
                                      unsigned int dest_width) {
-  unsigned int i;
-  unsigned int a, b, c, d, e;
-  unsigned char *des = dest;
-  const unsigned char *src = source;
-
+  const unsigned char *const source_end = source + source_width;
   (void)dest_width;
+  while (source < source_end) {
+    const unsigned int a = source[0];
+    const unsigned int b = source[1];
+    const unsigned int c = source[2];
+    const unsigned int d = source[3];
+    const unsigned int e = source[4];
 
-  for (i = 0; i < source_width; i += 5) {
-    a = src[0];
-    b = src[1];
-    c = src[2];
-    d = src[3];
-    e = src[4];
+    dest[0] = (unsigned char)a;
+    dest[1] = (unsigned char)((b * 85 + c * 171 + 128) >> 8);
+    dest[2] = (unsigned char)((d * 171 + e * 85 + 128) >> 8);
 
-    des[0] = (unsigned char)a;
-    des[1] = (unsigned char)((b * 85 + c * 171 + 128) >> 8);
-    des[2] = (unsigned char)((d * 171 + e * 85 + 128) >> 8);
-
-    src += 5;
-    des += 3;
+    source += 5;
+    dest += 3;
   }
 }
 
@@ -141,24 +128,20 @@
                                    unsigned int src_pitch, unsigned char *dest,
                                    unsigned int dest_pitch,
                                    unsigned int dest_width) {
-  unsigned int i;
-  unsigned int a, b, c, d, e;
-  unsigned char *des = dest;
-  unsigned char *src = source;
+  const unsigned char *const dest_end = dest + dest_width;
+  while (dest < dest_end) {
+    const unsigned int a = source[0 * src_pitch];
+    const unsigned int b = source[1 * src_pitch];
+    const unsigned int c = source[2 * src_pitch];
+    const unsigned int d = source[3 * src_pitch];
+    const unsigned int e = source[4 * src_pitch];
 
-  for (i = 0; i < dest_width; i++) {
-    a = src[0 * src_pitch];
-    b = src[1 * src_pitch];
-    c = src[2 * src_pitch];
-    d = src[3 * src_pitch];
-    e = src[4 * src_pitch];
+    dest[0 * dest_pitch] = (unsigned char)a;
+    dest[1 * dest_pitch] = (unsigned char)((b * 85 + c * 171 + 128) >> 8);
+    dest[2 * dest_pitch] = (unsigned char)((d * 171 + e * 85 + 128) >> 8);
 
-    des[0 * dest_pitch] = (unsigned char)a;
-    des[1 * dest_pitch] = (unsigned char)((b * 85 + c * 171 + 128) >> 8);
-    des[2 * dest_pitch] = (unsigned char)((d * 171 + e * 85 + 128) >> 8);
-
-    src++;
-    des++;
+    ++source;
+    ++dest;
   }
 }
 
@@ -186,18 +169,12 @@
                                      unsigned int source_width,
                                      unsigned char *dest,
                                      unsigned int dest_width) {
-  unsigned int i;
-  unsigned int a;
-  unsigned char *des = dest;
-  const unsigned char *src = source;
-
+  const unsigned char *const source_end = source + source_width;
   (void)dest_width;
-
-  for (i = 0; i < source_width; i += 2) {
-    a = src[0];
-    des[0] = (unsigned char)(a);
-    src += 2;
-    des += 1;
+  while (source < source_end) {
+    dest[0] = source[0];
+    source += 2;
+    ++dest;
   }
 }
 
@@ -215,18 +192,14 @@
                                      unsigned char *dest,
                                      unsigned int dest_pitch,
                                      unsigned int dest_width) {
-  int i;
-  int temp;
-  int width = dest_width;
-
+  const unsigned char *const dest_end = dest + dest_width;
   (void)dest_pitch;
-
-  for (i = 0; i < width; i++) {
-    temp = 8;
-    temp += source[i - (int)src_pitch] * 3;
-    temp += source[i] * 10;
-    temp += source[i + src_pitch] * 3;
-    temp >>= 4;
-    dest[i] = (unsigned char)(temp);
+  while (dest < dest_end) {
+    const unsigned int a = source[-src_pitch] * 3;
+    const unsigned int b = source[0] * 10;
+    const unsigned int c = source[src_pitch] * 3;
+    dest[0] = (unsigned char)((8 + a + b + c) >> 4);
+    ++source;
+    ++dest;
   }
 }

diff --git a/aomenc.c b/aomenc.c
index 8eb30ed..497c8d5 100644
--- a/aomenc.c
+++ b/aomenc.c

@@ -1415,9 +1415,8 @@
 #if CONFIG_WEBM_IO
   if (stream->config.write_webm) {
     stream->webm_ctx.stream = stream->file;
-    write_webm_file_header(&stream->webm_ctx, cfg, &global->framerate,
-                           stream->config.stereo_fmt, global->codec->fourcc,
-                           pixel_aspect_ratio);
+    write_webm_file_header(&stream->webm_ctx, cfg, stream->config.stereo_fmt,
+                           global->codec->fourcc, pixel_aspect_ratio);
   }
 #else
   (void)pixel_aspect_ratio;

diff --git a/av1/av1_dx_iface.c b/av1/av1_dx_iface.c
index 7da80f0..43cc3a2 100644
--- a/av1/av1_dx_iface.c
+++ b/av1/av1_dx_iface.c

@@ -828,7 +828,7 @@
 
 static aom_codec_err_t ctrl_copy_reference(aom_codec_alg_priv_t *ctx,
                                            va_list args) {
-  aom_ref_frame_t *data = va_arg(args, aom_ref_frame_t *);
+  const aom_ref_frame_t *const frame = va_arg(args, aom_ref_frame_t *);
 
   // Only support this function in serial decode.
   if (ctx->frame_parallel_decode) {
@@ -836,8 +836,7 @@
     return AOM_CODEC_INCAPABLE;
   }
 
-  if (data) {
-    aom_ref_frame_t *frame = (aom_ref_frame_t *)data;
+  if (frame) {
     YV12_BUFFER_CONFIG sd;
     AVxWorker *const worker = ctx->frame_workers;
     FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;

diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 53c9dc5..e66826f 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl

@@ -391,6 +391,9 @@
 add_proto qw/void av1_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
 specialize qw/av1_fht16x16 sse2 avx2/;
 
+add_proto qw/void av1_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+specialize qw/av1_fht32x32 avx2/;
+
 if (aom_config("CONFIG_EXT_TX") eq "yes") {
   add_proto qw/void av1_fht4x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
   specialize qw/av1_fht4x8 sse2/;
@@ -409,9 +412,6 @@
 
   add_proto qw/void av1_fht32x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
   specialize qw/av1_fht32x16 sse2/;
-
-  add_proto qw/void av1_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/av1_fht32x32 avx2/;
 }
 
 if (aom_config("CONFIG_EMULATE_HARDWARE") eq "yes") {

diff --git a/av1/common/entropymode.c b/av1/common/entropymode.c
index 20e8904..78f4ffe 100644
--- a/av1/common/entropymode.c
+++ b/av1/common/entropymode.c

@@ -892,13 +892,14 @@
 
 #if CONFIG_PALETTE
 int av1_get_palette_color_context(const uint8_t *color_map, int cols, int r,
-                                  int c, int n, int *color_order) {
+                                  int c, int n, uint8_t *color_order,
+                                  int *color_idx) {
   int i, j, max, max_idx, temp;
   int scores[PALETTE_MAX_SIZE + 10];
   int weights[4] = { 3, 2, 3, 2 };
   int color_ctx = 0;
   int color_neighbors[4];
-
+  int inverse_color_order[PALETTE_MAX_SIZE];
   assert(n <= PALETTE_MAX_SIZE);
 
   if (c - 1 >= 0)
@@ -918,7 +919,10 @@
   else
     color_neighbors[3] = -1;
 
-  for (i = 0; i < PALETTE_MAX_SIZE; ++i) color_order[i] = i;
+  for (i = 0; i < PALETTE_MAX_SIZE; ++i) {
+    color_order[i] = i;
+    inverse_color_order[i] = i;
+  }
   memset(scores, 0, PALETTE_MAX_SIZE * sizeof(scores[0]));
   for (i = 0; i < 4; ++i) {
     if (color_neighbors[i] >= 0) scores[color_neighbors[i]] += weights[i];
@@ -944,6 +948,8 @@
       temp = color_order[i];
       color_order[i] = color_order[max_idx];
       color_order[max_idx] = temp;
+      inverse_color_order[color_order[i]] = i;
+      inverse_color_order[color_order[max_idx]] = max_idx;
     }
   }
 
@@ -956,7 +962,9 @@
     }
 
   if (color_ctx >= PALETTE_COLOR_CONTEXTS) color_ctx = 0;
-
+  if (color_idx != NULL) {
+    *color_idx = inverse_color_order[color_map[r * cols + c]];
+  }
   return color_ctx;
 }
 #endif  // CONFIG_PALETTE

diff --git a/av1/common/entropymode.h b/av1/common/entropymode.h
index 85c68e1..68a6400 100644
--- a/av1/common/entropymode.h
+++ b/av1/common/entropymode.h

@@ -359,7 +359,8 @@
 
 #if CONFIG_PALETTE
 int av1_get_palette_color_context(const uint8_t *color_map, int cols, int r,
-                                  int c, int n, int *color_order);
+                                  int c, int n, uint8_t *color_order,
+                                  int *color_idx);
 #endif  // CONFIG_PALETTE
 
 #ifdef __cplusplus

diff --git a/av1/decoder/detokenize.c b/av1/decoder/detokenize.c
index 42679da..f2f74f5 100644
--- a/av1/decoder/detokenize.c
+++ b/av1/decoder/detokenize.c

@@ -297,15 +297,15 @@
 #if CONFIG_PALETTE
 void av1_decode_palette_tokens(MACROBLOCKD *const xd, int plane,
                                aom_reader *r) {
-  MODE_INFO *const mi = xd->mi[0];
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const MODE_INFO *const mi = xd->mi[0];
+  const MB_MODE_INFO *const mbmi = &mi->mbmi;
   const BLOCK_SIZE bsize = mbmi->sb_type;
   const int rows = (4 * num_4x4_blocks_high_lookup[bsize]) >>
                    (xd->plane[plane != 0].subsampling_y);
   const int cols = (4 * num_4x4_blocks_wide_lookup[bsize]) >>
                    (xd->plane[plane != 0].subsampling_x);
-  int color_idx, color_ctx, color_order[PALETTE_MAX_SIZE];
-  int n = mbmi->palette_mode_info.palette_size[plane != 0];
+  uint8_t color_order[PALETTE_MAX_SIZE];
+  const int n = mbmi->palette_mode_info.palette_size[plane != 0];
   int i, j;
   uint8_t *color_map = xd->plane[plane != 0].color_index_map;
   const aom_prob(*const prob)[PALETTE_COLOR_CONTEXTS][PALETTE_COLORS - 1] =
@@ -314,10 +314,10 @@
 
   for (i = 0; i < rows; ++i) {
     for (j = (i == 0 ? 1 : 0); j < cols; ++j) {
-      color_ctx =
-          av1_get_palette_color_context(color_map, cols, i, j, n, color_order);
-      color_idx = aom_read_tree(r, av1_palette_color_tree[n - 2],
-                                prob[n - 2][color_ctx], ACCT_STR);
+      const int color_ctx = av1_get_palette_color_context(color_map, cols, i, j,
+                                                          n, color_order, NULL);
+      const int color_idx = aom_read_tree(r, av1_palette_color_tree[n - 2],
+                                          prob[n - 2][color_ctx], ACCT_STR);
       assert(color_idx >= 0 && color_idx < n);
       color_map[i * cols + j] = color_order[color_idx];
     }

diff --git a/av1/encoder/dct.c b/av1/encoder/dct.c
index 63b71a5..221e3cd 100644
--- a/av1/encoder/dct.c
+++ b/av1/encoder/dct.c

@@ -325,7 +325,6 @@
   range_check(output, 16, 16);
 }
 
-#if CONFIG_EXT_TX
 static void fdct32(const tran_low_t *input, tran_low_t *output) {
   tran_high_t temp;
   tran_low_t step[32];
@@ -723,7 +722,6 @@
 
   range_check(output, 32, 18);
 }
-#endif  // CONFIG_EXT_TX
 
 static void fadst4(const tran_low_t *input, tran_low_t *output) {
   tran_high_t x0, x1, x2, x3;
@@ -1809,57 +1807,74 @@
 }
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
-#if CONFIG_EXT_TX
+// TODO(luoyi): Adding this function to avoid DCT_DCT overflow.
+// Remove this function after we scale the column txfm output correctly.
+static INLINE int range_check_dct32x32(const int16_t *input, int16_t bound,
+                                       int size) {
+  int i;
+  for (i = 0; i < size; ++i) {
+    if (abs(input[i]) > bound) return 1;
+  }
+  return 0;
+}
+
 void av1_fht32x32_c(const int16_t *input, tran_low_t *output, int stride,
                     int tx_type) {
-  if (tx_type == DCT_DCT) {
-    aom_fdct32x32_c(input, output, stride);
-  } else {
-    static const transform_2d FHT[] = {
-      { fdct32, fdct32 },              // DCT_DCT
-      { fhalfright32, fdct32 },        // ADST_DCT
-      { fdct32, fhalfright32 },        // DCT_ADST
-      { fhalfright32, fhalfright32 },  // ADST_ADST
-      { fhalfright32, fdct32 },        // FLIPADST_DCT
-      { fdct32, fhalfright32 },        // DCT_FLIPADST
-      { fhalfright32, fhalfright32 },  // FLIPADST_FLIPADST
-      { fhalfright32, fhalfright32 },  // ADST_FLIPADST
-      { fhalfright32, fhalfright32 },  // FLIPADST_ADST
-      { fidtx32, fidtx32 },            // IDTX
-      { fdct32, fidtx32 },             // V_DCT
-      { fidtx32, fdct32 },             // H_DCT
-      { fhalfright32, fidtx32 },       // V_ADST
-      { fidtx32, fhalfright32 },       // H_ADST
-      { fhalfright32, fidtx32 },       // V_FLIPADST
-      { fidtx32, fhalfright32 },       // H_FLIPADST
-    };
-    const transform_2d ht = FHT[tx_type];
-    tran_low_t out[1024];
-    int i, j;
-    tran_low_t temp_in[32], temp_out[32];
+  static const transform_2d FHT[] = {
+    { fdct32, fdct32 },  // DCT_DCT
+#if CONFIG_EXT_TX
+    { fhalfright32, fdct32 },        // ADST_DCT
+    { fdct32, fhalfright32 },        // DCT_ADST
+    { fhalfright32, fhalfright32 },  // ADST_ADST
+    { fhalfright32, fdct32 },        // FLIPADST_DCT
+    { fdct32, fhalfright32 },        // DCT_FLIPADST
+    { fhalfright32, fhalfright32 },  // FLIPADST_FLIPADST
+    { fhalfright32, fhalfright32 },  // ADST_FLIPADST
+    { fhalfright32, fhalfright32 },  // FLIPADST_ADST
+    { fidtx32, fidtx32 },            // IDTX
+    { fdct32, fidtx32 },             // V_DCT
+    { fidtx32, fdct32 },             // H_DCT
+    { fhalfright32, fidtx32 },       // V_ADST
+    { fidtx32, fhalfright32 },       // H_ADST
+    { fhalfright32, fidtx32 },       // V_FLIPADST
+    { fidtx32, fhalfright32 },       // H_FLIPADST
+#endif
+  };
+  const transform_2d ht = FHT[tx_type];
+  tran_low_t out[1024];
+  int i, j;
+  tran_low_t temp_in[32], temp_out[32];
 
-    int16_t flipped_input[32 * 32];
-    maybe_flip_input(&input, &stride, 32, 32, flipped_input, tx_type);
+#if CONFIG_EXT_TX
+  int16_t flipped_input[32 * 32];
+  maybe_flip_input(&input, &stride, 32, 32, flipped_input, tx_type);
+#endif
 
-    // Columns
-    for (i = 0; i < 32; ++i) {
-      for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
-      ht.cols(temp_in, temp_out);
-      for (j = 0; j < 32; ++j)
-        out[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+  if (DCT_DCT == tx_type) {
+    if (range_check_dct32x32(input, (1 << 6) - 1, 1 << 10)) {
+      aom_fdct32x32_c(input, output, stride);
+      return;
     }
+  }
+  // Columns
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
+    ht.cols(temp_in, temp_out);
+    for (j = 0; j < 32; ++j)
+      out[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+  }
 
-    // Rows
-    for (i = 0; i < 32; ++i) {
-      for (j = 0; j < 32; ++j) temp_in[j] = out[j + i * 32];
-      ht.rows(temp_in, temp_out);
-      for (j = 0; j < 32; ++j)
-        output[j + i * 32] =
-            (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
-    }
+  // Rows
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j) temp_in[j] = out[j + i * 32];
+    ht.rows(temp_in, temp_out);
+    for (j = 0; j < 32; ++j)
+      output[j + i * 32] =
+          (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
   }
 }
 
+#if CONFIG_EXT_TX
 // Forward identity transform.
 void av1_fwd_idtx_c(const int16_t *src_diff, tran_low_t *coeff, int stride,
                     int bs, int tx_type) {

diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index c22c5a8..f1a6f72 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c

@@ -422,39 +422,6 @@
   aom_free(cpi->segmentation_map);
   cpi->segmentation_map = NULL;
 
-#if CONFIG_REF_MV
-  for (i = 0; i < NMV_CONTEXTS; ++i) {
-    aom_free(cpi->nmv_costs[i][0]);
-    aom_free(cpi->nmv_costs[i][1]);
-    aom_free(cpi->nmv_costs_hp[i][0]);
-    aom_free(cpi->nmv_costs_hp[i][1]);
-    cpi->nmv_costs[i][0] = NULL;
-    cpi->nmv_costs[i][1] = NULL;
-    cpi->nmv_costs_hp[i][0] = NULL;
-    cpi->nmv_costs_hp[i][1] = NULL;
-  }
-#endif
-
-  aom_free(cpi->nmvcosts[0]);
-  aom_free(cpi->nmvcosts[1]);
-  cpi->nmvcosts[0] = NULL;
-  cpi->nmvcosts[1] = NULL;
-
-  aom_free(cpi->nmvcosts_hp[0]);
-  aom_free(cpi->nmvcosts_hp[1]);
-  cpi->nmvcosts_hp[0] = NULL;
-  cpi->nmvcosts_hp[1] = NULL;
-
-  aom_free(cpi->nmvsadcosts[0]);
-  aom_free(cpi->nmvsadcosts[1]);
-  cpi->nmvsadcosts[0] = NULL;
-  cpi->nmvsadcosts[1] = NULL;
-
-  aom_free(cpi->nmvsadcosts_hp[0]);
-  aom_free(cpi->nmvsadcosts_hp[1]);
-  cpi->nmvsadcosts_hp[0] = NULL;
-  cpi->nmvsadcosts_hp[1] = NULL;
-
   av1_cyclic_refresh_free(cpi->cyclic_refresh);
   cpi->cyclic_refresh = NULL;
 
@@ -512,27 +479,15 @@
 #if CONFIG_REF_MV
   for (i = 0; i < NMV_CONTEXTS; ++i) {
     av1_copy(cc->nmv_vec_cost[i], cpi->td.mb.nmv_vec_cost[i]);
-    memcpy(cc->nmv_costs[i][0], cpi->nmv_costs[i][0],
-           MV_VALS * sizeof(*cpi->nmv_costs[i][0]));
-    memcpy(cc->nmv_costs[i][1], cpi->nmv_costs[i][1],
-           MV_VALS * sizeof(*cpi->nmv_costs[i][1]));
-    memcpy(cc->nmv_costs_hp[i][0], cpi->nmv_costs_hp[i][0],
-           MV_VALS * sizeof(*cpi->nmv_costs_hp[i][0]));
-    memcpy(cc->nmv_costs_hp[i][1], cpi->nmv_costs_hp[i][1],
-           MV_VALS * sizeof(*cpi->nmv_costs_hp[i][1]));
+    av1_copy(cc->nmv_costs, cpi->nmv_costs);
+    av1_copy(cc->nmv_costs_hp, cpi->nmv_costs_hp);
   }
 #else
   av1_copy(cc->nmvjointcost, cpi->td.mb.nmvjointcost);
 #endif
 
-  memcpy(cc->nmvcosts[0], cpi->nmvcosts[0],
-         MV_VALS * sizeof(*cpi->nmvcosts[0]));
-  memcpy(cc->nmvcosts[1], cpi->nmvcosts[1],
-         MV_VALS * sizeof(*cpi->nmvcosts[1]));
-  memcpy(cc->nmvcosts_hp[0], cpi->nmvcosts_hp[0],
-         MV_VALS * sizeof(*cpi->nmvcosts_hp[0]));
-  memcpy(cc->nmvcosts_hp[1], cpi->nmvcosts_hp[1],
-         MV_VALS * sizeof(*cpi->nmvcosts_hp[1]));
+  av1_copy(cc->nmvcosts, cpi->nmvcosts);
+  av1_copy(cc->nmvcosts_hp, cpi->nmvcosts_hp);
 
   av1_copy(cc->last_ref_lf_deltas, cm->lf.last_ref_deltas);
   av1_copy(cc->last_mode_lf_deltas, cm->lf.last_mode_deltas);
@@ -552,25 +507,15 @@
 #if CONFIG_REF_MV
   for (i = 0; i < NMV_CONTEXTS; ++i) {
     av1_copy(cpi->td.mb.nmv_vec_cost[i], cc->nmv_vec_cost[i]);
-    memcpy(cpi->nmv_costs[i][0], cc->nmv_costs[i][0],
-           MV_VALS * sizeof(*cc->nmv_costs[i][0]));
-    memcpy(cpi->nmv_costs[i][1], cc->nmv_costs[i][1],
-           MV_VALS * sizeof(*cc->nmv_costs[i][1]));
-    memcpy(cpi->nmv_costs_hp[i][0], cc->nmv_costs_hp[i][0],
-           MV_VALS * sizeof(*cc->nmv_costs_hp[i][0]));
-    memcpy(cpi->nmv_costs_hp[i][1], cc->nmv_costs_hp[i][1],
-           MV_VALS * sizeof(*cc->nmv_costs_hp[i][1]));
+    av1_copy(cpi->nmv_costs, cc->nmv_costs);
+    av1_copy(cpi->nmv_costs_hp, cc->nmv_costs_hp);
   }
 #else
   av1_copy(cpi->td.mb.nmvjointcost, cc->nmvjointcost);
 #endif
 
-  memcpy(cpi->nmvcosts[0], cc->nmvcosts[0], MV_VALS * sizeof(*cc->nmvcosts[0]));
-  memcpy(cpi->nmvcosts[1], cc->nmvcosts[1], MV_VALS * sizeof(*cc->nmvcosts[1]));
-  memcpy(cpi->nmvcosts_hp[0], cc->nmvcosts_hp[0],
-         MV_VALS * sizeof(*cc->nmvcosts_hp[0]));
-  memcpy(cpi->nmvcosts_hp[1], cc->nmvcosts_hp[1],
-         MV_VALS * sizeof(*cc->nmvcosts_hp[1]));
+  av1_copy(cpi->nmvcosts, cc->nmvcosts);
+  av1_copy(cpi->nmvcosts_hp, cc->nmvcosts_hp);
 
   av1_copy(cm->lf.last_ref_deltas, cc->last_ref_lf_deltas);
   av1_copy(cm->lf.last_mode_deltas, cc->last_mode_lf_deltas);
@@ -2117,33 +2062,15 @@
 
 #if CONFIG_REF_MV
   for (i = 0; i < NMV_CONTEXTS; ++i) {
-    CHECK_MEM_ERROR(cm, cpi->nmv_costs[i][0],
-                    aom_calloc(MV_VALS, sizeof(*cpi->nmv_costs[i][0])));
-    CHECK_MEM_ERROR(cm, cpi->nmv_costs[i][1],
-                    aom_calloc(MV_VALS, sizeof(*cpi->nmv_costs[i][1])));
-    CHECK_MEM_ERROR(cm, cpi->nmv_costs_hp[i][0],
-                    aom_calloc(MV_VALS, sizeof(*cpi->nmv_costs_hp[i][0])));
-    CHECK_MEM_ERROR(cm, cpi->nmv_costs_hp[i][1],
-                    aom_calloc(MV_VALS, sizeof(*cpi->nmv_costs_hp[i][1])));
+    memset(cpi->nmv_costs, 0, sizeof(cpi->nmv_costs));
+    memset(cpi->nmv_costs_hp, 0, sizeof(cpi->nmv_costs_hp));
   }
 #endif
 
-  CHECK_MEM_ERROR(cm, cpi->nmvcosts[0],
-                  aom_calloc(MV_VALS, sizeof(*cpi->nmvcosts[0])));
-  CHECK_MEM_ERROR(cm, cpi->nmvcosts[1],
-                  aom_calloc(MV_VALS, sizeof(*cpi->nmvcosts[1])));
-  CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[0],
-                  aom_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[0])));
-  CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[1],
-                  aom_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[1])));
-  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[0],
-                  aom_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[0])));
-  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[1],
-                  aom_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[1])));
-  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[0],
-                  aom_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[0])));
-  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[1],
-                  aom_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[1])));
+  memset(cpi->nmvcosts, 0, sizeof(cpi->nmvcosts));
+  memset(cpi->nmvcosts_hp, 0, sizeof(cpi->nmvcosts_hp));
+  memset(cpi->nmvsadcosts, 0, sizeof(cpi->nmvsadcosts));
+  memset(cpi->nmvsadcosts_hp, 0, sizeof(cpi->nmvsadcosts_hp));
 
   for (i = 0; i < (sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0]));
        i++) {

diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index b55481b..0c66905 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h

@@ -414,14 +414,14 @@
   CODING_CONTEXT coding_context;
 
 #if CONFIG_REF_MV
-  int *nmv_costs[NMV_CONTEXTS][2];
-  int *nmv_costs_hp[NMV_CONTEXTS][2];
+  int nmv_costs[NMV_CONTEXTS][2][MV_VALS];
+  int nmv_costs_hp[NMV_CONTEXTS][2][MV_VALS];
 #endif
 
-  int *nmvcosts[2];
-  int *nmvcosts_hp[2];
-  int *nmvsadcosts[2];
-  int *nmvsadcosts_hp[2];
+  int nmvcosts[2][MV_VALS];
+  int nmvcosts_hp[2][MV_VALS];
+  int nmvsadcosts[2][MV_VALS];
+  int nmvsadcosts_hp[2][MV_VALS];
 
   int64_t last_time_stamp_seen;
   int64_t last_end_time_stamp_seen;

diff --git a/av1/encoder/hybrid_fwd_txfm.c b/av1/encoder/hybrid_fwd_txfm.c
index 1103c4b..6d5eccd 100644
--- a/av1/encoder/hybrid_fwd_txfm.c
+++ b/av1/encoder/hybrid_fwd_txfm.c

@@ -21,7 +21,7 @@
   if (rd_transform)
     aom_fdct32x32_rd(src, dst, src_stride);
   else
-    aom_fdct32x32(src, dst, src_stride);
+    av1_fht32x32(src, dst, src_stride, DCT_DCT);
 }
 
 static void fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,

diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 8d151a7..8ba6b7b 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c

@@ -1771,8 +1771,7 @@
   if (colors > 1 && colors <= 64) {
     int r, c, i, j, k;
     const int max_itr = 50;
-    int color_ctx, color_idx = 0;
-    int color_order[PALETTE_MAX_SIZE];
+    uint8_t color_order[PALETTE_MAX_SIZE];
     float *const data = x->palette_buffer->kmeans_data_buf;
     float centroids[PALETTE_MAX_SIZE];
     uint8_t *const color_map = xd->plane[0].color_index_map;
@@ -1856,13 +1855,9 @@
               1);
       for (i = 0; i < rows; ++i) {
         for (j = (i == 0 ? 1 : 0); j < cols; ++j) {
-          color_ctx = av1_get_palette_color_context(color_map, cols, i, j, k,
-                                                    color_order);
-          for (r = 0; r < k; ++r)
-            if (color_map[i * cols + j] == color_order[r]) {
-              color_idx = r;
-              break;
-            }
+          int color_idx;
+          const int color_ctx = av1_get_palette_color_context(
+              color_map, cols, i, j, k, color_order, &color_idx);
           assert(color_idx >= 0 && color_idx < k);
           this_rate += cpi->palette_y_color_cost[k - 2][color_ctx][color_idx];
         }
@@ -2507,7 +2502,7 @@
 
 static void angle_estimation(const uint8_t *src, int src_stride, int rows,
                              int cols, uint8_t *directional_mode_skip_mask) {
-  int i, r, c, dx, dy, temp, sn, remd, quot;
+  int i, r, c, index, dx, dy, temp, sn, remd, quot;
   uint64_t hist[DIRECTIONAL_MODES];
   uint64_t hist_sum = 0;
 
@@ -2515,7 +2510,6 @@
   src += src_stride;
   for (r = 1; r < rows; ++r) {
     for (c = 1; c < cols; ++c) {
-      uint8_t index;
       dx = src[c] - src[c - 1];
       dy = src[c] - src[c - src_stride];
       temp = dx * dx + dy * dy;
@@ -2538,16 +2532,16 @@
   for (i = 0; i < DIRECTIONAL_MODES; ++i) hist_sum += hist[i];
   for (i = 0; i < INTRA_MODES; ++i) {
     if (i != DC_PRED && i != TM_PRED) {
-      const uint8_t index = mode_to_angle_bin[i];
-      uint64_t score = 2 * hist[index];
+      const uint8_t angle_bin = mode_to_angle_bin[i];
+      uint64_t score = 2 * hist[angle_bin];
       int weight = 2;
-      if (index > 0) {
-        score += hist[index - 1];
-        weight += 1;
+      if (angle_bin > 0) {
+        score += hist[angle_bin - 1];
+        ++weight;
       }
-      if (index < DIRECTIONAL_MODES - 1) {
-        score += hist[index + 1];
-        weight += 1;
+      if (angle_bin < DIRECTIONAL_MODES - 1) {
+        score += hist[angle_bin + 1];
+        ++weight;
       }
       if (score * ANGLE_SKIP_THRESH < hist_sum * weight)
         directional_mode_skip_mask[i] = 1;
@@ -2559,7 +2553,7 @@
 static void highbd_angle_estimation(const uint8_t *src8, int src_stride,
                                     int rows, int cols,
                                     uint8_t *directional_mode_skip_mask) {
-  int i, r, c, dx, dy, temp, sn, remd, quot;
+  int i, r, c, index, dx, dy, temp, sn, remd, quot;
   uint64_t hist[DIRECTIONAL_MODES];
   uint64_t hist_sum = 0;
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
@@ -2568,7 +2562,6 @@
   src += src_stride;
   for (r = 1; r < rows; ++r) {
     for (c = 1; c < cols; ++c) {
-      uint8_t index;
       dx = src[c] - src[c - 1];
       dy = src[c] - src[c - src_stride];
       temp = dx * dx + dy * dy;
@@ -2591,16 +2584,16 @@
   for (i = 0; i < DIRECTIONAL_MODES; ++i) hist_sum += hist[i];
   for (i = 0; i < INTRA_MODES; ++i) {
     if (i != DC_PRED && i != TM_PRED) {
-      const uint8_t index = mode_to_angle_bin[i];
-      uint64_t score = 2 * hist[index];
+      const uint8_t angle_bin = mode_to_angle_bin[i];
+      uint64_t score = 2 * hist[angle_bin];
       int weight = 2;
-      if (index > 0) {
-        score += hist[index - 1];
-        weight += 1;
+      if (angle_bin > 0) {
+        score += hist[angle_bin - 1];
+        ++weight;
       }
-      if (index < DIRECTIONAL_MODES - 1) {
-        score += hist[index + 1];
-        weight += 1;
+      if (angle_bin < DIRECTIONAL_MODES - 1) {
+        score += hist[angle_bin + 1];
+        ++weight;
       }
       if (score * ANGLE_SKIP_THRESH < hist_sum * weight)
         directional_mode_skip_mask[i] = 1;
@@ -3654,8 +3647,7 @@
   if (colors > 1 && colors <= 64) {
     int r, c, n, i, j;
     const int max_itr = 50;
-    int color_ctx, color_idx = 0;
-    int color_order[PALETTE_MAX_SIZE];
+    uint8_t color_order[PALETTE_MAX_SIZE];
     int64_t this_sse;
     float lb_u, ub_u, val_u;
     float lb_v, ub_v, val_v;
@@ -3748,13 +3740,9 @@
 
       for (i = 0; i < rows; ++i) {
         for (j = (i == 0 ? 1 : 0); j < cols; ++j) {
-          color_ctx = av1_get_palette_color_context(color_map, cols, i, j, n,
-                                                    color_order);
-          for (r = 0; r < n; ++r)
-            if (color_map[i * cols + j] == color_order[r]) {
-              color_idx = r;
-              break;
-            }
+          int color_idx;
+          const int color_ctx = av1_get_palette_color_context(
+              color_map, cols, i, j, n, color_order, &color_idx);
           assert(color_idx >= 0 && color_idx < n);
           this_rate += cpi->palette_uv_color_cost[n - 2][color_ctx][color_idx];
         }
@@ -9385,7 +9373,7 @@
     int best_rate_nocoef;
 #endif
     int64_t distortion2 = 0, distortion_y = 0, dummy_rd = best_rd, this_rd;
-    int skippable = 0;
+    int skippable = 0, rate_overhead = 0;
     TX_SIZE best_tx_size, uv_tx;
     TX_TYPE best_tx_type;
     PALETTE_MODE_INFO palette_mode_info;

diff --git a/av1/encoder/tokenize.c b/av1/encoder/tokenize.c
index 13abe6b..67f4b5d 100644
--- a/av1/encoder/tokenize.c
+++ b/av1/encoder/tokenize.c

@@ -410,18 +410,19 @@
 }
 
 #if CONFIG_PALETTE
-void av1_tokenize_palette_sb(const AV1_COMP *cpi, struct ThreadData *const td,
-                             int plane, TOKENEXTRA **t, RUN_TYPE dry_run,
-                             BLOCK_SIZE bsize, int *rate) {
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  uint8_t *color_map = xd->plane[plane != 0].color_index_map;
-  PALETTE_MODE_INFO *pmi = &mbmi->palette_mode_info;
-  int n = pmi->palette_size[plane != 0];
-  int i, j, k;
+void av1_tokenize_palette_sb(const AV1_COMP *cpi,
+                             const struct ThreadData *const td, int plane,
+                             TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                             int *rate) {
+  const MACROBLOCK *const x = &td->mb;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const uint8_t *const color_map = xd->plane[plane != 0].color_index_map;
+  const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const int n = pmi->palette_size[plane != 0];
+  int i, j;
   int this_rate = 0;
-  int color_idx = -1, color_ctx, color_order[PALETTE_MAX_SIZE];
+  uint8_t color_order[PALETTE_MAX_SIZE];
   const int rows = (4 * num_4x4_blocks_high_lookup[bsize]) >>
                    (xd->plane[plane != 0].subsampling_y);
   const int cols = (4 * num_4x4_blocks_wide_lookup[bsize]) >>
@@ -432,17 +433,13 @@
 
   for (i = 0; i < rows; ++i) {
     for (j = (i == 0 ? 1 : 0); j < cols; ++j) {
-      color_ctx =
-          av1_get_palette_color_context(color_map, cols, i, j, n, color_order);
-      for (k = 0; k < n; ++k)
-        if (color_map[i * cols + j] == color_order[k]) {
-          color_idx = k;
-          break;
-        }
-      assert(color_idx >= 0 && color_idx < n);
+      int color_new_idx;
+      const int color_ctx = av1_get_palette_color_context(
+          color_map, cols, i, j, n, color_order, &color_new_idx);
+      assert(color_new_idx >= 0 && color_new_idx < n);
       if (dry_run == DRY_RUN_COSTCOEFFS)
-        this_rate += cpi->palette_y_color_cost[n - 2][color_ctx][color_idx];
-      (*t)->token = color_idx;
+        this_rate += cpi->palette_y_color_cost[n - 2][color_ctx][color_new_idx];
+      (*t)->token = color_new_idx;
       (*t)->context_tree = probs[n - 2][color_ctx];
       (*t)->skip_eob_node = 0;
       ++(*t);

diff --git a/av1/encoder/tokenize.h b/av1/encoder/tokenize.h
index ae896a6..89610df 100644
--- a/av1/encoder/tokenize.h
+++ b/av1/encoder/tokenize.h

@@ -72,7 +72,7 @@
 #endif
 #if CONFIG_PALETTE
 void av1_tokenize_palette_sb(const struct AV1_COMP *cpi,
-                             struct ThreadData *const td, int plane,
+                             const struct ThreadData *const td, int plane,
                              TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize,
                              int *rate);
 #endif  // CONFIG_PALETTE

diff --git a/av1/encoder/x86/hybrid_fwd_txfm_avx2.c b/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
index 69bf89a..928af13 100644
--- a/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
+++ b/av1/encoder/x86/hybrid_fwd_txfm_avx2.c

@@ -198,8 +198,8 @@
   in[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31);
 }
 
-static void load_buffer_16x16(const int16_t *input, int stride, int flipud,
-                              int fliplr, __m256i *in) {
+static INLINE void load_buffer_16x16(const int16_t *input, int stride,
+                                     int flipud, int fliplr, __m256i *in) {
   if (!flipud) {
     in[0] = _mm256_loadu_si256((const __m256i *)(input + 0 * stride));
     in[1] = _mm256_loadu_si256((const __m256i *)(input + 1 * stride));
@@ -1273,7 +1273,6 @@
   _mm256_zeroupper();
 }
 
-#if CONFIG_EXT_TX
 static void mm256_vectors_swap(__m256i *a0, __m256i *a1, const int size) {
   int i = 0;
   __m256i temp;
@@ -1622,7 +1621,6 @@
 
   mm256_transpose_32x32(in0, in1);
 }
-#endif  // CONFIG_EXT_TX
 
 static INLINE void write_buffer_32x32(const __m256i *in0, const __m256i *in1,
                                       int stride, tran_low_t *output) {
@@ -1667,9 +1665,11 @@
   mm256_vectors_swap(in1, &in1[16], 16);
   mm256_transpose_32x32(in0, in1);
 }
+#endif  // CONFIG_EXT_TX
 
-static void load_buffer_32x32(const int16_t *input, int stride, int flipud,
-                              int fliplr, __m256i *in0, __m256i *in1) {
+static INLINE void load_buffer_32x32(const int16_t *input, int stride,
+                                     int flipud, int fliplr, __m256i *in0,
+                                     __m256i *in1) {
   // Load 4 16x16 blocks
   const int16_t *topL = input;
   const int16_t *topR = input + 16;
@@ -1708,7 +1708,6 @@
   load_buffer_16x16(topR, stride, flipud, fliplr, in1);
   load_buffer_16x16(botR, stride, flipud, fliplr, in1 + 16);
 }
-#endif  // CONFIG_EXT_TX
 
 static void nr_right_shift_32x32_16col(__m256i *in) {
   int i = 0;
@@ -1729,8 +1728,7 @@
   nr_right_shift_32x32_16col(in1);
 }
 
-#if CONFIG_EXT_TX
-static void pr_right_shift_32x32_16col(__m256i *in) {
+static INLINE void pr_right_shift_32x32_16col(__m256i *in) {
   int i = 0;
   const __m256i zero = _mm256_setzero_si256();
   const __m256i one = _mm256_set1_epi16(1);
@@ -1745,11 +1743,12 @@
 }
 
 // Positive rounding
-static void pr_right_shift_32x32(__m256i *in0, __m256i *in1) {
+static INLINE void pr_right_shift_32x32(__m256i *in0, __m256i *in1) {
   pr_right_shift_32x32_16col(in0);
   pr_right_shift_32x32_16col(in1);
 }
 
+#if CONFIG_EXT_TX
 static void fidtx32_avx2(__m256i *in0, __m256i *in1) {
   int i = 0;
   while (i < 32) {
@@ -1761,23 +1760,42 @@
 }
 #endif
 
+static INLINE int range_check_dct32x32(const __m256i *in0, const __m256i *in1,
+                                       int row) {
+  __m256i value, bits0, bits1;
+  const __m256i bound = _mm256_set1_epi16((1 << 6) - 1);
+  int flag;
+  int i = 0;
+
+  while (i < row) {
+    value = _mm256_abs_epi16(in0[i]);
+    bits0 = _mm256_cmpgt_epi16(value, bound);
+    value = _mm256_abs_epi16(in1[i]);
+    bits1 = _mm256_cmpgt_epi16(value, bound);
+    bits0 = _mm256_or_si256(bits0, bits1);
+    flag = _mm256_movemask_epi8(bits0);
+    if (flag) return 1;
+    i++;
+  }
+  return 0;
+}
+
 void av1_fht32x32_avx2(const int16_t *input, tran_low_t *output, int stride,
                        int tx_type) {
   __m256i in0[32];  // left 32 columns
   __m256i in1[32];  // right 32 columns
-  (void)input;
-  (void)stride;
 
   switch (tx_type) {
-// TODO(luoyi): For DCT_DCT, fwd_txfm_32x32() uses aom set. But this
-// function has better speed. The replacement must work with the
-// corresponding inverse transform.
-// case DCT_DCT:
-//   load_buffer_32x32(input, stride, 0, 0, in0, in1);
-//   fdct32_avx2(in0, in1);
-//   pr_right_shift_32x32(in0, in1);
-//   fdct32_avx2(in0, in1);
-//   break;
+    case DCT_DCT:
+      load_buffer_32x32(input, stride, 0, 0, in0, in1);
+      if (range_check_dct32x32(in0, in1, 32)) {
+        aom_fdct32x32_avx2(input, output, stride);
+        return;
+      }
+      fdct32_avx2(in0, in1);
+      pr_right_shift_32x32(in0, in1);
+      fdct32_avx2(in0, in1);
+      break;
 #if CONFIG_EXT_TX
     case ADST_DCT:
       load_buffer_32x32(input, stride, 0, 0, in0, in1);

diff --git a/configure b/configure
index 99d2bb8..2659d37 100755
--- a/configure
+++ b/configure

@@ -616,20 +616,18 @@
         check_add_cflags -Wvla
         check_add_cflags -Wimplicit-function-declaration
         check_add_cflags -Wuninitialized
-        check_add_cflags -Wunused-variable
+        check_add_cflags -Wunused
         check_add_cflags -Wsign-compare
+        # Enabling the following warning (in combination with -Wunused above)
+        # for C++ generates errors in third_party code including googletest and
+        # libyuv. So enable it only for C code.
+        check_cflags "-Wextra" && add_cflags_only "-Wextra"
         # Enabling the following warning for C++ generates some useless warnings
         # about some function parameters shadowing class member function names.
         # So, only enable this warning for C code.
         check_cflags "-Wshadow" && add_cflags_only "-Wshadow"
-        case ${CC} in
-          *clang*) ;;
-          *) check_add_cflags -Wunused-but-set-variable ;;
-        esac
         if enabled mips || [ -z "${INLINE}" ]; then
           enabled extra_warnings || check_add_cflags -Wno-unused-function
-        else
-          check_add_cflags -Wunused-function
         fi
     fi
 

diff --git a/examples/aom_cx_set_ref.c b/examples/aom_cx_set_ref.c
index fdb9739..6beb4fb 100644
--- a/examples/aom_cx_set_ref.c
+++ b/examples/aom_cx_set_ref.c

@@ -191,8 +191,7 @@
 }
 
 static void testing_decode(aom_codec_ctx_t *encoder, aom_codec_ctx_t *decoder,
-                           aom_codec_enc_cfg_t *cfg, unsigned int frame_out,
-                           int *mismatch_seen) {
+                           unsigned int frame_out, int *mismatch_seen) {
   aom_image_t enc_img, dec_img;
   struct av1_ref_frame ref_enc, ref_dec;
 
@@ -226,11 +225,10 @@
   aom_img_free(&dec_img);
 }
 
-static int encode_frame(aom_codec_ctx_t *ecodec, aom_codec_enc_cfg_t *cfg,
-                        aom_image_t *img, unsigned int frame_in,
-                        AvxVideoWriter *writer, int test_decode,
-                        aom_codec_ctx_t *dcodec, unsigned int *frame_out,
-                        int *mismatch_seen) {
+static int encode_frame(aom_codec_ctx_t *ecodec, aom_image_t *img,
+                        unsigned int frame_in, AvxVideoWriter *writer,
+                        int test_decode, aom_codec_ctx_t *dcodec,
+                        unsigned int *frame_out, int *mismatch_seen) {
   int got_pkts = 0;
   aom_codec_iter_t iter = NULL;
   const aom_codec_cx_pkt_t *pkt = NULL;
@@ -271,7 +269,7 @@
 
   // Mismatch checking
   if (got_data && test_decode) {
-    testing_decode(ecodec, dcodec, cfg, *frame_out, mismatch_seen);
+    testing_decode(ecodec, dcodec, *frame_out, mismatch_seen);
   }
 
   return got_pkts;
@@ -280,12 +278,12 @@
 int main(int argc, char **argv) {
   FILE *infile = NULL;
   // Encoder
-  aom_codec_ctx_t ecodec = { 0 };
-  aom_codec_enc_cfg_t cfg = { 0 };
+  aom_codec_ctx_t ecodec;
+  aom_codec_enc_cfg_t cfg;
   unsigned int frame_in = 0;
   aom_image_t raw;
   aom_codec_err_t res;
-  AvxVideoInfo info = { 0 };
+  AvxVideoInfo info;
   AvxVideoWriter *writer = NULL;
   const AvxInterface *encoder = NULL;
 
@@ -311,6 +309,12 @@
   unsigned int limit = 0;
   exec_name = argv[0];
 
+  // Clear explicitly, as simply assigning "{ 0 }" generates
+  // "missing-field-initializers" warning in some compilers.
+  memset(&ecodec, 0, sizeof(ecodec));
+  memset(&cfg, 0, sizeof(cfg));
+  memset(&info, 0, sizeof(info));
+
   if (argc < 7) die("Invalid number of arguments");
 
   codec_arg = argv[1];
@@ -404,7 +408,7 @@
       }
     }
 
-    encode_frame(&ecodec, &cfg, &raw, frame_in, writer, test_decode, &dcodec,
+    encode_frame(&ecodec, &raw, frame_in, writer, test_decode, &dcodec,
                  &frame_out, &mismatch_seen);
     frame_in++;
     if (mismatch_seen) break;
@@ -412,8 +416,8 @@
 
   // Flush encoder.
   if (!mismatch_seen)
-    while (encode_frame(&ecodec, &cfg, NULL, frame_in, writer, test_decode,
-                        &dcodec, &frame_out, &mismatch_seen)) {
+    while (encode_frame(&ecodec, NULL, frame_in, writer, test_decode, &dcodec,
+                        &frame_out, &mismatch_seen)) {
     }
 
   printf("\n");

diff --git a/examples/lossless_encoder.c b/examples/lossless_encoder.c
index 069e35e..1abeb27 100644
--- a/examples/lossless_encoder.c
+++ b/examples/lossless_encoder.c

@@ -63,13 +63,17 @@
   int frame_count = 0;
   aom_image_t raw;
   aom_codec_err_t res;
-  AvxVideoInfo info = { 0 };
+  AvxVideoInfo info;
   AvxVideoWriter *writer = NULL;
   const AvxInterface *encoder = NULL;
   const int fps = 30;
 
   exec_name = argv[0];
 
+  // Clear explicitly, as simply assigning "{ 0 }" generates
+  // "missing-field-initializers" warning in some compilers.
+  memset(&info, 0, sizeof(info));
+
   if (argc < 5) die("Invalid number of arguments");
 
   encoder = get_aom_encoder_by_name("av1");

diff --git a/examples/simple_encoder.c b/examples/simple_encoder.c
index 418757d..1d2b51e 100644
--- a/examples/simple_encoder.c
+++ b/examples/simple_encoder.c

@@ -151,7 +151,7 @@
   int frame_count = 0;
   aom_image_t raw;
   aom_codec_err_t res;
-  AvxVideoInfo info = { 0 };
+  AvxVideoInfo info;
   AvxVideoWriter *writer = NULL;
   const AvxInterface *encoder = NULL;
   const int fps = 30;
@@ -168,6 +168,10 @@
 
   exec_name = argv[0];
 
+  // Clear explicitly, as simply assigning "{ 0 }" generates
+  // "missing-field-initializers" warning in some compilers.
+  memset(&info, 0, sizeof(info));
+
   if (argc != 9) die("Invalid number of arguments");
 
   codec_arg = argv[1];

diff --git a/test/av1_convolve_optimz_test.cc b/test/av1_convolve_optimz_test.cc
index b83ae94..b891e99 100644
--- a/test/av1_convolve_optimz_test.cc
+++ b/test/av1_convolve_optimz_test.cc

@@ -54,7 +54,6 @@
 const size_t maxBlockSize = maxWidth * maxHeight;
 const int horizOffset = 32;
 const int vertiOffset = 32;
-const size_t testMaxBlk = 128;
 const int stride = 128;
 const int x_step_q4 = 16;
 
@@ -90,7 +89,7 @@
   void RunVertFilterBitExactCheck();
 
  private:
-  void PrepFilterBuffer(int w, int h);
+  void PrepFilterBuffer();
   void DiffFilterBuffer();
   conv_filter_t conv_horiz_;
   conv_filter_t conv_vert_;
@@ -106,7 +105,7 @@
   int avg_;
 };
 
-void AV1ConvolveOptimzTest::PrepFilterBuffer(int w, int h) {
+void AV1ConvolveOptimzTest::PrepFilterBuffer() {
   int r, c;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
 
@@ -150,7 +149,7 @@
 }
 
 void AV1ConvolveOptimzTest::RunHorizFilterBitExactCheck() {
-  PrepFilterBuffer(testMaxBlk, testMaxBlk);
+  PrepFilterBuffer();
 
   InterpFilterParams filter_params = av1_get_interp_filter_params(filter_);
 
@@ -167,7 +166,7 @@
   // and test again.
   int intermediate_height =
       (((height_ - 1) * 16 + subpel_) >> SUBPEL_BITS) + filter_params.taps;
-  PrepFilterBuffer(testMaxBlk, testMaxBlk);
+  PrepFilterBuffer();
 
   av1_convolve_horiz_c(src_ref_, stride, dst_ref_, stride, width_,
                        intermediate_height, filter_params, subpel_, x_step_q4,
@@ -180,7 +179,7 @@
 }
 
 void AV1ConvolveOptimzTest::RunVertFilterBitExactCheck() {
-  PrepFilterBuffer(testMaxBlk, testMaxBlk);
+  PrepFilterBuffer();
 
   InterpFilterParams filter_params = av1_get_interp_filter_params(filter_);
 
@@ -266,7 +265,7 @@
   void RunVertFilterBitExactCheck();
 
  private:
-  void PrepFilterBuffer(int w, int h);
+  void PrepFilterBuffer();
   void DiffFilterBuffer();
   hbd_conv_filter_t conv_horiz_;
   hbd_conv_filter_t conv_vert_;
@@ -283,7 +282,7 @@
   int bit_depth_;
 };
 
-void AV1HbdConvolveOptimzTest::PrepFilterBuffer(int w, int h) {
+void AV1HbdConvolveOptimzTest::PrepFilterBuffer() {
   int r, c;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
 
@@ -326,7 +325,7 @@
 }
 
 void AV1HbdConvolveOptimzTest::RunHorizFilterBitExactCheck() {
-  PrepFilterBuffer(testMaxBlk, testMaxBlk);
+  PrepFilterBuffer();
 
   InterpFilterParams filter_params = av1_get_interp_filter_params(filter_);
 
@@ -344,7 +343,7 @@
   // and test again.
   int intermediate_height =
       (((height_ - 1) * 16 + subpel_) >> SUBPEL_BITS) + filter_params.taps;
-  PrepFilterBuffer(testMaxBlk, testMaxBlk);
+  PrepFilterBuffer();
 
   av1_highbd_convolve_horiz_c(src_, stride, dst_ref_, stride, width_,
                               intermediate_height, filter_params, subpel_,
@@ -357,7 +356,7 @@
 }
 
 void AV1HbdConvolveOptimzTest::RunVertFilterBitExactCheck() {
-  PrepFilterBuffer(testMaxBlk, testMaxBlk);
+  PrepFilterBuffer();
 
   InterpFilterParams filter_params = av1_get_interp_filter_params(filter_);
 

diff --git a/test/av1_dct_test.cc b/test/av1_dct_test.cc
index ac1a551..d5c23f6 100644
--- a/test/av1_dct_test.cc
+++ b/test/av1_dct_test.cc

@@ -102,5 +102,6 @@
     C, AV1FwdTxfm,
     ::testing::Values(FdctParam(&fdct4, &reference_dct_1d, 4, 1),
                       FdctParam(&fdct8, &reference_dct_1d, 8, 1),
-                      FdctParam(&fdct16, &reference_dct_1d, 16, 2)));
+                      FdctParam(&fdct16, &reference_dct_1d, 16, 2),
+                      FdctParam(&fdct32, &reference_dct_1d, 32, 3)));
 }  // namespace

diff --git a/test/codec_factory.h b/test/codec_factory.h
index c92d5c1..b645102 100644
--- a/test/codec_factory.h
+++ b/test/codec_factory.h

@@ -123,6 +123,9 @@
 #if CONFIG_AV1_DECODER
     return new AV1Decoder(cfg, flags, deadline);
 #else
+    (void)cfg;
+    (void)flags;
+    (void)deadline;
     return NULL;
 #endif
   }
@@ -134,6 +137,10 @@
 #if CONFIG_AV1_ENCODER
     return new AV1Encoder(cfg, deadline, init_flags, stats);
 #else
+    (void)cfg;
+    (void)deadline;
+    (void)init_flags;
+    (void)stats;
     return NULL;
 #endif
   }
@@ -143,6 +150,8 @@
 #if CONFIG_AV1_ENCODER
     return aom_codec_enc_config_default(&aom_codec_av1_cx_algo, cfg, usage);
 #else
+    (void)cfg;
+    (void)usage;
     return AOM_CODEC_INCAPABLE;
 #endif
   }

diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index e73daa5..9811955 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc

@@ -264,12 +264,12 @@
 }
 
 void idct16x16_10_ref(const tran_low_t *in, uint8_t *out, int stride,
-                      int tx_type) {
+                      int /*tx_type*/) {
   idct16x16_10(in, out, stride);
 }
 
 void idct16x16_12_ref(const tran_low_t *in, uint8_t *out, int stride,
-                      int tx_type) {
+                      int /*tx_type*/) {
   idct16x16_12(in, out, stride);
 }
 
@@ -727,7 +727,7 @@
   virtual void TearDown() { libaom_test::ClearSystemState(); }
 
  protected:
-  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {}
+  void RunFwdTxfm(int16_t * /*in*/, tran_low_t * /*out*/, int /*stride*/) {}
   void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
     inv_txfm_(out, dst, stride);
   }

diff --git a/test/decode_test_driver.cc b/test/decode_test_driver.cc
index 95a0eb5..7adb9d6 100644
--- a/test/decode_test_driver.cc
+++ b/test/decode_test_driver.cc

@@ -92,7 +92,7 @@
 
       aom_codec_err_t res_dec =
           decoder->DecodeFrame(video->cxdata(), video->frame_size());
-      if (!HandleDecodeResult(res_dec, *video, decoder)) break;
+      if (!HandleDecodeResult(res_dec, decoder)) break;
     } else {
       // Signal end of the file to the decoder.
       const aom_codec_err_t res_dec = decoder->DecodeFrame(NULL, 0);

diff --git a/test/decode_test_driver.h b/test/decode_test_driver.h
index aabca40..b8f8d1a 100644
--- a/test/decode_test_driver.h
+++ b/test/decode_test_driver.h

@@ -141,7 +141,6 @@
 
   // Hook to be called to handle decode result. Return true to continue.
   virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
-                                  const CompressedVideoSource & /*video*/,
                                   Decoder *decoder) {
     EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
     return AOM_CODEC_OK == res_dec;

diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc
index c1a0cb7..092e669 100644
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc

@@ -275,7 +275,7 @@
               aom_codec_err_t res_dec = decoder->DecodeFrame(
                   (const uint8_t *)pkt->data.frame.buf, pkt->data.frame.sz);
 
-              if (!HandleDecodeResult(res_dec, *video, decoder.get())) break;
+              if (!HandleDecodeResult(res_dec, decoder.get())) break;
 
               has_dxdata = true;
             }
@@ -293,7 +293,7 @@
       // Flush the decoder when there are no more fragments.
       if ((init_flags_ & AOM_CODEC_USE_OUTPUT_PARTITION) && has_dxdata) {
         const aom_codec_err_t res_dec = decoder->DecodeFrame(NULL, 0);
-        if (!HandleDecodeResult(res_dec, *video, decoder.get())) break;
+        if (!HandleDecodeResult(res_dec, decoder.get())) break;
       }
 
       if (has_dxdata && has_cxdata) {

diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h
index 11c387a..45a080e 100644
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h

@@ -228,7 +228,6 @@
 
   // Hook to be called to handle decode result. Return true to continue.
   virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
-                                  const VideoSource & /*video*/,
                                   Decoder *decoder) {
     EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
     return AOM_CODEC_OK == res_dec;

diff --git a/test/encoder_parms_get_to_decoder.cc b/test/encoder_parms_get_to_decoder.cc
index 640e12f..52d68b1 100644
--- a/test/encoder_parms_get_to_decoder.cc
+++ b/test/encoder_parms_get_to_decoder.cc

@@ -94,7 +94,6 @@
   }
 
   virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
-                                  const libaom_test::VideoSource & /*video*/,
                                   libaom_test::Decoder *decoder) {
     aom_codec_ctx_t *const av1_decoder = decoder->GetDecoder();
     aom_codec_alg_priv_t *const priv =

diff --git a/test/error_resilience_test.cc b/test/error_resilience_test.cc
index 951c47f..07b6039 100644
--- a/test/error_resilience_test.cc
+++ b/test/error_resilience_test.cc

@@ -55,8 +55,7 @@
     nframes_++;
   }
 
-  virtual void PreEncodeFrameHook(libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder * /*encoder*/) {
+  virtual void PreEncodeFrameHook(libaom_test::VideoSource *video) {
     frame_flags_ &=
         ~(AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF);
     if (droppable_nframes_ > 0 &&

diff --git a/test/fht32x32_test.cc b/test/fht32x32_test.cc
index a949ebf..3d07b44 100644
--- a/test/fht32x32_test.cc
+++ b/test/fht32x32_test.cc

@@ -69,6 +69,7 @@
     inv_txfm_ = GET_PARAM(1);
     tx_type_ = GET_PARAM(2);
     pitch_ = 32;
+    height_ = 32;
     fwd_txfm_ref = fht32x32_ref;
     bit_depth_ = GET_PARAM(3);
     mask_ = (1 << bit_depth_) - 1;
@@ -90,6 +91,7 @@
 };
 
 TEST_P(AV1Trans32x32HT, CoeffCheck) { RunCoeffCheck(); }
+TEST_P(AV1Trans32x32HT, MemCheck) { RunMemCheck(); }
 
 #if CONFIG_AOM_HIGHBITDEPTH
 class AV1HighbdTrans32x32HT
@@ -164,8 +166,7 @@
 
 #if HAVE_AVX2
 const Ht32x32Param kArrayHt32x32Param_avx2[] = {
-  // TODO(luoyi): DCT_DCT tx_type is not enabled in av1_fht32x32_c(avx2) yet.
-  // make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 0, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 0, AOM_BITS_8, 1024),
   make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 1, AOM_BITS_8, 1024),
   make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 2, AOM_BITS_8, 1024),
   make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 3, AOM_BITS_8, 1024),

diff --git a/test/frame_size_tests.cc b/test/frame_size_tests.cc
index f1fad70..25b8718 100644
--- a/test/frame_size_tests.cc
+++ b/test/frame_size_tests.cc

@@ -28,7 +28,6 @@
   }
 
   virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
-                                  const libaom_test::VideoSource & /*video*/,
                                   libaom_test::Decoder *decoder) {
     EXPECT_EQ(expected_res_, res_dec) << decoder->DecodeError();
     return !::testing::Test::HasFailure();

diff --git a/test/variance_test.cc b/test/variance_test.cc
index 5ff5090..7848e20 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc

@@ -101,8 +101,7 @@
   }
   RoundHighBitDepth(bit_depth, &se, &sse);
   *sse_ptr = static_cast<uint32_t>(sse);
-  return static_cast<uint32_t>(
-      sse - ((static_cast<int64_t>(se) * se) >> (l2w + l2h)));
+  return static_cast<uint32_t>(sse - ((se * se) >> (l2w + l2h)));
 }
 
 /* The subpel reference functions differ from the codec version in one aspect:
@@ -157,8 +156,7 @@
   }
   RoundHighBitDepth(bit_depth, &se, &sse);
   *sse_ptr = static_cast<uint32_t>(sse);
-  return static_cast<uint32_t>(
-      sse - ((static_cast<int64_t>(se) * se) >> (l2w + l2h)));
+  return static_cast<uint32_t>(sse - ((se * se) >> (l2w + l2h)));
 }
 
 static uint32_t subpel_avg_variance_ref(const uint8_t *ref, const uint8_t *src,
@@ -211,8 +209,7 @@
   }
   RoundHighBitDepth(bit_depth, &se, &sse);
   *sse_ptr = static_cast<uint32_t>(sse);
-  return static_cast<uint32_t>(
-      sse - ((static_cast<int64_t>(se) * se) >> (l2w + l2h)));
+  return static_cast<uint32_t>(sse - ((se * se) >> (l2w + l2h)));
 }
 
 ////////////////////////////////////////////////////////////////////////////////

diff --git a/webmenc.cc b/webmenc.cc
index f78f027..e3d209a 100644
--- a/webmenc.cc
+++ b/webmenc.cc

@@ -24,7 +24,6 @@
 
 void write_webm_file_header(struct WebmOutputContext *webm_ctx,
                             const aom_codec_enc_cfg_t *cfg,
-                            const struct aom_rational *fps,
                             stereo_format_t stereo_fmt, unsigned int fourcc,
                             const struct AvxRational *par) {
   mkvmuxer::MkvWriter *const writer = new mkvmuxer::MkvWriter(webm_ctx->stream);

diff --git a/webmenc.h b/webmenc.h
index 90211ff..74387fb 100644
--- a/webmenc.h
+++ b/webmenc.h

@@ -40,7 +40,6 @@
 
 void write_webm_file_header(struct WebmOutputContext *webm_ctx,
                             const aom_codec_enc_cfg_t *cfg,
-                            const struct aom_rational *fps,
                             stereo_format_t stereo_fmt, unsigned int fourcc,
                             const struct AvxRational *par);