Merge "Refactor tx_size to pixel number in decodeframe.c" into nextgenv2
diff --git a/aom_dsp/answriter.h b/aom_dsp/answriter.h
index 298b255..370472a 100644
--- a/aom_dsp/answriter.h
+++ b/aom_dsp/answriter.h
@@ -20,8 +20,23 @@
 #include "aom_dsp/ans.h"
 #include "aom_dsp/prob.h"
 #include "aom_ports/mem_ops.h"
+#include "av1/common/odintrin.h"
 
-#define ANS_DIV(dividend, divisor) ((dividend) / (divisor))
+#if RANS_PRECISION <= OD_DIVU_DMAX
+#define ANS_DIVREM(quotient, remainder, dividend, divisor) \
+  do {                                                     \
+    quotient = OD_DIVU_SMALL((dividend), (divisor));       \
+    remainder = (dividend) - (quotient) * (divisor);       \
+  } while (0)
+#else
+#define ANS_DIVREM(quotient, remainder, dividend, divisor) \
+  do {                                                     \
+    quotient = (dividend) / (divisor);                     \
+    remainder = (dividend) % (divisor);                    \
+  } while (0)
+#endif
+
+#define ANS_DIV8(dividend, divisor) OD_DIVU_SMALL((dividend), (divisor))
 
 #ifdef __cplusplus
 extern "C" {
@@ -72,9 +87,9 @@
     ans->state /= IO_BASE;
   }
   if (!val)
-    ans->state = ANS_DIV(ans->state * ANS_P8_PRECISION, p0);
+    ans->state = ANS_DIV8(ans->state * ANS_P8_PRECISION, p0);
   else
-    ans->state = ANS_DIV((ans->state + 1) * ANS_P8_PRECISION + p - 1, p) - 1;
+    ans->state = ANS_DIV8((ans->state + 1) * ANS_P8_PRECISION + p - 1, p) - 1;
 }
 
 struct rans_sym {
@@ -88,15 +103,17 @@
 static INLINE void rans_write(struct AnsCoder *ans,
                               const struct rans_sym *const sym) {
   const aom_cdf_prob p = sym->prob;
+  unsigned quot, rem;
   while (ans->state >= L_BASE / RANS_PRECISION * IO_BASE * p) {
     ans->buf[ans->buf_offset++] = ans->state % IO_BASE;
     ans->state /= IO_BASE;
   }
-  ans->state =
-      (ans->state / p) * RANS_PRECISION + ans->state % p + sym->cum_prob;
+  ANS_DIVREM(quot, rem, ans->state, p);
+  ans->state = quot * RANS_PRECISION + rem + sym->cum_prob;
 }
 
-#undef ANS_DIV
+#undef ANS_DIV8
+#undef ANS_DIVREM
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/aom_dsp/aom_dsp.mk b/aom_dsp/aom_dsp.mk
index c74bfe3..28e7f12 100644
--- a/aom_dsp/aom_dsp.mk
+++ b/aom_dsp/aom_dsp.mk
@@ -394,9 +394,13 @@
 DSP_SRCS-yes += simd/v64_intrinsics_c.h
 DSP_SRCS-yes += simd/v128_intrinsics.h
 DSP_SRCS-yes += simd/v128_intrinsics_c.h
+DSP_SRCS-yes += simd/v256_intrinsics.h
+DSP_SRCS-yes += simd/v256_intrinsics_c.h
 DSP_SRCS-$(HAVE_SSE2) += simd/v64_intrinsics_x86.h
 DSP_SRCS-$(HAVE_SSE2) += simd/v128_intrinsics_x86.h
+DSP_SRCS-$(HAVE_SSE2) += simd/v256_intrinsics_x86.h
 DSP_SRCS-$(HAVE_NEON) += simd/v64_intrinsics_arm.h
 DSP_SRCS-$(HAVE_NEON) += simd/v128_intrinsics_arm.h
+DSP_SRCS-$(HAVE_NEON) += simd/v256_intrinsics_arm.h
 
 $(eval $(call rtcd_h_template,aom_dsp_rtcd,aom_dsp/aom_dsp_rtcd_defs.pl))
diff --git a/aom_dsp/aom_simd.h b/aom_dsp/aom_simd.h
index 3879d95..ae4ff23 100644
--- a/aom_dsp/aom_simd.h
+++ b/aom_dsp/aom_simd.h
@@ -22,11 +22,11 @@
 #include "./aom_simd_inline.h"
 
 #if HAVE_NEON
-#include "simd/v128_intrinsics_arm.h"
+#include "simd/v256_intrinsics_arm.h"
 #elif HAVE_SSE2
-#include "simd/v128_intrinsics_x86.h"
+#include "simd/v256_intrinsics_x86.h"
 #else
-#include "simd/v128_intrinsics.h"
+#include "simd/v256_intrinsics.h"
 #endif
 
 #endif  // AOM_DSP_AOM_AOM_SIMD_H_
diff --git a/aom_dsp/arm/aom_convolve8_avg_neon.c b/aom_dsp/arm/aom_convolve8_avg_neon.c
index 7dc936d..09429d6 100644
--- a/aom_dsp/arm/aom_convolve8_avg_neon.c
+++ b/aom_dsp/arm/aom_convolve8_avg_neon.c
@@ -65,6 +65,10 @@
 
   assert(x_step_q4 == 16);
 
+  (void)x_step_q4;
+  (void)y_step_q4;
+  (void)filter_y;
+
   q0s16 = vld1q_s16(filter_x);
 
   src -= 3;                // adjust for taps
@@ -241,6 +245,10 @@
 
   assert(y_step_q4 == 16);
 
+  (void)x_step_q4;
+  (void)y_step_q4;
+  (void)filter_x;
+
   src -= src_stride * 3;
   q0s16 = vld1q_s16(filter_y);
   for (; w > 0; w -= 4, src += 4, dst += 4) {  // loop_vert_h
diff --git a/aom_dsp/arm/aom_convolve8_neon.c b/aom_dsp/arm/aom_convolve8_neon.c
index ed0df6d..8ebffb5 100644
--- a/aom_dsp/arm/aom_convolve8_neon.c
+++ b/aom_dsp/arm/aom_convolve8_neon.c
@@ -65,6 +65,10 @@
 
   assert(x_step_q4 == 16);
 
+  (void)x_step_q4;
+  (void)y_step_q4;
+  (void)filter_y;
+
   q0s16 = vld1q_s16(filter_x);
 
   src -= 3;  // adjust for taps
@@ -225,6 +229,10 @@
 
   assert(y_step_q4 == 16);
 
+  (void)x_step_q4;
+  (void)y_step_q4;
+  (void)filter_x;
+
   src -= src_stride * 3;
   q0s16 = vld1q_s16(filter_y);
   for (; w > 0; w -= 4, src += 4, dst += 4) {  // loop_vert_h
diff --git a/aom_dsp/simd/v256_intrinsics.h b/aom_dsp/simd/v256_intrinsics.h
new file mode 100644
index 0000000..73bcd94
--- /dev/null
+++ b/aom_dsp/simd/v256_intrinsics.h
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef _V256_INTRINSICS_H
+#define _V256_INTRINSICS_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "./v256_intrinsics_c.h"
+#include "./v128_intrinsics.h"
+#include "./v64_intrinsics.h"
+
+/* Fallback to plain, unoptimised C. */
+
+typedef c_v256 v256;
+
+SIMD_INLINE uint32_t v256_low_u32(v256 a) { return c_v256_low_u32(a); }
+SIMD_INLINE v64 v256_low_v64(v256 a) { return c_v256_low_v64(a); }
+SIMD_INLINE v128 v256_low_v128(v256 a) { return c_v256_low_v128(a); }
+SIMD_INLINE v128 v256_high_v128(v256 a) { return c_v256_high_v128(a); }
+SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) {
+  return c_v256_from_v128(hi, lo);
+}
+SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
+  return c_v256_from_64(a, b, c, d);
+}
+SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) {
+  return c_v256_from_v64(a, b, c, d);
+}
+
+SIMD_INLINE v256 v256_load_unaligned(const void *p) {
+  return c_v256_load_unaligned(p);
+}
+SIMD_INLINE v256 v256_load_aligned(const void *p) {
+  return c_v256_load_aligned(p);
+}
+
+SIMD_INLINE void v256_store_unaligned(void *p, v256 a) {
+  c_v256_store_unaligned(p, a);
+}
+SIMD_INLINE void v256_store_aligned(void *p, v256 a) {
+  c_v256_store_aligned(p, a);
+}
+
+SIMD_INLINE v256 v256_align(v256 a, v256 b, const unsigned int c) {
+  return c_v256_align(a, b, c);
+}
+
+SIMD_INLINE v256 v256_zero() { return c_v256_zero(); }
+SIMD_INLINE v256 v256_dup_8(uint8_t x) { return c_v256_dup_8(x); }
+SIMD_INLINE v256 v256_dup_16(uint16_t x) { return c_v256_dup_16(x); }
+SIMD_INLINE v256 v256_dup_32(uint32_t x) { return c_v256_dup_32(x); }
+
+typedef uint32_t sad256_internal;
+SIMD_INLINE sad256_internal v256_sad_u8_init() { return c_v256_sad_u8_init(); }
+SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) {
+  return c_v256_sad_u8(s, a, b);
+}
+SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) {
+  return c_v256_sad_u8_sum(s);
+}
+typedef uint32_t ssd256_internal;
+SIMD_INLINE ssd256_internal v256_ssd_u8_init() { return c_v256_ssd_u8_init(); }
+SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
+  return c_v256_ssd_u8(s, a, b);
+}
+SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) {
+  return c_v256_ssd_u8_sum(s);
+}
+SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
+  return c_v256_dotp_s16(a, b);
+}
+SIMD_INLINE uint64_t v256_hadd_u8(v256 a) { return c_v256_hadd_u8(a); }
+
+SIMD_INLINE v256 v256_or(v256 a, v256 b) { return c_v256_or(a, b); }
+SIMD_INLINE v256 v256_xor(v256 a, v256 b) { return c_v256_xor(a, b); }
+SIMD_INLINE v256 v256_and(v256 a, v256 b) { return c_v256_and(a, b); }
+SIMD_INLINE v256 v256_andn(v256 a, v256 b) { return c_v256_andn(a, b); }
+
+SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return c_v256_add_8(a, b); }
+SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return c_v256_add_16(a, b); }
+SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) { return c_v256_sadd_s16(a, b); }
+SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return c_v256_add_32(a, b); }
+SIMD_INLINE v256 v256_padd_s16(v256 a) { return c_v256_padd_s16(a); }
+SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { return c_v256_sub_8(a, b); }
+SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { return c_v256_ssub_u8(a, b); }
+SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { return c_v256_ssub_s8(a, b); }
+SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { return c_v256_sub_16(a, b); }
+SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) { return c_v256_ssub_s16(a, b); }
+SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return c_v256_sub_32(a, b); }
+SIMD_INLINE v256 v256_abs_s16(v256 a) { return c_v256_abs_s16(a); }
+
+SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) { return c_v256_mul_s16(a, b); }
+SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
+  return c_v256_mullo_s16(a, b);
+}
+SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) {
+  return c_v256_mulhi_s16(a, b);
+}
+SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) {
+  return c_v256_mullo_s32(a, b);
+}
+SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) { return c_v256_madd_s16(a, b); }
+SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) { return c_v256_madd_us8(a, b); }
+
+SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { return c_v256_avg_u8(a, b); }
+SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) { return c_v256_rdavg_u8(a, b); }
+SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return c_v256_avg_u16(a, b); }
+SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return c_v256_min_u8(a, b); }
+SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return c_v256_max_u8(a, b); }
+SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return c_v256_min_s8(a, b); }
+SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return c_v256_max_s8(a, b); }
+SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return c_v256_min_s16(a, b); }
+SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return c_v256_max_s16(a, b); }
+
+SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) { return c_v256_ziplo_8(a, b); }
+SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) { return c_v256_ziphi_8(a, b); }
+SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) { return c_v256_ziplo_16(a, b); }
+SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) { return c_v256_ziphi_16(a, b); }
+SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) { return c_v256_ziplo_32(a, b); }
+SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) { return c_v256_ziphi_32(a, b); }
+SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) { return c_v256_ziplo_64(a, b); }
+SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) { return c_v256_ziphi_64(a, b); }
+SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
+  return c_v256_ziplo_128(a, b);
+}
+SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) {
+  return c_v256_ziphi_128(a, b);
+}
+SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) { return c_v256_zip_8(a, b); }
+SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) { return c_v256_zip_16(a, b); }
+SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) { return c_v256_zip_32(a, b); }
+SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
+  return c_v256_unziplo_8(a, b);
+}
+SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
+  return c_v256_unziphi_8(a, b);
+}
+SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
+  return c_v256_unziplo_16(a, b);
+}
+SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
+  return c_v256_unziphi_16(a, b);
+}
+SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
+  return c_v256_unziplo_32(a, b);
+}
+SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
+  return c_v256_unziphi_32(a, b);
+}
+SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { return c_v256_unpack_u8_s16(a); }
+SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
+  return c_v256_unpacklo_u8_s16(a);
+}
+SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
+  return c_v256_unpackhi_u8_s16(a);
+}
+SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
+  return c_v256_pack_s32_s16(a, b);
+}
+SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
+  return c_v256_pack_s16_u8(a, b);
+}
+SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) {
+  return c_v256_pack_s16_s8(a, b);
+}
+SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
+  return c_v256_unpack_u16_s32(a);
+}
+SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
+  return c_v256_unpack_s16_s32(a);
+}
+SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
+  return c_v256_unpacklo_u16_s32(a);
+}
+SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) {
+  return c_v256_unpacklo_s16_s32(a);
+}
+SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) {
+  return c_v256_unpackhi_u16_s32(a);
+}
+SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
+  return c_v256_unpackhi_s16_s32(a);
+}
+SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) {
+  return c_v256_shuffle_8(a, pattern);
+}
+SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
+  return c_v256_pshuffle_8(a, pattern);
+}
+
+SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) { return c_v256_cmpgt_s8(a, b); }
+SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) { return c_v256_cmplt_s8(a, b); }
+SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) { return c_v256_cmpeq_8(a, b); }
+SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) {
+  return c_v256_cmpgt_s16(a, b);
+}
+SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
+  return c_v256_cmplt_s16(a, b);
+}
+SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) { return c_v256_cmpeq_16(a, b); }
+
+SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) {
+  return c_v256_shl_8(a, c);
+}
+SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) {
+  return c_v256_shr_u8(a, c);
+}
+SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) {
+  return c_v256_shr_s8(a, c);
+}
+SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) {
+  return c_v256_shl_16(a, c);
+}
+SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) {
+  return c_v256_shr_u16(a, c);
+}
+SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) {
+  return c_v256_shr_s16(a, c);
+}
+SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) {
+  return c_v256_shl_32(a, c);
+}
+SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) {
+  return c_v256_shr_u32(a, c);
+}
+SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) {
+  return c_v256_shr_s32(a, c);
+}
+
+SIMD_INLINE v256 v256_shr_n_byte(v256 a, const unsigned int n) {
+  return c_v256_shr_n_byte(a, n);
+}
+SIMD_INLINE v256 v256_shl_n_byte(v256 a, const unsigned int n) {
+  return c_v256_shl_n_byte(a, n);
+}
+SIMD_INLINE v256 v256_shl_n_8(v256 a, const unsigned int n) {
+  return c_v256_shl_n_8(a, n);
+}
+SIMD_INLINE v256 v256_shl_n_16(v256 a, const unsigned int n) {
+  return c_v256_shl_n_16(a, n);
+}
+SIMD_INLINE v256 v256_shl_n_32(v256 a, const unsigned int n) {
+  return c_v256_shl_n_32(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_u8(v256 a, const unsigned int n) {
+  return c_v256_shr_n_u8(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_u16(v256 a, const unsigned int n) {
+  return c_v256_shr_n_u16(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_u32(v256 a, const unsigned int n) {
+  return c_v256_shr_n_u32(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_s8(v256 a, const unsigned int n) {
+  return c_v256_shr_n_s8(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_s16(v256 a, const unsigned int n) {
+  return c_v256_shr_n_s16(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_s32(v256 a, const unsigned int n) {
+  return c_v256_shr_n_s32(a, n);
+}
+
+#endif /* _V256_INTRINSICS_H */
diff --git a/aom_dsp/simd/v256_intrinsics_arm.h b/aom_dsp/simd/v256_intrinsics_arm.h
new file mode 100644
index 0000000..ba4ed71
--- /dev/null
+++ b/aom_dsp/simd/v256_intrinsics_arm.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef _V256_INTRINSICS_H
+#define _V256_INTRINSICS_H
+
+#include "./v256_intrinsics_v128.h"
+
+#endif /* _V256_INTRINSICS_H */
diff --git a/aom_dsp/simd/v256_intrinsics_c.h b/aom_dsp/simd/v256_intrinsics_c.h
new file mode 100644
index 0000000..8a67f9e
--- /dev/null
+++ b/aom_dsp/simd/v256_intrinsics_c.h
@@ -0,0 +1,701 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef _V256_INTRINSICS_C_H
+#define _V256_INTRINSICS_C_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "./v128_intrinsics_c.h"
+#include "./aom_config.h"
+
+typedef union {
+  uint8_t u8[32];
+  uint16_t u16[16];
+  uint32_t u32[8];
+  uint64_t u64[4];
+  int8_t s8[32];
+  int16_t s16[16];
+  int32_t s32[8];
+  int64_t s64[4];
+  c_v64 v64[4];
+  c_v128 v128[2];
+} c_v256;
+
+SIMD_INLINE uint32_t c_v256_low_u32(c_v256 a) { return a.u32[0]; }
+
+SIMD_INLINE c_v64 c_v256_low_v64(c_v256 a) { return a.v64[0]; }
+
+SIMD_INLINE c_v128 c_v256_low_v128(c_v256 a) { return a.v128[0]; }
+
+SIMD_INLINE c_v128 c_v256_high_v128(c_v256 a) { return a.v128[1]; }
+
+SIMD_INLINE c_v256 c_v256_from_v128(c_v128 hi, c_v128 lo) {
+  c_v256 t;
+  t.v128[1] = hi;
+  t.v128[0] = lo;
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_from_64(uint64_t a, uint64_t b, uint64_t c,
+                                  uint64_t d) {
+  c_v256 t;
+  t.u64[3] = a;
+  t.u64[2] = b;
+  t.u64[1] = c;
+  t.u64[0] = d;
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_from_v64(c_v64 a, c_v64 b, c_v64 c, c_v64 d) {
+  c_v256 t;
+  t.u64[3] = a.u64;
+  t.u64[2] = b.u64;
+  t.u64[1] = c.u64;
+  t.u64[0] = d.u64;
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_load_unaligned(const void *p) {
+  c_v256 t;
+  uint8_t *pp = (uint8_t *)p;
+  uint8_t *q = (uint8_t *)&t;
+  int c;
+  for (c = 0; c < 32; c++) q[c] = pp[c];
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_load_aligned(const void *p) {
+  if (simd_check && (uintptr_t)p & 31) {
+    fprintf(stderr, "Error: unaligned v256 load at %p\n", p);
+    abort();
+  }
+  return c_v256_load_unaligned(p);
+}
+
+SIMD_INLINE void c_v256_store_unaligned(void *p, c_v256 a) {
+  uint8_t *pp = (uint8_t *)p;
+  uint8_t *q = (uint8_t *)&a;
+  int c;
+  for (c = 0; c < 32; c++) pp[c] = q[c];
+}
+
+SIMD_INLINE void c_v256_store_aligned(void *p, c_v256 a) {
+  if (simd_check && (uintptr_t)p & 31) {
+    fprintf(stderr, "Error: unaligned v256 store at %p\n", p);
+    abort();
+  }
+  c_v256_store_unaligned(p, a);
+}
+
+SIMD_INLINE c_v256 c_v256_zero() {
+  c_v256 t;
+  t.u64[3] = t.u64[2] = t.u64[1] = t.u64[0] = 0;
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_dup_8(uint8_t x) {
+  c_v256 t;
+  t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_8(x);
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_dup_16(uint16_t x) {
+  c_v256 t;
+  t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_16(x);
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_dup_32(uint32_t x) {
+  c_v256 t;
+  t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_32(x);
+  return t;
+}
+
+SIMD_INLINE int64_t c_v256_dotp_s16(c_v256 a, c_v256 b) {
+  return c_v128_dotp_s16(a.v128[1], b.v128[1]) +
+         c_v128_dotp_s16(a.v128[0], b.v128[0]);
+}
+
+SIMD_INLINE uint64_t c_v256_hadd_u8(c_v256 a) {
+  return c_v128_hadd_u8(a.v128[1]) + c_v128_hadd_u8(a.v128[0]);
+}
+
+typedef uint32_t c_sad256_internal;
+
+SIMD_INLINE c_sad128_internal c_v256_sad_u8_init() { return 0; }
+
+/* Implementation dependent return value.  Result must be finalised with
+   v256_sad_u8_sum().
+   The result for more than 16 v256_sad_u8() calls is undefined. */
+SIMD_INLINE c_sad128_internal c_v256_sad_u8(c_sad256_internal s, c_v256 a,
+                                            c_v256 b) {
+  int c;
+  for (c = 0; c < 32; c++)
+    s += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
+  return s;
+}
+
+SIMD_INLINE uint32_t c_v256_sad_u8_sum(c_sad256_internal s) { return s; }
+
+typedef uint32_t c_ssd256_internal;
+
+SIMD_INLINE c_ssd256_internal c_v256_ssd_u8_init() { return 0; }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v256_ssd_u8_sum(). */
+SIMD_INLINE c_ssd256_internal c_v256_ssd_u8(c_ssd256_internal s, c_v256 a,
+                                            c_v256 b) {
+  int c;
+  for (c = 0; c < 32; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]);
+  return s;
+}
+
+SIMD_INLINE uint32_t c_v256_ssd_u8_sum(c_ssd256_internal s) { return s; }
+
+SIMD_INLINE c_v256 c_v256_or(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_or(a.v128[1], b.v128[1]),
+                          c_v128_or(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_xor(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_xor(a.v128[1], b.v128[1]),
+                          c_v128_xor(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_and(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_and(a.v128[1], b.v128[1]),
+                          c_v128_and(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_andn(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_andn(a.v128[1], b.v128[1]),
+                          c_v128_andn(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_add_8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_add_8(a.v128[1], b.v128[1]),
+                          c_v128_add_8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_add_16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_add_16(a.v128[1], b.v128[1]),
+                          c_v128_add_16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_sadd_s16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_sadd_s16(a.v128[1], b.v128[1]),
+                          c_v128_sadd_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_add_32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_add_32(a.v128[1], b.v128[1]),
+                          c_v128_add_32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_padd_s16(c_v256 a) {
+  c_v256 t;
+  t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1];
+  t.s32[1] = (int32_t)a.s16[2] + (int32_t)a.s16[3];
+  t.s32[2] = (int32_t)a.s16[4] + (int32_t)a.s16[5];
+  t.s32[3] = (int32_t)a.s16[6] + (int32_t)a.s16[7];
+  t.s32[4] = (int32_t)a.s16[8] + (int32_t)a.s16[9];
+  t.s32[5] = (int32_t)a.s16[10] + (int32_t)a.s16[11];
+  t.s32[6] = (int32_t)a.s16[12] + (int32_t)a.s16[13];
+  t.s32[7] = (int32_t)a.s16[14] + (int32_t)a.s16[15];
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_sub_8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_sub_8(a.v128[1], b.v128[1]),
+                          c_v128_sub_8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ssub_u8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ssub_u8(a.v128[1], b.v128[1]),
+                          c_v128_ssub_u8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ssub_s8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ssub_s8(a.v128[1], b.v128[1]),
+                          c_v128_ssub_s8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_sub_16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_sub_16(a.v128[1], b.v128[1]),
+                          c_v128_sub_16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ssub_s16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ssub_s16(a.v128[1], b.v128[1]),
+                          c_v128_ssub_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_sub_32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_sub_32(a.v128[1], b.v128[1]),
+                          c_v128_sub_32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_abs_s16(c_v256 a) {
+  return c_v256_from_v128(c_v128_abs_s16(a.v128[1]), c_v128_abs_s16(a.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_mul_s16(c_v128 a, c_v128 b) {
+  c_v128 lo_bits = c_v128_mullo_s16(a, b);
+  c_v128 hi_bits = c_v128_mulhi_s16(a, b);
+  return c_v256_from_v128(c_v128_ziphi_16(hi_bits, lo_bits),
+                          c_v128_ziplo_16(hi_bits, lo_bits));
+}
+
+SIMD_INLINE c_v256 c_v256_mullo_s16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_mullo_s16(a.v128[1], b.v128[1]),
+                          c_v128_mullo_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_mulhi_s16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_mulhi_s16(a.v128[1], b.v128[1]),
+                          c_v128_mulhi_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_mullo_s32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_mullo_s32(a.v128[1], b.v128[1]),
+                          c_v128_mullo_s32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_madd_s16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_madd_s16(a.v128[1], b.v128[1]),
+                          c_v128_madd_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_madd_us8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_madd_us8(a.v128[1], b.v128[1]),
+                          c_v128_madd_us8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_avg_u8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_avg_u8(a.v128[1], b.v128[1]),
+                          c_v128_avg_u8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_rdavg_u8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_rdavg_u8(a.v128[1], b.v128[1]),
+                          c_v128_rdavg_u8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_avg_u16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_avg_u16(a.v128[1], b.v128[1]),
+                          c_v128_avg_u16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_min_u8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_min_u8(a.v128[1], b.v128[1]),
+                          c_v128_min_u8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_max_u8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_max_u8(a.v128[1], b.v128[1]),
+                          c_v128_max_u8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_min_s8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_min_s8(a.v128[1], b.v128[1]),
+                          c_v128_min_s8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_max_s8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_max_s8(a.v128[1], b.v128[1]),
+                          c_v128_max_s8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_min_s16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_min_s16(a.v128[1], b.v128[1]),
+                          c_v128_min_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_max_s16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_max_s16(a.v128[1], b.v128[1]),
+                          c_v128_max_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziplo_8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ziphi_8(a.v128[0], b.v128[0]),
+                          c_v128_ziplo_8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziphi_8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ziphi_8(a.v128[1], b.v128[1]),
+                          c_v128_ziplo_8(a.v128[1], b.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziplo_16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ziphi_16(a.v128[0], b.v128[0]),
+                          c_v128_ziplo_16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziphi_16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ziphi_16(a.v128[1], b.v128[1]),
+                          c_v128_ziplo_16(a.v128[1], b.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziplo_32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ziphi_32(a.v128[0], b.v128[0]),
+                          c_v128_ziplo_32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziphi_32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ziphi_32(a.v128[1], b.v128[1]),
+                          c_v128_ziplo_32(a.v128[1], b.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziplo_64(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ziphi_64(a.v128[0], b.v128[0]),
+                          c_v128_ziplo_64(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziphi_64(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ziphi_64(a.v128[1], b.v128[1]),
+                          c_v128_ziplo_64(a.v128[1], b.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziplo_128(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(a.v128[0], b.v128[0]);
+}
+
+SIMD_INLINE c_v256 c_v256_ziphi_128(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(a.v128[1], b.v128[1]);
+}
+
+SIMD_INLINE c_v256 c_v256_zip_8(c_v128 a, c_v128 b) {
+  return c_v256_from_v128(c_v128_ziphi_8(a, b), c_v128_ziplo_8(a, b));
+}
+
+SIMD_INLINE c_v256 c_v256_zip_16(c_v128 a, c_v128 b) {
+  return c_v256_from_v128(c_v128_ziphi_16(a, b), c_v128_ziplo_16(a, b));
+}
+
+SIMD_INLINE c_v256 c_v256_zip_32(c_v128 a, c_v128 b) {
+  return c_v256_from_v128(c_v128_ziphi_32(a, b), c_v128_ziplo_32(a, b));
+}
+
+SIMD_INLINE c_v256 _c_v256_unzip_8(c_v256 a, c_v256 b, int mode) {
+  c_v256 t;
+  int i;
+  if (mode) {
+    for (i = 0; i < 16; i++) {
+      t.u8[i] = a.u8[i * 2 + 1];
+      t.u8[i + 16] = b.u8[i * 2 + 1];
+    }
+  } else {
+    for (i = 0; i < 16; i++) {
+      t.u8[i] = b.u8[i * 2];
+      t.u8[i + 16] = a.u8[i * 2];
+    }
+  }
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_unziplo_8(c_v256 a, c_v256 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_8(a, b, 1)
+                           : _c_v256_unzip_8(a, b, 0);
+}
+
+SIMD_INLINE c_v256 c_v256_unziphi_8(c_v256 a, c_v256 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_8(b, a, 0)
+                           : _c_v256_unzip_8(b, a, 1);
+}
+
+SIMD_INLINE c_v256 _c_v256_unzip_16(c_v256 a, c_v256 b, int mode) {
+  c_v256 t;
+  int i;
+  if (mode) {
+    for (i = 0; i < 8; i++) {
+      t.u16[i] = a.u16[i * 2 + 1];
+      t.u16[i + 8] = b.u16[i * 2 + 1];
+    }
+  } else {
+    for (i = 0; i < 8; i++) {
+      t.u16[i] = b.u16[i * 2];
+      t.u16[i + 8] = a.u16[i * 2];
+    }
+  }
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_unziplo_16(c_v256 a, c_v256 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_16(a, b, 1)
+                           : _c_v256_unzip_16(a, b, 0);
+}
+
+SIMD_INLINE c_v256 c_v256_unziphi_16(c_v256 a, c_v256 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_16(b, a, 0)
+                           : _c_v256_unzip_16(b, a, 1);
+}
+
+SIMD_INLINE c_v256 _c_v256_unzip_32(c_v256 a, c_v256 b, int mode) {
+  c_v256 t;
+  if (mode) {
+    t.u32[7] = b.u32[7];
+    t.u32[6] = b.u32[5];
+    t.u32[5] = b.u32[3];
+    t.u32[4] = b.u32[1];
+    t.u32[3] = a.u32[7];
+    t.u32[2] = a.u32[5];
+    t.u32[1] = a.u32[3];
+    t.u32[0] = a.u32[1];
+  } else {
+    t.u32[7] = a.u32[6];
+    t.u32[6] = a.u32[4];
+    t.u32[5] = a.u32[2];
+    t.u32[4] = a.u32[0];
+    t.u32[3] = b.u32[6];
+    t.u32[2] = b.u32[4];
+    t.u32[1] = b.u32[2];
+    t.u32[0] = b.u32[0];
+  }
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_unziplo_32(c_v256 a, c_v256 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_32(a, b, 1)
+                           : _c_v256_unzip_32(a, b, 0);
+}
+
+SIMD_INLINE c_v256 c_v256_unziphi_32(c_v256 a, c_v256 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_32(b, a, 0)
+                           : _c_v256_unzip_32(b, a, 1);
+}
+
+SIMD_INLINE c_v256 c_v256_unpack_u8_s16(c_v128 a) {
+  return c_v256_from_v128(c_v128_unpackhi_u8_s16(a), c_v128_unpacklo_u8_s16(a));
+}
+
+SIMD_INLINE c_v256 c_v256_unpacklo_u8_s16(c_v256 a) {
+  return c_v256_from_v128(c_v128_unpackhi_u8_s16(a.v128[0]),
+                          c_v128_unpacklo_u8_s16(a.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpackhi_u8_s16(c_v256 a) {
+  return c_v256_from_v128(c_v128_unpackhi_u8_s16(a.v128[1]),
+                          c_v128_unpacklo_u8_s16(a.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_pack_s32_s16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_pack_s32_s16(a.v128[1], a.v128[0]),
+                          c_v128_pack_s32_s16(b.v128[1], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_pack_s16_u8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_pack_s16_u8(a.v128[1], a.v128[0]),
+                          c_v128_pack_s16_u8(b.v128[1], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_pack_s16_s8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_pack_s16_s8(a.v128[1], a.v128[0]),
+                          c_v128_pack_s16_s8(b.v128[1], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpack_u16_s32(c_v128 a) {
+  return c_v256_from_v128(c_v128_unpackhi_u16_s32(a),
+                          c_v128_unpacklo_u16_s32(a));
+}
+
+SIMD_INLINE c_v256 c_v256_unpack_s16_s32(c_v128 a) {
+  return c_v256_from_v128(c_v128_unpackhi_s16_s32(a),
+                          c_v128_unpacklo_s16_s32(a));
+}
+
+SIMD_INLINE c_v256 c_v256_unpacklo_u16_s32(c_v256 a) {
+  return c_v256_from_v128(c_v128_unpackhi_u16_s32(a.v128[0]),
+                          c_v128_unpacklo_u16_s32(a.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpacklo_s16_s32(c_v256 a) {
+  return c_v256_from_v128(c_v128_unpackhi_s16_s32(a.v128[0]),
+                          c_v128_unpacklo_s16_s32(a.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpackhi_u16_s32(c_v256 a) {
+  return c_v256_from_v128(c_v128_unpackhi_u16_s32(a.v128[1]),
+                          c_v128_unpacklo_u16_s32(a.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpackhi_s16_s32(c_v256 a) {
+  return c_v256_from_v128(c_v128_unpackhi_s16_s32(a.v128[1]),
+                          c_v128_unpacklo_s16_s32(a.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_shuffle_8(c_v256 a, c_v256 pattern) {
+  c_v256 t;
+  int c;
+  for (c = 0; c < 32; c++) {
+    if (pattern.u8[c] & ~31) {
+      fprintf(stderr, "Undefined v256_shuffle_8 index %d/%d\n", pattern.u8[c],
+              c);
+      abort();
+    }
+    t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 31 - (pattern.u8[c] & 31)
+                                     : pattern.u8[c] & 31];
+  }
+  return t;
+}
+
+// Pairwise / dual-lane shuffle: shuffle two 128 bit lates.
+SIMD_INLINE c_v256 c_v256_pshuffle_8(c_v256 a, c_v256 pattern) {
+  return c_v256_from_v128(
+      c_v128_shuffle_8(c_v256_high_v128(a), c_v256_high_v128(pattern)),
+      c_v128_shuffle_8(c_v256_low_v128(a), c_v256_low_v128(pattern)));
+}
+
+SIMD_INLINE c_v256 c_v256_cmpgt_s8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_cmpgt_s8(a.v128[1], b.v128[1]),
+                          c_v128_cmpgt_s8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmplt_s8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_cmplt_s8(a.v128[1], b.v128[1]),
+                          c_v128_cmplt_s8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmpeq_8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_cmpeq_8(a.v128[1], b.v128[1]),
+                          c_v128_cmpeq_8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmpgt_s16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_cmpgt_s16(a.v128[1], b.v128[1]),
+                          c_v128_cmpgt_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmplt_s16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_cmplt_s16(a.v128[1], b.v128[1]),
+                          c_v128_cmplt_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmpeq_16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_cmpeq_16(a.v128[1], b.v128[1]),
+                          c_v128_cmpeq_16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_shl_n_byte(c_v256 a, const unsigned int n) {
+  if (n < 16)
+    return c_v256_from_v128(c_v128_or(c_v128_shl_n_byte(a.v128[1], n),
+                                      c_v128_shr_n_byte(a.v128[0], 16 - n)),
+                            c_v128_shl_n_byte(a.v128[0], n));
+  else if (n > 16)
+    return c_v256_from_v128(c_v128_shl_n_byte(a.v128[0], n - 16),
+                            c_v128_zero());
+  else
+    return c_v256_from_v128(c_v256_low_v128(a), c_v128_zero());
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_byte(c_v256 a, const unsigned int n) {
+  if (n < 16)
+    return c_v256_from_v128(c_v128_shr_n_byte(a.v128[1], n),
+                            c_v128_or(c_v128_shr_n_byte(a.v128[0], n),
+                                      c_v128_shl_n_byte(a.v128[1], 16 - n)));
+  else if (n > 16)
+    return c_v256_from_v128(c_v128_zero(),
+                            c_v128_shr_n_byte(a.v128[1], n - 16));
+  else
+    return c_v256_from_v128(c_v128_zero(), c_v256_high_v128(a));
+}
+
+SIMD_INLINE c_v256 c_v256_align(c_v256 a, c_v256 b, const unsigned int c) {
+  if (simd_check && c > 31) {
+    fprintf(stderr, "Error: undefined alignment %d\n", c);
+    abort();
+  }
+  return c ? c_v256_or(c_v256_shr_n_byte(b, c), c_v256_shl_n_byte(a, 32 - c))
+           : b;
+}
+
+SIMD_INLINE c_v256 c_v256_shl_8(c_v256 a, const unsigned int c) {
+  return c_v256_from_v128(c_v128_shl_8(a.v128[1], c),
+                          c_v128_shl_8(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_u8(c_v256 a, const unsigned int c) {
+  return c_v256_from_v128(c_v128_shr_u8(a.v128[1], c),
+                          c_v128_shr_u8(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_s8(c_v256 a, const unsigned int c) {
+  return c_v256_from_v128(c_v128_shr_s8(a.v128[1], c),
+                          c_v128_shr_s8(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shl_16(c_v256 a, const unsigned int c) {
+  return c_v256_from_v128(c_v128_shl_16(a.v128[1], c),
+                          c_v128_shl_16(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_u16(c_v256 a, const unsigned int c) {
+  return c_v256_from_v128(c_v128_shr_u16(a.v128[1], c),
+                          c_v128_shr_u16(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_s16(c_v256 a, const unsigned int c) {
+  return c_v256_from_v128(c_v128_shr_s16(a.v128[1], c),
+                          c_v128_shr_s16(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shl_32(c_v256 a, const unsigned int c) {
+  return c_v256_from_v128(c_v128_shl_32(a.v128[1], c),
+                          c_v128_shl_32(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_u32(c_v256 a, const unsigned int c) {
+  return c_v256_from_v128(c_v128_shr_u32(a.v128[1], c),
+                          c_v128_shr_u32(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_s32(c_v256 a, const unsigned int c) {
+  return c_v256_from_v128(c_v128_shr_s32(a.v128[1], c),
+                          c_v128_shr_s32(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shl_n_8(c_v256 a, const unsigned int n) {
+  return c_v256_shl_8(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shl_n_16(c_v256 a, const unsigned int n) {
+  return c_v256_shl_16(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shl_n_32(c_v256 a, const unsigned int n) {
+  return c_v256_shl_32(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_u8(c_v256 a, const unsigned int n) {
+  return c_v256_shr_u8(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_u16(c_v256 a, const unsigned int n) {
+  return c_v256_shr_u16(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_u32(c_v256 a, const unsigned int n) {
+  return c_v256_shr_u32(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_s8(c_v256 a, const unsigned int n) {
+  return c_v256_shr_s8(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_s16(c_v256 a, const unsigned int n) {
+  return c_v256_shr_s16(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_s32(c_v256 a, const unsigned int n) {
+  return c_v256_shr_s32(a, n);
+}
+
+#endif /* _V256_INTRINSICS_C_H */
diff --git a/aom_dsp/simd/v256_intrinsics_v128.h b/aom_dsp/simd/v256_intrinsics_v128.h
new file mode 100644
index 0000000..93cccce
--- /dev/null
+++ b/aom_dsp/simd/v256_intrinsics_v128.h
@@ -0,0 +1,525 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef _V256_INTRINSICS_V128_H
+#define _V256_INTRINSICS_V128_H
+
+#if HAVE_NEON
+#include "./v128_intrinsics_arm.h"
+#elif HAVE_SSE2
+#include "./v128_intrinsics_x86.h"
+#else
+#include "./v128_intrinsics.h"
+#endif
+
+typedef struct { v128 lo, hi; } v256;
+
+SIMD_INLINE uint32_t v256_low_u32(v256 a) { return v128_low_u32(a.lo); }
+
+SIMD_INLINE v64 v256_low_v64(v256 a) { return v128_low_v64(a.lo); }
+
+SIMD_INLINE v128 v256_low_v128(v256 a) { return a.lo; }
+
+SIMD_INLINE v128 v256_high_v128(v256 a) { return a.hi; }
+
+SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) {
+  v256 t;
+  t.hi = hi;
+  t.lo = lo;
+  return t;
+}
+
+SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
+  return v256_from_v128(v128_from_64(a, b), v128_from_64(c, d));
+}
+
+SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) {
+  return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d));
+}
+
+SIMD_INLINE v256 v256_load_unaligned(const void *p) {
+  return v256_from_v128(v128_load_unaligned((uint8_t *)p + 16),
+                        v128_load_unaligned(p));
+}
+
+SIMD_INLINE v256 v256_load_aligned(const void *p) {
+  return v256_from_v128(v128_load_aligned((uint8_t *)p + 16),
+                        v128_load_aligned(p));
+}
+
+SIMD_INLINE void v256_store_unaligned(void *p, v256 a) {
+  v128_store_unaligned(p, a.lo);
+  v128_store_unaligned((uint8_t *)p + 16, a.hi);
+}
+
+SIMD_INLINE void v256_store_aligned(void *p, v256 a) {
+  v128_store_aligned(p, a.lo);
+  v128_store_aligned((uint8_t *)p + 16, a.hi);
+}
+
+SIMD_INLINE v256 v256_zero() {
+  return v256_from_v128(v128_zero(), v128_zero());
+}
+
+SIMD_INLINE v256 v256_dup_8(uint8_t x) {
+  v128 t = v128_dup_8(x);
+  return v256_from_v128(t, t);
+}
+
+SIMD_INLINE v256 v256_dup_16(uint16_t x) {
+  v128 t = v128_dup_16(x);
+  return v256_from_v128(t, t);
+}
+
+SIMD_INLINE v256 v256_dup_32(uint32_t x) {
+  v128 t = v128_dup_32(x);
+  return v256_from_v128(t, t);
+}
+
+SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
+  return v128_dotp_s16(a.hi, b.hi) + v128_dotp_s16(a.lo, b.lo);
+}
+
+SIMD_INLINE uint64_t v256_hadd_u8(v256 a) {
+  return v128_hadd_u8(a.hi) + v128_hadd_u8(a.lo);
+}
+
+typedef struct {
+  sad128_internal hi;
+  sad128_internal lo;
+} sad256_internal;
+
+SIMD_INLINE sad256_internal v256_sad_u8_init() {
+  sad256_internal t;
+  t.hi = v128_sad_u8_init();
+  t.lo = v128_sad_u8_init();
+  return t;
+}
+
+/* Implementation dependent return value.  Result must be finalised with
+   v256_sad_u8_sum().
+   The result for more than 16 v256_sad_u8() calls is undefined. */
+SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) {
+  sad256_internal t;
+  t.hi = v128_sad_u8(s.hi, a.hi, b.hi);
+  t.lo = v128_sad_u8(s.lo, a.lo, b.lo);
+  return t;
+}
+
+SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) {
+  return v128_sad_u8_sum(s.hi) + v128_sad_u8_sum(s.lo);
+}
+
+typedef struct {
+  ssd128_internal hi;
+  ssd128_internal lo;
+} ssd256_internal;
+
+SIMD_INLINE ssd256_internal v256_ssd_u8_init() {
+  ssd256_internal t;
+  t.hi = v128_ssd_u8_init();
+  t.lo = v128_ssd_u8_init();
+  return t;
+}
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v256_ssd_u8_sum(). */
+SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
+  ssd256_internal t;
+  t.hi = v128_ssd_u8(s.hi, a.hi, b.hi);
+  t.lo = v128_ssd_u8(s.lo, a.lo, b.lo);
+  return t;
+}
+
+SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) {
+  return v128_ssd_u8_sum(s.hi) + v128_ssd_u8_sum(s.lo);
+}
+
+SIMD_INLINE v256 v256_or(v256 a, v256 b) {
+  return v256_from_v128(v128_or(a.hi, b.hi), v128_or(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_xor(v256 a, v256 b) {
+  return v256_from_v128(v128_xor(a.hi, b.hi), v128_xor(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_and(v256 a, v256 b) {
+  return v256_from_v128(v128_and(a.hi, b.hi), v128_and(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_andn(v256 a, v256 b) {
+  return v256_from_v128(v128_andn(a.hi, b.hi), v128_andn(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_add_8(v256 a, v256 b) {
+  return v256_from_v128(v128_add_8(a.hi, b.hi), v128_add_8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_add_16(v256 a, v256 b) {
+  return v256_from_v128(v128_add_16(a.hi, b.hi), v128_add_16(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) {
+  return v256_from_v128(v128_sadd_s16(a.hi, b.hi), v128_sadd_s16(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_add_32(v256 a, v256 b) {
+  return v256_from_v128(v128_add_32(a.hi, b.hi), v128_add_32(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_padd_s16(v256 a) {
+  return v256_from_v128(v128_padd_s16(a.hi), v128_padd_s16(a.lo));
+}
+
+SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) {
+  return v256_from_v128(v128_sub_8(a.hi, b.hi), v128_sub_8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) {
+  return v256_from_v128(v128_ssub_u8(a.hi, b.hi), v128_ssub_u8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) {
+  return v256_from_v128(v128_ssub_s8(a.hi, b.hi), v128_ssub_s8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) {
+  return v256_from_v128(v128_sub_16(a.hi, b.hi), v128_sub_16(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) {
+  return v256_from_v128(v128_ssub_s16(a.hi, b.hi), v128_ssub_s16(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) {
+  return v256_from_v128(v128_sub_32(a.hi, b.hi), v128_sub_32(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_abs_s16(v256 a) {
+  return v256_from_v128(v128_abs_s16(a.hi), v128_abs_s16(a.lo));
+}
+
+SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) {
+  v128 lo_bits = v128_mullo_s16(a, b);
+  v128 hi_bits = v128_mulhi_s16(a, b);
+  return v256_from_v128(v128_ziphi_16(hi_bits, lo_bits),
+                        v128_ziplo_16(hi_bits, lo_bits));
+}
+
+SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
+  return v256_from_v128(v128_mullo_s16(a.hi, b.hi), v128_mullo_s16(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) {
+  return v256_from_v128(v128_mulhi_s16(a.hi, b.hi), v128_mulhi_s16(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) {
+  return v256_from_v128(v128_mullo_s32(a.hi, b.hi), v128_mullo_s32(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) {
+  return v256_from_v128(v128_madd_s16(a.hi, b.hi), v128_madd_s16(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) {
+  return v256_from_v128(v128_madd_us8(a.hi, b.hi), v128_madd_us8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) {
+  return v256_from_v128(v128_avg_u8(a.hi, b.hi), v128_avg_u8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) {
+  return v256_from_v128(v128_rdavg_u8(a.hi, b.hi), v128_rdavg_u8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) {
+  return v256_from_v128(v128_avg_u16(a.hi, b.hi), v128_avg_u16(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) {
+  return v256_from_v128(v128_min_u8(a.hi, b.hi), v128_min_u8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) {
+  return v256_from_v128(v128_max_u8(a.hi, b.hi), v128_max_u8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) {
+  return v256_from_v128(v128_min_s8(a.hi, b.hi), v128_min_s8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) {
+  return v256_from_v128(v128_max_s8(a.hi, b.hi), v128_max_s8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) {
+  return v256_from_v128(v128_min_s16(a.hi, b.hi), v128_min_s16(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) {
+  return v256_from_v128(v128_max_s16(a.hi, b.hi), v128_max_s16(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) {
+  return v256_from_v128(v128_ziphi_8(a.lo, b.lo), v128_ziplo_8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) {
+  return v256_from_v128(v128_ziphi_8(a.hi, b.hi), v128_ziplo_8(a.hi, b.hi));
+}
+
+SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) {
+  return v256_from_v128(v128_ziphi_16(a.lo, b.lo), v128_ziplo_16(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) {
+  return v256_from_v128(v128_ziphi_16(a.hi, b.hi), v128_ziplo_16(a.hi, b.hi));
+}
+
+SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) {
+  return v256_from_v128(v128_ziphi_32(a.lo, b.lo), v128_ziplo_32(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) {
+  return v256_from_v128(v128_ziphi_32(a.hi, b.hi), v128_ziplo_32(a.hi, b.hi));
+}
+
+SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) {
+  return v256_from_v128(v128_ziphi_64(a.lo, b.lo), v128_ziplo_64(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) {
+  return v256_from_v128(v128_ziphi_64(a.hi, b.hi), v128_ziplo_64(a.hi, b.hi));
+}
+
+SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
+  return v256_from_v128(a.lo, b.lo);
+}
+
+SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) {
+  return v256_from_v128(a.hi, b.hi);
+}
+
+SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) {
+  return v256_from_v128(v128_ziphi_8(a, b), v128_ziplo_8(a, b));
+}
+
+SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) {
+  return v256_from_v128(v128_ziphi_16(a, b), v128_ziplo_16(a, b));
+}
+
+SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) {
+  return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b));
+}
+
+SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
+  return v256_from_v128(v128_unziplo_8(a.hi, a.lo), v128_unziplo_8(b.hi, b.lo));
+}
+
+SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
+  return v256_from_v128(v128_unziphi_8(a.hi, a.lo), v128_unziphi_8(b.hi, b.lo));
+}
+
+SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
+  return v256_from_v128(v128_unziplo_16(a.hi, a.lo),
+                        v128_unziplo_16(b.hi, b.lo));
+}
+
+SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
+  return v256_from_v128(v128_unziphi_16(a.hi, a.lo),
+                        v128_unziphi_16(b.hi, b.lo));
+}
+
+SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
+  return v256_from_v128(v128_unziplo_32(a.hi, a.lo),
+                        v128_unziplo_32(b.hi, b.lo));
+}
+
+SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
+  return v256_from_v128(v128_unziphi_32(a.hi, a.lo),
+                        v128_unziphi_32(b.hi, b.lo));
+}
+
+SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) {
+  return v256_from_v128(v128_unpackhi_u8_s16(a), v128_unpacklo_u8_s16(a));
+}
+
+SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
+  return v256_from_v128(v128_unpackhi_u8_s16(a.lo), v128_unpacklo_u8_s16(a.lo));
+}
+
+SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
+  return v256_from_v128(v128_unpackhi_u8_s16(a.hi), v128_unpacklo_u8_s16(a.hi));
+}
+
+SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
+  return v256_from_v128(v128_pack_s32_s16(a.hi, a.lo),
+                        v128_pack_s32_s16(b.hi, b.lo));
+}
+
+SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
+  return v256_from_v128(v128_pack_s16_u8(a.hi, a.lo),
+                        v128_pack_s16_u8(b.hi, b.lo));
+}
+
+SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) {
+  return v256_from_v128(v128_pack_s16_s8(a.hi, a.lo),
+                        v128_pack_s16_s8(b.hi, b.lo));
+}
+
+SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
+  return v256_from_v128(v128_unpackhi_u16_s32(a), v128_unpacklo_u16_s32(a));
+}
+
+SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
+  return v256_from_v128(v128_unpackhi_s16_s32(a), v128_unpacklo_s16_s32(a));
+}
+
+SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
+  return v256_from_v128(v128_unpackhi_u16_s32(a.lo),
+                        v128_unpacklo_u16_s32(a.lo));
+}
+
+SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) {
+  return v256_from_v128(v128_unpackhi_s16_s32(a.lo),
+                        v128_unpacklo_s16_s32(a.lo));
+}
+
+SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) {
+  return v256_from_v128(v128_unpackhi_u16_s32(a.hi),
+                        v128_unpacklo_u16_s32(a.hi));
+}
+
+SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
+  return v256_from_v128(v128_unpackhi_s16_s32(a.hi),
+                        v128_unpacklo_s16_s32(a.hi));
+}
+
+SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) {
+  v128 c16 = v128_dup_8(16);
+  v128 maskhi = v128_cmplt_s8(pattern.hi, c16);
+  v128 masklo = v128_cmplt_s8(pattern.lo, c16);
+  return v256_from_v128(
+      v128_or(
+          v128_and(v128_shuffle_8(a.lo, pattern.hi), maskhi),
+          v128_andn(v128_shuffle_8(a.hi, v128_sub_8(pattern.hi, c16)), maskhi)),
+      v128_or(v128_and(v128_shuffle_8(a.lo, pattern.lo), masklo),
+              v128_andn(v128_shuffle_8(a.hi, v128_sub_8(pattern.lo, c16)),
+                        masklo)));
+}
+
+SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
+  return v256_from_v128(
+      v128_shuffle_8(v256_high_v128(a), v256_high_v128(pattern)),
+      v128_shuffle_8(v256_low_v128(a), v256_low_v128(pattern)));
+}
+
+SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) {
+  return v256_from_v128(v128_cmpgt_s8(a.hi, b.hi), v128_cmpgt_s8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) {
+  return v256_from_v128(v128_cmplt_s8(a.hi, b.hi), v128_cmplt_s8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) {
+  return v256_from_v128(v128_cmpeq_8(a.hi, b.hi), v128_cmpeq_8(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) {
+  return v256_from_v128(v128_cmpgt_s16(a.hi, b.hi), v128_cmpgt_s16(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
+  return v256_from_v128(v128_cmplt_s16(a.hi, b.hi), v128_cmplt_s16(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) {
+  return v256_from_v128(v128_cmpeq_16(a.hi, b.hi), v128_cmpeq_16(a.lo, b.lo));
+}
+
+SIMD_INLINE v256 v256_shl_8(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shl_8(a.hi, c), v128_shl_8(a.lo, c));
+}
+
+SIMD_INLINE v256 v256_shr_u8(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_u8(a.hi, c), v128_shr_u8(a.lo, c));
+}
+
+SIMD_INLINE v256 v256_shr_s8(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_s8(a.hi, c), v128_shr_s8(a.lo, c));
+}
+
+SIMD_INLINE v256 v256_shl_16(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shl_16(a.hi, c), v128_shl_16(a.lo, c));
+}
+
+SIMD_INLINE v256 v256_shr_u16(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_u16(a.hi, c), v128_shr_u16(a.lo, c));
+}
+
+SIMD_INLINE v256 v256_shr_s16(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_s16(a.hi, c), v128_shr_s16(a.lo, c));
+}
+
+SIMD_INLINE v256 v256_shl_32(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shl_32(a.hi, c), v128_shl_32(a.lo, c));
+}
+
+SIMD_INLINE v256 v256_shr_u32(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_u32(a.hi, c), v128_shr_u32(a.lo, c));
+}
+
+SIMD_INLINE v256 v256_shr_s32(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_s32(a.hi, c), v128_shr_s32(a.lo, c));
+}
+
+/* These intrinsics require immediate values, so we must use #defines
+   to enforce that. */
+#define v256_shl_n_byte(a, n)                                                 \
+  ((n) < 16 ? v256_from_v128(v128_or(v128_shl_n_byte(a.hi, n),                \
+                                     v128_shr_n_byte(a.lo, 16 - (n))),        \
+                             v128_shl_n_byte(a.lo, (n)))                      \
+            : v256_from_v128((n) > 16 ? v128_shl_n_byte(a.lo, (n)-16) : a.lo, \
+                             v128_zero()))
+
+#define v256_shr_n_byte(a, n)                                          \
+  ((n) < 16 ? v256_from_v128(v128_shr_n_byte(a.hi, n),                 \
+                             v128_or(v128_shr_n_byte(a.lo, n),         \
+                                     v128_shl_n_byte(a.hi, 16 - (n)))) \
+            : v256_from_v128(v128_zero(),                              \
+                             (n) > 16 ? v128_shr_n_byte(a.hi, (n)-16) : a.hi))
+
+#define v256_align(a, b, c) \
+  ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b)
+
+#define v256_shl_n_8(a, n) \
+  v256_from_v128(v128_shl_n_8(a.hi, n), v128_shl_n_8(a.lo, n))
+#define v256_shl_n_16(a, n) \
+  v256_from_v128(v128_shl_n_16(a.hi, n), v128_shl_n_16(a.lo, n))
+#define v256_shl_n_32(a, n) \
+  v256_from_v128(v128_shl_n_32(a.hi, n), v128_shl_n_32(a.lo, n))
+#define v256_shr_n_u8(a, n) \
+  v256_from_v128(v128_shr_n_u8(a.hi, n), v128_shr_n_u8(a.lo, n))
+#define v256_shr_n_u16(a, n) \
+  v256_from_v128(v128_shr_n_u16(a.hi, n), v128_shr_n_u16(a.lo, n))
+#define v256_shr_n_u32(a, n) \
+  v256_from_v128(v128_shr_n_u32(a.hi, n), v128_shr_n_u32(a.lo, n))
+#define v256_shr_n_s8(a, n) \
+  v256_from_v128(v128_shr_n_s8(a.hi, n), v128_shr_n_s8(a.lo, n))
+#define v256_shr_n_s16(a, n) \
+  v256_from_v128(v128_shr_n_s16(a.hi, n), v128_shr_n_s16(a.lo, n))
+#define v256_shr_n_s32(a, n) \
+  v256_from_v128(v128_shr_n_s32(a.hi, n), v128_shr_n_s32(a.lo, n))
+
+#endif /* _V256_INTRINSICS_V128_H */
diff --git a/aom_dsp/simd/v256_intrinsics_x86.h b/aom_dsp/simd/v256_intrinsics_x86.h
new file mode 100644
index 0000000..b5bdb53
--- /dev/null
+++ b/aom_dsp/simd/v256_intrinsics_x86.h
@@ -0,0 +1,528 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef _V256_INTRINSICS_H
+#define _V256_INTRINSICS_H
+
+#if !defined(__AVX2__)
+
+#include "./v256_intrinsics_v128.h"
+
+#else
+
+// The _m256i type seems to cause problems for g++'s mangling prior to
+// version 5, but adding -fabi-version=0 fixes this.
+#if !defined(__clang__) && __GNUC__ < 5 && defined(__AVX2__) && \
+    defined(__cplusplus)
+#pragma GCC optimize "-fabi-version=0"
+#endif
+
+#include <immintrin.h>
+#include "./v128_intrinsics_x86.h"
+
+typedef __m256i v256;
+
+SIMD_INLINE uint32_t v256_low_u32(v256 a) {
+  return (uint32_t)_mm_cvtsi128_si32(_mm256_extracti128_si256(a, 0));
+}
+
+SIMD_INLINE v64 v256_low_v64(v256 a) {
+  return _mm_unpacklo_epi64(_mm256_extracti128_si256(a, 0), v64_zero());
+}
+
+SIMD_INLINE v128 v256_low_v128(v256 a) {
+  return _mm256_extracti128_si256(a, 0);
+}
+
+SIMD_INLINE v128 v256_high_v128(v256 a) {
+  return _mm256_extracti128_si256(a, 1);
+}
+
+SIMD_INLINE v256 v256_from_v128(v128 a, v128 b) {
+  // gcc seems to be missing _mm256_set_m128i()
+  return _mm256_insertf128_si256(
+      _mm256_insertf128_si256(_mm256_setzero_si256(), b, 0), a, 1);
+}
+
+SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) {
+  return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d));
+}
+
+SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
+  return v256_from_v128(v128_from_64(a, b), v128_from_64(c, d));
+}
+
+SIMD_INLINE v256 v256_load_aligned(const void *p) {
+  return _mm256_load_si256((const __m256i *)p);
+}
+
+SIMD_INLINE v256 v256_load_unaligned(const void *p) {
+  return _mm256_loadu_si256((const __m256i *)p);
+}
+
+SIMD_INLINE void v256_store_aligned(void *p, v256 a) {
+  _mm256_store_si256((__m256i *)p, a);
+}
+
+SIMD_INLINE void v256_store_unaligned(void *p, v256 a) {
+  _mm256_storeu_si256((__m256i *)p, a);
+}
+
+SIMD_INLINE v256 v256_zero() { return _mm256_setzero_si256(); }
+
+SIMD_INLINE v256 v256_dup_8(uint8_t x) { return _mm256_set1_epi8(x); }
+
+SIMD_INLINE v256 v256_dup_16(uint16_t x) { return _mm256_set1_epi16(x); }
+
+SIMD_INLINE v256 v256_dup_32(uint32_t x) { return _mm256_set1_epi32(x); }
+
+SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return _mm256_add_epi8(a, b); }
+
+SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return _mm256_add_epi16(a, b); }
+
+SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) {
+  return _mm256_adds_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return _mm256_add_epi32(a, b); }
+
+SIMD_INLINE v256 v256_padd_s16(v256 a) {
+  return _mm256_madd_epi16(a, _mm256_set1_epi16(1));
+}
+
+SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { return _mm256_sub_epi8(a, b); }
+
+SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { return _mm256_subs_epu8(a, b); }
+
+SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { return _mm256_subs_epi8(a, b); }
+
+SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { return _mm256_sub_epi16(a, b); }
+
+SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) {
+  return _mm256_subs_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return _mm256_sub_epi32(a, b); }
+
+SIMD_INLINE v256 v256_abs_s16(v256 a) { return _mm256_abs_epi16(a); }
+
+// AVX doesn't have the direct intrinsics to zip/unzip 8, 16, 32 bit
+// lanes of lower or upper halves of a 256bit vector because the
+// unpack/pack intrinsics operate on the 256 bit input vector as 2
+// independent 128 bit vectors.
+SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) {
+  return v256_from_v128(v128_ziphi_8(v256_low_v128(a), v256_low_v128(b)),
+                        v128_ziplo_8(v256_low_v128(a), v256_low_v128(b)));
+}
+
+SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) {
+  return v256_from_v128(v128_ziphi_8(v256_high_v128(a), v256_high_v128(b)),
+                        v128_ziplo_8(v256_high_v128(a), v256_high_v128(b)));
+}
+
+SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) {
+  return v256_from_v128(v128_ziphi_16(v256_low_v128(a), v256_low_v128(b)),
+                        v128_ziplo_16(v256_low_v128(a), v256_low_v128(b)));
+}
+
+SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) {
+  return v256_from_v128(v128_ziphi_16(v256_high_v128(a), v256_high_v128(b)),
+                        v128_ziplo_16(v256_high_v128(a), v256_high_v128(b)));
+}
+
+SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) {
+  return v256_from_v128(v128_ziphi_32(v256_low_v128(a), v256_low_v128(b)),
+                        v128_ziplo_32(v256_low_v128(a), v256_low_v128(b)));
+}
+
+SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) {
+  return v256_from_v128(v128_ziphi_32(v256_high_v128(a), v256_high_v128(b)),
+                        v128_ziplo_32(v256_high_v128(a), v256_high_v128(b)));
+}
+
+SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) {
+  return v256_from_v128(v128_ziphi_64(v256_low_v128(a), v256_low_v128(b)),
+                        v128_ziplo_64(v256_low_v128(a), v256_low_v128(b)));
+}
+
+SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) {
+  return v256_from_v128(v128_ziphi_64(v256_high_v128(a), v256_high_v128(b)),
+                        v128_ziplo_64(v256_high_v128(a), v256_high_v128(b)));
+}
+
+SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
+  return v256_from_v128(v256_low_v128(a), v256_low_v128(b));
+}
+
+SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) {
+  return v256_from_v128(v256_high_v128(a), v256_high_v128(b));
+}
+
+SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) {
+  return v256_from_v128(v128_ziphi_8(a, b), v128_ziplo_8(a, b));
+}
+
+SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) {
+  return v256_from_v128(v128_ziphi_16(a, b), v128_ziplo_16(a, b));
+}
+
+SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) {
+  return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b));
+}
+
+SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
+  return v256_from_v128(v128_unziplo_8(v256_high_v128(a), v256_low_v128(a)),
+                        v128_unziplo_8(v256_high_v128(b), v256_low_v128(b)));
+}
+
+SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
+  return v256_from_v128(v128_unziphi_8(v256_high_v128(a), v256_low_v128(a)),
+                        v128_unziphi_8(v256_high_v128(b), v256_low_v128(b)));
+}
+
+SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
+  return v256_from_v128(v128_unziplo_16(v256_high_v128(a), v256_low_v128(a)),
+                        v128_unziplo_16(v256_high_v128(b), v256_low_v128(b)));
+}
+
+SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
+  return v256_from_v128(v128_unziphi_16(v256_high_v128(a), v256_low_v128(a)),
+                        v128_unziphi_16(v256_high_v128(b), v256_low_v128(b)));
+}
+
+SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
+  return v256_from_v128(v128_unziplo_32(v256_high_v128(a), v256_low_v128(a)),
+                        v128_unziplo_32(v256_high_v128(b), v256_low_v128(b)));
+}
+
+SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
+  return v256_from_v128(v128_unziphi_32(v256_high_v128(a), v256_low_v128(a)),
+                        v128_unziphi_32(v256_high_v128(b), v256_low_v128(b)));
+}
+
+SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) {
+  return v256_from_v128(v128_unpackhi_u8_s16(a), v128_unpacklo_u8_s16(a));
+}
+
+SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
+  return v256_from_v128(v128_unpackhi_u8_s16(v256_low_v128(a)),
+                        v128_unpacklo_u8_s16(v256_low_v128(a)));
+}
+
+SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
+  return v256_from_v128(v128_unpackhi_u8_s16(v256_high_v128(a)),
+                        v128_unpacklo_u8_s16(v256_high_v128(a)));
+}
+
+SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
+  return v256_from_v128(v128_pack_s32_s16(v256_high_v128(a), v256_low_v128(a)),
+                        v128_pack_s32_s16(v256_high_v128(b), v256_low_v128(b)));
+}
+
+SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
+  return v256_from_v128(v128_pack_s16_u8(v256_high_v128(a), v256_low_v128(a)),
+                        v128_pack_s16_u8(v256_high_v128(b), v256_low_v128(b)));
+}
+
+SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) {
+  return v256_from_v128(v128_pack_s16_s8(v256_high_v128(a), v256_low_v128(a)),
+                        v128_pack_s16_s8(v256_high_v128(b), v256_low_v128(b)));
+}
+
+SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
+  return v256_from_v128(v128_unpackhi_u16_s32(a), v128_unpacklo_u16_s32(a));
+}
+
+SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
+  return v256_from_v128(v128_unpackhi_s16_s32(a), v128_unpacklo_s16_s32(a));
+}
+
+SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
+  return v256_from_v128(v128_unpackhi_u16_s32(v256_low_v128(a)),
+                        v128_unpacklo_u16_s32(v256_low_v128(a)));
+}
+
+SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) {
+  return v256_from_v128(v128_unpackhi_s16_s32(v256_low_v128(a)),
+                        v128_unpacklo_s16_s32(v256_low_v128(a)));
+}
+
+SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) {
+  return v256_from_v128(v128_unpackhi_u16_s32(v256_high_v128(a)),
+                        v128_unpacklo_u16_s32(v256_high_v128(a)));
+}
+
+SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
+  return v256_from_v128(v128_unpackhi_s16_s32(v256_high_v128(a)),
+                        v128_unpacklo_s16_s32(v256_high_v128(a)));
+}
+SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) {
+  v128 c16 = v128_dup_8(16);
+  v128 hi = v256_high_v128(pattern);
+  v128 lo = v256_low_v128(pattern);
+  v128 maskhi = v128_cmplt_s8(hi, c16);
+  v128 masklo = v128_cmplt_s8(lo, c16);
+  return v256_from_v128(
+      v128_or(v128_and(v128_shuffle_8(v256_low_v128(a), hi), maskhi),
+              v128_andn(v128_shuffle_8(v256_high_v128(a), v128_sub_8(hi, c16)),
+                        maskhi)),
+      v128_or(v128_and(v128_shuffle_8(v256_low_v128(a), lo), masklo),
+              v128_andn(v128_shuffle_8(v256_high_v128(a), v128_sub_8(lo, c16)),
+                        masklo)));
+}
+
+SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
+  return _mm256_shuffle_epi8(a, pattern);
+}
+
+SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
+  v256 r = _mm256_madd_epi16(a, b);
+#if defined(__x86_64__)
+  v128 t;
+  r = _mm256_add_epi64(_mm256_cvtepi32_epi64(v256_high_v128(r)),
+                       _mm256_cvtepi32_epi64(v256_low_v128(r)));
+  t = v256_low_v128(_mm256_add_epi64(
+      r, _mm256_permute2x128_si256(r, r, _MM_SHUFFLE(2, 0, 0, 1))));
+  return _mm_cvtsi128_si64(_mm_add_epi64(t, _mm_srli_si128(t, 8)));
+#else
+  v128 l = v256_low_v128(r);
+  v128 h = v256_high_v128(r);
+  return (int64_t)_mm_cvtsi128_si32(l) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 4)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 8)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 12)) +
+         (int64_t)_mm_cvtsi128_si32(h) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 4)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 8)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 12));
+#endif
+}
+
+SIMD_INLINE uint64_t v256_hadd_u8(v256 a) {
+  v256 t = _mm256_sad_epu8(a, _mm256_setzero_si256());
+  v128 lo = v256_low_v128(t);
+  v128 hi = v256_high_v128(t);
+  lo = v128_add_32(lo, hi);
+  return v64_low_u32(v128_low_v64(lo)) + v128_low_u32(v128_high_v64(lo));
+}
+
+typedef v256 sad256_internal;
+
+SIMD_INLINE sad256_internal v256_sad_u8_init() {
+  return _mm256_setzero_si256();
+}
+
+/* Implementation dependent return value.  Result must be finalised with
+   v256_sad_sum().
+   The result for more than 32 v256_sad_u8() calls is undefined. */
+SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) {
+  return _mm256_add_epi64(s, _mm256_sad_epu8(a, b));
+}
+
+SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) {
+  v256 t = _mm256_add_epi32(s, _mm256_unpackhi_epi64(s, s));
+  return v128_low_u32(_mm_add_epi32(v256_high_v128(t), v256_low_v128(t)));
+}
+
+typedef v256 ssd256_internal;
+
+SIMD_INLINE ssd256_internal v256_ssd_u8_init() {
+  return _mm256_setzero_si256();
+}
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v256_ssd_sum(). */
+SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
+  v256 l = _mm256_sub_epi16(_mm256_unpacklo_epi8(a, _mm256_setzero_si256()),
+                            _mm256_unpacklo_epi8(b, _mm256_setzero_si256()));
+  v256 h = _mm256_sub_epi16(_mm256_unpackhi_epi8(a, _mm256_setzero_si256()),
+                            _mm256_unpackhi_epi8(b, _mm256_setzero_si256()));
+  v256 rl = _mm256_madd_epi16(l, l);
+  v256 rh = _mm256_madd_epi16(h, h);
+  v128 c = _mm_cvtsi32_si128(32);
+  rl = _mm256_add_epi32(rl, _mm256_srli_si256(rl, 8));
+  rl = _mm256_add_epi32(rl, _mm256_srli_si256(rl, 4));
+  rh = _mm256_add_epi32(rh, _mm256_srli_si256(rh, 8));
+  rh = _mm256_add_epi32(rh, _mm256_srli_si256(rh, 4));
+  return _mm256_add_epi64(
+      s,
+      _mm256_srl_epi64(_mm256_sll_epi64(_mm256_unpacklo_epi64(rl, rh), c), c));
+}
+
+SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) {
+  v256 t = _mm256_add_epi32(s, _mm256_unpackhi_epi64(s, s));
+  return v128_low_u32(_mm_add_epi32(v256_high_v128(t), v256_low_v128(t)));
+}
+
+SIMD_INLINE v256 v256_or(v256 a, v256 b) { return _mm256_or_si256(a, b); }
+
+SIMD_INLINE v256 v256_xor(v256 a, v256 b) { return _mm256_xor_si256(a, b); }
+
+SIMD_INLINE v256 v256_and(v256 a, v256 b) { return _mm256_and_si256(a, b); }
+
+SIMD_INLINE v256 v256_andn(v256 a, v256 b) { return _mm256_andnot_si256(b, a); }
+
+SIMD_INLINE v256 v256_mul_s16(v64 a, v64 b) {
+  v128 lo_bits = v128_mullo_s16(a, b);
+  v128 hi_bits = v128_mulhi_s16(a, b);
+  return v256_from_v128(v128_ziphi_16(hi_bits, lo_bits),
+                        v128_ziplo_16(hi_bits, lo_bits));
+}
+
+SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
+  return _mm256_mullo_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) {
+  return _mm256_mulhi_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) {
+  return _mm256_mullo_epi32(a, b);
+}
+
+SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) {
+  return _mm256_madd_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) {
+  return _mm256_maddubs_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { return _mm256_avg_epu8(a, b); }
+
+SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) {
+  return _mm256_sub_epi8(
+      _mm256_avg_epu8(a, b),
+      _mm256_and_si256(_mm256_xor_si256(a, b), v256_dup_8(1)));
+}
+
+SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return _mm256_avg_epu16(a, b); }
+
+SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return _mm256_min_epu8(a, b); }
+
+SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return _mm256_max_epu8(a, b); }
+
+SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return _mm256_min_epi8(a, b); }
+
+SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return _mm256_max_epi8(a, b); }
+
+SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return _mm256_min_epi16(a, b); }
+
+SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return _mm256_max_epi16(a, b); }
+
+SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) {
+  return _mm256_cmpgt_epi8(a, b);
+}
+
+SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) {
+  return v256_andn(_mm256_cmpgt_epi8(b, a), _mm256_cmpeq_epi8(b, a));
+}
+
+SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) {
+  return _mm256_cmpeq_epi8(a, b);
+}
+
+SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) {
+  return _mm256_cmpgt_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
+  return v256_andn(_mm256_cmpgt_epi16(b, a), _mm256_cmpeq_epi16(b, a));
+}
+
+SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) {
+  return _mm256_cmpeq_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) {
+  return _mm256_and_si256(_mm256_set1_epi8((uint8_t)(0xff << c)),
+                          _mm256_sll_epi16(a, _mm_cvtsi32_si128(c)));
+}
+
+SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) {
+  return _mm256_and_si256(_mm256_set1_epi8(0xff >> c),
+                          _mm256_srl_epi16(a, _mm_cvtsi32_si128(c)));
+}
+
+SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) {
+  __m128i x = _mm_cvtsi32_si128(c + 8);
+  return _mm256_packs_epi16(_mm256_sra_epi16(_mm256_unpacklo_epi8(a, a), x),
+                            _mm256_sra_epi16(_mm256_unpackhi_epi8(a, a), x));
+}
+
+SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) {
+  return _mm256_sll_epi16(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) {
+  return _mm256_srl_epi16(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) {
+  return _mm256_sra_epi16(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) {
+  return _mm256_sll_epi32(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) {
+  return _mm256_srl_epi32(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) {
+  return _mm256_sra_epi32(a, _mm_cvtsi32_si128(c));
+}
+
+/* These intrinsics require immediate values, so we must use #defines
+   to enforce that. */
+// _mm256_slli_si256 works on 128 bit lanes and can't be used
+#define v256_shl_n_byte(a, n)                                                 \
+  ((n) < 16                                                                   \
+       ? v256_from_v128(v128_or(v128_shl_n_byte(v256_high_v128(a), n),        \
+                                v128_shr_n_byte(v256_low_v128(a), 16 - (n))), \
+                        v128_shl_n_byte(v256_low_v128(a), n))                 \
+       : v256_from_v128(v128_shl_n_byte(v256_low_v128(a), (n)-16),            \
+                        v128_zero()))
+
+// _mm256_srli_si256 works on 128 bit lanes and can't be used
+#define v256_shr_n_byte(a, n)                                                 \
+  ((n) < 16                                                                   \
+       ? _mm256_alignr_epi8(                                                  \
+             _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), a, n)  \
+       : ((n) > 16                                                            \
+              ? _mm256_srli_si256(                                            \
+                    _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), \
+                    (n)-16)                                                   \
+              : _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1))))
+
+// _mm256_alignr_epi8 works on two 128 bit lanes and can't be used
+#define v256_align(a, b, c) \
+  ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - c)) : b)
+
+#define v256_shl_n_8(a, c)                                   \
+  _mm256_and_si256(_mm256_set1_epi8((uint8_t)(0xff << (c))), \
+                   _mm256_slli_epi16(a, c))
+#define v256_shr_n_u8(a, c) \
+  _mm256_and_si256(_mm256_set1_epi8(0xff >> (c)), _mm256_srli_epi16(a, c))
+#define v256_shr_n_s8(a, c)                                                  \
+  _mm256_packs_epi16(_mm256_srai_epi16(_mm256_unpacklo_epi8(a, a), (c) + 8), \
+                     _mm256_srai_epi16(_mm256_unpackhi_epi8(a, a), (c) + 8))
+#define v256_shl_n_16(a, c) _mm256_slli_epi16(a, c)
+#define v256_shr_n_u16(a, c) _mm256_srli_epi16(a, c)
+#define v256_shr_n_s16(a, c) _mm256_srai_epi16(a, c)
+#define v256_shl_n_32(a, c) _mm256_slli_epi32(a, c)
+#define v256_shr_n_u32(a, c) _mm256_srli_epi32(a, c)
+#define v256_shr_n_s32(a, c) _mm256_srai_epi32(a, c)
+#endif
+
+#endif /* _V256_INTRINSICS_H */
diff --git a/aom_dsp/x86/convolve.h b/aom_dsp/x86/convolve.h
index ffaed02..87ff34b 100644
--- a/aom_dsp/x86/convolve.h
+++ b/aom_dsp/x86/convolve.h
@@ -27,6 +27,10 @@
       const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                \
       ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,          \
       const int16_t *filter_y, int y_step_q4, int w, int h) {                \
+    (void)filter_x;                                                          \
+    (void)x_step_q4;                                                         \
+    (void)filter_y;                                                          \
+    (void)y_step_q4;                                                         \
     assert(filter[3] != 128);                                                \
     assert(step_q4 == 16);                                                   \
     if (filter[0] | filter[1] | filter[2]) {                                 \
diff --git a/aom_scale/generic/aom_scale.c b/aom_scale/generic/aom_scale.c
index 49084db..28604ac 100644
--- a/aom_scale/generic/aom_scale.c
+++ b/aom_scale/generic/aom_scale.c
@@ -68,24 +68,25 @@
                           unsigned int source_scale, unsigned int source_length,
                           unsigned char *dest, int dest_step,
                           unsigned int dest_scale, unsigned int dest_length) {
-  unsigned int i, j;
-  unsigned int temp;
-  int source_pitch = source_step;
+  const unsigned int source_pitch = source_step;
+  const unsigned char *const dest_end = dest + dest_length * dest_step;
   (void)source_length;
   (void)source_scale;
   (void)dest_scale;
 
-  source_step *= 2;
-  dest[0] = source[0];
+  source_step *= 2;  // Every other row.
 
-  for (i = dest_step, j = source_step; i < dest_length * dest_step;
-       i += dest_step, j += source_step) {
-    temp = 8;
-    temp += 3 * source[j - source_pitch];
-    temp += 10 * source[j];
-    temp += 3 * source[j + source_pitch];
-    temp >>= 4;
-    dest[i] = (char)(temp);
+  dest[0] = source[0];  // Special case: 1st pixel.
+  source += source_step;
+  dest += dest_step;
+
+  while (dest < dest_end) {
+    const unsigned int a = 3 * source[-source_pitch];
+    const unsigned int b = 10 * source[0];
+    const unsigned int c = 3 * source[source_pitch];
+    *dest = (unsigned char)((8 + a + b + c) >> 4);
+    source += source_step;
+    dest += dest_step;
   }
 }
 
@@ -119,17 +120,18 @@
                            unsigned int source_length, unsigned char *dest,
                            int dest_step, unsigned int dest_scale,
                            unsigned int dest_length) {
-  unsigned int i, j;
-
+  const unsigned char *const dest_end = dest + dest_length * dest_step;
   (void)source_length;
   (void)source_scale;
   (void)dest_scale;
 
-  source_step *= 2;
-  j = 0;
+  source_step *= 2;  // Every other row.
 
-  for (i = 0; i < dest_length * dest_step; i += dest_step, j += source_step)
-    dest[i] = source[j];
+  while (dest < dest_end) {
+    *dest = *source;
+    source += source_step;
+    dest += dest_step;
+  }
 }
 /****************************************************************************
  *
@@ -159,12 +161,12 @@
                       unsigned int source_scale, unsigned int source_length,
                       unsigned char *dest, int dest_step,
                       unsigned int dest_scale, unsigned int dest_length) {
-  unsigned int i;
-  unsigned int round_value = dest_scale / 2;
+  const unsigned char *const dest_end = dest + dest_length * dest_step;
+  const unsigned int round_value = dest_scale / 2;
   unsigned int left_modifier = dest_scale;
   unsigned int right_modifier = 0;
-  unsigned char left_pixel = *source;
-  unsigned char right_pixel = *(source + source_step);
+  unsigned char left_pixel = source[0];
+  unsigned char right_pixel = source[source_step];
 
   (void)source_length;
 
@@ -173,18 +175,18 @@
   /* assert ( (source_length - 1) * dest_scale >= (dest_length - 1) *
    * source_scale);*/
 
-  for (i = 0; i < dest_length * dest_step; i += dest_step) {
-    dest[i] = (char)((left_modifier * left_pixel +
-                      right_modifier * right_pixel + round_value) /
-                     dest_scale);
+  while (dest < dest_end) {
+    *dest = (unsigned char)((left_modifier * left_pixel +
+                             right_modifier * right_pixel + round_value) /
+                            dest_scale);
 
     right_modifier += source_scale;
 
     while (right_modifier > dest_scale) {
       right_modifier -= dest_scale;
       source += source_step;
-      left_pixel = *source;
-      right_pixel = *(source + source_step);
+      left_pixel = source[0];
+      right_pixel = source[source_step];
     }
 
     left_modifier = dest_scale - right_modifier;
@@ -236,11 +238,10 @@
     unsigned int dest_width, unsigned int dest_height, unsigned char *temp_area,
     unsigned char temp_area_height, unsigned int hscale, unsigned int hratio,
     unsigned int vscale, unsigned int vratio, unsigned int interlaced) {
-  /*unsigned*/
-  int i, j, k;
-  int bands;
-  int dest_band_height;
-  int source_band_height;
+  unsigned int i, j, k;
+  unsigned int bands;
+  unsigned int dest_band_height;
+  unsigned int source_band_height;
 
   typedef void (*Scale1D)(const unsigned char *source, int source_step,
                           unsigned int source_scale, unsigned int source_length,
@@ -331,7 +332,7 @@
   if (ratio_scalable) {
     if (source_height == dest_height) {
       /* for each band of the image */
-      for (k = 0; k < (int)dest_height; k++) {
+      for (k = 0; k < dest_height; ++k) {
         horiz_line_scale(source, source_width, dest, dest_width);
         source += source_pitch;
         dest += dest_pitch;
@@ -346,14 +347,13 @@
       horiz_line_scale(source, source_width, temp_area, dest_width);
     }
 
-    for (k = 0;
-         k < (int)(dest_height + dest_band_height - 1) / dest_band_height;
-         k++) {
+    for (k = 0; k < (dest_height + dest_band_height - 1) / dest_band_height;
+         ++k) {
       /* scale one band horizontally */
-      for (i = 0; i < source_band_height; i++) {
+      for (i = 0; i < source_band_height; ++i) {
         /* Trap case where we could read off the base of the source buffer */
 
-        line_src = (unsigned char *)source + i * source_pitch;
+        line_src = source + i * source_pitch;
 
         if (line_src < source_base) line_src = source_base;
 
@@ -388,7 +388,7 @@
 
   if (source_height == dest_height) {
     /* for each band of the image */
-    for (k = 0; k < (int)dest_height; k++) {
+    for (k = 0; k < dest_height; ++k) {
       Scale1Dh(source, 1, hscale, source_width + 1, dest, 1, hratio,
                dest_width);
       source += source_pitch;
@@ -414,10 +414,10 @@
   /* for each band of the image */
   bands = (dest_height + dest_band_height - 1) / dest_band_height;
 
-  for (k = 0; k < bands; k++) {
+  for (k = 0; k < bands; ++k) {
     /* scale one band horizontally */
-    for (i = 1; i < source_band_height + 1; i++) {
-      if (k * source_band_height + i < (int)source_height) {
+    for (i = 1; i < source_band_height + 1; ++i) {
+      if (k * source_band_height + i < source_height) {
         Scale1Dh(source + i * source_pitch, 1, hscale, source_width + 1,
                  temp_area + i * dest_pitch, 1, hratio, dest_width);
       } else { /*  Duplicate the last row */
@@ -428,7 +428,7 @@
     }
 
     /* scale one band vertically */
-    for (j = 0; j < (int)dest_width; j++) {
+    for (j = 0; j < dest_width; ++j) {
       Scale1Dv(&temp_area[j], dest_pitch, vscale, source_band_height + 1,
                &dest[j], dest_pitch, vratio, dest_band_height);
     }
@@ -487,12 +487,12 @@
           temp_area, temp_height, hscale, hratio, vscale, vratio, interlaced);
 
   if (dw < (int)dst->y_width)
-    for (i = 0; i < dh; i++)
+    for (i = 0; i < dh; ++i)
       memset(dst->y_buffer + i * dst->y_stride + dw - 1,
              dst->y_buffer[i * dst->y_stride + dw - 2], dst->y_width - dw + 1);
 
   if (dh < (int)dst->y_height)
-    for (i = dh - 1; i < (int)dst->y_height; i++)
+    for (i = dh - 1; i < (int)dst->y_height; ++i)
       memcpy(dst->y_buffer + i * dst->y_stride,
              dst->y_buffer + (dh - 2) * dst->y_stride, dst->y_width + 1);
 
@@ -502,13 +502,13 @@
           vratio, interlaced);
 
   if (dw / 2 < (int)dst->uv_width)
-    for (i = 0; i < dst->uv_height; i++)
+    for (i = 0; i < dst->uv_height; ++i)
       memset(dst->u_buffer + i * dst->uv_stride + dw / 2 - 1,
              dst->u_buffer[i * dst->uv_stride + dw / 2 - 2],
              dst->uv_width - dw / 2 + 1);
 
   if (dh / 2 < (int)dst->uv_height)
-    for (i = dh / 2 - 1; i < (int)dst->y_height / 2; i++)
+    for (i = dh / 2 - 1; i < (int)dst->y_height / 2; ++i)
       memcpy(dst->u_buffer + i * dst->uv_stride,
              dst->u_buffer + (dh / 2 - 2) * dst->uv_stride, dst->uv_width);
 
@@ -518,13 +518,13 @@
           vratio, interlaced);
 
   if (dw / 2 < (int)dst->uv_width)
-    for (i = 0; i < dst->uv_height; i++)
+    for (i = 0; i < dst->uv_height; ++i)
       memset(dst->v_buffer + i * dst->uv_stride + dw / 2 - 1,
              dst->v_buffer[i * dst->uv_stride + dw / 2 - 2],
              dst->uv_width - dw / 2 + 1);
 
   if (dh / 2 < (int)dst->uv_height)
-    for (i = dh / 2 - 1; i < (int)dst->y_height / 2; i++)
+    for (i = dh / 2 - 1; i < (int)dst->y_height / 2; ++i)
       memcpy(dst->v_buffer + i * dst->uv_stride,
              dst->v_buffer + (dh / 2 - 2) * dst->uv_stride, dst->uv_width);
 }
diff --git a/aom_scale/generic/gen_scalers.c b/aom_scale/generic/gen_scalers.c
index 57c464d..fd638bd 100644
--- a/aom_scale/generic/gen_scalers.c
+++ b/aom_scale/generic/gen_scalers.c
@@ -39,27 +39,23 @@
                                      unsigned int source_width,
                                      unsigned char *dest,
                                      unsigned int dest_width) {
-  unsigned i;
-  unsigned int a, b, c, d, e;
-  unsigned char *des = dest;
-  const unsigned char *src = source;
-
+  const unsigned char *const source_end = source + source_width;
   (void)dest_width;
 
-  for (i = 0; i < source_width; i += 5) {
-    a = src[0];
-    b = src[1];
-    c = src[2];
-    d = src[3];
-    e = src[4];
+  while (source < source_end) {
+    const unsigned int a = source[0];
+    const unsigned int b = source[1];
+    const unsigned int c = source[2];
+    const unsigned int d = source[3];
+    const unsigned int e = source[4];
 
-    des[0] = (unsigned char)a;
-    des[1] = (unsigned char)((b * 192 + c * 64 + 128) >> 8);
-    des[2] = (unsigned char)((c * 128 + d * 128 + 128) >> 8);
-    des[3] = (unsigned char)((d * 64 + e * 192 + 128) >> 8);
+    dest[0] = (unsigned char)a;
+    dest[1] = (unsigned char)((b * 192 + c * 64 + 128) >> 8);
+    dest[2] = (unsigned char)((c * 128 + d * 128 + 128) >> 8);
+    dest[3] = (unsigned char)((d * 64 + e * 192 + 128) >> 8);
 
-    src += 5;
-    des += 4;
+    source += 5;
+    dest += 4;
   }
 }
 
@@ -67,25 +63,21 @@
                                    unsigned int src_pitch, unsigned char *dest,
                                    unsigned int dest_pitch,
                                    unsigned int dest_width) {
-  unsigned int i;
-  unsigned int a, b, c, d, e;
-  unsigned char *des = dest;
-  unsigned char *src = source;
+  const unsigned char *const dest_end = dest + dest_width;
+  while (dest < dest_end) {
+    const unsigned int a = source[0 * src_pitch];
+    const unsigned int b = source[1 * src_pitch];
+    const unsigned int c = source[2 * src_pitch];
+    const unsigned int d = source[3 * src_pitch];
+    const unsigned int e = source[4 * src_pitch];
 
-  for (i = 0; i < dest_width; i++) {
-    a = src[0 * src_pitch];
-    b = src[1 * src_pitch];
-    c = src[2 * src_pitch];
-    d = src[3 * src_pitch];
-    e = src[4 * src_pitch];
+    dest[0 * dest_pitch] = (unsigned char)a;
+    dest[1 * dest_pitch] = (unsigned char)((b * 192 + c * 64 + 128) >> 8);
+    dest[2 * dest_pitch] = (unsigned char)((c * 128 + d * 128 + 128) >> 8);
+    dest[3 * dest_pitch] = (unsigned char)((d * 64 + e * 192 + 128) >> 8);
 
-    des[0 * dest_pitch] = (unsigned char)a;
-    des[1 * dest_pitch] = (unsigned char)((b * 192 + c * 64 + 128) >> 8);
-    des[2 * dest_pitch] = (unsigned char)((c * 128 + d * 128 + 128) >> 8);
-    des[3 * dest_pitch] = (unsigned char)((d * 64 + e * 192 + 128) >> 8);
-
-    src++;
-    des++;
+    ++source;
+    ++dest;
   }
 }
 
@@ -114,26 +106,21 @@
                                      unsigned int source_width,
                                      unsigned char *dest,
                                      unsigned int dest_width) {
-  unsigned int i;
-  unsigned int a, b, c, d, e;
-  unsigned char *des = dest;
-  const unsigned char *src = source;
-
+  const unsigned char *const source_end = source + source_width;
   (void)dest_width;
+  while (source < source_end) {
+    const unsigned int a = source[0];
+    const unsigned int b = source[1];
+    const unsigned int c = source[2];
+    const unsigned int d = source[3];
+    const unsigned int e = source[4];
 
-  for (i = 0; i < source_width; i += 5) {
-    a = src[0];
-    b = src[1];
-    c = src[2];
-    d = src[3];
-    e = src[4];
+    dest[0] = (unsigned char)a;
+    dest[1] = (unsigned char)((b * 85 + c * 171 + 128) >> 8);
+    dest[2] = (unsigned char)((d * 171 + e * 85 + 128) >> 8);
 
-    des[0] = (unsigned char)a;
-    des[1] = (unsigned char)((b * 85 + c * 171 + 128) >> 8);
-    des[2] = (unsigned char)((d * 171 + e * 85 + 128) >> 8);
-
-    src += 5;
-    des += 3;
+    source += 5;
+    dest += 3;
   }
 }
 
@@ -141,24 +128,20 @@
                                    unsigned int src_pitch, unsigned char *dest,
                                    unsigned int dest_pitch,
                                    unsigned int dest_width) {
-  unsigned int i;
-  unsigned int a, b, c, d, e;
-  unsigned char *des = dest;
-  unsigned char *src = source;
+  const unsigned char *const dest_end = dest + dest_width;
+  while (dest < dest_end) {
+    const unsigned int a = source[0 * src_pitch];
+    const unsigned int b = source[1 * src_pitch];
+    const unsigned int c = source[2 * src_pitch];
+    const unsigned int d = source[3 * src_pitch];
+    const unsigned int e = source[4 * src_pitch];
 
-  for (i = 0; i < dest_width; i++) {
-    a = src[0 * src_pitch];
-    b = src[1 * src_pitch];
-    c = src[2 * src_pitch];
-    d = src[3 * src_pitch];
-    e = src[4 * src_pitch];
+    dest[0 * dest_pitch] = (unsigned char)a;
+    dest[1 * dest_pitch] = (unsigned char)((b * 85 + c * 171 + 128) >> 8);
+    dest[2 * dest_pitch] = (unsigned char)((d * 171 + e * 85 + 128) >> 8);
 
-    des[0 * dest_pitch] = (unsigned char)a;
-    des[1 * dest_pitch] = (unsigned char)((b * 85 + c * 171 + 128) >> 8);
-    des[2 * dest_pitch] = (unsigned char)((d * 171 + e * 85 + 128) >> 8);
-
-    src++;
-    des++;
+    ++source;
+    ++dest;
   }
 }
 
@@ -186,18 +169,12 @@
                                      unsigned int source_width,
                                      unsigned char *dest,
                                      unsigned int dest_width) {
-  unsigned int i;
-  unsigned int a;
-  unsigned char *des = dest;
-  const unsigned char *src = source;
-
+  const unsigned char *const source_end = source + source_width;
   (void)dest_width;
-
-  for (i = 0; i < source_width; i += 2) {
-    a = src[0];
-    des[0] = (unsigned char)(a);
-    src += 2;
-    des += 1;
+  while (source < source_end) {
+    dest[0] = source[0];
+    source += 2;
+    ++dest;
   }
 }
 
@@ -215,18 +192,14 @@
                                      unsigned char *dest,
                                      unsigned int dest_pitch,
                                      unsigned int dest_width) {
-  int i;
-  int temp;
-  int width = dest_width;
-
+  const unsigned char *const dest_end = dest + dest_width;
   (void)dest_pitch;
-
-  for (i = 0; i < width; i++) {
-    temp = 8;
-    temp += source[i - (int)src_pitch] * 3;
-    temp += source[i] * 10;
-    temp += source[i + src_pitch] * 3;
-    temp >>= 4;
-    dest[i] = (unsigned char)(temp);
+  while (dest < dest_end) {
+    const unsigned int a = source[-src_pitch] * 3;
+    const unsigned int b = source[0] * 10;
+    const unsigned int c = source[src_pitch] * 3;
+    dest[0] = (unsigned char)((8 + a + b + c) >> 4);
+    ++source;
+    ++dest;
   }
 }
diff --git a/aomenc.c b/aomenc.c
index 8eb30ed..497c8d5 100644
--- a/aomenc.c
+++ b/aomenc.c
@@ -1415,9 +1415,8 @@
 #if CONFIG_WEBM_IO
   if (stream->config.write_webm) {
     stream->webm_ctx.stream = stream->file;
-    write_webm_file_header(&stream->webm_ctx, cfg, &global->framerate,
-                           stream->config.stereo_fmt, global->codec->fourcc,
-                           pixel_aspect_ratio);
+    write_webm_file_header(&stream->webm_ctx, cfg, stream->config.stereo_fmt,
+                           global->codec->fourcc, pixel_aspect_ratio);
   }
 #else
   (void)pixel_aspect_ratio;
diff --git a/av1/av1_dx_iface.c b/av1/av1_dx_iface.c
index 7da80f0..43cc3a2 100644
--- a/av1/av1_dx_iface.c
+++ b/av1/av1_dx_iface.c
@@ -828,7 +828,7 @@
 
 static aom_codec_err_t ctrl_copy_reference(aom_codec_alg_priv_t *ctx,
                                            va_list args) {
-  aom_ref_frame_t *data = va_arg(args, aom_ref_frame_t *);
+  const aom_ref_frame_t *const frame = va_arg(args, aom_ref_frame_t *);
 
   // Only support this function in serial decode.
   if (ctx->frame_parallel_decode) {
@@ -836,8 +836,7 @@
     return AOM_CODEC_INCAPABLE;
   }
 
-  if (data) {
-    aom_ref_frame_t *frame = (aom_ref_frame_t *)data;
+  if (frame) {
     YV12_BUFFER_CONFIG sd;
     AVxWorker *const worker = ctx->frame_workers;
     FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
diff --git a/av1/common/blockd.c b/av1/common/blockd.c
index 8938bbf..6332fed 100644
--- a/av1/common/blockd.c
+++ b/av1/common/blockd.c
@@ -92,40 +92,38 @@
 }
 
 void av1_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
-                      BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob,
-                      int aoff, int loff) {
+                      TX_SIZE tx_size, int has_eob, int aoff, int loff) {
   ENTROPY_CONTEXT *const a = pd->above_context + aoff;
   ENTROPY_CONTEXT *const l = pd->left_context + loff;
-  const int tx_w_in_blocks = num_4x4_blocks_wide_txsize_lookup[tx_size];
-  const int tx_h_in_blocks = num_4x4_blocks_high_txsize_lookup[tx_size];
+  const int tx_size_in_blocks = 1 << tx_size;
 
   // above
   if (has_eob && xd->mb_to_right_edge < 0) {
     int i;
-    const int blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize] +
-                            (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
-    int above_contexts = tx_w_in_blocks;
+    const int blocks_wide =
+        pd->n4_w + (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+    int above_contexts = tx_size_in_blocks;
     if (above_contexts + aoff > blocks_wide)
       above_contexts = blocks_wide - aoff;
 
     for (i = 0; i < above_contexts; ++i) a[i] = has_eob;
-    for (i = above_contexts; i < tx_w_in_blocks; ++i) a[i] = 0;
+    for (i = above_contexts; i < tx_size_in_blocks; ++i) a[i] = 0;
   } else {
-    memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * tx_w_in_blocks);
+    memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
   }
 
   // left
   if (has_eob && xd->mb_to_bottom_edge < 0) {
     int i;
-    const int blocks_high = num_4x4_blocks_high_lookup[plane_bsize] +
-                            (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
-    int left_contexts = tx_h_in_blocks;
+    const int blocks_high =
+        pd->n4_h + (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+    int left_contexts = tx_size_in_blocks;
     if (left_contexts + loff > blocks_high) left_contexts = blocks_high - loff;
 
     for (i = 0; i < left_contexts; ++i) l[i] = has_eob;
-    for (i = left_contexts; i < tx_h_in_blocks; ++i) l[i] = 0;
+    for (i = left_contexts; i < tx_size_in_blocks; ++i) l[i] = 0;
   } else {
-    memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * tx_h_in_blocks);
+    memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
   }
 }
 
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index 6296faa..bc1970c 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -191,7 +191,6 @@
   TX_SIZE inter_tx_size[MAX_MIB_SIZE][MAX_MIB_SIZE];
 #endif
   int8_t skip;
-  int8_t has_no_coeffs;
   int8_t segment_id;
 #if CONFIG_SUPERTX
   // Minimum of all segment IDs under the current supertx block.
@@ -757,8 +756,7 @@
                                    void *arg);
 
 void av1_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
-                      BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob,
-                      int aoff, int loff);
+                      TX_SIZE tx_size, int has_eob, int aoff, int loff);
 
 #if CONFIG_EXT_INTER
 static INLINE int is_interintra_allowed_bsize(const BLOCK_SIZE bsize) {
diff --git a/av1/common/entropymode.c b/av1/common/entropymode.c
index 20e8904..78f4ffe 100644
--- a/av1/common/entropymode.c
+++ b/av1/common/entropymode.c
@@ -892,13 +892,14 @@
 
 #if CONFIG_PALETTE
 int av1_get_palette_color_context(const uint8_t *color_map, int cols, int r,
-                                  int c, int n, int *color_order) {
+                                  int c, int n, uint8_t *color_order,
+                                  int *color_idx) {
   int i, j, max, max_idx, temp;
   int scores[PALETTE_MAX_SIZE + 10];
   int weights[4] = { 3, 2, 3, 2 };
   int color_ctx = 0;
   int color_neighbors[4];
-
+  int inverse_color_order[PALETTE_MAX_SIZE];
   assert(n <= PALETTE_MAX_SIZE);
 
   if (c - 1 >= 0)
@@ -918,7 +919,10 @@
   else
     color_neighbors[3] = -1;
 
-  for (i = 0; i < PALETTE_MAX_SIZE; ++i) color_order[i] = i;
+  for (i = 0; i < PALETTE_MAX_SIZE; ++i) {
+    color_order[i] = i;
+    inverse_color_order[i] = i;
+  }
   memset(scores, 0, PALETTE_MAX_SIZE * sizeof(scores[0]));
   for (i = 0; i < 4; ++i) {
     if (color_neighbors[i] >= 0) scores[color_neighbors[i]] += weights[i];
@@ -944,6 +948,8 @@
       temp = color_order[i];
       color_order[i] = color_order[max_idx];
       color_order[max_idx] = temp;
+      inverse_color_order[color_order[i]] = i;
+      inverse_color_order[color_order[max_idx]] = max_idx;
     }
   }
 
@@ -956,7 +962,9 @@
     }
 
   if (color_ctx >= PALETTE_COLOR_CONTEXTS) color_ctx = 0;
-
+  if (color_idx != NULL) {
+    *color_idx = inverse_color_order[color_map[r * cols + c]];
+  }
   return color_ctx;
 }
 #endif  // CONFIG_PALETTE
diff --git a/av1/common/entropymode.h b/av1/common/entropymode.h
index 85c68e1..68a6400 100644
--- a/av1/common/entropymode.h
+++ b/av1/common/entropymode.h
@@ -359,7 +359,8 @@
 
 #if CONFIG_PALETTE
 int av1_get_palette_color_context(const uint8_t *color_map, int cols, int r,
-                                  int c, int n, int *color_order);
+                                  int c, int n, uint8_t *color_order,
+                                  int *color_idx);
 #endif  // CONFIG_PALETTE
 
 #ifdef __cplusplus
diff --git a/av1/common/loopfilter.c b/av1/common/loopfilter.c
index c8022f2..d0b897c 100644
--- a/av1/common/loopfilter.c
+++ b/av1/common/loopfilter.c
@@ -753,7 +753,7 @@
 
   // If the block has no coefficients and is not intra we skip applying
   // the loop filter on block edges.
-  if ((mbmi->skip || mbmi->has_no_coeffs) && is_inter_block(mbmi)) return;
+  if (mbmi->skip && is_inter_block(mbmi)) return;
 
   // Here we are adding a mask for the transform size. The transform
   // size mask is set to be correct for a 64x64 prediction block size. We
@@ -818,7 +818,7 @@
   *above_y |= above_prediction_mask[block_size] << shift_y;
   *left_y |= left_prediction_mask[block_size] << shift_y;
 
-  if ((mbmi->skip || mbmi->has_no_coeffs) && is_inter_block(mbmi)) return;
+  if (mbmi->skip && is_inter_block(mbmi)) return;
 
   *above_y |= (size_mask[block_size] & above_64x64_txform_mask[tx_size_y])
               << shift_y;
diff --git a/av1/common/onyxc_int.h b/av1/common/onyxc_int.h
index afc9da4..3c8eac8 100644
--- a/av1/common/onyxc_int.h
+++ b/av1/common/onyxc_int.h
@@ -520,6 +520,17 @@
   return len + MAX_MIB_SIZE;
 }
 
+static INLINE void set_plane_n4(MACROBLOCKD *const xd, int bw, int bh, int bwl,
+                                int bhl) {
+  int i;
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    xd->plane[i].n4_w = (bw << 1) >> xd->plane[i].subsampling_x;
+    xd->plane[i].n4_h = (bh << 1) >> xd->plane[i].subsampling_y;
+    xd->plane[i].n4_wl = bwl - xd->plane[i].subsampling_x;
+    xd->plane[i].n4_hl = bhl - xd->plane[i].subsampling_y;
+  }
+}
+
 static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
                                   int mi_row, int bh, int mi_col, int bw,
                                   int mi_rows, int mi_cols) {
diff --git a/av1/common/reconinter.c b/av1/common/reconinter.c
index c0fc494..b07a8bd 100644
--- a/av1/common/reconinter.c
+++ b/av1/common/reconinter.c
@@ -647,6 +647,87 @@
   }
 #endif
 
+#if CONFIG_SUB8X8_MC
+  if (mi->mbmi.sb_type < BLOCK_8X8 && plane > 0) {
+    // block size in log2
+    const int b4_wl = b_width_log2_lookup[mi->mbmi.sb_type];
+    const int b4_hl = b_height_log2_lookup[mi->mbmi.sb_type];
+    const int b8_sl = b_width_log2_lookup[BLOCK_8X8];
+
+    // block size
+    const int b4_w = 1 << b4_wl;
+    const int b4_h = 1 << b4_hl;
+    const int b8_s = 1 << b8_sl;
+    int idx, idy;
+
+    const int x_base = x;
+    const int y_base = y;
+
+    // processing unit size
+    const int x_step = w >> (b8_sl - b4_wl);
+    const int y_step = h >> (b8_sl - b4_hl);
+
+    for (idy = 0; idy < b8_s; idy += b4_h) {
+      for (idx = 0; idx < b8_s; idx += b4_w) {
+        const int chr_idx = (idy * 2) + idx;
+        for (ref = 0; ref < 1 + is_compound; ++ref) {
+          const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
+          struct buf_2d *const pre_buf = &pd->pre[ref];
+          struct buf_2d *const dst_buf = &pd->dst;
+          uint8_t *dst = dst_buf->buf;
+          const MV mv = mi->bmi[chr_idx].as_mv[ref].as_mv;
+          const MV mv_q4 = clamp_mv_to_umv_border_sb(
+              xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
+          uint8_t *pre;
+          MV32 scaled_mv;
+          int xs, ys, subpel_x, subpel_y;
+          const int is_scaled = av1_is_scaled(sf);
+
+          x = x_base + idx * x_step;
+          y = y_base + idy * y_step;
+
+          dst += dst_buf->stride * y + x;
+
+          if (is_scaled) {
+            pre =
+                pre_buf->buf + scaled_buffer_offset(x, y, pre_buf->stride, sf);
+            scaled_mv = av1_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
+            xs = sf->x_step_q4;
+            ys = sf->y_step_q4;
+          } else {
+            pre = pre_buf->buf + y * pre_buf->stride + x;
+            scaled_mv.row = mv_q4.row;
+            scaled_mv.col = mv_q4.col;
+            xs = ys = 16;
+          }
+
+          subpel_x = scaled_mv.col & SUBPEL_MASK;
+          subpel_y = scaled_mv.row & SUBPEL_MASK;
+          pre += (scaled_mv.row >> SUBPEL_BITS) * pre_buf->stride +
+                 (scaled_mv.col >> SUBPEL_BITS);
+
+#if CONFIG_AOM_HIGHBITDEPTH
+          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+            high_inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
+                                 subpel_x, subpel_y, sf, x_step, y_step, ref,
+                                 &mi->mbmi.interp_filter, xs, ys, xd->bd);
+          } else {
+            inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
+                            subpel_x, subpel_y, sf, x_step, y_step, ref,
+                            &mi->mbmi.interp_filter, xs, ys);
+          }
+#else
+          inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride, subpel_x,
+                          subpel_y, sf, x_step, y_step, ref,
+                          &mi->mbmi.interp_filter, xs, ys);
+#endif
+        }
+      }
+    }
+    return;
+  }
+#endif
+
   for (ref = 0; ref < 1 + is_compound; ++ref) {
     const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
     struct buf_2d *const pre_buf = &pd->pre[ref];
diff --git a/av1/common/reconinter.h b/av1/common/reconinter.h
index bfa7e95..5f62f0a 100644
--- a/av1/common/reconinter.h
+++ b/av1/common/reconinter.h
@@ -50,7 +50,7 @@
     const int16_t *kernel_y =
         av1_get_interp_filter_subpel_kernel(interp_filter_params_y, subpel_y);
 #else
-  if (interp_filter_params.taps == SUBPEL_TAPS) {
+  if (interp_filter_params.taps == SUBPEL_TAPS && w > 2 && h > 2) {
     const int16_t *kernel_x =
         av1_get_interp_filter_subpel_kernel(interp_filter_params, subpel_x);
     const int16_t *kernel_y =
@@ -109,7 +109,7 @@
     const int16_t *kernel_y =
         av1_get_interp_filter_subpel_kernel(interp_filter_params_y, subpel_y);
 #else
-  if (interp_filter_params.taps == SUBPEL_TAPS) {
+  if (interp_filter_params.taps == SUBPEL_TAPS && w > 2 && h > 2) {
     const int16_t *kernel_x =
         av1_get_interp_filter_subpel_kernel(interp_filter_params, subpel_x);
     const int16_t *kernel_y =
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index 27640b7..146ca23 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -383,17 +383,6 @@
   }
 }
 
-static void set_plane_n4(MACROBLOCKD *const xd, int bw, int bh, int bwl,
-                         int bhl) {
-  int i;
-  for (i = 0; i < MAX_MB_PLANE; i++) {
-    xd->plane[i].n4_w = (bw << 1) >> xd->plane[i].subsampling_x;
-    xd->plane[i].n4_h = (bh << 1) >> xd->plane[i].subsampling_y;
-    xd->plane[i].n4_wl = bwl - xd->plane[i].subsampling_x;
-    xd->plane[i].n4_hl = bhl - xd->plane[i].subsampling_y;
-  }
-}
-
 static MB_MODE_INFO *set_offsets(AV1_COMMON *const cm, MACROBLOCKD *const xd,
                                  BLOCK_SIZE bsize, int mi_row, int mi_col,
                                  int bw, int bh, int x_mis, int y_mis, int bwl,
@@ -1153,7 +1142,6 @@
 #endif  // CONFIG_EXT_PARTITION_TYPES
                          BLOCK_SIZE bsize, int bwl, int bhl) {
   AV1_COMMON *const cm = &pbi->common;
-  const int less8x8 = bsize < BLOCK_8X8;
   const int bw = 1 << (bwl - 1);
   const int bh = 1 << (bhl - 1);
   const int x_mis = AOMMIN(bw, cm->mi_cols - mi_col);
@@ -1373,9 +1361,6 @@
                                                 row, col, tx_size);
 #endif
       }
-
-      if (!less8x8 && eobtotal == 0)
-        mbmi->has_no_coeffs = 1;  // skip loopfilter
     }
   }
 
diff --git a/av1/decoder/decodemv.c b/av1/decoder/decodemv.c
index 4d181fd..8260f9d 100644
--- a/av1/decoder/decodemv.c
+++ b/av1/decoder/decodemv.c
@@ -718,9 +718,15 @@
         !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
       FRAME_COUNTS *counts = xd->counts;
       TX_TYPE tx_type_nom = intra_mode_to_tx_type_context[mbmi->mode];
+#if CONFIG_DAALA_EC
+      mbmi->tx_type = av1_ext_tx_inv[aom_read_symbol(
+          r, cm->fc->intra_ext_tx_cdf[mbmi->tx_size][tx_type_nom], TX_TYPES,
+          ACCT_STR)];
+#else
       mbmi->tx_type = aom_read_tree(
           r, av1_ext_tx_tree,
           cm->fc->intra_ext_tx_prob[mbmi->tx_size][tx_type_nom], ACCT_STR);
+#endif
       if (counts)
         ++counts->intra_ext_tx[mbmi->tx_size][tx_type_nom][mbmi->tx_type];
     } else {
@@ -740,7 +746,7 @@
 
   // Integer part
   if (class0) {
-    d = aom_read_tree(r, av1_mv_class0_tree, mvcomp->class0, ACCT_STR);
+    d = aom_read(r, mvcomp->class0[0], ACCT_STR);
     mag = 0;
   } else {
     int i;
diff --git a/av1/decoder/detokenize.c b/av1/decoder/detokenize.c
index 7077788..f2f74f5 100644
--- a/av1/decoder/detokenize.c
+++ b/av1/decoder/detokenize.c
@@ -294,59 +294,18 @@
   return c;
 }
 
-// TODO(slavarnway): Decode version of av1_set_context.  Modify
-// av1_set_context
-// after testing is complete, then delete this version.
-static void dec_set_contexts(const MACROBLOCKD *xd,
-                             struct macroblockd_plane *pd, TX_SIZE tx_size,
-                             int has_eob, int aoff, int loff) {
-  ENTROPY_CONTEXT *const a = pd->above_context + aoff;
-  ENTROPY_CONTEXT *const l = pd->left_context + loff;
-  const int tx_w_in_blocks = num_4x4_blocks_wide_txsize_lookup[tx_size];
-  const int tx_h_in_blocks = num_4x4_blocks_high_txsize_lookup[tx_size];
-
-  // above
-  if (has_eob && xd->mb_to_right_edge < 0) {
-    int i;
-    const int blocks_wide =
-        pd->n4_w + (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
-    int above_contexts = tx_w_in_blocks;
-    if (above_contexts + aoff > blocks_wide)
-      above_contexts = blocks_wide - aoff;
-
-    for (i = 0; i < above_contexts; ++i) a[i] = has_eob;
-    for (i = above_contexts; i < tx_w_in_blocks; ++i) a[i] = 0;
-  } else {
-    memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * tx_w_in_blocks);
-  }
-
-  // left
-  if (has_eob && xd->mb_to_bottom_edge < 0) {
-    int i;
-    const int blocks_high =
-        pd->n4_h + (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
-    int left_contexts = tx_h_in_blocks;
-    if (left_contexts + loff > blocks_high) left_contexts = blocks_high - loff;
-
-    for (i = 0; i < left_contexts; ++i) l[i] = has_eob;
-    for (i = left_contexts; i < tx_h_in_blocks; ++i) l[i] = 0;
-  } else {
-    memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * tx_h_in_blocks);
-  }
-}
-
 #if CONFIG_PALETTE
 void av1_decode_palette_tokens(MACROBLOCKD *const xd, int plane,
                                aom_reader *r) {
-  MODE_INFO *const mi = xd->mi[0];
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const MODE_INFO *const mi = xd->mi[0];
+  const MB_MODE_INFO *const mbmi = &mi->mbmi;
   const BLOCK_SIZE bsize = mbmi->sb_type;
   const int rows = (4 * num_4x4_blocks_high_lookup[bsize]) >>
                    (xd->plane[plane != 0].subsampling_y);
   const int cols = (4 * num_4x4_blocks_wide_lookup[bsize]) >>
                    (xd->plane[plane != 0].subsampling_x);
-  int color_idx, color_ctx, color_order[PALETTE_MAX_SIZE];
-  int n = mbmi->palette_mode_info.palette_size[plane != 0];
+  uint8_t color_order[PALETTE_MAX_SIZE];
+  const int n = mbmi->palette_mode_info.palette_size[plane != 0];
   int i, j;
   uint8_t *color_map = xd->plane[plane != 0].color_index_map;
   const aom_prob(*const prob)[PALETTE_COLOR_CONTEXTS][PALETTE_COLORS - 1] =
@@ -355,10 +314,10 @@
 
   for (i = 0; i < rows; ++i) {
     for (j = (i == 0 ? 1 : 0); j < cols; ++j) {
-      color_ctx =
-          av1_get_palette_color_context(color_map, cols, i, j, n, color_order);
-      color_idx = aom_read_tree(r, av1_palette_color_tree[n - 2],
-                                prob[n - 2][color_ctx], ACCT_STR);
+      const int color_ctx = av1_get_palette_color_context(color_map, cols, i, j,
+                                                          n, color_order, NULL);
+      const int color_idx = aom_read_tree(r, av1_palette_color_tree[n - 2],
+                                          prob[n - 2][color_ctx], ACCT_STR);
       assert(color_idx >= 0 && color_idx < n);
       color_map[i * cols + j] = color_order[color_idx];
     }
@@ -391,11 +350,6 @@
 #endif  // CONFIG_NEW_QUANT
                    ctx, sc->scan, sc->neighbors, r);
 #endif  // CONFIG_AOM_QM
-  dec_set_contexts(xd, pd, tx_size, eob > 0, x, y);
-  /*
-  av1_set_contexts(xd, pd,
-                    get_plane_block_size(xd->mi[0]->mbmi.sb_type, pd),
-                    tx_size, eob > 0, x, y);
-                    */
+  av1_set_contexts(xd, pd, tx_size, eob > 0, x, y);
   return eob;
 }
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index 7068604..e0fb7ec 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -1666,11 +1666,19 @@
 #else
     if (mbmi->tx_size < TX_32X32 && cm->base_qindex > 0 && !mbmi->skip &&
         !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+#if CONFIG_DAALA_EC
+      aom_write_symbol(
+          w, av1_ext_tx_ind[mbmi->tx_type],
+          cm->fc->intra_ext_tx_cdf[mbmi->tx_size]
+                                  [intra_mode_to_tx_type_context[mbmi->mode]],
+          TX_TYPES);
+#else
       av1_write_token(
           w, av1_ext_tx_tree,
           cm->fc->intra_ext_tx_prob[mbmi->tx_size]
                                    [intra_mode_to_tx_type_context[mbmi->mode]],
           &ext_tx_encodings[mbmi->tx_type]);
+#endif
     }
 #endif  // CONFIG_EXT_TX
   }
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 04d5282..d3b97d6 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -262,6 +262,8 @@
   MACROBLOCKD *const xd = &x->e_mbd;
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int bwl = b_width_log2_lookup[AOMMAX(bsize, BLOCK_8X8)];
+  const int bhl = b_height_log2_lookup[AOMMAX(bsize, BLOCK_8X8)];
 
   set_skip_context(xd, mi_row, mi_col);
 
@@ -284,6 +286,8 @@
   x->mv_row_max = (cm->mi_rows - mi_row) * MI_SIZE + AOM_INTERP_EXTEND;
   x->mv_col_max = (cm->mi_cols - mi_col) * MI_SIZE + AOM_INTERP_EXTEND;
 
+  set_plane_n4(xd, mi_width, mi_height, bwl, bhl);
+
   // Set up distance of MB to edge of frame in 1/8th pel units.
   assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
   set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width, cm->mi_rows,
diff --git a/av1/encoder/encodemv.c b/av1/encoder/encodemv.c
index da6f35c..7276fee 100644
--- a/av1/encoder/encodemv.c
+++ b/av1/encoder/encodemv.c
@@ -23,12 +23,10 @@
 static struct av1_token mv_joint_encodings[MV_JOINTS];
 static struct av1_token mv_class_encodings[MV_CLASSES];
 static struct av1_token mv_fp_encodings[MV_FP_SIZE];
-static struct av1_token mv_class0_encodings[CLASS0_SIZE];
 
 void av1_entropy_mv_init(void) {
   av1_tokens_from_tree(mv_joint_encodings, av1_mv_joint_tree);
   av1_tokens_from_tree(mv_class_encodings, av1_mv_class_tree);
-  av1_tokens_from_tree(mv_class0_encodings, av1_mv_class0_tree);
   av1_tokens_from_tree(mv_fp_encodings, av1_mv_fp_tree);
 }
 
@@ -53,8 +51,7 @@
 
   // Integer bits
   if (mv_class == MV_CLASS_0) {
-    av1_write_token(w, av1_mv_class0_tree, mvcomp->class0,
-                    &mv_class0_encodings[d]);
+    aom_write(w, d, mvcomp->class0[0]);
   } else {
     int i;
     const int n = mv_class + CLASS0_BITS - 1;  // number of bits
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index c22c5a8..f1a6f72 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -422,39 +422,6 @@
   aom_free(cpi->segmentation_map);
   cpi->segmentation_map = NULL;
 
-#if CONFIG_REF_MV
-  for (i = 0; i < NMV_CONTEXTS; ++i) {
-    aom_free(cpi->nmv_costs[i][0]);
-    aom_free(cpi->nmv_costs[i][1]);
-    aom_free(cpi->nmv_costs_hp[i][0]);
-    aom_free(cpi->nmv_costs_hp[i][1]);
-    cpi->nmv_costs[i][0] = NULL;
-    cpi->nmv_costs[i][1] = NULL;
-    cpi->nmv_costs_hp[i][0] = NULL;
-    cpi->nmv_costs_hp[i][1] = NULL;
-  }
-#endif
-
-  aom_free(cpi->nmvcosts[0]);
-  aom_free(cpi->nmvcosts[1]);
-  cpi->nmvcosts[0] = NULL;
-  cpi->nmvcosts[1] = NULL;
-
-  aom_free(cpi->nmvcosts_hp[0]);
-  aom_free(cpi->nmvcosts_hp[1]);
-  cpi->nmvcosts_hp[0] = NULL;
-  cpi->nmvcosts_hp[1] = NULL;
-
-  aom_free(cpi->nmvsadcosts[0]);
-  aom_free(cpi->nmvsadcosts[1]);
-  cpi->nmvsadcosts[0] = NULL;
-  cpi->nmvsadcosts[1] = NULL;
-
-  aom_free(cpi->nmvsadcosts_hp[0]);
-  aom_free(cpi->nmvsadcosts_hp[1]);
-  cpi->nmvsadcosts_hp[0] = NULL;
-  cpi->nmvsadcosts_hp[1] = NULL;
-
   av1_cyclic_refresh_free(cpi->cyclic_refresh);
   cpi->cyclic_refresh = NULL;
 
@@ -512,27 +479,15 @@
 #if CONFIG_REF_MV
   for (i = 0; i < NMV_CONTEXTS; ++i) {
     av1_copy(cc->nmv_vec_cost[i], cpi->td.mb.nmv_vec_cost[i]);
-    memcpy(cc->nmv_costs[i][0], cpi->nmv_costs[i][0],
-           MV_VALS * sizeof(*cpi->nmv_costs[i][0]));
-    memcpy(cc->nmv_costs[i][1], cpi->nmv_costs[i][1],
-           MV_VALS * sizeof(*cpi->nmv_costs[i][1]));
-    memcpy(cc->nmv_costs_hp[i][0], cpi->nmv_costs_hp[i][0],
-           MV_VALS * sizeof(*cpi->nmv_costs_hp[i][0]));
-    memcpy(cc->nmv_costs_hp[i][1], cpi->nmv_costs_hp[i][1],
-           MV_VALS * sizeof(*cpi->nmv_costs_hp[i][1]));
+    av1_copy(cc->nmv_costs, cpi->nmv_costs);
+    av1_copy(cc->nmv_costs_hp, cpi->nmv_costs_hp);
   }
 #else
   av1_copy(cc->nmvjointcost, cpi->td.mb.nmvjointcost);
 #endif
 
-  memcpy(cc->nmvcosts[0], cpi->nmvcosts[0],
-         MV_VALS * sizeof(*cpi->nmvcosts[0]));
-  memcpy(cc->nmvcosts[1], cpi->nmvcosts[1],
-         MV_VALS * sizeof(*cpi->nmvcosts[1]));
-  memcpy(cc->nmvcosts_hp[0], cpi->nmvcosts_hp[0],
-         MV_VALS * sizeof(*cpi->nmvcosts_hp[0]));
-  memcpy(cc->nmvcosts_hp[1], cpi->nmvcosts_hp[1],
-         MV_VALS * sizeof(*cpi->nmvcosts_hp[1]));
+  av1_copy(cc->nmvcosts, cpi->nmvcosts);
+  av1_copy(cc->nmvcosts_hp, cpi->nmvcosts_hp);
 
   av1_copy(cc->last_ref_lf_deltas, cm->lf.last_ref_deltas);
   av1_copy(cc->last_mode_lf_deltas, cm->lf.last_mode_deltas);
@@ -552,25 +507,15 @@
 #if CONFIG_REF_MV
   for (i = 0; i < NMV_CONTEXTS; ++i) {
     av1_copy(cpi->td.mb.nmv_vec_cost[i], cc->nmv_vec_cost[i]);
-    memcpy(cpi->nmv_costs[i][0], cc->nmv_costs[i][0],
-           MV_VALS * sizeof(*cc->nmv_costs[i][0]));
-    memcpy(cpi->nmv_costs[i][1], cc->nmv_costs[i][1],
-           MV_VALS * sizeof(*cc->nmv_costs[i][1]));
-    memcpy(cpi->nmv_costs_hp[i][0], cc->nmv_costs_hp[i][0],
-           MV_VALS * sizeof(*cc->nmv_costs_hp[i][0]));
-    memcpy(cpi->nmv_costs_hp[i][1], cc->nmv_costs_hp[i][1],
-           MV_VALS * sizeof(*cc->nmv_costs_hp[i][1]));
+    av1_copy(cpi->nmv_costs, cc->nmv_costs);
+    av1_copy(cpi->nmv_costs_hp, cc->nmv_costs_hp);
   }
 #else
   av1_copy(cpi->td.mb.nmvjointcost, cc->nmvjointcost);
 #endif
 
-  memcpy(cpi->nmvcosts[0], cc->nmvcosts[0], MV_VALS * sizeof(*cc->nmvcosts[0]));
-  memcpy(cpi->nmvcosts[1], cc->nmvcosts[1], MV_VALS * sizeof(*cc->nmvcosts[1]));
-  memcpy(cpi->nmvcosts_hp[0], cc->nmvcosts_hp[0],
-         MV_VALS * sizeof(*cc->nmvcosts_hp[0]));
-  memcpy(cpi->nmvcosts_hp[1], cc->nmvcosts_hp[1],
-         MV_VALS * sizeof(*cc->nmvcosts_hp[1]));
+  av1_copy(cpi->nmvcosts, cc->nmvcosts);
+  av1_copy(cpi->nmvcosts_hp, cc->nmvcosts_hp);
 
   av1_copy(cm->lf.last_ref_deltas, cc->last_ref_lf_deltas);
   av1_copy(cm->lf.last_mode_deltas, cc->last_mode_lf_deltas);
@@ -2117,33 +2062,15 @@
 
 #if CONFIG_REF_MV
   for (i = 0; i < NMV_CONTEXTS; ++i) {
-    CHECK_MEM_ERROR(cm, cpi->nmv_costs[i][0],
-                    aom_calloc(MV_VALS, sizeof(*cpi->nmv_costs[i][0])));
-    CHECK_MEM_ERROR(cm, cpi->nmv_costs[i][1],
-                    aom_calloc(MV_VALS, sizeof(*cpi->nmv_costs[i][1])));
-    CHECK_MEM_ERROR(cm, cpi->nmv_costs_hp[i][0],
-                    aom_calloc(MV_VALS, sizeof(*cpi->nmv_costs_hp[i][0])));
-    CHECK_MEM_ERROR(cm, cpi->nmv_costs_hp[i][1],
-                    aom_calloc(MV_VALS, sizeof(*cpi->nmv_costs_hp[i][1])));
+    memset(cpi->nmv_costs, 0, sizeof(cpi->nmv_costs));
+    memset(cpi->nmv_costs_hp, 0, sizeof(cpi->nmv_costs_hp));
   }
 #endif
 
-  CHECK_MEM_ERROR(cm, cpi->nmvcosts[0],
-                  aom_calloc(MV_VALS, sizeof(*cpi->nmvcosts[0])));
-  CHECK_MEM_ERROR(cm, cpi->nmvcosts[1],
-                  aom_calloc(MV_VALS, sizeof(*cpi->nmvcosts[1])));
-  CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[0],
-                  aom_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[0])));
-  CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[1],
-                  aom_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[1])));
-  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[0],
-                  aom_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[0])));
-  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[1],
-                  aom_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[1])));
-  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[0],
-                  aom_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[0])));
-  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[1],
-                  aom_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[1])));
+  memset(cpi->nmvcosts, 0, sizeof(cpi->nmvcosts));
+  memset(cpi->nmvcosts_hp, 0, sizeof(cpi->nmvcosts_hp));
+  memset(cpi->nmvsadcosts, 0, sizeof(cpi->nmvsadcosts));
+  memset(cpi->nmvsadcosts_hp, 0, sizeof(cpi->nmvsadcosts_hp));
 
   for (i = 0; i < (sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0]));
        i++) {
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index b55481b..0c66905 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -414,14 +414,14 @@
   CODING_CONTEXT coding_context;
 
 #if CONFIG_REF_MV
-  int *nmv_costs[NMV_CONTEXTS][2];
-  int *nmv_costs_hp[NMV_CONTEXTS][2];
+  int nmv_costs[NMV_CONTEXTS][2][MV_VALS];
+  int nmv_costs_hp[NMV_CONTEXTS][2][MV_VALS];
 #endif
 
-  int *nmvcosts[2];
-  int *nmvcosts_hp[2];
-  int *nmvsadcosts[2];
-  int *nmvsadcosts_hp[2];
+  int nmvcosts[2][MV_VALS];
+  int nmvcosts_hp[2][MV_VALS];
+  int nmvsadcosts[2][MV_VALS];
+  int nmvsadcosts_hp[2][MV_VALS];
 
   int64_t last_time_stamp_seen;
   int64_t last_end_time_stamp_seen;
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 8d151a7..8ba6b7b 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -1771,8 +1771,7 @@
   if (colors > 1 && colors <= 64) {
     int r, c, i, j, k;
     const int max_itr = 50;
-    int color_ctx, color_idx = 0;
-    int color_order[PALETTE_MAX_SIZE];
+    uint8_t color_order[PALETTE_MAX_SIZE];
     float *const data = x->palette_buffer->kmeans_data_buf;
     float centroids[PALETTE_MAX_SIZE];
     uint8_t *const color_map = xd->plane[0].color_index_map;
@@ -1856,13 +1855,9 @@
               1);
       for (i = 0; i < rows; ++i) {
         for (j = (i == 0 ? 1 : 0); j < cols; ++j) {
-          color_ctx = av1_get_palette_color_context(color_map, cols, i, j, k,
-                                                    color_order);
-          for (r = 0; r < k; ++r)
-            if (color_map[i * cols + j] == color_order[r]) {
-              color_idx = r;
-              break;
-            }
+          int color_idx;
+          const int color_ctx = av1_get_palette_color_context(
+              color_map, cols, i, j, k, color_order, &color_idx);
           assert(color_idx >= 0 && color_idx < k);
           this_rate += cpi->palette_y_color_cost[k - 2][color_ctx][color_idx];
         }
@@ -2507,7 +2502,7 @@
 
 static void angle_estimation(const uint8_t *src, int src_stride, int rows,
                              int cols, uint8_t *directional_mode_skip_mask) {
-  int i, r, c, dx, dy, temp, sn, remd, quot;
+  int i, r, c, index, dx, dy, temp, sn, remd, quot;
   uint64_t hist[DIRECTIONAL_MODES];
   uint64_t hist_sum = 0;
 
@@ -2515,7 +2510,6 @@
   src += src_stride;
   for (r = 1; r < rows; ++r) {
     for (c = 1; c < cols; ++c) {
-      uint8_t index;
       dx = src[c] - src[c - 1];
       dy = src[c] - src[c - src_stride];
       temp = dx * dx + dy * dy;
@@ -2538,16 +2532,16 @@
   for (i = 0; i < DIRECTIONAL_MODES; ++i) hist_sum += hist[i];
   for (i = 0; i < INTRA_MODES; ++i) {
     if (i != DC_PRED && i != TM_PRED) {
-      const uint8_t index = mode_to_angle_bin[i];
-      uint64_t score = 2 * hist[index];
+      const uint8_t angle_bin = mode_to_angle_bin[i];
+      uint64_t score = 2 * hist[angle_bin];
       int weight = 2;
-      if (index > 0) {
-        score += hist[index - 1];
-        weight += 1;
+      if (angle_bin > 0) {
+        score += hist[angle_bin - 1];
+        ++weight;
       }
-      if (index < DIRECTIONAL_MODES - 1) {
-        score += hist[index + 1];
-        weight += 1;
+      if (angle_bin < DIRECTIONAL_MODES - 1) {
+        score += hist[angle_bin + 1];
+        ++weight;
       }
       if (score * ANGLE_SKIP_THRESH < hist_sum * weight)
         directional_mode_skip_mask[i] = 1;
@@ -2559,7 +2553,7 @@
 static void highbd_angle_estimation(const uint8_t *src8, int src_stride,
                                     int rows, int cols,
                                     uint8_t *directional_mode_skip_mask) {
-  int i, r, c, dx, dy, temp, sn, remd, quot;
+  int i, r, c, index, dx, dy, temp, sn, remd, quot;
   uint64_t hist[DIRECTIONAL_MODES];
   uint64_t hist_sum = 0;
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
@@ -2568,7 +2562,6 @@
   src += src_stride;
   for (r = 1; r < rows; ++r) {
     for (c = 1; c < cols; ++c) {
-      uint8_t index;
       dx = src[c] - src[c - 1];
       dy = src[c] - src[c - src_stride];
       temp = dx * dx + dy * dy;
@@ -2591,16 +2584,16 @@
   for (i = 0; i < DIRECTIONAL_MODES; ++i) hist_sum += hist[i];
   for (i = 0; i < INTRA_MODES; ++i) {
     if (i != DC_PRED && i != TM_PRED) {
-      const uint8_t index = mode_to_angle_bin[i];
-      uint64_t score = 2 * hist[index];
+      const uint8_t angle_bin = mode_to_angle_bin[i];
+      uint64_t score = 2 * hist[angle_bin];
       int weight = 2;
-      if (index > 0) {
-        score += hist[index - 1];
-        weight += 1;
+      if (angle_bin > 0) {
+        score += hist[angle_bin - 1];
+        ++weight;
       }
-      if (index < DIRECTIONAL_MODES - 1) {
-        score += hist[index + 1];
-        weight += 1;
+      if (angle_bin < DIRECTIONAL_MODES - 1) {
+        score += hist[angle_bin + 1];
+        ++weight;
       }
       if (score * ANGLE_SKIP_THRESH < hist_sum * weight)
         directional_mode_skip_mask[i] = 1;
@@ -3654,8 +3647,7 @@
   if (colors > 1 && colors <= 64) {
     int r, c, n, i, j;
     const int max_itr = 50;
-    int color_ctx, color_idx = 0;
-    int color_order[PALETTE_MAX_SIZE];
+    uint8_t color_order[PALETTE_MAX_SIZE];
     int64_t this_sse;
     float lb_u, ub_u, val_u;
     float lb_v, ub_v, val_v;
@@ -3748,13 +3740,9 @@
 
       for (i = 0; i < rows; ++i) {
         for (j = (i == 0 ? 1 : 0); j < cols; ++j) {
-          color_ctx = av1_get_palette_color_context(color_map, cols, i, j, n,
-                                                    color_order);
-          for (r = 0; r < n; ++r)
-            if (color_map[i * cols + j] == color_order[r]) {
-              color_idx = r;
-              break;
-            }
+          int color_idx;
+          const int color_ctx = av1_get_palette_color_context(
+              color_map, cols, i, j, n, color_order, &color_idx);
           assert(color_idx >= 0 && color_idx < n);
           this_rate += cpi->palette_uv_color_cost[n - 2][color_ctx][color_idx];
         }
@@ -9385,7 +9373,7 @@
     int best_rate_nocoef;
 #endif
     int64_t distortion2 = 0, distortion_y = 0, dummy_rd = best_rd, this_rd;
-    int skippable = 0;
+    int skippable = 0, rate_overhead = 0;
     TX_SIZE best_tx_size, uv_tx;
     TX_TYPE best_tx_type;
     PALETTE_MODE_INFO palette_mode_info;
diff --git a/av1/encoder/tokenize.c b/av1/encoder/tokenize.c
index fd0f76b..67f4b5d 100644
--- a/av1/encoder/tokenize.c
+++ b/av1/encoder/tokenize.c
@@ -369,8 +369,8 @@
   int rate = av1_cost_coeffs(cm, x, plane, block, pt, tx_size, scan_order->scan,
                              scan_order->neighbors, 0);
   args->this_rate += rate;
-  av1_set_contexts(xd, pd, plane_bsize, tx_size, p->eobs[block] > 0, blk_col,
-                   blk_row);
+  (void)plane_bsize;
+  av1_set_contexts(xd, pd, tx_size, p->eobs[block] > 0, blk_col, blk_row);
 }
 
 static void set_entropy_context_b(int plane, int block, int blk_row,
@@ -382,8 +382,8 @@
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblock_plane *p = &x->plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
-  av1_set_contexts(xd, pd, plane_bsize, tx_size, p->eobs[block] > 0, blk_col,
-                   blk_row);
+  (void)plane_bsize;
+  av1_set_contexts(xd, pd, tx_size, p->eobs[block] > 0, blk_col, blk_row);
 }
 
 static INLINE void add_token(TOKENEXTRA **t, const aom_prob *context_tree,
@@ -410,18 +410,19 @@
 }
 
 #if CONFIG_PALETTE
-void av1_tokenize_palette_sb(const AV1_COMP *cpi, struct ThreadData *const td,
-                             int plane, TOKENEXTRA **t, RUN_TYPE dry_run,
-                             BLOCK_SIZE bsize, int *rate) {
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  uint8_t *color_map = xd->plane[plane != 0].color_index_map;
-  PALETTE_MODE_INFO *pmi = &mbmi->palette_mode_info;
-  int n = pmi->palette_size[plane != 0];
-  int i, j, k;
+void av1_tokenize_palette_sb(const AV1_COMP *cpi,
+                             const struct ThreadData *const td, int plane,
+                             TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                             int *rate) {
+  const MACROBLOCK *const x = &td->mb;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const uint8_t *const color_map = xd->plane[plane != 0].color_index_map;
+  const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const int n = pmi->palette_size[plane != 0];
+  int i, j;
   int this_rate = 0;
-  int color_idx = -1, color_ctx, color_order[PALETTE_MAX_SIZE];
+  uint8_t color_order[PALETTE_MAX_SIZE];
   const int rows = (4 * num_4x4_blocks_high_lookup[bsize]) >>
                    (xd->plane[plane != 0].subsampling_y);
   const int cols = (4 * num_4x4_blocks_wide_lookup[bsize]) >>
@@ -432,17 +433,13 @@
 
   for (i = 0; i < rows; ++i) {
     for (j = (i == 0 ? 1 : 0); j < cols; ++j) {
-      color_ctx =
-          av1_get_palette_color_context(color_map, cols, i, j, n, color_order);
-      for (k = 0; k < n; ++k)
-        if (color_map[i * cols + j] == color_order[k]) {
-          color_idx = k;
-          break;
-        }
-      assert(color_idx >= 0 && color_idx < n);
+      int color_new_idx;
+      const int color_ctx = av1_get_palette_color_context(
+          color_map, cols, i, j, n, color_order, &color_new_idx);
+      assert(color_new_idx >= 0 && color_new_idx < n);
       if (dry_run == DRY_RUN_COSTCOEFFS)
-        this_rate += cpi->palette_y_color_cost[n - 2][color_ctx][color_idx];
-      (*t)->token = color_idx;
+        this_rate += cpi->palette_y_color_cost[n - 2][color_ctx][color_new_idx];
+      (*t)->token = color_new_idx;
       (*t)->context_tree = probs[n - 2][color_ctx];
       (*t)->skip_eob_node = 0;
       ++(*t);
@@ -501,6 +498,7 @@
   int skip_eob = 0;
   int16_t token;
   EXTRABIT extra;
+  (void)plane_bsize;
   pt = get_entropy_context(tx_size, pd->above_context + blk_col,
                            pd->left_context + blk_row);
   scan = scan_order->scan;
@@ -535,7 +533,7 @@
 
   *tp = t;
 
-  av1_set_contexts(xd, pd, plane_bsize, tx_size, c > 0, blk_col, blk_row);
+  av1_set_contexts(xd, pd, tx_size, c > 0, blk_col, blk_row);
 }
 
 struct is_skippable_args {
diff --git a/av1/encoder/tokenize.h b/av1/encoder/tokenize.h
index ae896a6..89610df 100644
--- a/av1/encoder/tokenize.h
+++ b/av1/encoder/tokenize.h
@@ -72,7 +72,7 @@
 #endif
 #if CONFIG_PALETTE
 void av1_tokenize_palette_sb(const struct AV1_COMP *cpi,
-                             struct ThreadData *const td, int plane,
+                             const struct ThreadData *const td, int plane,
                              TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize,
                              int *rate);
 #endif  // CONFIG_PALETTE
diff --git a/configure b/configure
index 99d2bb8..2659d37 100755
--- a/configure
+++ b/configure
@@ -616,20 +616,18 @@
         check_add_cflags -Wvla
         check_add_cflags -Wimplicit-function-declaration
         check_add_cflags -Wuninitialized
-        check_add_cflags -Wunused-variable
+        check_add_cflags -Wunused
         check_add_cflags -Wsign-compare
+        # Enabling the following warning (in combination with -Wunused above)
+        # for C++ generates errors in third_party code including googletest and
+        # libyuv. So enable it only for C code.
+        check_cflags "-Wextra" && add_cflags_only "-Wextra"
         # Enabling the following warning for C++ generates some useless warnings
         # about some function parameters shadowing class member function names.
         # So, only enable this warning for C code.
         check_cflags "-Wshadow" && add_cflags_only "-Wshadow"
-        case ${CC} in
-          *clang*) ;;
-          *) check_add_cflags -Wunused-but-set-variable ;;
-        esac
         if enabled mips || [ -z "${INLINE}" ]; then
           enabled extra_warnings || check_add_cflags -Wno-unused-function
-        else
-          check_add_cflags -Wunused-function
         fi
     fi
 
diff --git a/examples/aom_cx_set_ref.c b/examples/aom_cx_set_ref.c
index fdb9739..6beb4fb 100644
--- a/examples/aom_cx_set_ref.c
+++ b/examples/aom_cx_set_ref.c
@@ -191,8 +191,7 @@
 }
 
 static void testing_decode(aom_codec_ctx_t *encoder, aom_codec_ctx_t *decoder,
-                           aom_codec_enc_cfg_t *cfg, unsigned int frame_out,
-                           int *mismatch_seen) {
+                           unsigned int frame_out, int *mismatch_seen) {
   aom_image_t enc_img, dec_img;
   struct av1_ref_frame ref_enc, ref_dec;
 
@@ -226,11 +225,10 @@
   aom_img_free(&dec_img);
 }
 
-static int encode_frame(aom_codec_ctx_t *ecodec, aom_codec_enc_cfg_t *cfg,
-                        aom_image_t *img, unsigned int frame_in,
-                        AvxVideoWriter *writer, int test_decode,
-                        aom_codec_ctx_t *dcodec, unsigned int *frame_out,
-                        int *mismatch_seen) {
+static int encode_frame(aom_codec_ctx_t *ecodec, aom_image_t *img,
+                        unsigned int frame_in, AvxVideoWriter *writer,
+                        int test_decode, aom_codec_ctx_t *dcodec,
+                        unsigned int *frame_out, int *mismatch_seen) {
   int got_pkts = 0;
   aom_codec_iter_t iter = NULL;
   const aom_codec_cx_pkt_t *pkt = NULL;
@@ -271,7 +269,7 @@
 
   // Mismatch checking
   if (got_data && test_decode) {
-    testing_decode(ecodec, dcodec, cfg, *frame_out, mismatch_seen);
+    testing_decode(ecodec, dcodec, *frame_out, mismatch_seen);
   }
 
   return got_pkts;
@@ -280,12 +278,12 @@
 int main(int argc, char **argv) {
   FILE *infile = NULL;
   // Encoder
-  aom_codec_ctx_t ecodec = { 0 };
-  aom_codec_enc_cfg_t cfg = { 0 };
+  aom_codec_ctx_t ecodec;
+  aom_codec_enc_cfg_t cfg;
   unsigned int frame_in = 0;
   aom_image_t raw;
   aom_codec_err_t res;
-  AvxVideoInfo info = { 0 };
+  AvxVideoInfo info;
   AvxVideoWriter *writer = NULL;
   const AvxInterface *encoder = NULL;
 
@@ -311,6 +309,12 @@
   unsigned int limit = 0;
   exec_name = argv[0];
 
+  // Clear explicitly, as simply assigning "{ 0 }" generates
+  // "missing-field-initializers" warning in some compilers.
+  memset(&ecodec, 0, sizeof(ecodec));
+  memset(&cfg, 0, sizeof(cfg));
+  memset(&info, 0, sizeof(info));
+
   if (argc < 7) die("Invalid number of arguments");
 
   codec_arg = argv[1];
@@ -404,7 +408,7 @@
       }
     }
 
-    encode_frame(&ecodec, &cfg, &raw, frame_in, writer, test_decode, &dcodec,
+    encode_frame(&ecodec, &raw, frame_in, writer, test_decode, &dcodec,
                  &frame_out, &mismatch_seen);
     frame_in++;
     if (mismatch_seen) break;
@@ -412,8 +416,8 @@
 
   // Flush encoder.
   if (!mismatch_seen)
-    while (encode_frame(&ecodec, &cfg, NULL, frame_in, writer, test_decode,
-                        &dcodec, &frame_out, &mismatch_seen)) {
+    while (encode_frame(&ecodec, NULL, frame_in, writer, test_decode, &dcodec,
+                        &frame_out, &mismatch_seen)) {
     }
 
   printf("\n");
diff --git a/examples/lossless_encoder.c b/examples/lossless_encoder.c
index 069e35e..1abeb27 100644
--- a/examples/lossless_encoder.c
+++ b/examples/lossless_encoder.c
@@ -63,13 +63,17 @@
   int frame_count = 0;
   aom_image_t raw;
   aom_codec_err_t res;
-  AvxVideoInfo info = { 0 };
+  AvxVideoInfo info;
   AvxVideoWriter *writer = NULL;
   const AvxInterface *encoder = NULL;
   const int fps = 30;
 
   exec_name = argv[0];
 
+  // Clear explicitly, as simply assigning "{ 0 }" generates
+  // "missing-field-initializers" warning in some compilers.
+  memset(&info, 0, sizeof(info));
+
   if (argc < 5) die("Invalid number of arguments");
 
   encoder = get_aom_encoder_by_name("av1");
diff --git a/examples/simple_encoder.c b/examples/simple_encoder.c
index 418757d..1d2b51e 100644
--- a/examples/simple_encoder.c
+++ b/examples/simple_encoder.c
@@ -151,7 +151,7 @@
   int frame_count = 0;
   aom_image_t raw;
   aom_codec_err_t res;
-  AvxVideoInfo info = { 0 };
+  AvxVideoInfo info;
   AvxVideoWriter *writer = NULL;
   const AvxInterface *encoder = NULL;
   const int fps = 30;
@@ -168,6 +168,10 @@
 
   exec_name = argv[0];
 
+  // Clear explicitly, as simply assigning "{ 0 }" generates
+  // "missing-field-initializers" warning in some compilers.
+  memset(&info, 0, sizeof(info));
+
   if (argc != 9) die("Invalid number of arguments");
 
   codec_arg = argv[1];
diff --git a/test/av1_convolve_optimz_test.cc b/test/av1_convolve_optimz_test.cc
index b83ae94..b891e99 100644
--- a/test/av1_convolve_optimz_test.cc
+++ b/test/av1_convolve_optimz_test.cc
@@ -54,7 +54,6 @@
 const size_t maxBlockSize = maxWidth * maxHeight;
 const int horizOffset = 32;
 const int vertiOffset = 32;
-const size_t testMaxBlk = 128;
 const int stride = 128;
 const int x_step_q4 = 16;
 
@@ -90,7 +89,7 @@
   void RunVertFilterBitExactCheck();
 
  private:
-  void PrepFilterBuffer(int w, int h);
+  void PrepFilterBuffer();
   void DiffFilterBuffer();
   conv_filter_t conv_horiz_;
   conv_filter_t conv_vert_;
@@ -106,7 +105,7 @@
   int avg_;
 };
 
-void AV1ConvolveOptimzTest::PrepFilterBuffer(int w, int h) {
+void AV1ConvolveOptimzTest::PrepFilterBuffer() {
   int r, c;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
 
@@ -150,7 +149,7 @@
 }
 
 void AV1ConvolveOptimzTest::RunHorizFilterBitExactCheck() {
-  PrepFilterBuffer(testMaxBlk, testMaxBlk);
+  PrepFilterBuffer();
 
   InterpFilterParams filter_params = av1_get_interp_filter_params(filter_);
 
@@ -167,7 +166,7 @@
   // and test again.
   int intermediate_height =
       (((height_ - 1) * 16 + subpel_) >> SUBPEL_BITS) + filter_params.taps;
-  PrepFilterBuffer(testMaxBlk, testMaxBlk);
+  PrepFilterBuffer();
 
   av1_convolve_horiz_c(src_ref_, stride, dst_ref_, stride, width_,
                        intermediate_height, filter_params, subpel_, x_step_q4,
@@ -180,7 +179,7 @@
 }
 
 void AV1ConvolveOptimzTest::RunVertFilterBitExactCheck() {
-  PrepFilterBuffer(testMaxBlk, testMaxBlk);
+  PrepFilterBuffer();
 
   InterpFilterParams filter_params = av1_get_interp_filter_params(filter_);
 
@@ -266,7 +265,7 @@
   void RunVertFilterBitExactCheck();
 
  private:
-  void PrepFilterBuffer(int w, int h);
+  void PrepFilterBuffer();
   void DiffFilterBuffer();
   hbd_conv_filter_t conv_horiz_;
   hbd_conv_filter_t conv_vert_;
@@ -283,7 +282,7 @@
   int bit_depth_;
 };
 
-void AV1HbdConvolveOptimzTest::PrepFilterBuffer(int w, int h) {
+void AV1HbdConvolveOptimzTest::PrepFilterBuffer() {
   int r, c;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
 
@@ -326,7 +325,7 @@
 }
 
 void AV1HbdConvolveOptimzTest::RunHorizFilterBitExactCheck() {
-  PrepFilterBuffer(testMaxBlk, testMaxBlk);
+  PrepFilterBuffer();
 
   InterpFilterParams filter_params = av1_get_interp_filter_params(filter_);
 
@@ -344,7 +343,7 @@
   // and test again.
   int intermediate_height =
       (((height_ - 1) * 16 + subpel_) >> SUBPEL_BITS) + filter_params.taps;
-  PrepFilterBuffer(testMaxBlk, testMaxBlk);
+  PrepFilterBuffer();
 
   av1_highbd_convolve_horiz_c(src_, stride, dst_ref_, stride, width_,
                               intermediate_height, filter_params, subpel_,
@@ -357,7 +356,7 @@
 }
 
 void AV1HbdConvolveOptimzTest::RunVertFilterBitExactCheck() {
-  PrepFilterBuffer(testMaxBlk, testMaxBlk);
+  PrepFilterBuffer();
 
   InterpFilterParams filter_params = av1_get_interp_filter_params(filter_);
 
diff --git a/test/codec_factory.h b/test/codec_factory.h
index c92d5c1..b645102 100644
--- a/test/codec_factory.h
+++ b/test/codec_factory.h
@@ -123,6 +123,9 @@
 #if CONFIG_AV1_DECODER
     return new AV1Decoder(cfg, flags, deadline);
 #else
+    (void)cfg;
+    (void)flags;
+    (void)deadline;
     return NULL;
 #endif
   }
@@ -134,6 +137,10 @@
 #if CONFIG_AV1_ENCODER
     return new AV1Encoder(cfg, deadline, init_flags, stats);
 #else
+    (void)cfg;
+    (void)deadline;
+    (void)init_flags;
+    (void)stats;
     return NULL;
 #endif
   }
@@ -143,6 +150,8 @@
 #if CONFIG_AV1_ENCODER
     return aom_codec_enc_config_default(&aom_codec_av1_cx_algo, cfg, usage);
 #else
+    (void)cfg;
+    (void)usage;
     return AOM_CODEC_INCAPABLE;
 #endif
   }
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index e73daa5..9811955 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -264,12 +264,12 @@
 }
 
 void idct16x16_10_ref(const tran_low_t *in, uint8_t *out, int stride,
-                      int tx_type) {
+                      int /*tx_type*/) {
   idct16x16_10(in, out, stride);
 }
 
 void idct16x16_12_ref(const tran_low_t *in, uint8_t *out, int stride,
-                      int tx_type) {
+                      int /*tx_type*/) {
   idct16x16_12(in, out, stride);
 }
 
@@ -727,7 +727,7 @@
   virtual void TearDown() { libaom_test::ClearSystemState(); }
 
  protected:
-  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {}
+  void RunFwdTxfm(int16_t * /*in*/, tran_low_t * /*out*/, int /*stride*/) {}
   void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
     inv_txfm_(out, dst, stride);
   }
diff --git a/test/decode_test_driver.cc b/test/decode_test_driver.cc
index 95a0eb5..7adb9d6 100644
--- a/test/decode_test_driver.cc
+++ b/test/decode_test_driver.cc
@@ -92,7 +92,7 @@
 
       aom_codec_err_t res_dec =
           decoder->DecodeFrame(video->cxdata(), video->frame_size());
-      if (!HandleDecodeResult(res_dec, *video, decoder)) break;
+      if (!HandleDecodeResult(res_dec, decoder)) break;
     } else {
       // Signal end of the file to the decoder.
       const aom_codec_err_t res_dec = decoder->DecodeFrame(NULL, 0);
diff --git a/test/decode_test_driver.h b/test/decode_test_driver.h
index aabca40..b8f8d1a 100644
--- a/test/decode_test_driver.h
+++ b/test/decode_test_driver.h
@@ -141,7 +141,6 @@
 
   // Hook to be called to handle decode result. Return true to continue.
   virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
-                                  const CompressedVideoSource & /*video*/,
                                   Decoder *decoder) {
     EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
     return AOM_CODEC_OK == res_dec;
diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc
index c1a0cb7..092e669 100644
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc
@@ -275,7 +275,7 @@
               aom_codec_err_t res_dec = decoder->DecodeFrame(
                   (const uint8_t *)pkt->data.frame.buf, pkt->data.frame.sz);
 
-              if (!HandleDecodeResult(res_dec, *video, decoder.get())) break;
+              if (!HandleDecodeResult(res_dec, decoder.get())) break;
 
               has_dxdata = true;
             }
@@ -293,7 +293,7 @@
       // Flush the decoder when there are no more fragments.
       if ((init_flags_ & AOM_CODEC_USE_OUTPUT_PARTITION) && has_dxdata) {
         const aom_codec_err_t res_dec = decoder->DecodeFrame(NULL, 0);
-        if (!HandleDecodeResult(res_dec, *video, decoder.get())) break;
+        if (!HandleDecodeResult(res_dec, decoder.get())) break;
       }
 
       if (has_dxdata && has_cxdata) {
diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h
index 11c387a..45a080e 100644
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@@ -228,7 +228,6 @@
 
   // Hook to be called to handle decode result. Return true to continue.
   virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
-                                  const VideoSource & /*video*/,
                                   Decoder *decoder) {
     EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
     return AOM_CODEC_OK == res_dec;
diff --git a/test/encoder_parms_get_to_decoder.cc b/test/encoder_parms_get_to_decoder.cc
index 640e12f..52d68b1 100644
--- a/test/encoder_parms_get_to_decoder.cc
+++ b/test/encoder_parms_get_to_decoder.cc
@@ -94,7 +94,6 @@
   }
 
   virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
-                                  const libaom_test::VideoSource & /*video*/,
                                   libaom_test::Decoder *decoder) {
     aom_codec_ctx_t *const av1_decoder = decoder->GetDecoder();
     aom_codec_alg_priv_t *const priv =
diff --git a/test/error_resilience_test.cc b/test/error_resilience_test.cc
index 951c47f..07b6039 100644
--- a/test/error_resilience_test.cc
+++ b/test/error_resilience_test.cc
@@ -55,8 +55,7 @@
     nframes_++;
   }
 
-  virtual void PreEncodeFrameHook(libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder * /*encoder*/) {
+  virtual void PreEncodeFrameHook(libaom_test::VideoSource *video) {
     frame_flags_ &=
         ~(AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF);
     if (droppable_nframes_ > 0 &&
diff --git a/test/frame_size_tests.cc b/test/frame_size_tests.cc
index f1fad70..25b8718 100644
--- a/test/frame_size_tests.cc
+++ b/test/frame_size_tests.cc
@@ -28,7 +28,6 @@
   }
 
   virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
-                                  const libaom_test::VideoSource & /*video*/,
                                   libaom_test::Decoder *decoder) {
     EXPECT_EQ(expected_res_, res_dec) << decoder->DecodeError();
     return !::testing::Test::HasFailure();
diff --git a/test/variance_test.cc b/test/variance_test.cc
index 5ff5090..7848e20 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -101,8 +101,7 @@
   }
   RoundHighBitDepth(bit_depth, &se, &sse);
   *sse_ptr = static_cast<uint32_t>(sse);
-  return static_cast<uint32_t>(
-      sse - ((static_cast<int64_t>(se) * se) >> (l2w + l2h)));
+  return static_cast<uint32_t>(sse - ((se * se) >> (l2w + l2h)));
 }
 
 /* The subpel reference functions differ from the codec version in one aspect:
@@ -157,8 +156,7 @@
   }
   RoundHighBitDepth(bit_depth, &se, &sse);
   *sse_ptr = static_cast<uint32_t>(sse);
-  return static_cast<uint32_t>(
-      sse - ((static_cast<int64_t>(se) * se) >> (l2w + l2h)));
+  return static_cast<uint32_t>(sse - ((se * se) >> (l2w + l2h)));
 }
 
 static uint32_t subpel_avg_variance_ref(const uint8_t *ref, const uint8_t *src,
@@ -211,8 +209,7 @@
   }
   RoundHighBitDepth(bit_depth, &se, &sse);
   *sse_ptr = static_cast<uint32_t>(sse);
-  return static_cast<uint32_t>(
-      sse - ((static_cast<int64_t>(se) * se) >> (l2w + l2h)));
+  return static_cast<uint32_t>(sse - ((se * se) >> (l2w + l2h)));
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/webmenc.cc b/webmenc.cc
index f78f027..e3d209a 100644
--- a/webmenc.cc
+++ b/webmenc.cc
@@ -24,7 +24,6 @@
 
 void write_webm_file_header(struct WebmOutputContext *webm_ctx,
                             const aom_codec_enc_cfg_t *cfg,
-                            const struct aom_rational *fps,
                             stereo_format_t stereo_fmt, unsigned int fourcc,
                             const struct AvxRational *par) {
   mkvmuxer::MkvWriter *const writer = new mkvmuxer::MkvWriter(webm_ctx->stream);
diff --git a/webmenc.h b/webmenc.h
index 90211ff..74387fb 100644
--- a/webmenc.h
+++ b/webmenc.h
@@ -40,7 +40,6 @@
 
 void write_webm_file_header(struct WebmOutputContext *webm_ctx,
                             const aom_codec_enc_cfg_t *cfg,
-                            const struct aom_rational *fps,
                             stereo_format_t stereo_fmt, unsigned int fourcc,
                             const struct AvxRational *par);