Merge "Fix ubsan left shift warnings in warped motion library" into nextgenv2
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..bfaa1f6
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,264 @@
+##
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+cmake_minimum_required(VERSION 3.2)
+project(AOM C CXX)
+
+set(AOM_ROOT "${CMAKE_CURRENT_SOURCE_DIR}")
+
+set(AOM_SRCS
+    "${AOM_ROOT}/aom/aom.h"
+    "${AOM_ROOT}/aom/aom_codec.h"
+    "${AOM_ROOT}/aom/aom_decoder.h"
+    "${AOM_ROOT}/aom/aom_encoder.h"
+    "${AOM_ROOT}/aom/aom_frame_buffer.h"
+    "${AOM_ROOT}/aom/aom_image.h"
+    "${AOM_ROOT}/aom/aom_integer.h"
+    "${AOM_ROOT}/aom/aomcx.h"
+    "${AOM_ROOT}/aom/aomdx.h"
+    "${AOM_ROOT}/aom/internal/aom_codec_internal.h"
+    "${AOM_ROOT}/aom/src/aom_codec.c"
+    "${AOM_ROOT}/aom/src/aom_decoder.c"
+    "${AOM_ROOT}/aom/src/aom_encoder.c"
+    "${AOM_ROOT}/aom/src/aom_image.c")
+
+set(AOM_DSP_SRCS
+    "${AOM_ROOT}/aom_dsp/aom_convolve.c"
+    "${AOM_ROOT}/aom_dsp/aom_convolve.h"
+    "${AOM_ROOT}/aom_dsp/aom_dsp_common.h"
+    "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd.c"
+    "${AOM_ROOT}/aom_dsp/aom_filter.h"
+    "${AOM_ROOT}/aom_dsp/aom_simd.c"
+    "${AOM_ROOT}/aom_dsp/aom_simd.h"
+    "${AOM_ROOT}/aom_dsp/aom_simd_inline.h"
+    "${AOM_ROOT}/aom_dsp/avg.c"
+    "${AOM_ROOT}/aom_dsp/bitreader.h"
+    "${AOM_ROOT}/aom_dsp/bitreader_buffer.c"
+    "${AOM_ROOT}/aom_dsp/bitreader_buffer.h"
+    "${AOM_ROOT}/aom_dsp/bitwriter.h"
+    "${AOM_ROOT}/aom_dsp/bitwriter_buffer.c"
+    "${AOM_ROOT}/aom_dsp/bitwriter_buffer.h"
+    "${AOM_ROOT}/aom_dsp/blend.h"
+    "${AOM_ROOT}/aom_dsp/blend_a64_hmask.c"
+    "${AOM_ROOT}/aom_dsp/blend_a64_mask.c"
+    "${AOM_ROOT}/aom_dsp/blend_a64_vmask.c"
+    "${AOM_ROOT}/aom_dsp/dkboolreader.c"
+    "${AOM_ROOT}/aom_dsp/dkboolreader.h"
+    "${AOM_ROOT}/aom_dsp/dkboolwriter.c"
+    "${AOM_ROOT}/aom_dsp/dkboolwriter.h"
+    "${AOM_ROOT}/aom_dsp/fwd_txfm.c"
+    "${AOM_ROOT}/aom_dsp/fwd_txfm.h"
+    "${AOM_ROOT}/aom_dsp/intrapred.c"
+    "${AOM_ROOT}/aom_dsp/inv_txfm.c"
+    "${AOM_ROOT}/aom_dsp/inv_txfm.h"
+    "${AOM_ROOT}/aom_dsp/loopfilter.c"
+    "${AOM_ROOT}/aom_dsp/prob.c"
+    "${AOM_ROOT}/aom_dsp/prob.h"
+    "${AOM_ROOT}/aom_dsp/psnr.c"
+    "${AOM_ROOT}/aom_dsp/psnr.h"
+    "${AOM_ROOT}/aom_dsp/quantize.c"
+    "${AOM_ROOT}/aom_dsp/quantize.h"
+    "${AOM_ROOT}/aom_dsp/sad.c"
+    "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics.h"
+    "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics_c.h"
+    "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics.h"
+    "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics_c.h"
+    "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics.h"
+    "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics_c.h"
+    "${AOM_ROOT}/aom_dsp/subtract.c"
+    "${AOM_ROOT}/aom_dsp/txfm_common.h"
+    "${AOM_ROOT}/aom_dsp/variance.c"
+    "${AOM_ROOT}/aom_dsp/variance.h")
+
+set(AOM_MEM_SRCS
+    "${AOM_ROOT}/aom_mem/aom_mem.c"
+    "${AOM_ROOT}/aom_mem/aom_mem.h"
+    "${AOM_ROOT}/aom_mem/include/aom_mem_intrnl.h")
+
+set(AOM_SCALE_SRCS
+    "${AOM_ROOT}/aom_scale/aom_scale.h"
+    "${AOM_ROOT}/aom_scale/aom_scale_rtcd.c"
+    "${AOM_ROOT}/aom_scale/generic/aom_scale.c"
+    "${AOM_ROOT}/aom_scale/generic/gen_scalers.c"
+    "${AOM_ROOT}/aom_scale/generic/yv12config.c"
+    "${AOM_ROOT}/aom_scale/generic/yv12extend.c"
+    "${AOM_ROOT}/aom_scale/yv12config.h")
+
+# TODO(tomfinegan): Extract aom_ports from aom_util if possible.
+set(AOM_UTIL_SRCS
+    "${AOM_ROOT}/aom_ports/aom_once.h"
+    "${AOM_ROOT}/aom_ports/aom_timer.h"
+    "${AOM_ROOT}/aom_ports/bitops.h"
+    "${AOM_ROOT}/aom_ports/emmintrin_compat.h"
+    "${AOM_ROOT}/aom_ports/mem.h"
+    "${AOM_ROOT}/aom_ports/mem_ops.h"
+    "${AOM_ROOT}/aom_ports/mem_ops_aligned.h"
+    "${AOM_ROOT}/aom_ports/msvc.h"
+    "${AOM_ROOT}/aom_ports/system_state.h"
+    "${AOM_ROOT}/aom_util/aom_thread.c"
+    "${AOM_ROOT}/aom_util/aom_thread.h"
+    "${AOM_ROOT}/aom_util/endian_inl.h")
+
+set(AOM_AV1_COMMON_SRCS
+    "${AOM_ROOT}/av1/av1_iface_common.h"
+    "${AOM_ROOT}/av1/common/alloccommon.c"
+    "${AOM_ROOT}/av1/common/alloccommon.h"
+    "${AOM_ROOT}/av1/common/av1_fwd_txfm.c"
+    "${AOM_ROOT}/av1/common/av1_fwd_txfm.h"
+    "${AOM_ROOT}/av1/common/av1_inv_txfm.c"
+    "${AOM_ROOT}/av1/common/av1_inv_txfm.h"
+    "${AOM_ROOT}/av1/common/av1_rtcd.c"
+    "${AOM_ROOT}/av1/common/blockd.c"
+    "${AOM_ROOT}/av1/common/blockd.h"
+    "${AOM_ROOT}/av1/common/common.h"
+    "${AOM_ROOT}/av1/common/common_data.h"
+    "${AOM_ROOT}/av1/common/convolve.c"
+    "${AOM_ROOT}/av1/common/convolve.h"
+    "${AOM_ROOT}/av1/common/debugmodes.c"
+    "${AOM_ROOT}/av1/common/entropy.c"
+    "${AOM_ROOT}/av1/common/entropy.h"
+    "${AOM_ROOT}/av1/common/entropymode.c"
+    "${AOM_ROOT}/av1/common/entropymode.h"
+    "${AOM_ROOT}/av1/common/entropymv.c"
+    "${AOM_ROOT}/av1/common/entropymv.h"
+    "${AOM_ROOT}/av1/common/enums.h"
+    "${AOM_ROOT}/av1/common/filter.c"
+    "${AOM_ROOT}/av1/common/filter.h"
+    "${AOM_ROOT}/av1/common/frame_buffers.c"
+    "${AOM_ROOT}/av1/common/frame_buffers.h"
+    "${AOM_ROOT}/av1/common/idct.c"
+    "${AOM_ROOT}/av1/common/idct.h"
+    "${AOM_ROOT}/av1/common/loopfilter.c"
+    "${AOM_ROOT}/av1/common/loopfilter.h"
+    "${AOM_ROOT}/av1/common/mv.h"
+    "${AOM_ROOT}/av1/common/mvref_common.c"
+    "${AOM_ROOT}/av1/common/mvref_common.h"
+    "${AOM_ROOT}/av1/common/odintrin.c"
+    "${AOM_ROOT}/av1/common/odintrin.h"
+    "${AOM_ROOT}/av1/common/onyxc_int.h"
+    "${AOM_ROOT}/av1/common/pred_common.c"
+    "${AOM_ROOT}/av1/common/pred_common.h"
+    "${AOM_ROOT}/av1/common/quant_common.c"
+    "${AOM_ROOT}/av1/common/quant_common.h"
+    "${AOM_ROOT}/av1/common/reconinter.c"
+    "${AOM_ROOT}/av1/common/reconinter.h"
+    "${AOM_ROOT}/av1/common/reconintra.c"
+    "${AOM_ROOT}/av1/common/reconintra.h"
+    "${AOM_ROOT}/av1/common/scale.c"
+    "${AOM_ROOT}/av1/common/scale.h"
+    "${AOM_ROOT}/av1/common/scan.c"
+    "${AOM_ROOT}/av1/common/scan.h"
+    "${AOM_ROOT}/av1/common/seg_common.c"
+    "${AOM_ROOT}/av1/common/seg_common.h"
+    "${AOM_ROOT}/av1/common/thread_common.c"
+    "${AOM_ROOT}/av1/common/thread_common.h"
+    "${AOM_ROOT}/av1/common/tile_common.c"
+    "${AOM_ROOT}/av1/common/tile_common.h")
+
+set(AOM_AV1_DECODER_SRCS
+    "${AOM_ROOT}/av1/av1_dx_iface.c"
+    "${AOM_ROOT}/av1/decoder/decodeframe.c"
+    "${AOM_ROOT}/av1/decoder/decodeframe.h"
+    "${AOM_ROOT}/av1/decoder/decodemv.c"
+    "${AOM_ROOT}/av1/decoder/decodemv.h"
+    "${AOM_ROOT}/av1/decoder/decoder.c"
+    "${AOM_ROOT}/av1/decoder/decoder.h"
+    "${AOM_ROOT}/av1/decoder/detokenize.c"
+    "${AOM_ROOT}/av1/decoder/detokenize.h"
+    "${AOM_ROOT}/av1/decoder/dsubexp.c"
+    "${AOM_ROOT}/av1/decoder/dsubexp.h"
+    "${AOM_ROOT}/av1/decoder/dthread.c"
+    "${AOM_ROOT}/av1/decoder/dthread.h")
+
+set(AOM_AV1_ENCODER_SRCS
+    "${AOM_ROOT}/av1/av1_cx_iface.c"
+    "${AOM_ROOT}/av1/encoder/aq_complexity.c"
+    "${AOM_ROOT}/av1/encoder/aq_complexity.h"
+    "${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.c"
+    "${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.h"
+    "${AOM_ROOT}/av1/encoder/aq_variance.c"
+    "${AOM_ROOT}/av1/encoder/aq_variance.h"
+    "${AOM_ROOT}/av1/encoder/bitstream.c"
+    "${AOM_ROOT}/av1/encoder/bitstream.h"
+    "${AOM_ROOT}/av1/encoder/block.h"
+    "${AOM_ROOT}/av1/encoder/context_tree.c"
+    "${AOM_ROOT}/av1/encoder/context_tree.h"
+    "${AOM_ROOT}/av1/encoder/cost.c"
+    "${AOM_ROOT}/av1/encoder/cost.h"
+    "${AOM_ROOT}/av1/encoder/dct.c"
+    "${AOM_ROOT}/av1/encoder/encodeframe.c"
+    "${AOM_ROOT}/av1/encoder/encodeframe.h"
+    "${AOM_ROOT}/av1/encoder/encodemb.c"
+    "${AOM_ROOT}/av1/encoder/encodemb.h"
+    "${AOM_ROOT}/av1/encoder/encodemv.c"
+    "${AOM_ROOT}/av1/encoder/encodemv.h"
+    "${AOM_ROOT}/av1/encoder/encoder.c"
+    "${AOM_ROOT}/av1/encoder/encoder.h"
+    "${AOM_ROOT}/av1/encoder/ethread.c"
+    "${AOM_ROOT}/av1/encoder/ethread.h"
+    "${AOM_ROOT}/av1/encoder/extend.c"
+    "${AOM_ROOT}/av1/encoder/extend.h"
+    "${AOM_ROOT}/av1/encoder/firstpass.c"
+    "${AOM_ROOT}/av1/encoder/firstpass.h"
+    "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.c"
+    "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.h"
+    "${AOM_ROOT}/av1/encoder/lookahead.c"
+    "${AOM_ROOT}/av1/encoder/lookahead.h"
+    "${AOM_ROOT}/av1/encoder/mbgraph.c"
+    "${AOM_ROOT}/av1/encoder/mbgraph.h"
+    "${AOM_ROOT}/av1/encoder/mcomp.c"
+    "${AOM_ROOT}/av1/encoder/mcomp.h"
+    "${AOM_ROOT}/av1/encoder/picklpf.c"
+    "${AOM_ROOT}/av1/encoder/picklpf.h"
+    "${AOM_ROOT}/av1/encoder/quantize.c"
+    "${AOM_ROOT}/av1/encoder/quantize.h"
+    "${AOM_ROOT}/av1/encoder/ratectrl.c"
+    "${AOM_ROOT}/av1/encoder/ratectrl.h"
+    "${AOM_ROOT}/av1/encoder/rd.c"
+    "${AOM_ROOT}/av1/encoder/rd.h"
+    "${AOM_ROOT}/av1/encoder/rdopt.c"
+    "${AOM_ROOT}/av1/encoder/rdopt.h"
+    "${AOM_ROOT}/av1/encoder/resize.c"
+    "${AOM_ROOT}/av1/encoder/resize.h"
+    "${AOM_ROOT}/av1/encoder/segmentation.c"
+    "${AOM_ROOT}/av1/encoder/segmentation.h"
+    "${AOM_ROOT}/av1/encoder/speed_features.c"
+    "${AOM_ROOT}/av1/encoder/speed_features.h"
+    "${AOM_ROOT}/av1/encoder/subexp.c"
+    "${AOM_ROOT}/av1/encoder/subexp.h"
+    "${AOM_ROOT}/av1/encoder/temporal_filter.c"
+    "${AOM_ROOT}/av1/encoder/temporal_filter.h"
+    "${AOM_ROOT}/av1/encoder/tokenize.c"
+    "${AOM_ROOT}/av1/encoder/tokenize.h"
+    "${AOM_ROOT}/av1/encoder/treewriter.c"
+    "${AOM_ROOT}/av1/encoder/treewriter.h")
+
+# Targets
+add_library(aom_dsp ${AOM_DSP_SRCS})
+add_library(aom_mem ${AOM_MEM_SRCS})
+add_library(aom_scale ${AOM_SCALE_SRCS})
+add_library(aom_util ${AOM_UTIL_SRCS})
+add_library(aom_av1_decoder ${AOM_AV1_DECODER_SRCS})
+add_library(aom_av1_encoder ${AOM_AV1_ENCODER_SRCS})
+add_library(aom ${AOM_SRCS})
+target_link_libraries(aom LINK_PUBLIC
+                      aom_dsp
+                      aom_mem
+                      aom_scale
+                      aom_util
+                      aom_av1_decoder
+                      aom_av1_encoder)
+add_executable(simple_decoder examples/simple_decoder.c)
+include_directories(${AOM_ROOT})
+target_link_libraries(simple_decoder LINK_PUBLIC aom)
+add_executable(simple_encoder examples/simple_encoder.c)
+include_directories(${AOM_ROOT})
+target_link_libraries(simple_encoder LINK_PUBLIC aom)
+
diff --git a/aom_dsp/ansreader.h b/aom_dsp/ansreader.h
index c46778b..0e9a671 100644
--- a/aom_dsp/ansreader.h
+++ b/aom_dsp/ansreader.h
@@ -20,6 +20,9 @@
 #include "aom_dsp/prob.h"
 #include "aom_dsp/ans.h"
 #include "aom_ports/mem_ops.h"
+#if CONFIG_ACCOUNTING
+#include "av1/common/accounting.h"
+#endif
 
 #ifdef __cplusplus
 extern "C" {
@@ -29,6 +32,9 @@
   const uint8_t *buf;
   int buf_offset;
   uint32_t state;
+#if CONFIG_ACCOUNTING
+  Accounting *accounting;
+#endif
 };
 
 static INLINE int uabs_read(struct AnsDecoder *ans, AnsP8 p0) {
@@ -119,6 +125,9 @@
     // 110xxxxx implies this byte is a superframe marker
     return 1;
   }
+#if CONFIG_ACCOUNTING
+  ans->accounting = NULL;
+#endif
   ans->state += L_BASE;
   if (ans->state >= L_BASE * IO_BASE) return 1;
   return 0;
diff --git a/aom_dsp/aom_dsp.mk b/aom_dsp/aom_dsp.mk
index 4735199..eebdc0c 100644
--- a/aom_dsp/aom_dsp.mk
+++ b/aom_dsp/aom_dsp.mk
@@ -191,6 +191,7 @@
 endif  # CONFIG_AOM_HIGHBITDEPTH
 
 DSP_SRCS-yes            += txfm_common.h
+DSP_SRCS-yes            += x86/txfm_common_intrin.h
 DSP_SRCS-$(HAVE_SSE2)   += x86/txfm_common_sse2.h
 DSP_SRCS-$(HAVE_MSA)    += mips/txfm_macros_msa.h
 # forward transform
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 6d873bc..b073b1b 100644
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -701,6 +701,34 @@
 #
 if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
   if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void aom_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_fdct4x4 sse2/;
+
+    add_proto qw/void aom_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_fdct4x4_1 sse2/;
+
+    add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_fdct8x8 sse2/, "$ssse3_x86_64";
+
+    add_proto qw/void aom_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_fdct8x8_1 sse2/;
+
+    add_proto qw/void aom_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_fdct16x16 sse2/;
+
+    add_proto qw/void aom_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_fdct16x16_1 sse2 avx2/;
+
+    add_proto qw/void aom_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_fdct32x32 sse2 avx2/;
+
+    add_proto qw/void aom_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_fdct32x32_rd sse2 avx2/;
+
+    add_proto qw/void aom_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_fdct32x32_1 sse2 avx2/;
+
+    # High bit depth
     add_proto qw/void aom_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
     specialize qw/aom_highbd_fdct4x4 sse2/;
 
@@ -724,33 +752,34 @@
 
     add_proto qw/void aom_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
     specialize qw/aom_highbd_fdct32x32_1/;
-  }   # CONFIG_AOM_HIGHBITDEPTH
-  add_proto qw/void aom_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct4x4 sse2 msa/;
+  } else {
+    add_proto qw/void aom_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_fdct4x4 sse2 msa/;
 
-  add_proto qw/void aom_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct4x4_1 sse2/;
+    add_proto qw/void aom_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_fdct4x4_1 sse2/;
 
-  add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct8x8 sse2 neon msa/, "$ssse3_x86_64";
+    add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_fdct8x8 sse2 neon msa/, "$ssse3_x86_64";
 
-  add_proto qw/void aom_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct8x8_1 sse2 neon msa/;
+    add_proto qw/void aom_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_fdct8x8_1 sse2 neon msa/;
 
-  add_proto qw/void aom_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct16x16 sse2 msa/;
+    add_proto qw/void aom_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_fdct16x16 sse2 msa/;
 
-  add_proto qw/void aom_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct16x16_1 sse2 avx2 msa/;
+    add_proto qw/void aom_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_fdct16x16_1 sse2 avx2 msa/;
 
-  add_proto qw/void aom_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct32x32 sse2 avx2 msa/;
+    add_proto qw/void aom_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_fdct32x32 sse2 avx2 msa/;
 
-  add_proto qw/void aom_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct32x32_rd sse2 avx2 msa/;
+    add_proto qw/void aom_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_fdct32x32_rd sse2 avx2 msa/;
 
-  add_proto qw/void aom_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct32x32_1 sse2 avx2 msa/;
+    add_proto qw/void aom_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_fdct32x32_1 sse2 avx2 msa/;
+  }  # CONFIG_AOM_HIGHBITDEPTH
 }  # CONFIG_AV1_ENCODER
 
 #
diff --git a/aom_dsp/bitreader.h b/aom_dsp/bitreader.h
index ef2e5e9..478945b 100644
--- a/aom_dsp/bitreader.h
+++ b/aom_dsp/bitreader.h
@@ -16,6 +16,10 @@
 #include <limits.h>
 
 #include "./aom_config.h"
+#if CONFIG_EC_ADAPT && !CONFIG_EC_MULTISYMBOL
+#error "CONFIG_EC_ADAPT is enabled without enabling CONFIG_EC_MULTISYMBOL."
+#endif
+
 #include "aom/aomdx.h"
 #include "aom/aom_integer.h"
 #if CONFIG_ANS
@@ -203,7 +207,8 @@
   return ret;
 }
 
-static INLINE int aom_read_symbol_(aom_reader *r, const aom_cdf_prob *cdf,
+#if CONFIG_EC_MULTISYMBOL
+static INLINE int aom_read_symbol_(aom_reader *r, aom_cdf_prob *cdf,
                                    int nsymbs ACCT_STR_PARAM) {
   int ret;
 #if CONFIG_RANS
@@ -212,17 +217,21 @@
 #elif CONFIG_DAALA_EC
   ret = daala_read_symbol(r, cdf, nsymbs);
 #else
-  (void)r;
-  (void)cdf;
-  (void)nsymbs;
-  assert(0 && "Unsupported bitreader operation");
-  ret = -1;
+#error \
+    "CONFIG_EC_MULTISYMBOL is selected without a valid backing entropy " \
+  "coder. Enable daala_ec or ans for a valid configuration."
 #endif
+
+#if CONFIG_EC_ADAPT
+  update_cdf(cdf, ret, nsymbs);
+#endif
+
 #if CONFIG_ACCOUNTING
   if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
 #endif
   return ret;
 }
+#endif  // CONFIG_EC_MULTISYMBOL
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/aom_dsp/bitwriter.h b/aom_dsp/bitwriter.h
index 841a171..ef529fc 100644
--- a/aom_dsp/bitwriter.h
+++ b/aom_dsp/bitwriter.h
@@ -14,6 +14,10 @@
 
 #include <assert.h>
 #include "./aom_config.h"
+#if CONFIG_EC_ADAPT && !CONFIG_EC_MULTISYMBOL
+#error "CONFIG_EC_ADAPT is enabled without enabling CONFIG_EC_MULTISYMBOL"
+#endif
+
 #if CONFIG_ANS
 #include "aom_dsp/buf_ans.h"
 #elif CONFIG_DAALA_EC
@@ -98,8 +102,9 @@
 #endif
 }
 
-static INLINE void aom_write_symbol(aom_writer *w, int symb,
-                                    const aom_cdf_prob *cdf, int nsymbs) {
+#if CONFIG_EC_MULTISYMBOL
+static INLINE void aom_write_symbol(aom_writer *w, int symb, aom_cdf_prob *cdf,
+                                    int nsymbs) {
 #if CONFIG_RANS
   struct rans_sym s;
   (void)nsymbs;
@@ -110,13 +115,16 @@
 #elif CONFIG_DAALA_EC
   daala_write_symbol(w, symb, cdf, nsymbs);
 #else
-  (void)w;
-  (void)symb;
-  (void)cdf;
-  (void)nsymbs;
-  assert(0 && "Unsupported bitwriter operation");
+#error \
+    "CONFIG_EC_MULTISYMBOL is selected without a valid backing entropy " \
+  "coder. Enable daala_ec or ans for a valid configuration."
+#endif
+
+#if CONFIG_EC_ADAPT
+  update_cdf(cdf, symb, nsymbs);
 #endif
 }
+#endif  // CONFIG_EC_MULTISYMBOL
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/aom_dsp/inv_txfm.c b/aom_dsp/inv_txfm.c
index e0dda12..4bb656b 100644
--- a/aom_dsp/inv_txfm.c
+++ b/aom_dsp/inv_txfm.c
@@ -93,7 +93,7 @@
   }
 }
 
-void idct4_c(const tran_low_t *input, tran_low_t *output) {
+void aom_idct4_c(const tran_low_t *input, tran_low_t *output) {
   tran_low_t step[4];
   tran_high_t temp1, temp2;
   // stage 1
@@ -121,7 +121,7 @@
 
   // Rows
   for (i = 0; i < 4; ++i) {
-    idct4_c(input, outptr);
+    aom_idct4_c(input, outptr);
     input += 4;
     outptr += 4;
   }
@@ -129,7 +129,7 @@
   // Columns
   for (i = 0; i < 4; ++i) {
     for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
-    idct4_c(temp_in, temp_out);
+    aom_idct4_c(temp_in, temp_out);
     for (j = 0; j < 4; ++j) {
       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
                                             ROUND_POWER_OF_TWO(temp_out[j], 4));
@@ -154,7 +154,7 @@
   }
 }
 
-void idct8_c(const tran_low_t *input, tran_low_t *output) {
+void aom_idct8_c(const tran_low_t *input, tran_low_t *output) {
   tran_low_t step1[8], step2[8];
   tran_high_t temp1, temp2;
   // stage 1
@@ -216,7 +216,7 @@
 
   // First transform rows
   for (i = 0; i < 8; ++i) {
-    idct8_c(input, outptr);
+    aom_idct8_c(input, outptr);
     input += 8;
     outptr += 8;
   }
@@ -224,7 +224,7 @@
   // Then transform columns
   for (i = 0; i < 8; ++i) {
     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
-    idct8_c(temp_in, temp_out);
+    aom_idct8_c(temp_in, temp_out);
     for (j = 0; j < 8; ++j) {
       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
                                             ROUND_POWER_OF_TWO(temp_out[j], 5));
@@ -244,7 +244,7 @@
   }
 }
 
-void iadst4_c(const tran_low_t *input, tran_low_t *output) {
+void aom_iadst4_c(const tran_low_t *input, tran_low_t *output) {
   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
 
   tran_low_t x0 = input[0];
@@ -281,7 +281,7 @@
   output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
 }
 
-void iadst8_c(const tran_low_t *input, tran_low_t *output) {
+void aom_iadst8_c(const tran_low_t *input, tran_low_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7;
 
   tran_high_t x0 = input[7];
@@ -367,7 +367,7 @@
   // First transform rows
   // only first 4 row has non-zero coefs
   for (i = 0; i < 4; ++i) {
-    idct8_c(input, outptr);
+    aom_idct8_c(input, outptr);
     input += 8;
     outptr += 8;
   }
@@ -375,7 +375,7 @@
   // Then transform columns
   for (i = 0; i < 8; ++i) {
     for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
-    idct8_c(temp_in, temp_out);
+    aom_idct8_c(temp_in, temp_out);
     for (j = 0; j < 8; ++j) {
       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
                                             ROUND_POWER_OF_TWO(temp_out[j], 5));
@@ -383,7 +383,7 @@
   }
 }
 
-void idct16_c(const tran_low_t *input, tran_low_t *output) {
+void aom_idct16_c(const tran_low_t *input, tran_low_t *output) {
   tran_low_t step1[16], step2[16];
   tran_high_t temp1, temp2;
 
@@ -557,7 +557,7 @@
 
   // First transform rows
   for (i = 0; i < 16; ++i) {
-    idct16_c(input, outptr);
+    aom_idct16_c(input, outptr);
     input += 16;
     outptr += 16;
   }
@@ -565,7 +565,7 @@
   // Then transform columns
   for (i = 0; i < 16; ++i) {
     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
-    idct16_c(temp_in, temp_out);
+    aom_idct16_c(temp_in, temp_out);
     for (j = 0; j < 16; ++j) {
       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
@@ -573,7 +573,7 @@
   }
 }
 
-void iadst16_c(const tran_low_t *input, tran_low_t *output) {
+void aom_iadst16_c(const tran_low_t *input, tran_low_t *output) {
   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
   tran_high_t s9, s10, s11, s12, s13, s14, s15;
 
@@ -754,7 +754,7 @@
   // First transform rows. Since all non-zero dct coefficients are in
   // upper-left 4x4 area, we only need to calculate first 4 rows here.
   for (i = 0; i < 4; ++i) {
-    idct16_c(input, outptr);
+    aom_idct16_c(input, outptr);
     input += 16;
     outptr += 16;
   }
@@ -762,7 +762,7 @@
   // Then transform columns
   for (i = 0; i < 16; ++i) {
     for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
-    idct16_c(temp_in, temp_out);
+    aom_idct16_c(temp_in, temp_out);
     for (j = 0; j < 16; ++j) {
       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
@@ -782,7 +782,7 @@
   }
 }
 
-void idct32_c(const tran_low_t *input, tran_low_t *output) {
+void aom_idct32_c(const tran_low_t *input, tran_low_t *output) {
   tran_low_t step1[32], step2[32];
   tran_high_t temp1, temp2;
 
@@ -1168,7 +1168,7 @@
       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
 
     if (zero_coeff[0] | zero_coeff[1])
-      idct32_c(input, outptr);
+      aom_idct32_c(input, outptr);
     else
       memset(outptr, 0, sizeof(tran_low_t) * 32);
     input += 32;
@@ -1178,7 +1178,7 @@
   // Columns
   for (i = 0; i < 32; ++i) {
     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
-    idct32_c(temp_in, temp_out);
+    aom_idct32_c(temp_in, temp_out);
     for (j = 0; j < 32; ++j) {
       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
@@ -1196,7 +1196,7 @@
   // Rows
   // only upper-left 16x16 has non-zero coeff
   for (i = 0; i < 16; ++i) {
-    idct32_c(input, outptr);
+    aom_idct32_c(input, outptr);
     input += 32;
     outptr += 32;
   }
@@ -1204,7 +1204,7 @@
   // Columns
   for (i = 0; i < 32; ++i) {
     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
-    idct32_c(temp_in, temp_out);
+    aom_idct32_c(temp_in, temp_out);
     for (j = 0; j < 32; ++j) {
       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
@@ -1222,7 +1222,7 @@
   // Rows
   // only upper-left 8x8 has non-zero coeff
   for (i = 0; i < 8; ++i) {
-    idct32_c(input, outptr);
+    aom_idct32_c(input, outptr);
     input += 32;
     outptr += 32;
   }
@@ -1230,7 +1230,7 @@
   // Columns
   for (i = 0; i < 32; ++i) {
     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
-    idct32_c(temp_in, temp_out);
+    aom_idct32_c(temp_in, temp_out);
     for (j = 0; j < 32; ++j) {
       dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
                                             ROUND_POWER_OF_TWO(temp_out[j], 6));
diff --git a/aom_dsp/inv_txfm.h b/aom_dsp/inv_txfm.h
index 0f84e38..c3d794e 100644
--- a/aom_dsp/inv_txfm.h
+++ b/aom_dsp/inv_txfm.h
@@ -97,13 +97,13 @@
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 #endif  // CONFIG_EMULATE_HARDWARE
 
-void idct4_c(const tran_low_t *input, tran_low_t *output);
-void idct8_c(const tran_low_t *input, tran_low_t *output);
-void idct16_c(const tran_low_t *input, tran_low_t *output);
-void idct32_c(const tran_low_t *input, tran_low_t *output);
-void iadst4_c(const tran_low_t *input, tran_low_t *output);
-void iadst8_c(const tran_low_t *input, tran_low_t *output);
-void iadst16_c(const tran_low_t *input, tran_low_t *output);
+void aom_idct4_c(const tran_low_t *input, tran_low_t *output);
+void aom_idct8_c(const tran_low_t *input, tran_low_t *output);
+void aom_idct16_c(const tran_low_t *input, tran_low_t *output);
+void aom_idct32_c(const tran_low_t *input, tran_low_t *output);
+void aom_iadst4_c(const tran_low_t *input, tran_low_t *output);
+void aom_iadst8_c(const tran_low_t *input, tran_low_t *output);
+void aom_iadst16_c(const tran_low_t *input, tran_low_t *output);
 
 #if CONFIG_AOM_HIGHBITDEPTH
 void aom_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd);
diff --git a/aom_dsp/prob.c b/aom_dsp/prob.c
index d3556cb..a98a4bc 100644
--- a/aom_dsp/prob.c
+++ b/aom_dsp/prob.c
@@ -11,7 +11,7 @@
 
 #include "./aom_config.h"
 
-#if CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
 #include <string.h>
 #endif
 
@@ -57,7 +57,7 @@
   tree_merge_probs_impl(0, tree, pre_probs, counts, probs);
 }
 
-#if CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
 typedef struct tree_node tree_node;
 
 struct tree_node {
@@ -86,7 +86,7 @@
   int i;
   uint32_t pa;
   uint32_t pb;
-  for (i = 0; i < OD_MINI(a.len, b.len) && a.probs[i] == b.probs[i]; i++) {
+  for (i = 0; i < AOMMIN(a.len, b.len) && a.probs[i] == b.probs[i]; i++) {
   }
   pa = tree_node_prob(a, i);
   pb = tree_node_prob(b, i);
diff --git a/aom_dsp/prob.h b/aom_dsp/prob.h
index bf9abbf..9384ffe 100644
--- a/aom_dsp/prob.h
+++ b/aom_dsp/prob.h
@@ -15,6 +15,7 @@
 #include "./aom_config.h"
 #include "./aom_dsp_common.h"
 
+#include "aom_ports/bitops.h"
 #include "aom_ports/mem.h"
 
 #ifdef __cplusplus
@@ -32,7 +33,7 @@
 
 typedef int8_t aom_tree_index;
 
-#define TREE_SIZE(leaf_count) (2 * (leaf_count)-2)
+#define TREE_SIZE(leaf_count) (-2 + 2 * (leaf_count))
 
 #define aom_complement(x) (255 - x)
 
@@ -96,7 +97,7 @@
 void aom_tree_merge_probs(const aom_tree_index *tree, const aom_prob *pre_probs,
                           const unsigned int *counts, aom_prob *probs);
 
-#if CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
 int tree_to_cdf(const aom_tree_index *tree, const aom_prob *probs,
                 aom_tree_index root, aom_cdf_prob *cdf, aom_tree_index *ind,
                 int *pth, int *len);
@@ -134,6 +135,22 @@
 
 DECLARE_ALIGNED(16, extern const uint8_t, aom_norm[256]);
 
+#if CONFIG_EC_ADAPT
+static INLINE void update_cdf(aom_cdf_prob *cdf, int val, int nsymbs) {
+  const int rate = 4 + get_msb(nsymbs);
+  int i, diff, tmp;
+  for (i = 0; i < nsymbs; ++i) {
+    tmp = (i + 1) << (12 - rate);
+    cdf[i] -= ((cdf[i] - tmp) >> rate);
+  }
+  diff = 32768 - cdf[nsymbs - 1];
+
+  for (i = val; i < nsymbs; ++i) {
+    cdf[i] += diff;
+  }
+}
+#endif
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/aom_dsp/x86/fwd_dct32x32_impl_avx2.h b/aom_dsp/x86/fwd_dct32x32_impl_avx2.h
index 8b136e7..2167395 100644
--- a/aom_dsp/x86/fwd_dct32x32_impl_avx2.h
+++ b/aom_dsp/x86/fwd_dct32x32_impl_avx2.h
@@ -12,6 +12,7 @@
 #include <immintrin.h>  // AVX2
 
 #include "aom_dsp/txfm_common.h"
+#include "aom_dsp/x86/txfm_common_intrin.h"
 #include "aom_dsp/x86/txfm_common_avx2.h"
 
 #if FDCT32x32_HIGH_PRECISION
@@ -31,7 +32,19 @@
 }
 #endif
 
-void FDCT32x32_2D_AVX2(const int16_t *input, int16_t *output_org, int stride) {
+#ifndef STORE_COEFF_FUNC
+#define STORE_COEFF_FUNC
+static void store_coeff(const __m256i *coeff, tran_low_t *curr,
+                        tran_low_t *next) {
+  __m128i u = _mm256_castsi256_si128(*coeff);
+  storeu_output(&u, curr);
+  u = _mm256_extractf128_si256(*coeff, 1);
+  storeu_output(&u, next);
+}
+#endif
+
+void FDCT32x32_2D_AVX2(const int16_t *input, tran_low_t *output_org,
+                       int stride) {
   // Calculate pre-multiplied strides
   const int str1 = stride;
   const int str2 = 2 * stride;
@@ -2842,13 +2855,14 @@
       {
         int transpose_block;
         int16_t *output_currStep, *output_nextStep;
-        if (0 == pass) {
-          output_currStep = &intermediate[column_start * 32];
-          output_nextStep = &intermediate[(column_start + 8) * 32];
-        } else {
-          output_currStep = &output_org[column_start * 32];
-          output_nextStep = &output_org[(column_start + 8) * 32];
-        }
+        tran_low_t *curr_out, *next_out;
+        // Pass 0
+        output_currStep = &intermediate[column_start * 32];
+        output_nextStep = &intermediate[(column_start + 8) * 32];
+        // Pass 1
+        curr_out = &output_org[column_start * 32];
+        next_out = &output_org[(column_start + 8) * 32];
+
         for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
           __m256i *this_out = &out[8 * transpose_block];
           // 00  01  02  03  04  05  06  07  08  09  10  11  12  13  14  15
@@ -2948,44 +2962,58 @@
             tr2_6 = _mm256_srai_epi16(tr2_6, 2);
             tr2_7 = _mm256_srai_epi16(tr2_7, 2);
           }
-          // Note: even though all these stores are aligned, using the aligned
-          //       intrinsic make the code slightly slower.
-          _mm_storeu_si128((__m128i *)(output_currStep + 0 * 32),
-                           _mm256_castsi256_si128(tr2_0));
-          _mm_storeu_si128((__m128i *)(output_currStep + 1 * 32),
-                           _mm256_castsi256_si128(tr2_1));
-          _mm_storeu_si128((__m128i *)(output_currStep + 2 * 32),
-                           _mm256_castsi256_si128(tr2_2));
-          _mm_storeu_si128((__m128i *)(output_currStep + 3 * 32),
-                           _mm256_castsi256_si128(tr2_3));
-          _mm_storeu_si128((__m128i *)(output_currStep + 4 * 32),
-                           _mm256_castsi256_si128(tr2_4));
-          _mm_storeu_si128((__m128i *)(output_currStep + 5 * 32),
-                           _mm256_castsi256_si128(tr2_5));
-          _mm_storeu_si128((__m128i *)(output_currStep + 6 * 32),
-                           _mm256_castsi256_si128(tr2_6));
-          _mm_storeu_si128((__m128i *)(output_currStep + 7 * 32),
-                           _mm256_castsi256_si128(tr2_7));
+          if (0 == pass) {
+            // Note: even though all these stores are aligned, using the aligned
+            //       intrinsic make the code slightly slower.
+            _mm_storeu_si128((__m128i *)(output_currStep + 0 * 32),
+                             _mm256_castsi256_si128(tr2_0));
+            _mm_storeu_si128((__m128i *)(output_currStep + 1 * 32),
+                             _mm256_castsi256_si128(tr2_1));
+            _mm_storeu_si128((__m128i *)(output_currStep + 2 * 32),
+                             _mm256_castsi256_si128(tr2_2));
+            _mm_storeu_si128((__m128i *)(output_currStep + 3 * 32),
+                             _mm256_castsi256_si128(tr2_3));
+            _mm_storeu_si128((__m128i *)(output_currStep + 4 * 32),
+                             _mm256_castsi256_si128(tr2_4));
+            _mm_storeu_si128((__m128i *)(output_currStep + 5 * 32),
+                             _mm256_castsi256_si128(tr2_5));
+            _mm_storeu_si128((__m128i *)(output_currStep + 6 * 32),
+                             _mm256_castsi256_si128(tr2_6));
+            _mm_storeu_si128((__m128i *)(output_currStep + 7 * 32),
+                             _mm256_castsi256_si128(tr2_7));
 
-          _mm_storeu_si128((__m128i *)(output_nextStep + 0 * 32),
-                           _mm256_extractf128_si256(tr2_0, 1));
-          _mm_storeu_si128((__m128i *)(output_nextStep + 1 * 32),
-                           _mm256_extractf128_si256(tr2_1, 1));
-          _mm_storeu_si128((__m128i *)(output_nextStep + 2 * 32),
-                           _mm256_extractf128_si256(tr2_2, 1));
-          _mm_storeu_si128((__m128i *)(output_nextStep + 3 * 32),
-                           _mm256_extractf128_si256(tr2_3, 1));
-          _mm_storeu_si128((__m128i *)(output_nextStep + 4 * 32),
-                           _mm256_extractf128_si256(tr2_4, 1));
-          _mm_storeu_si128((__m128i *)(output_nextStep + 5 * 32),
-                           _mm256_extractf128_si256(tr2_5, 1));
-          _mm_storeu_si128((__m128i *)(output_nextStep + 6 * 32),
-                           _mm256_extractf128_si256(tr2_6, 1));
-          _mm_storeu_si128((__m128i *)(output_nextStep + 7 * 32),
-                           _mm256_extractf128_si256(tr2_7, 1));
-          // Process next 8x8
-          output_currStep += 8;
-          output_nextStep += 8;
+            _mm_storeu_si128((__m128i *)(output_nextStep + 0 * 32),
+                             _mm256_extractf128_si256(tr2_0, 1));
+            _mm_storeu_si128((__m128i *)(output_nextStep + 1 * 32),
+                             _mm256_extractf128_si256(tr2_1, 1));
+            _mm_storeu_si128((__m128i *)(output_nextStep + 2 * 32),
+                             _mm256_extractf128_si256(tr2_2, 1));
+            _mm_storeu_si128((__m128i *)(output_nextStep + 3 * 32),
+                             _mm256_extractf128_si256(tr2_3, 1));
+            _mm_storeu_si128((__m128i *)(output_nextStep + 4 * 32),
+                             _mm256_extractf128_si256(tr2_4, 1));
+            _mm_storeu_si128((__m128i *)(output_nextStep + 5 * 32),
+                             _mm256_extractf128_si256(tr2_5, 1));
+            _mm_storeu_si128((__m128i *)(output_nextStep + 6 * 32),
+                             _mm256_extractf128_si256(tr2_6, 1));
+            _mm_storeu_si128((__m128i *)(output_nextStep + 7 * 32),
+                             _mm256_extractf128_si256(tr2_7, 1));
+            // Process next 8x8
+            output_currStep += 8;
+            output_nextStep += 8;
+          }
+          if (1 == pass) {
+            store_coeff(&tr2_0, curr_out + 0 * 32, next_out + 0 * 32);
+            store_coeff(&tr2_1, curr_out + 1 * 32, next_out + 1 * 32);
+            store_coeff(&tr2_2, curr_out + 2 * 32, next_out + 2 * 32);
+            store_coeff(&tr2_3, curr_out + 3 * 32, next_out + 3 * 32);
+            store_coeff(&tr2_4, curr_out + 4 * 32, next_out + 4 * 32);
+            store_coeff(&tr2_5, curr_out + 5 * 32, next_out + 5 * 32);
+            store_coeff(&tr2_6, curr_out + 6 * 32, next_out + 6 * 32);
+            store_coeff(&tr2_7, curr_out + 7 * 32, next_out + 7 * 32);
+            curr_out += 8;
+            next_out += 8;
+          }
         }
       }
     }
diff --git a/aom_dsp/x86/fwd_txfm_avx2.c b/aom_dsp/x86/fwd_txfm_avx2.c
index d381a6e..670f864 100644
--- a/aom_dsp/x86/fwd_txfm_avx2.c
+++ b/aom_dsp/x86/fwd_txfm_avx2.c
@@ -17,14 +17,6 @@
 #undef FDCT32x32_2D_AVX2
 #undef FDCT32x32_HIGH_PRECISION
 
-// TODO(luoyi): The following macro hides an error. The second parameter type of
-// function,
-//   void FDCT32x32_2D_AVX2(const int16_t *, int16_t*, int);
-// is different from the one in,
-//   void aom_fdct32x32_avx2(const int16_t *, tran_low_t*, int);
-// In CONFIG_AOM_HIGHBITDEPTH=1 build, the second parameter type should be
-// int32_t.
-// This function should be removed after av1_fht32x32 scaling/rounding fix.
 #define FDCT32x32_2D_AVX2 aom_fdct32x32_avx2
 #define FDCT32x32_HIGH_PRECISION 1
 #include "aom_dsp/x86/fwd_dct32x32_impl_avx2.h"  // NOLINT
diff --git a/aom_dsp/x86/fwd_txfm_sse2.h b/aom_dsp/x86/fwd_txfm_sse2.h
index 3261584..fe3e446 100644
--- a/aom_dsp/x86/fwd_txfm_sse2.h
+++ b/aom_dsp/x86/fwd_txfm_sse2.h
@@ -12,6 +12,8 @@
 #ifndef AOM_DSP_X86_FWD_TXFM_SSE2_H_
 #define AOM_DSP_X86_FWD_TXFM_SSE2_H_
 
+#include "aom_dsp/x86/txfm_common_intrin.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -257,19 +259,6 @@
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 }
 
-static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) {
-#if CONFIG_AOM_HIGHBITDEPTH
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
-  __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
-  __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
-  _mm_storeu_si128((__m128i *)(dst_ptr), out0);
-  _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
-#else
-  _mm_storeu_si128((__m128i *)(dst_ptr), *poutput);
-#endif  // CONFIG_AOM_HIGHBITDEPTH
-}
-
 static INLINE __m128i mult_round_shift(const __m128i *pin0, const __m128i *pin1,
                                        const __m128i *pmultiplier,
                                        const __m128i *prounding,
diff --git a/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
index 6f3c470..5b2aab2 100644
--- a/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
+++ b/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
@@ -130,12 +130,30 @@
   psraw              m%2, 1
 %endmacro
 
+%macro STORE_OUTPUT 2 ; index, result
+%if CONFIG_AOM_HIGHBITDEPTH
+  ; const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+  ; __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+  ; __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+  ; _mm_store_si128((__m128i *)(dst_ptr), out0);
+  ; _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
+  pxor               m11, m11
+  pcmpgtw            m11, m%2
+  movdqa             m12, m%2
+  punpcklwd          m%2, m11
+  punpckhwd          m12, m11
+  mova               [outputq + 4*%1 +  0], m%2
+  mova               [outputq + 4*%1 + 16], m12
+%else
+  mova               [outputq + 2*%1], m%2
+%endif
+%endmacro
+
 INIT_XMM ssse3
 cglobal fdct8x8, 3, 5, 13, input, output, stride
 
   mova               m8, [pd_8192]
   mova              m12, [pw_11585x2]
-  pxor              m11, m11
 
   lea                r3, [2 * strideq]
   lea                r4, [4 * strideq]
@@ -173,14 +191,14 @@
   DIVIDE_ROUND_2X   4, 5, 9, 10
   DIVIDE_ROUND_2X   6, 7, 9, 10
 
-  mova              [outputq +   0], m0
-  mova              [outputq +  16], m1
-  mova              [outputq +  32], m2
-  mova              [outputq +  48], m3
-  mova              [outputq +  64], m4
-  mova              [outputq +  80], m5
-  mova              [outputq +  96], m6
-  mova              [outputq + 112], m7
+  STORE_OUTPUT       0, 0
+  STORE_OUTPUT       8, 1
+  STORE_OUTPUT      16, 2
+  STORE_OUTPUT      24, 3
+  STORE_OUTPUT      32, 4
+  STORE_OUTPUT      40, 5
+  STORE_OUTPUT      48, 6
+  STORE_OUTPUT      56, 7
 
   RET
 %endif
diff --git a/aom_dsp/x86/inv_txfm_sse2.c b/aom_dsp/x86/inv_txfm_sse2.c
index 4735d97..2217a46 100644
--- a/aom_dsp/x86/inv_txfm_sse2.c
+++ b/aom_dsp/x86/inv_txfm_sse2.c
@@ -171,7 +171,7 @@
   RECON_AND_STORE4X4(dest + 3 * stride, dc_value);
 }
 
-void idct4_sse2(__m128i *in) {
+void aom_idct4_sse2(__m128i *in) {
   const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
@@ -207,7 +207,7 @@
   in[1] = _mm_shuffle_epi32(in[1], 0x4E);
 }
 
-void iadst4_sse2(__m128i *in) {
+void aom_iadst4_sse2(__m128i *in) {
   const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
   const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
   const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
@@ -533,7 +533,7 @@
   RECON_AND_STORE(dest + 7 * stride, dc_value);
 }
 
-void idct8_sse2(__m128i *in) {
+void aom_idct8_sse2(__m128i *in) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
   const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
@@ -558,7 +558,7 @@
         in[4], in[5], in[6], in[7]);
 }
 
-void iadst8_sse2(__m128i *in) {
+void aom_iadst8_sse2(__m128i *in) {
   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
   const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
@@ -2114,13 +2114,13 @@
   in[15] = _mm_sub_epi16(s[0], s[15]);
 }
 
-void idct16_sse2(__m128i *in0, __m128i *in1) {
+void aom_idct16_sse2(__m128i *in0, __m128i *in1) {
   array_transpose_16x16(in0, in1);
   idct16_8col(in0);
   idct16_8col(in1);
 }
 
-void iadst16_sse2(__m128i *in0, __m128i *in1) {
+void aom_iadst16_sse2(__m128i *in0, __m128i *in1) {
   array_transpose_16x16(in0, in1);
   iadst16_8col(in0);
   iadst16_8col(in1);
@@ -3596,7 +3596,7 @@
 
   if (!test) {
     // Do the row transform
-    idct4_sse2(inptr);
+    aom_idct4_sse2(inptr);
 
     // Check the min & max values
     max_input = _mm_max_epi16(inptr[0], inptr[1]);
@@ -3632,7 +3632,7 @@
   }
 
   if (optimised_cols) {
-    idct4_sse2(inptr);
+    aom_idct4_sse2(inptr);
 
     // Final round and shift
     inptr[0] = _mm_add_epi16(inptr[0], eight);
@@ -3712,7 +3712,7 @@
 
   if (!test) {
     // Do the row transform
-    idct8_sse2(inptr);
+    aom_idct8_sse2(inptr);
 
     // Find the min & max for the column transform
     max_input = _mm_max_epi16(inptr[0], inptr[1]);
@@ -3749,7 +3749,7 @@
   }
 
   if (optimised_cols) {
-    idct8_sse2(inptr);
+    aom_idct8_sse2(inptr);
 
     // Final round & shift and Reconstruction and Store
     {
@@ -3813,7 +3813,7 @@
 
   if (!test) {
     // Do the row transform
-    idct8_sse2(inptr);
+    aom_idct8_sse2(inptr);
 
     // Find the min & max for the column transform
     // N.B. Only first 4 cols contain non-zero coeffs
@@ -3852,7 +3852,7 @@
   }
 
   if (optimised_cols) {
-    idct8_sse2(inptr);
+    aom_idct8_sse2(inptr);
 
     // Final round & shift and Reconstruction and Store
     {
@@ -3918,7 +3918,7 @@
 
   if (!test) {
     // Do the row transform
-    idct16_sse2(inptr, inptr + 16);
+    aom_idct16_sse2(inptr, inptr + 16);
 
     // Find the min & max for the column transform
     max_input = _mm_max_epi16(inptr[0], inptr[1]);
@@ -3960,7 +3960,7 @@
   }
 
   if (optimised_cols) {
-    idct16_sse2(inptr, inptr + 16);
+    aom_idct16_sse2(inptr, inptr + 16);
 
     // Final round & shift and Reconstruction and Store
     {
@@ -4033,7 +4033,7 @@
 
   if (!test) {
     // Do the row transform (N.B. This transposes inptr)
-    idct16_sse2(inptr, inptr + 16);
+    aom_idct16_sse2(inptr, inptr + 16);
 
     // Find the min & max for the column transform
     // N.B. Only first 4 cols contain non-zero coeffs
@@ -4078,7 +4078,7 @@
   }
 
   if (optimised_cols) {
-    idct16_sse2(inptr, inptr + 16);
+    aom_idct16_sse2(inptr, inptr + 16);
 
     // Final round & shift and Reconstruction and Store
     {
diff --git a/aom_dsp/x86/inv_txfm_sse2.h b/aom_dsp/x86/inv_txfm_sse2.h
index c271b28..4ebb34d 100644
--- a/aom_dsp/x86/inv_txfm_sse2.h
+++ b/aom_dsp/x86/inv_txfm_sse2.h
@@ -197,12 +197,12 @@
 
 void iadst16_8col(__m128i *in);
 void idct16_8col(__m128i *in);
-void idct4_sse2(__m128i *in);
-void idct8_sse2(__m128i *in);
-void idct16_sse2(__m128i *in0, __m128i *in1);
-void iadst4_sse2(__m128i *in);
-void iadst8_sse2(__m128i *in);
-void iadst16_sse2(__m128i *in0, __m128i *in1);
+void aom_idct4_sse2(__m128i *in);
+void aom_idct8_sse2(__m128i *in);
+void aom_idct16_sse2(__m128i *in0, __m128i *in1);
+void aom_iadst4_sse2(__m128i *in);
+void aom_iadst8_sse2(__m128i *in);
+void aom_iadst16_sse2(__m128i *in0, __m128i *in1);
 void idct32_8col(__m128i *in0, __m128i *in1);
 
 #endif  // AOM_DSP_X86_INV_TXFM_SSE2_H_
diff --git a/aom_dsp/x86/txfm_common_avx2.h b/aom_dsp/x86/txfm_common_avx2.h
index 7dc17f0..39e9b8e 100644
--- a/aom_dsp/x86/txfm_common_avx2.h
+++ b/aom_dsp/x86/txfm_common_avx2.h
@@ -14,6 +14,8 @@
 
 #include <immintrin.h>
 
+#include "aom_dsp/txfm_common.h"
+
 #define pair256_set_epi16(a, b)                                            \
   _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
                    (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
@@ -24,4 +26,179 @@
   _mm256_set_epi32((int)(b), (int)(a), (int)(b), (int)(a), (int)(b), (int)(a), \
                    (int)(b), (int)(a))
 
+static INLINE void mm256_reverse_epi16(__m256i *u) {
+  const __m256i control = _mm256_set_epi16(
+      0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E, 0x0100,
+      0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E);
+  __m256i v = _mm256_shuffle_epi8(*u, control);
+  *u = _mm256_permute2x128_si256(v, v, 1);
+}
+
+static INLINE void mm256_transpose_16x16(__m256i *in) {
+  __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]);
+  __m256i tr0_1 = _mm256_unpackhi_epi16(in[0], in[1]);
+  __m256i tr0_2 = _mm256_unpacklo_epi16(in[2], in[3]);
+  __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]);
+  __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]);
+  __m256i tr0_5 = _mm256_unpackhi_epi16(in[4], in[5]);
+  __m256i tr0_6 = _mm256_unpacklo_epi16(in[6], in[7]);
+  __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]);
+
+  __m256i tr0_8 = _mm256_unpacklo_epi16(in[8], in[9]);
+  __m256i tr0_9 = _mm256_unpackhi_epi16(in[8], in[9]);
+  __m256i tr0_a = _mm256_unpacklo_epi16(in[10], in[11]);
+  __m256i tr0_b = _mm256_unpackhi_epi16(in[10], in[11]);
+  __m256i tr0_c = _mm256_unpacklo_epi16(in[12], in[13]);
+  __m256i tr0_d = _mm256_unpackhi_epi16(in[12], in[13]);
+  __m256i tr0_e = _mm256_unpacklo_epi16(in[14], in[15]);
+  __m256i tr0_f = _mm256_unpackhi_epi16(in[14], in[15]);
+
+  // 00 10 01 11 02 12 03 13  08 18 09 19 0a 1a 0b 1b
+  // 04 14 05 15 06 16 07 17  0c 1c 0d 1d 0e 1e 0f 1f
+  // 20 30 21 31 22 32 23 33  28 38 29 39 2a 3a 2b 3b
+  // 24 34 25 35 26 36 27 37  2c 3c 2d 3d 2e 3e 2f 3f
+  // 40 50 41 51 42 52 43 53  48 58 49 59 4a 5a 4b 5b
+  // 44 54 45 55 46 56 47 57  4c 5c 4d 5d 4e 5e 4f 5f
+  // 60 70 61 71 62 72 63 73  68 78 69 79 6a 7a 6b 7b
+  // 64 74 65 75 66 76 67 77  6c 7c 6d 7d 6e 7e 6f 7f
+
+  // 80 90 81 91 82 92 83 93  88 98 89 99 8a 9a 8b 9b
+  // 84 94 85 95 86 96 87 97  8c 9c 8d 9d 8e 9e 8f 9f
+  // a0 b0 a1 b1 a2 b2 a3 b3  a8 b8 a9 b9 aa ba ab bb
+  // a4 b4 a5 b5 a6 b6 a7 b7  ac bc ad bd ae be af bf
+  // c0 d0 c1 d1 c2 d2 c3 d3  c8 d8 c9 d9 ca da cb db
+  // c4 d4 c5 d5 c6 d6 c7 d7  cc dc cd dd ce de cf df
+  // e0 f0 e1 f1 e2 f2 e3 f3  e8 f8 e9 f9 ea fa eb fb
+  // e4 f4 e5 f5 e6 f6 e7 f7  ec fc ed fd ee fe ef ff
+
+  __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_2);
+  __m256i tr1_1 = _mm256_unpackhi_epi32(tr0_0, tr0_2);
+  __m256i tr1_2 = _mm256_unpacklo_epi32(tr0_1, tr0_3);
+  __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_1, tr0_3);
+  __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_6);
+  __m256i tr1_5 = _mm256_unpackhi_epi32(tr0_4, tr0_6);
+  __m256i tr1_6 = _mm256_unpacklo_epi32(tr0_5, tr0_7);
+  __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_5, tr0_7);
+
+  __m256i tr1_8 = _mm256_unpacklo_epi32(tr0_8, tr0_a);
+  __m256i tr1_9 = _mm256_unpackhi_epi32(tr0_8, tr0_a);
+  __m256i tr1_a = _mm256_unpacklo_epi32(tr0_9, tr0_b);
+  __m256i tr1_b = _mm256_unpackhi_epi32(tr0_9, tr0_b);
+  __m256i tr1_c = _mm256_unpacklo_epi32(tr0_c, tr0_e);
+  __m256i tr1_d = _mm256_unpackhi_epi32(tr0_c, tr0_e);
+  __m256i tr1_e = _mm256_unpacklo_epi32(tr0_d, tr0_f);
+  __m256i tr1_f = _mm256_unpackhi_epi32(tr0_d, tr0_f);
+
+  // 00 10 20 30 01 11 21 31  08 18 28 38 09 19 29 39
+  // 02 12 22 32 03 13 23 33  0a 1a 2a 3a 0b 1b 2b 3b
+  // 04 14 24 34 05 15 25 35  0c 1c 2c 3c 0d 1d 2d 3d
+  // 06 16 26 36 07 17 27 37  0e 1e 2e 3e 0f 1f 2f 3f
+  // 40 50 60 70 41 51 61 71  48 58 68 78 49 59 69 79
+  // 42 52 62 72 43 53 63 73  4a 5a 6a 7a 4b 5b 6b 7b
+  // 44 54 64 74 45 55 65 75  4c 5c 6c 7c 4d 5d 6d 7d
+  // 46 56 66 76 47 57 67 77  4e 5e 6e 7e 4f 5f 6f 7f
+
+  // 80 90 a0 b0 81 91 a1 b1  88 98 a8 b8 89 99 a9 b9
+  // 82 92 a2 b2 83 93 a3 b3  8a 9a aa ba 8b 9b ab bb
+  // 84 94 a4 b4 85 95 a5 b5  8c 9c ac bc 8d 9d ad bd
+  // 86 96 a6 b6 87 97 a7 b7  8e ae 9e be 8f 9f af bf
+  // c0 d0 e0 f0 c1 d1 e1 f1  c8 d8 e8 f8 c9 d9 e9 f9
+  // c2 d2 e2 f2 c3 d3 e3 f3  ca da ea fa cb db eb fb
+  // c4 d4 e4 f4 c5 d5 e5 f5  cc dc ef fc cd dd ed fd
+  // c6 d6 e6 f6 c7 d7 e7 f7  ce de ee fe cf df ef ff
+
+  tr0_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4);
+  tr0_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4);
+  tr0_2 = _mm256_unpacklo_epi64(tr1_1, tr1_5);
+  tr0_3 = _mm256_unpackhi_epi64(tr1_1, tr1_5);
+  tr0_4 = _mm256_unpacklo_epi64(tr1_2, tr1_6);
+  tr0_5 = _mm256_unpackhi_epi64(tr1_2, tr1_6);
+  tr0_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7);
+  tr0_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7);
+
+  tr0_8 = _mm256_unpacklo_epi64(tr1_8, tr1_c);
+  tr0_9 = _mm256_unpackhi_epi64(tr1_8, tr1_c);
+  tr0_a = _mm256_unpacklo_epi64(tr1_9, tr1_d);
+  tr0_b = _mm256_unpackhi_epi64(tr1_9, tr1_d);
+  tr0_c = _mm256_unpacklo_epi64(tr1_a, tr1_e);
+  tr0_d = _mm256_unpackhi_epi64(tr1_a, tr1_e);
+  tr0_e = _mm256_unpacklo_epi64(tr1_b, tr1_f);
+  tr0_f = _mm256_unpackhi_epi64(tr1_b, tr1_f);
+
+  // 00 10 20 30 40 50 60 70  08 18 28 38 48 58 68 78
+  // 01 11 21 31 41 51 61 71  09 19 29 39 49 59 69 79
+  // 02 12 22 32 42 52 62 72  0a 1a 2a 3a 4a 5a 6a 7a
+  // 03 13 23 33 43 53 63 73  0b 1b 2b 3b 4b 5b 6b 7b
+  // 04 14 24 34 44 54 64 74  0c 1c 2c 3c 4c 5c 6c 7c
+  // 05 15 25 35 45 55 65 75  0d 1d 2d 3d 4d 5d 6d 7d
+  // 06 16 26 36 46 56 66 76  0e 1e 2e 3e 4e 5e 6e 7e
+  // 07 17 27 37 47 57 67 77  0f 1f 2f 3f 4f 5f 6f 7f
+
+  // 80 90 a0 b0 c0 d0 e0 f0  88 98 a8 b8 c8 d8 e8 f8
+  // 81 91 a1 b1 c1 d1 e1 f1  89 99 a9 b9 c9 d9 e9 f9
+  // 82 92 a2 b2 c2 d2 e2 f2  8a 9a aa ba ca da ea fa
+  // 83 93 a3 b3 c3 d3 e3 f3  8b 9b ab bb cb db eb fb
+  // 84 94 a4 b4 c4 d4 e4 f4  8c 9c ac bc cc dc ef fc
+  // 85 95 a5 b5 c5 d5 e5 f5  8d 9d ad bd cd dd ed fd
+  // 86 96 a6 b6 c6 d6 e6 f6  8e ae 9e be ce de ee fe
+  // 87 97 a7 b7 c7 d7 e7 f7  8f 9f af bf cf df ef ff
+
+  in[0] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x20);  // 0010 0000
+  in[8] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x31);  // 0011 0001
+  in[1] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x20);
+  in[9] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x31);
+  in[2] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x20);
+  in[10] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x31);
+  in[3] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x20);
+  in[11] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x31);
+
+  in[4] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x20);
+  in[12] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x31);
+  in[5] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x20);
+  in[13] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x31);
+  in[6] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x20);
+  in[14] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x31);
+  in[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20);
+  in[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31);
+}
+
+static INLINE __m256i butter_fly(__m256i a0, __m256i a1, const __m256i cospi) {
+  const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+  __m256i y0 = _mm256_madd_epi16(a0, cospi);
+  __m256i y1 = _mm256_madd_epi16(a1, cospi);
+
+  y0 = _mm256_add_epi32(y0, dct_rounding);
+  y1 = _mm256_add_epi32(y1, dct_rounding);
+  y0 = _mm256_srai_epi32(y0, DCT_CONST_BITS);
+  y1 = _mm256_srai_epi32(y1, DCT_CONST_BITS);
+
+  return _mm256_packs_epi32(y0, y1);
+}
+
+static INLINE void txfm_scaling16_avx2(const int16_t c, __m256i *in) {
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i sqrt2_epi16 = _mm256_set1_epi16(c);
+  const __m256i dct_const_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+  __m256i u0, u1;
+  int i = 0;
+
+  while (i < 16) {
+    in[i] = _mm256_slli_epi16(in[i], 1);
+
+    u0 = _mm256_unpacklo_epi16(zero, in[i]);
+    u1 = _mm256_unpackhi_epi16(zero, in[i]);
+
+    u0 = _mm256_madd_epi16(u0, sqrt2_epi16);
+    u1 = _mm256_madd_epi16(u1, sqrt2_epi16);
+
+    u0 = _mm256_add_epi32(u0, dct_const_rounding);
+    u1 = _mm256_add_epi32(u1, dct_const_rounding);
+
+    u0 = _mm256_srai_epi32(u0, DCT_CONST_BITS);
+    u1 = _mm256_srai_epi32(u1, DCT_CONST_BITS);
+    in[i] = _mm256_packs_epi32(u0, u1);
+    i++;
+  }
+}
+
 #endif  // AOM_DSP_X86_TXFM_COMMON_AVX2_H
diff --git a/aom_dsp/x86/txfm_common_intrin.h b/aom_dsp/x86/txfm_common_intrin.h
new file mode 100644
index 0000000..890e048
--- /dev/null
+++ b/aom_dsp/x86/txfm_common_intrin.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_
+#define _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_
+
+// Note:
+//  This header file should be put below any x86 intrinsics head file
+
+static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) {
+#if CONFIG_AOM_HIGHBITDEPTH
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+  __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+  __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+  _mm_storeu_si128((__m128i *)(dst_ptr), out0);
+  _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
+#else
+  _mm_storeu_si128((__m128i *)(dst_ptr), *poutput);
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+}
+
+#endif  // _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_
diff --git a/aomenc.c b/aomenc.c
index e32a922..ef174c2 100644
--- a/aomenc.c
+++ b/aomenc.c
@@ -1964,6 +1964,7 @@
           { stream->config.cfg.g_input_bit_depth = input.bit_depth; });
     }
 
+#if CONFIG_AOM_HIGHBITDEPTH
     FOREACH_STREAM({
       if (input.fmt != AOM_IMG_FMT_I420 && input.fmt != AOM_IMG_FMT_I42016) {
         /* Automatically upgrade if input is non-4:2:0 but a 4:2:0 profile
@@ -1980,7 +1981,6 @@
           default: break;
         }
       }
-#if CONFIG_AOM_HIGHBITDEPTH
       /* Automatically set the codec bit depth to match the input bit depth.
        * Upgrade the profile if required. */
       if (stream->config.cfg.g_input_bit_depth >
@@ -2003,7 +2003,6 @@
       if (stream->config.cfg.g_profile > 1) {
         stream->config.use_16bit_internal = 1;
       }
-#endif
       if (profile_updated) {
         fprintf(stderr,
                 "Warning: automatically upgrading to profile %d to "
@@ -2011,6 +2010,31 @@
                 stream->config.cfg.g_profile);
       }
     });
+#else
+    FOREACH_STREAM({
+      if (input.fmt != AOM_IMG_FMT_I420 && input.fmt != AOM_IMG_FMT_I42016) {
+        /* Automatically upgrade if input is non-4:2:0 but a 4:2:0 profile
+           was selected. */
+        switch (stream->config.cfg.g_profile) {
+          case 0:
+            stream->config.cfg.g_profile = 1;
+            profile_updated = 1;
+            break;
+          case 2:
+            stream->config.cfg.g_profile = 3;
+            profile_updated = 1;
+            break;
+          default: break;
+        }
+      }
+      if (profile_updated) {
+        fprintf(stderr,
+                "Warning: automatically upgrading to profile %d to "
+                "match input format.\n",
+                stream->config.cfg.g_profile);
+      }
+    });
+#endif
 
     FOREACH_STREAM(set_stream_dimensions(stream, input.width, input.height));
     FOREACH_STREAM(validate_stream_config(stream, &global));
diff --git a/av1/av1_common.mk b/av1/av1_common.mk
index 38fb6fd..0fe4a89 100644
--- a/av1/av1_common.mk
+++ b/av1/av1_common.mk
@@ -30,8 +30,6 @@
 AV1_COMMON_SRCS-yes += common/filter.c
 AV1_COMMON_SRCS-yes += common/idct.h
 AV1_COMMON_SRCS-yes += common/idct.c
-AV1_COMMON_SRCS-yes += common/av1_inv_txfm.h
-AV1_COMMON_SRCS-yes += common/av1_inv_txfm.c
 AV1_COMMON_SRCS-yes += common/loopfilter.h
 AV1_COMMON_SRCS-yes += common/thread_common.h
 AV1_COMMON_SRCS-yes += common/mv.h
@@ -61,8 +59,6 @@
 AV1_COMMON_SRCS-yes += common/scan.c
 AV1_COMMON_SRCS-yes += common/scan.h
 # TODO(angiebird) the forward transform belongs under encoder/
-AV1_COMMON_SRCS-$(CONFIG_AV1_ENCODER) += common/av1_fwd_txfm.h
-AV1_COMMON_SRCS-$(CONFIG_AV1_ENCODER) += common/av1_fwd_txfm.c
 AV1_COMMON_SRCS-yes += common/av1_txfm.h
 AV1_COMMON_SRCS-yes += common/av1_fwd_txfm1d.h
 AV1_COMMON_SRCS-yes += common/av1_fwd_txfm1d.c
@@ -78,8 +74,8 @@
 AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_highbd_convolve_sse4.c
 AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_highbd_convolve_filters_sse4.h
 endif
-AV1_COMMON_SRCS-yes += common/av1_convolve.c
-AV1_COMMON_SRCS-yes += common/av1_convolve.h
+AV1_COMMON_SRCS-yes += common/convolve.c
+AV1_COMMON_SRCS-yes += common/convolve.h
 AV1_COMMON_SRCS-$(CONFIG_LOOP_RESTORATION) += common/restoration.h
 AV1_COMMON_SRCS-$(CONFIG_LOOP_RESTORATION) += common/restoration.c
 ifeq (yes,$(filter $(CONFIG_GLOBAL_MOTION) $(CONFIG_WARPED_MOTION),yes))
@@ -122,10 +118,9 @@
 AV1_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/idct16x16_msa.c
 
 AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_intrin_sse2.c
+AV1_COMMON_SRCS-$(HAVE_AVX2) += common/x86/hybrid_inv_txfm_avx2.c
+
 ifeq ($(CONFIG_AV1_ENCODER),yes)
-AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/av1_fwd_txfm_sse2.c
-AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/av1_fwd_dct32x32_impl_sse2.h
-AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/av1_fwd_txfm_impl_sse2.h
 AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_txfm1d_sse4.h
 AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_fwd_txfm1d_sse4.c
 AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_fwd_txfm2d_sse4.c
@@ -143,7 +138,4 @@
 AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/filterintra_sse4.c
 endif
 
-AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/av1_inv_txfm_sse2.c
-AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/av1_inv_txfm_sse2.h
-
 $(eval $(call rtcd_h_template,av1_rtcd,av1/common/av1_rtcd_defs.pl))
diff --git a/av1/av1_cx_iface.c b/av1/av1_cx_iface.c
index a7eb71e..e8069d6 100644
--- a/av1/av1_cx_iface.c
+++ b/av1/av1_cx_iface.c
@@ -142,10 +142,10 @@
     return AOM_CODEC_INVALID_PARAM; \
   } while (0)
 
-#define RANGE_CHECK(p, memb, lo, hi)                                 \
-  do {                                                               \
-    if (!(((p)->memb == lo || (p)->memb > (lo)) && (p)->memb <= hi)) \
-      ERROR(#memb " out of range [" #lo ".." #hi "]");               \
+#define RANGE_CHECK(p, memb, lo, hi)                   \
+  do {                                                 \
+    if (!((p)->memb >= (lo) && (p)->memb <= (hi)))     \
+      ERROR(#memb " out of range [" #lo ".." #hi "]"); \
   } while (0)
 
 #define RANGE_CHECK_HI(p, memb, hi)                                     \
@@ -176,7 +176,7 @@
   RANGE_CHECK_HI(cfg, rc_min_quantizer, cfg->rc_max_quantizer);
   RANGE_CHECK_BOOL(extra_cfg, lossless);
   RANGE_CHECK(extra_cfg, aq_mode, 0, AQ_MODE_COUNT - 1);
-  RANGE_CHECK(extra_cfg, frame_periodic_boost, 0, 1);
+  RANGE_CHECK_HI(extra_cfg, frame_periodic_boost, 1);
   RANGE_CHECK_HI(cfg, g_threads, 64);
   RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS);
   RANGE_CHECK(cfg, rc_end_usage, AOM_VBR, AOM_Q);
@@ -189,8 +189,8 @@
   RANGE_CHECK_HI(cfg, rc_resize_up_thresh, 100);
   RANGE_CHECK_HI(cfg, rc_resize_down_thresh, 100);
   RANGE_CHECK(cfg, g_pass, AOM_RC_ONE_PASS, AOM_RC_LAST_PASS);
-  RANGE_CHECK(extra_cfg, min_gf_interval, 0, (MAX_LAG_BUFFERS - 1));
-  RANGE_CHECK(extra_cfg, max_gf_interval, 0, (MAX_LAG_BUFFERS - 1));
+  RANGE_CHECK_HI(extra_cfg, min_gf_interval, MAX_LAG_BUFFERS - 1);
+  RANGE_CHECK_HI(extra_cfg, max_gf_interval, MAX_LAG_BUFFERS - 1);
   if (extra_cfg->max_gf_interval > 0) {
     RANGE_CHECK(extra_cfg, max_gf_interval, 2, (MAX_LAG_BUFFERS - 1));
   }
@@ -200,8 +200,8 @@
   }
 
   if (cfg->rc_resize_allowed == 1) {
-    RANGE_CHECK(cfg, rc_scaled_width, 0, cfg->g_w);
-    RANGE_CHECK(cfg, rc_scaled_height, 0, cfg->g_h);
+    RANGE_CHECK_HI(cfg, rc_scaled_width, cfg->g_w);
+    RANGE_CHECK_HI(cfg, rc_scaled_height, cfg->g_h);
   }
 
   // AV1 does not support a lower bound on the keyframe interval in
@@ -212,9 +212,9 @@
         "kf_min_dist not supported in auto mode, use 0 "
         "or kf_max_dist instead.");
 
-  RANGE_CHECK(extra_cfg, enable_auto_alt_ref, 0, 2);
+  RANGE_CHECK_HI(extra_cfg, enable_auto_alt_ref, 2);
 #if CONFIG_EXT_REFS
-  RANGE_CHECK(extra_cfg, enable_auto_bwd_ref, 0, 2);
+  RANGE_CHECK_HI(extra_cfg, enable_auto_bwd_ref, 2);
 #endif  // CONFIG_EXT_REFS
   RANGE_CHECK(extra_cfg, cpu_used, -8, 8);
   RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6);
@@ -239,13 +239,13 @@
       RANGE_CHECK(extra_cfg, tile_rows, 1, 64);
   }
 #else
-  RANGE_CHECK(extra_cfg, tile_columns, 0, 6);
-  RANGE_CHECK(extra_cfg, tile_rows, 0, 2);
+  RANGE_CHECK_HI(extra_cfg, tile_columns, 6);
+  RANGE_CHECK_HI(extra_cfg, tile_rows, 2);
 #endif  // CONFIG_EXT_TILE
   RANGE_CHECK_HI(extra_cfg, sharpness, 7);
-  RANGE_CHECK(extra_cfg, arnr_max_frames, 0, 15);
+  RANGE_CHECK_HI(extra_cfg, arnr_max_frames, 15);
   RANGE_CHECK_HI(extra_cfg, arnr_strength, 6);
-  RANGE_CHECK(extra_cfg, cq_level, 0, 63);
+  RANGE_CHECK_HI(extra_cfg, cq_level, 63);
   RANGE_CHECK(cfg, g_bit_depth, AOM_BITS_8, AOM_BITS_12);
   RANGE_CHECK(cfg, g_input_bit_depth, 8, 12);
   RANGE_CHECK(extra_cfg, content, AOM_CONTENT_DEFAULT, AOM_CONTENT_INVALID - 1);
diff --git a/av1/common/av1_fwd_txfm.c b/av1/common/av1_fwd_txfm.c
deleted file mode 100644
index 84a3876..0000000
--- a/av1/common/av1_fwd_txfm.c
+++ /dev/null
@@ -1,813 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "av1/common/av1_fwd_txfm.h"
-#include <assert.h>
-#include "./av1_rtcd.h"
-
-void av1_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
-  // The 2D transform is done with two passes which are actually pretty
-  // similar. In the first one, we transform the columns and transpose
-  // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we transpose the columns (that
-  // is the transposed rows) and transpose the results (so that it goes back
-  // in normal/row positions).
-  int pass;
-  // We need an intermediate buffer between passes.
-  tran_low_t intermediate[4 * 4];
-  const tran_low_t *in_low = NULL;
-  tran_low_t *out = intermediate;
-  // Do the two transform/transpose passes
-  for (pass = 0; pass < 2; ++pass) {
-    tran_high_t in_high[4];    // canbe16
-    tran_high_t step[4];       // canbe16
-    tran_high_t temp1, temp2;  // needs32
-    int i;
-    for (i = 0; i < 4; ++i) {
-      // Load inputs.
-      if (0 == pass) {
-        in_high[0] = input[0 * stride] * 16;
-        in_high[1] = input[1 * stride] * 16;
-        in_high[2] = input[2 * stride] * 16;
-        in_high[3] = input[3 * stride] * 16;
-        if (i == 0 && in_high[0]) {
-          in_high[0] += 1;
-        }
-      } else {
-        assert(in_low != NULL);
-        in_high[0] = in_low[0 * 4];
-        in_high[1] = in_low[1 * 4];
-        in_high[2] = in_low[2 * 4];
-        in_high[3] = in_low[3 * 4];
-        in_low++;
-      }
-      // Transform.
-      step[0] = in_high[0] + in_high[3];
-      step[1] = in_high[1] + in_high[2];
-      step[2] = in_high[1] - in_high[2];
-      step[3] = in_high[0] - in_high[3];
-      temp1 = (step[0] + step[1]) * cospi_16_64;
-      temp2 = (step[0] - step[1]) * cospi_16_64;
-      out[0] = (tran_low_t)fdct_round_shift(temp1);
-      out[2] = (tran_low_t)fdct_round_shift(temp2);
-      temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
-      temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
-      out[1] = (tran_low_t)fdct_round_shift(temp1);
-      out[3] = (tran_low_t)fdct_round_shift(temp2);
-      // Do next column (which is a transposed row in second/horizontal pass)
-      input++;
-      out += 4;
-    }
-    // Setup in_low/out for next pass.
-    in_low = intermediate;
-    out = output;
-  }
-
-  {
-    int i, j;
-    for (i = 0; i < 4; ++i) {
-      for (j = 0; j < 4; ++j) output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
-    }
-  }
-}
-
-void av1_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
-  int r, c;
-  tran_low_t sum = 0;
-  for (r = 0; r < 4; ++r)
-    for (c = 0; c < 4; ++c) sum += input[r * stride + c];
-
-  output[0] = sum << 1;
-  output[1] = 0;
-}
-
-void av1_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
-  int i, j;
-  tran_low_t intermediate[64];
-  int pass;
-  tran_low_t *output = intermediate;
-  const tran_low_t *in = NULL;
-
-  // Transform columns
-  for (pass = 0; pass < 2; ++pass) {
-    tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
-    tran_high_t t0, t1, t2, t3;                  // needs32
-    tran_high_t x0, x1, x2, x3;                  // canbe16
-
-    for (i = 0; i < 8; i++) {
-      // stage 1
-      if (pass == 0) {
-        s0 = (input[0 * stride] + input[7 * stride]) * 4;
-        s1 = (input[1 * stride] + input[6 * stride]) * 4;
-        s2 = (input[2 * stride] + input[5 * stride]) * 4;
-        s3 = (input[3 * stride] + input[4 * stride]) * 4;
-        s4 = (input[3 * stride] - input[4 * stride]) * 4;
-        s5 = (input[2 * stride] - input[5 * stride]) * 4;
-        s6 = (input[1 * stride] - input[6 * stride]) * 4;
-        s7 = (input[0 * stride] - input[7 * stride]) * 4;
-        ++input;
-      } else {
-        s0 = in[0 * 8] + in[7 * 8];
-        s1 = in[1 * 8] + in[6 * 8];
-        s2 = in[2 * 8] + in[5 * 8];
-        s3 = in[3 * 8] + in[4 * 8];
-        s4 = in[3 * 8] - in[4 * 8];
-        s5 = in[2 * 8] - in[5 * 8];
-        s6 = in[1 * 8] - in[6 * 8];
-        s7 = in[0 * 8] - in[7 * 8];
-        ++in;
-      }
-
-      // fdct4(step, step);
-      x0 = s0 + s3;
-      x1 = s1 + s2;
-      x2 = s1 - s2;
-      x3 = s0 - s3;
-      t0 = (x0 + x1) * cospi_16_64;
-      t1 = (x0 - x1) * cospi_16_64;
-      t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
-      t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
-      output[0] = (tran_low_t)fdct_round_shift(t0);
-      output[2] = (tran_low_t)fdct_round_shift(t2);
-      output[4] = (tran_low_t)fdct_round_shift(t1);
-      output[6] = (tran_low_t)fdct_round_shift(t3);
-
-      // Stage 2
-      t0 = (s6 - s5) * cospi_16_64;
-      t1 = (s6 + s5) * cospi_16_64;
-      t2 = fdct_round_shift(t0);
-      t3 = fdct_round_shift(t1);
-
-      // Stage 3
-      x0 = s4 + t2;
-      x1 = s4 - t2;
-      x2 = s7 - t3;
-      x3 = s7 + t3;
-
-      // Stage 4
-      t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
-      t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
-      t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
-      t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
-      output[1] = (tran_low_t)fdct_round_shift(t0);
-      output[3] = (tran_low_t)fdct_round_shift(t2);
-      output[5] = (tran_low_t)fdct_round_shift(t1);
-      output[7] = (tran_low_t)fdct_round_shift(t3);
-      output += 8;
-    }
-    in = intermediate;
-    output = final_output;
-  }
-
-  // Rows
-  for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j) final_output[j + i * 8] /= 2;
-  }
-}
-
-void av1_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) {
-  int r, c;
-  tran_low_t sum = 0;
-  for (r = 0; r < 8; ++r)
-    for (c = 0; c < 8; ++c) sum += input[r * stride + c];
-
-  output[0] = sum;
-  output[1] = 0;
-}
-
-void av1_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
-  // The 2D transform is done with two passes which are actually pretty
-  // similar. In the first one, we transform the columns and transpose
-  // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we transpose the columns (that
-  // is the transposed rows) and transpose the results (so that it goes back
-  // in normal/row positions).
-  int pass;
-  // We need an intermediate buffer between passes.
-  tran_low_t intermediate[256];
-  const tran_low_t *in_low = NULL;
-  tran_low_t *out = intermediate;
-  // Do the two transform/transpose passes
-  for (pass = 0; pass < 2; ++pass) {
-    tran_high_t step1[8];      // canbe16
-    tran_high_t step2[8];      // canbe16
-    tran_high_t step3[8];      // canbe16
-    tran_high_t in_high[8];    // canbe16
-    tran_high_t temp1, temp2;  // needs32
-    int i;
-    for (i = 0; i < 16; i++) {
-      if (0 == pass) {
-        // Calculate input for the first 8 results.
-        in_high[0] = (input[0 * stride] + input[15 * stride]) * 4;
-        in_high[1] = (input[1 * stride] + input[14 * stride]) * 4;
-        in_high[2] = (input[2 * stride] + input[13 * stride]) * 4;
-        in_high[3] = (input[3 * stride] + input[12 * stride]) * 4;
-        in_high[4] = (input[4 * stride] + input[11 * stride]) * 4;
-        in_high[5] = (input[5 * stride] + input[10 * stride]) * 4;
-        in_high[6] = (input[6 * stride] + input[9 * stride]) * 4;
-        in_high[7] = (input[7 * stride] + input[8 * stride]) * 4;
-        // Calculate input for the next 8 results.
-        step1[0] = (input[7 * stride] - input[8 * stride]) * 4;
-        step1[1] = (input[6 * stride] - input[9 * stride]) * 4;
-        step1[2] = (input[5 * stride] - input[10 * stride]) * 4;
-        step1[3] = (input[4 * stride] - input[11 * stride]) * 4;
-        step1[4] = (input[3 * stride] - input[12 * stride]) * 4;
-        step1[5] = (input[2 * stride] - input[13 * stride]) * 4;
-        step1[6] = (input[1 * stride] - input[14 * stride]) * 4;
-        step1[7] = (input[0 * stride] - input[15 * stride]) * 4;
-      } else {
-        // Calculate input for the first 8 results.
-        assert(in_low != NULL);
-        in_high[0] = ((in_low[0 * 16] + 1) >> 2) + ((in_low[15 * 16] + 1) >> 2);
-        in_high[1] = ((in_low[1 * 16] + 1) >> 2) + ((in_low[14 * 16] + 1) >> 2);
-        in_high[2] = ((in_low[2 * 16] + 1) >> 2) + ((in_low[13 * 16] + 1) >> 2);
-        in_high[3] = ((in_low[3 * 16] + 1) >> 2) + ((in_low[12 * 16] + 1) >> 2);
-        in_high[4] = ((in_low[4 * 16] + 1) >> 2) + ((in_low[11 * 16] + 1) >> 2);
-        in_high[5] = ((in_low[5 * 16] + 1) >> 2) + ((in_low[10 * 16] + 1) >> 2);
-        in_high[6] = ((in_low[6 * 16] + 1) >> 2) + ((in_low[9 * 16] + 1) >> 2);
-        in_high[7] = ((in_low[7 * 16] + 1) >> 2) + ((in_low[8 * 16] + 1) >> 2);
-        // Calculate input for the next 8 results.
-        step1[0] = ((in_low[7 * 16] + 1) >> 2) - ((in_low[8 * 16] + 1) >> 2);
-        step1[1] = ((in_low[6 * 16] + 1) >> 2) - ((in_low[9 * 16] + 1) >> 2);
-        step1[2] = ((in_low[5 * 16] + 1) >> 2) - ((in_low[10 * 16] + 1) >> 2);
-        step1[3] = ((in_low[4 * 16] + 1) >> 2) - ((in_low[11 * 16] + 1) >> 2);
-        step1[4] = ((in_low[3 * 16] + 1) >> 2) - ((in_low[12 * 16] + 1) >> 2);
-        step1[5] = ((in_low[2 * 16] + 1) >> 2) - ((in_low[13 * 16] + 1) >> 2);
-        step1[6] = ((in_low[1 * 16] + 1) >> 2) - ((in_low[14 * 16] + 1) >> 2);
-        step1[7] = ((in_low[0 * 16] + 1) >> 2) - ((in_low[15 * 16] + 1) >> 2);
-        in_low++;
-      }
-      // Work on the first eight values; fdct8(input, even_results);
-      {
-        tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
-        tran_high_t t0, t1, t2, t3;                  // needs32
-        tran_high_t x0, x1, x2, x3;                  // canbe16
-
-        // stage 1
-        s0 = in_high[0] + in_high[7];
-        s1 = in_high[1] + in_high[6];
-        s2 = in_high[2] + in_high[5];
-        s3 = in_high[3] + in_high[4];
-        s4 = in_high[3] - in_high[4];
-        s5 = in_high[2] - in_high[5];
-        s6 = in_high[1] - in_high[6];
-        s7 = in_high[0] - in_high[7];
-
-        // fdct4(step, step);
-        x0 = s0 + s3;
-        x1 = s1 + s2;
-        x2 = s1 - s2;
-        x3 = s0 - s3;
-        t0 = (x0 + x1) * cospi_16_64;
-        t1 = (x0 - x1) * cospi_16_64;
-        t2 = x3 * cospi_8_64 + x2 * cospi_24_64;
-        t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
-        out[0] = (tran_low_t)fdct_round_shift(t0);
-        out[4] = (tran_low_t)fdct_round_shift(t2);
-        out[8] = (tran_low_t)fdct_round_shift(t1);
-        out[12] = (tran_low_t)fdct_round_shift(t3);
-
-        // Stage 2
-        t0 = (s6 - s5) * cospi_16_64;
-        t1 = (s6 + s5) * cospi_16_64;
-        t2 = fdct_round_shift(t0);
-        t3 = fdct_round_shift(t1);
-
-        // Stage 3
-        x0 = s4 + t2;
-        x1 = s4 - t2;
-        x2 = s7 - t3;
-        x3 = s7 + t3;
-
-        // Stage 4
-        t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
-        t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
-        t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
-        t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
-        out[2] = (tran_low_t)fdct_round_shift(t0);
-        out[6] = (tran_low_t)fdct_round_shift(t2);
-        out[10] = (tran_low_t)fdct_round_shift(t1);
-        out[14] = (tran_low_t)fdct_round_shift(t3);
-      }
-      // Work on the next eight values; step1 -> odd_results
-      {
-        // step 2
-        temp1 = (step1[5] - step1[2]) * cospi_16_64;
-        temp2 = (step1[4] - step1[3]) * cospi_16_64;
-        step2[2] = fdct_round_shift(temp1);
-        step2[3] = fdct_round_shift(temp2);
-        temp1 = (step1[4] + step1[3]) * cospi_16_64;
-        temp2 = (step1[5] + step1[2]) * cospi_16_64;
-        step2[4] = fdct_round_shift(temp1);
-        step2[5] = fdct_round_shift(temp2);
-        // step 3
-        step3[0] = step1[0] + step2[3];
-        step3[1] = step1[1] + step2[2];
-        step3[2] = step1[1] - step2[2];
-        step3[3] = step1[0] - step2[3];
-        step3[4] = step1[7] - step2[4];
-        step3[5] = step1[6] - step2[5];
-        step3[6] = step1[6] + step2[5];
-        step3[7] = step1[7] + step2[4];
-        // step 4
-        temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64;
-        temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64;
-        step2[1] = fdct_round_shift(temp1);
-        step2[2] = fdct_round_shift(temp2);
-        temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
-        temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64;
-        step2[5] = fdct_round_shift(temp1);
-        step2[6] = fdct_round_shift(temp2);
-        // step 5
-        step1[0] = step3[0] + step2[1];
-        step1[1] = step3[0] - step2[1];
-        step1[2] = step3[3] + step2[2];
-        step1[3] = step3[3] - step2[2];
-        step1[4] = step3[4] - step2[5];
-        step1[5] = step3[4] + step2[5];
-        step1[6] = step3[7] - step2[6];
-        step1[7] = step3[7] + step2[6];
-        // step 6
-        temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64;
-        temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
-        out[1] = (tran_low_t)fdct_round_shift(temp1);
-        out[9] = (tran_low_t)fdct_round_shift(temp2);
-        temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
-        temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64;
-        out[5] = (tran_low_t)fdct_round_shift(temp1);
-        out[13] = (tran_low_t)fdct_round_shift(temp2);
-        temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64;
-        temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
-        out[3] = (tran_low_t)fdct_round_shift(temp1);
-        out[11] = (tran_low_t)fdct_round_shift(temp2);
-        temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
-        temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
-        out[7] = (tran_low_t)fdct_round_shift(temp1);
-        out[15] = (tran_low_t)fdct_round_shift(temp2);
-      }
-      // Do next column (which is a transposed row in second/horizontal pass)
-      input++;
-      out += 16;
-    }
-    // Setup in/out for next pass.
-    in_low = intermediate;
-    out = output;
-  }
-}
-
-void av1_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) {
-  int r, c;
-  tran_low_t sum = 0;
-  for (r = 0; r < 16; ++r)
-    for (c = 0; c < 16; ++c) sum += input[r * stride + c];
-
-  output[0] = sum >> 1;
-  output[1] = 0;
-}
-
-static INLINE tran_high_t dct_32_round(tran_high_t input) {
-  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-  // TODO(debargha, peter.derivaz): Find new bounds for this assert,
-  // and make the bounds consts.
-  // assert(-131072 <= rv && rv <= 131071);
-  return rv;
-}
-
-static INLINE tran_high_t half_round_shift(tran_high_t input) {
-  tran_high_t rv = (input + 1 + (input < 0)) >> 2;
-  return rv;
-}
-
-void av1_fdct32(const tran_high_t *input, tran_high_t *output, int round) {
-  tran_high_t step[32];
-  // Stage 1
-  step[0] = input[0] + input[(32 - 1)];
-  step[1] = input[1] + input[(32 - 2)];
-  step[2] = input[2] + input[(32 - 3)];
-  step[3] = input[3] + input[(32 - 4)];
-  step[4] = input[4] + input[(32 - 5)];
-  step[5] = input[5] + input[(32 - 6)];
-  step[6] = input[6] + input[(32 - 7)];
-  step[7] = input[7] + input[(32 - 8)];
-  step[8] = input[8] + input[(32 - 9)];
-  step[9] = input[9] + input[(32 - 10)];
-  step[10] = input[10] + input[(32 - 11)];
-  step[11] = input[11] + input[(32 - 12)];
-  step[12] = input[12] + input[(32 - 13)];
-  step[13] = input[13] + input[(32 - 14)];
-  step[14] = input[14] + input[(32 - 15)];
-  step[15] = input[15] + input[(32 - 16)];
-  step[16] = -input[16] + input[(32 - 17)];
-  step[17] = -input[17] + input[(32 - 18)];
-  step[18] = -input[18] + input[(32 - 19)];
-  step[19] = -input[19] + input[(32 - 20)];
-  step[20] = -input[20] + input[(32 - 21)];
-  step[21] = -input[21] + input[(32 - 22)];
-  step[22] = -input[22] + input[(32 - 23)];
-  step[23] = -input[23] + input[(32 - 24)];
-  step[24] = -input[24] + input[(32 - 25)];
-  step[25] = -input[25] + input[(32 - 26)];
-  step[26] = -input[26] + input[(32 - 27)];
-  step[27] = -input[27] + input[(32 - 28)];
-  step[28] = -input[28] + input[(32 - 29)];
-  step[29] = -input[29] + input[(32 - 30)];
-  step[30] = -input[30] + input[(32 - 31)];
-  step[31] = -input[31] + input[(32 - 32)];
-
-  // Stage 2
-  output[0] = step[0] + step[16 - 1];
-  output[1] = step[1] + step[16 - 2];
-  output[2] = step[2] + step[16 - 3];
-  output[3] = step[3] + step[16 - 4];
-  output[4] = step[4] + step[16 - 5];
-  output[5] = step[5] + step[16 - 6];
-  output[6] = step[6] + step[16 - 7];
-  output[7] = step[7] + step[16 - 8];
-  output[8] = -step[8] + step[16 - 9];
-  output[9] = -step[9] + step[16 - 10];
-  output[10] = -step[10] + step[16 - 11];
-  output[11] = -step[11] + step[16 - 12];
-  output[12] = -step[12] + step[16 - 13];
-  output[13] = -step[13] + step[16 - 14];
-  output[14] = -step[14] + step[16 - 15];
-  output[15] = -step[15] + step[16 - 16];
-
-  output[16] = step[16];
-  output[17] = step[17];
-  output[18] = step[18];
-  output[19] = step[19];
-
-  output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64);
-  output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64);
-  output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64);
-  output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64);
-
-  output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64);
-  output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64);
-  output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64);
-  output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64);
-
-  output[28] = step[28];
-  output[29] = step[29];
-  output[30] = step[30];
-  output[31] = step[31];
-
-  // dump the magnitude by 4, hence the intermediate values are within
-  // the range of 16 bits.
-  if (round) {
-    output[0] = half_round_shift(output[0]);
-    output[1] = half_round_shift(output[1]);
-    output[2] = half_round_shift(output[2]);
-    output[3] = half_round_shift(output[3]);
-    output[4] = half_round_shift(output[4]);
-    output[5] = half_round_shift(output[5]);
-    output[6] = half_round_shift(output[6]);
-    output[7] = half_round_shift(output[7]);
-    output[8] = half_round_shift(output[8]);
-    output[9] = half_round_shift(output[9]);
-    output[10] = half_round_shift(output[10]);
-    output[11] = half_round_shift(output[11]);
-    output[12] = half_round_shift(output[12]);
-    output[13] = half_round_shift(output[13]);
-    output[14] = half_round_shift(output[14]);
-    output[15] = half_round_shift(output[15]);
-
-    output[16] = half_round_shift(output[16]);
-    output[17] = half_round_shift(output[17]);
-    output[18] = half_round_shift(output[18]);
-    output[19] = half_round_shift(output[19]);
-    output[20] = half_round_shift(output[20]);
-    output[21] = half_round_shift(output[21]);
-    output[22] = half_round_shift(output[22]);
-    output[23] = half_round_shift(output[23]);
-    output[24] = half_round_shift(output[24]);
-    output[25] = half_round_shift(output[25]);
-    output[26] = half_round_shift(output[26]);
-    output[27] = half_round_shift(output[27]);
-    output[28] = half_round_shift(output[28]);
-    output[29] = half_round_shift(output[29]);
-    output[30] = half_round_shift(output[30]);
-    output[31] = half_round_shift(output[31]);
-  }
-
-  // Stage 3
-  step[0] = output[0] + output[(8 - 1)];
-  step[1] = output[1] + output[(8 - 2)];
-  step[2] = output[2] + output[(8 - 3)];
-  step[3] = output[3] + output[(8 - 4)];
-  step[4] = -output[4] + output[(8 - 5)];
-  step[5] = -output[5] + output[(8 - 6)];
-  step[6] = -output[6] + output[(8 - 7)];
-  step[7] = -output[7] + output[(8 - 8)];
-  step[8] = output[8];
-  step[9] = output[9];
-  step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64);
-  step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64);
-  step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64);
-  step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64);
-  step[14] = output[14];
-  step[15] = output[15];
-
-  step[16] = output[16] + output[23];
-  step[17] = output[17] + output[22];
-  step[18] = output[18] + output[21];
-  step[19] = output[19] + output[20];
-  step[20] = -output[20] + output[19];
-  step[21] = -output[21] + output[18];
-  step[22] = -output[22] + output[17];
-  step[23] = -output[23] + output[16];
-  step[24] = -output[24] + output[31];
-  step[25] = -output[25] + output[30];
-  step[26] = -output[26] + output[29];
-  step[27] = -output[27] + output[28];
-  step[28] = output[28] + output[27];
-  step[29] = output[29] + output[26];
-  step[30] = output[30] + output[25];
-  step[31] = output[31] + output[24];
-
-  // Stage 4
-  output[0] = step[0] + step[3];
-  output[1] = step[1] + step[2];
-  output[2] = -step[2] + step[1];
-  output[3] = -step[3] + step[0];
-  output[4] = step[4];
-  output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64);
-  output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64);
-  output[7] = step[7];
-  output[8] = step[8] + step[11];
-  output[9] = step[9] + step[10];
-  output[10] = -step[10] + step[9];
-  output[11] = -step[11] + step[8];
-  output[12] = -step[12] + step[15];
-  output[13] = -step[13] + step[14];
-  output[14] = step[14] + step[13];
-  output[15] = step[15] + step[12];
-
-  output[16] = step[16];
-  output[17] = step[17];
-  output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64);
-  output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64);
-  output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64);
-  output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64);
-  output[22] = step[22];
-  output[23] = step[23];
-  output[24] = step[24];
-  output[25] = step[25];
-  output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64);
-  output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64);
-  output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64);
-  output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64);
-  output[30] = step[30];
-  output[31] = step[31];
-
-  // Stage 5
-  step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64);
-  step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64);
-  step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64);
-  step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64);
-  step[4] = output[4] + output[5];
-  step[5] = -output[5] + output[4];
-  step[6] = -output[6] + output[7];
-  step[7] = output[7] + output[6];
-  step[8] = output[8];
-  step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64);
-  step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64);
-  step[11] = output[11];
-  step[12] = output[12];
-  step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64);
-  step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64);
-  step[15] = output[15];
-
-  step[16] = output[16] + output[19];
-  step[17] = output[17] + output[18];
-  step[18] = -output[18] + output[17];
-  step[19] = -output[19] + output[16];
-  step[20] = -output[20] + output[23];
-  step[21] = -output[21] + output[22];
-  step[22] = output[22] + output[21];
-  step[23] = output[23] + output[20];
-  step[24] = output[24] + output[27];
-  step[25] = output[25] + output[26];
-  step[26] = -output[26] + output[25];
-  step[27] = -output[27] + output[24];
-  step[28] = -output[28] + output[31];
-  step[29] = -output[29] + output[30];
-  step[30] = output[30] + output[29];
-  step[31] = output[31] + output[28];
-
-  // Stage 6
-  output[0] = step[0];
-  output[1] = step[1];
-  output[2] = step[2];
-  output[3] = step[3];
-  output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64);
-  output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64);
-  output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64);
-  output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64);
-  output[8] = step[8] + step[9];
-  output[9] = -step[9] + step[8];
-  output[10] = -step[10] + step[11];
-  output[11] = step[11] + step[10];
-  output[12] = step[12] + step[13];
-  output[13] = -step[13] + step[12];
-  output[14] = -step[14] + step[15];
-  output[15] = step[15] + step[14];
-
-  output[16] = step[16];
-  output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64);
-  output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64);
-  output[19] = step[19];
-  output[20] = step[20];
-  output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64);
-  output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64);
-  output[23] = step[23];
-  output[24] = step[24];
-  output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64);
-  output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64);
-  output[27] = step[27];
-  output[28] = step[28];
-  output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64);
-  output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64);
-  output[31] = step[31];
-
-  // Stage 7
-  step[0] = output[0];
-  step[1] = output[1];
-  step[2] = output[2];
-  step[3] = output[3];
-  step[4] = output[4];
-  step[5] = output[5];
-  step[6] = output[6];
-  step[7] = output[7];
-  step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64);
-  step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64);
-  step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64);
-  step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64);
-  step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64);
-  step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64);
-  step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64);
-  step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64);
-
-  step[16] = output[16] + output[17];
-  step[17] = -output[17] + output[16];
-  step[18] = -output[18] + output[19];
-  step[19] = output[19] + output[18];
-  step[20] = output[20] + output[21];
-  step[21] = -output[21] + output[20];
-  step[22] = -output[22] + output[23];
-  step[23] = output[23] + output[22];
-  step[24] = output[24] + output[25];
-  step[25] = -output[25] + output[24];
-  step[26] = -output[26] + output[27];
-  step[27] = output[27] + output[26];
-  step[28] = output[28] + output[29];
-  step[29] = -output[29] + output[28];
-  step[30] = -output[30] + output[31];
-  step[31] = output[31] + output[30];
-
-  // Final stage --- outputs indices are bit-reversed.
-  output[0] = step[0];
-  output[16] = step[1];
-  output[8] = step[2];
-  output[24] = step[3];
-  output[4] = step[4];
-  output[20] = step[5];
-  output[12] = step[6];
-  output[28] = step[7];
-  output[2] = step[8];
-  output[18] = step[9];
-  output[10] = step[10];
-  output[26] = step[11];
-  output[6] = step[12];
-  output[22] = step[13];
-  output[14] = step[14];
-  output[30] = step[15];
-
-  output[1] = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64);
-  output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64);
-  output[9] = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64);
-  output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64);
-  output[5] = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64);
-  output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64);
-  output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64);
-  output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64);
-  output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64);
-  output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64);
-  output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64);
-  output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64);
-  output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64);
-  output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64);
-  output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64);
-  output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
-}
-
-void av1_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
-  int i, j;
-  tran_high_t output[32 * 32];
-
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    tran_high_t temp_in[32], temp_out[32];
-    for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
-    av1_fdct32(temp_in, temp_out, 0);
-    for (j = 0; j < 32; ++j)
-      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
-  }
-
-  // Rows
-  for (i = 0; i < 32; ++i) {
-    tran_high_t temp_in[32], temp_out[32];
-    for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32];
-    av1_fdct32(temp_in, temp_out, 0);
-    for (j = 0; j < 32; ++j)
-      out[j + i * 32] =
-          (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
-  }
-}
-
-// Note that although we use dct_32_round in dct32 computation flow,
-// this 2d fdct32x32 for rate-distortion optimization loop is operating
-// within 16 bits precision.
-void av1_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
-  int i, j;
-  tran_high_t output[32 * 32];
-
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    tran_high_t temp_in[32], temp_out[32];
-    for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
-    av1_fdct32(temp_in, temp_out, 0);
-    for (j = 0; j < 32; ++j)
-      // TODO(cd): see quality impact of only doing
-      //           output[j * 32 + i] = (temp_out[j] + 1) >> 2;
-      //           PS: also change code in av1_dsp/x86/av1_dct_sse2.c
-      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
-  }
-
-  // Rows
-  for (i = 0; i < 32; ++i) {
-    tran_high_t temp_in[32], temp_out[32];
-    for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32];
-    av1_fdct32(temp_in, temp_out, 1);
-    for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j];
-  }
-}
-
-void av1_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) {
-  int r, c;
-  tran_low_t sum = 0;
-  for (r = 0; r < 32; ++r)
-    for (c = 0; c < 32; ++c) sum += input[r * stride + c];
-
-  output[0] = sum >> 3;
-  output[1] = 0;
-}
-
-#if CONFIG_AOM_HIGHBITDEPTH
-void av1_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output,
-                          int stride) {
-  av1_fdct4x4_c(input, output, stride);
-}
-
-void av1_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
-                          int stride) {
-  av1_fdct8x8_c(input, final_output, stride);
-}
-
-void av1_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output,
-                            int stride) {
-  av1_fdct8x8_1_c(input, final_output, stride);
-}
-
-void av1_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output,
-                            int stride) {
-  av1_fdct16x16_c(input, output, stride);
-}
-
-void av1_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output,
-                              int stride) {
-  av1_fdct16x16_1_c(input, output, stride);
-}
-
-void av1_highbd_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
-  av1_fdct32x32_c(input, out, stride);
-}
-
-void av1_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
-                               int stride) {
-  av1_fdct32x32_rd_c(input, out, stride);
-}
-
-void av1_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *out,
-                              int stride) {
-  av1_fdct32x32_1_c(input, out, stride);
-}
-#endif  // CONFIG_AOM_HIGHBITDEPTH
diff --git a/av1/common/av1_fwd_txfm.h b/av1/common/av1_fwd_txfm.h
deleted file mode 100644
index db763e5..0000000
--- a/av1/common/av1_fwd_txfm.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AV1_COMMON_AV1_FWD_TXFM_H_
-#define AV1_COMMON_AV1_FWD_TXFM_H_
-
-#include "aom_dsp/txfm_common.h"
-#include "aom_dsp/fwd_txfm.h"
-
-void av1_fdct32(const tran_high_t *input, tran_high_t *output, int round);
-#endif  // AV1_COMMON_AV1_FWD_TXFM_H_
diff --git a/av1/common/av1_fwd_txfm2d_cfg.h b/av1/common/av1_fwd_txfm2d_cfg.h
index 49d324d..5a7c218 100644
--- a/av1/common/av1_fwd_txfm2d_cfg.h
+++ b/av1/common/av1_fwd_txfm2d_cfg.h
@@ -109,7 +109,7 @@
 };  // .txfm_type_row
 
 //  ---------------- config fwd_dct_dct_64 ----------------
-static const int8_t fwd_shift_dct_dct_64[3] = { 2, -2, -2 };
+static const int8_t fwd_shift_dct_dct_64[3] = { 0, -2, -2 };
 static const int8_t fwd_stage_range_col_dct_dct_64[12] = {
   13, 14, 15, 16, 17, 18, 19, 19, 19, 19, 19, 19
 };
diff --git a/av1/common/av1_inv_txfm.c b/av1/common/av1_inv_txfm.c
deleted file mode 100644
index 4b2f061..0000000
--- a/av1/common/av1_inv_txfm.c
+++ /dev/null
@@ -1,2468 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <math.h>
-#include <string.h>
-
-#include "./av1_rtcd.h"
-#include "av1/common/av1_inv_txfm.h"
-
-void av1_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
-     0.5 shifts per pixel. */
-  int i;
-  tran_low_t output[16];
-  tran_high_t a1, b1, c1, d1, e1;
-  const tran_low_t *ip = input;
-  tran_low_t *op = output;
-
-  for (i = 0; i < 4; i++) {
-    a1 = ip[0] >> UNIT_QUANT_SHIFT;
-    c1 = ip[1] >> UNIT_QUANT_SHIFT;
-    d1 = ip[2] >> UNIT_QUANT_SHIFT;
-    b1 = ip[3] >> UNIT_QUANT_SHIFT;
-    a1 += c1;
-    d1 -= b1;
-    e1 = (a1 - d1) >> 1;
-    b1 = e1 - b1;
-    c1 = e1 - c1;
-    a1 -= b1;
-    d1 += c1;
-    op[0] = WRAPLOW(a1);
-    op[1] = WRAPLOW(b1);
-    op[2] = WRAPLOW(c1);
-    op[3] = WRAPLOW(d1);
-    ip += 4;
-    op += 4;
-  }
-
-  ip = output;
-  for (i = 0; i < 4; i++) {
-    a1 = ip[4 * 0];
-    c1 = ip[4 * 1];
-    d1 = ip[4 * 2];
-    b1 = ip[4 * 3];
-    a1 += c1;
-    d1 -= b1;
-    e1 = (a1 - d1) >> 1;
-    b1 = e1 - b1;
-    c1 = e1 - c1;
-    a1 -= b1;
-    d1 += c1;
-    dest[stride * 0] = clip_pixel_add(dest[stride * 0], WRAPLOW(a1));
-    dest[stride * 1] = clip_pixel_add(dest[stride * 1], WRAPLOW(b1));
-    dest[stride * 2] = clip_pixel_add(dest[stride * 2], WRAPLOW(c1));
-    dest[stride * 3] = clip_pixel_add(dest[stride * 3], WRAPLOW(d1));
-
-    ip++;
-    dest++;
-  }
-}
-
-void av1_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
-  int i;
-  tran_high_t a1, e1;
-  tran_low_t tmp[4];
-  const tran_low_t *ip = in;
-  tran_low_t *op = tmp;
-
-  a1 = ip[0] >> UNIT_QUANT_SHIFT;
-  e1 = a1 >> 1;
-  a1 -= e1;
-  op[0] = WRAPLOW(a1);
-  op[1] = op[2] = op[3] = WRAPLOW(e1);
-
-  ip = tmp;
-  for (i = 0; i < 4; i++) {
-    e1 = ip[0] >> 1;
-    a1 = ip[0] - e1;
-    dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1);
-    dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1);
-    dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1);
-    dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1);
-    ip++;
-    dest++;
-  }
-}
-
-void av1_idct4_c(const tran_low_t *input, tran_low_t *output) {
-  tran_low_t step[4];
-  tran_high_t temp1, temp2;
-  // stage 1
-  temp1 = (input[0] + input[2]) * cospi_16_64;
-  temp2 = (input[0] - input[2]) * cospi_16_64;
-  step[0] = WRAPLOW(dct_const_round_shift(temp1));
-  step[1] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
-  temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
-  step[2] = WRAPLOW(dct_const_round_shift(temp1));
-  step[3] = WRAPLOW(dct_const_round_shift(temp2));
-
-  // stage 2
-  output[0] = WRAPLOW(step[0] + step[3]);
-  output[1] = WRAPLOW(step[1] + step[2]);
-  output[2] = WRAPLOW(step[1] - step[2]);
-  output[3] = WRAPLOW(step[0] - step[3]);
-}
-
-void av1_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  tran_low_t out[4 * 4];
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[4], temp_out[4];
-
-  // Rows
-  for (i = 0; i < 4; ++i) {
-    av1_idct4_c(input, outptr);
-    input += 4;
-    outptr += 4;
-  }
-
-  // Columns
-  for (i = 0; i < 4; ++i) {
-    for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
-    av1_idct4_c(temp_in, temp_out);
-    for (j = 0; j < 4; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 4));
-    }
-  }
-}
-
-void av1_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
-                         int dest_stride) {
-  int i;
-  tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
-  a1 = ROUND_POWER_OF_TWO(out, 4);
-
-  for (i = 0; i < 4; i++) {
-    dest[0] = clip_pixel_add(dest[0], a1);
-    dest[1] = clip_pixel_add(dest[1], a1);
-    dest[2] = clip_pixel_add(dest[2], a1);
-    dest[3] = clip_pixel_add(dest[3], a1);
-    dest += dest_stride;
-  }
-}
-
-void av1_idct8_c(const tran_low_t *input, tran_low_t *output) {
-  tran_low_t step1[8], step2[8];
-  tran_high_t temp1, temp2;
-  // stage 1
-  step1[0] = input[0];
-  step1[2] = input[4];
-  step1[1] = input[2];
-  step1[3] = input[6];
-  temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
-  temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
-  temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
-
-  // stage 2
-  temp1 = (step1[0] + step1[2]) * cospi_16_64;
-  temp2 = (step1[0] - step1[2]) * cospi_16_64;
-  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
-  temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
-  step2[4] = WRAPLOW(step1[4] + step1[5]);
-  step2[5] = WRAPLOW(step1[4] - step1[5]);
-  step2[6] = WRAPLOW(-step1[6] + step1[7]);
-  step2[7] = WRAPLOW(step1[6] + step1[7]);
-
-  // stage 3
-  step1[0] = WRAPLOW(step2[0] + step2[3]);
-  step1[1] = WRAPLOW(step2[1] + step2[2]);
-  step1[2] = WRAPLOW(step2[1] - step2[2]);
-  step1[3] = WRAPLOW(step2[0] - step2[3]);
-  step1[4] = step2[4];
-  temp1 = (step2[6] - step2[5]) * cospi_16_64;
-  temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
-  step1[7] = step2[7];
-
-  // stage 4
-  output[0] = WRAPLOW(step1[0] + step1[7]);
-  output[1] = WRAPLOW(step1[1] + step1[6]);
-  output[2] = WRAPLOW(step1[2] + step1[5]);
-  output[3] = WRAPLOW(step1[3] + step1[4]);
-  output[4] = WRAPLOW(step1[3] - step1[4]);
-  output[5] = WRAPLOW(step1[2] - step1[5]);
-  output[6] = WRAPLOW(step1[1] - step1[6]);
-  output[7] = WRAPLOW(step1[0] - step1[7]);
-}
-
-void av1_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  tran_low_t out[8 * 8];
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[8], temp_out[8];
-
-  // First transform rows
-  for (i = 0; i < 8; ++i) {
-    av1_idct8_c(input, outptr);
-    input += 8;
-    outptr += 8;
-  }
-
-  // Then transform columns
-  for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
-    av1_idct8_c(temp_in, temp_out);
-    for (j = 0; j < 8; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 5));
-    }
-  }
-}
-
-void av1_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  int i, j;
-  tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
-  a1 = ROUND_POWER_OF_TWO(out, 5);
-  for (j = 0; j < 8; ++j) {
-    for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1);
-    dest += stride;
-  }
-}
-
-void av1_iadst4_c(const tran_low_t *input, tran_low_t *output) {
-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
-
-  tran_low_t x0 = input[0];
-  tran_low_t x1 = input[1];
-  tran_low_t x2 = input[2];
-  tran_low_t x3 = input[3];
-
-  if (!(x0 | x1 | x2 | x3)) {
-    output[0] = output[1] = output[2] = output[3] = 0;
-    return;
-  }
-
-  s0 = sinpi_1_9 * x0;
-  s1 = sinpi_2_9 * x0;
-  s2 = sinpi_3_9 * x1;
-  s3 = sinpi_4_9 * x2;
-  s4 = sinpi_1_9 * x2;
-  s5 = sinpi_2_9 * x3;
-  s6 = sinpi_4_9 * x3;
-  s7 = WRAPLOW(x0 - x2 + x3);
-
-  s0 = s0 + s3 + s5;
-  s1 = s1 - s4 - s6;
-  s3 = s2;
-  s2 = sinpi_3_9 * s7;
-
-  // 1-D transform scaling factor is sqrt(2).
-  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
-  // + 1b (addition) = 29b.
-  // Hence the output bit depth is 15b.
-  output[0] = WRAPLOW(dct_const_round_shift(s0 + s3));
-  output[1] = WRAPLOW(dct_const_round_shift(s1 + s3));
-  output[2] = WRAPLOW(dct_const_round_shift(s2));
-  output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
-}
-
-void av1_iadst8_c(const tran_low_t *input, tran_low_t *output) {
-  int s0, s1, s2, s3, s4, s5, s6, s7;
-
-  tran_high_t x0 = input[7];
-  tran_high_t x1 = input[0];
-  tran_high_t x2 = input[5];
-  tran_high_t x3 = input[2];
-  tran_high_t x4 = input[3];
-  tran_high_t x5 = input[4];
-  tran_high_t x6 = input[1];
-  tran_high_t x7 = input[6];
-
-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
-    output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
-        output[6] = output[7] = 0;
-    return;
-  }
-
-  // stage 1
-  s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1);
-  s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1);
-  s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
-  s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
-  s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
-  s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
-  s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7);
-  s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7);
-
-  x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
-  x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
-  x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
-  x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
-  x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
-  x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
-  x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
-  x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
-
-  // stage 2
-  s0 = (int)x0;
-  s1 = (int)x1;
-  s2 = (int)x2;
-  s3 = (int)x3;
-  s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
-  s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
-  s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
-  s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
-
-  x0 = WRAPLOW(s0 + s2);
-  x1 = WRAPLOW(s1 + s3);
-  x2 = WRAPLOW(s0 - s2);
-  x3 = WRAPLOW(s1 - s3);
-  x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
-  x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
-  x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
-  x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
-
-  // stage 3
-  s2 = (int)(cospi_16_64 * (x2 + x3));
-  s3 = (int)(cospi_16_64 * (x2 - x3));
-  s6 = (int)(cospi_16_64 * (x6 + x7));
-  s7 = (int)(cospi_16_64 * (x6 - x7));
-
-  x2 = WRAPLOW(dct_const_round_shift(s2));
-  x3 = WRAPLOW(dct_const_round_shift(s3));
-  x6 = WRAPLOW(dct_const_round_shift(s6));
-  x7 = WRAPLOW(dct_const_round_shift(s7));
-
-  output[0] = WRAPLOW(x0);
-  output[1] = WRAPLOW(-x4);
-  output[2] = WRAPLOW(x6);
-  output[3] = WRAPLOW(-x2);
-  output[4] = WRAPLOW(x3);
-  output[5] = WRAPLOW(-x7);
-  output[6] = WRAPLOW(x5);
-  output[7] = WRAPLOW(-x1);
-}
-
-void av1_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  tran_low_t out[8 * 8] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[8], temp_out[8];
-
-  // First transform rows
-  // only first 4 row has non-zero coefs
-  for (i = 0; i < 4; ++i) {
-    av1_idct8_c(input, outptr);
-    input += 8;
-    outptr += 8;
-  }
-
-  // Then transform columns
-  for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
-    av1_idct8_c(temp_in, temp_out);
-    for (j = 0; j < 8; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 5));
-    }
-  }
-}
-
-void av1_idct16_c(const tran_low_t *input, tran_low_t *output) {
-  tran_low_t step1[16], step2[16];
-  tran_high_t temp1, temp2;
-
-  // stage 1
-  step1[0] = input[0 / 2];
-  step1[1] = input[16 / 2];
-  step1[2] = input[8 / 2];
-  step1[3] = input[24 / 2];
-  step1[4] = input[4 / 2];
-  step1[5] = input[20 / 2];
-  step1[6] = input[12 / 2];
-  step1[7] = input[28 / 2];
-  step1[8] = input[2 / 2];
-  step1[9] = input[18 / 2];
-  step1[10] = input[10 / 2];
-  step1[11] = input[26 / 2];
-  step1[12] = input[6 / 2];
-  step1[13] = input[22 / 2];
-  step1[14] = input[14 / 2];
-  step1[15] = input[30 / 2];
-
-  // stage 2
-  step2[0] = step1[0];
-  step2[1] = step1[1];
-  step2[2] = step1[2];
-  step2[3] = step1[3];
-  step2[4] = step1[4];
-  step2[5] = step1[5];
-  step2[6] = step1[6];
-  step2[7] = step1[7];
-
-  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
-  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[15] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
-  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
-  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
-  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
-
-  // stage 3
-  step1[0] = step2[0];
-  step1[1] = step2[1];
-  step1[2] = step2[2];
-  step1[3] = step2[3];
-
-  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
-  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
-  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
-
-  step1[8] = WRAPLOW(step2[8] + step2[9]);
-  step1[9] = WRAPLOW(step2[8] - step2[9]);
-  step1[10] = WRAPLOW(-step2[10] + step2[11]);
-  step1[11] = WRAPLOW(step2[10] + step2[11]);
-  step1[12] = WRAPLOW(step2[12] + step2[13]);
-  step1[13] = WRAPLOW(step2[12] - step2[13]);
-  step1[14] = WRAPLOW(-step2[14] + step2[15]);
-  step1[15] = WRAPLOW(step2[14] + step2[15]);
-
-  // stage 4
-  temp1 = (step1[0] + step1[1]) * cospi_16_64;
-  temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
-  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
-  step2[4] = WRAPLOW(step1[4] + step1[5]);
-  step2[5] = WRAPLOW(step1[4] - step1[5]);
-  step2[6] = WRAPLOW(-step1[6] + step1[7]);
-  step2[7] = WRAPLOW(step1[6] + step1[7]);
-
-  step2[8] = step1[8];
-  step2[15] = step1[15];
-  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
-  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
-  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
-  step2[11] = step1[11];
-  step2[12] = step1[12];
-
-  // stage 5
-  step1[0] = WRAPLOW(step2[0] + step2[3]);
-  step1[1] = WRAPLOW(step2[1] + step2[2]);
-  step1[2] = WRAPLOW(step2[1] - step2[2]);
-  step1[3] = WRAPLOW(step2[0] - step2[3]);
-  step1[4] = step2[4];
-  temp1 = (step2[6] - step2[5]) * cospi_16_64;
-  temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
-  step1[7] = step2[7];
-
-  step1[8] = WRAPLOW(step2[8] + step2[11]);
-  step1[9] = WRAPLOW(step2[9] + step2[10]);
-  step1[10] = WRAPLOW(step2[9] - step2[10]);
-  step1[11] = WRAPLOW(step2[8] - step2[11]);
-  step1[12] = WRAPLOW(-step2[12] + step2[15]);
-  step1[13] = WRAPLOW(-step2[13] + step2[14]);
-  step1[14] = WRAPLOW(step2[13] + step2[14]);
-  step1[15] = WRAPLOW(step2[12] + step2[15]);
-
-  // stage 6
-  step2[0] = WRAPLOW(step1[0] + step1[7]);
-  step2[1] = WRAPLOW(step1[1] + step1[6]);
-  step2[2] = WRAPLOW(step1[2] + step1[5]);
-  step2[3] = WRAPLOW(step1[3] + step1[4]);
-  step2[4] = WRAPLOW(step1[3] - step1[4]);
-  step2[5] = WRAPLOW(step1[2] - step1[5]);
-  step2[6] = WRAPLOW(step1[1] - step1[6]);
-  step2[7] = WRAPLOW(step1[0] - step1[7]);
-  step2[8] = step1[8];
-  step2[9] = step1[9];
-  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
-  temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
-  temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
-  step2[14] = step1[14];
-  step2[15] = step1[15];
-
-  // stage 7
-  output[0] = WRAPLOW(step2[0] + step2[15]);
-  output[1] = WRAPLOW(step2[1] + step2[14]);
-  output[2] = WRAPLOW(step2[2] + step2[13]);
-  output[3] = WRAPLOW(step2[3] + step2[12]);
-  output[4] = WRAPLOW(step2[4] + step2[11]);
-  output[5] = WRAPLOW(step2[5] + step2[10]);
-  output[6] = WRAPLOW(step2[6] + step2[9]);
-  output[7] = WRAPLOW(step2[7] + step2[8]);
-  output[8] = WRAPLOW(step2[7] - step2[8]);
-  output[9] = WRAPLOW(step2[6] - step2[9]);
-  output[10] = WRAPLOW(step2[5] - step2[10]);
-  output[11] = WRAPLOW(step2[4] - step2[11]);
-  output[12] = WRAPLOW(step2[3] - step2[12]);
-  output[13] = WRAPLOW(step2[2] - step2[13]);
-  output[14] = WRAPLOW(step2[1] - step2[14]);
-  output[15] = WRAPLOW(step2[0] - step2[15]);
-}
-
-void av1_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
-                             int stride) {
-  tran_low_t out[16 * 16];
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[16], temp_out[16];
-
-  // First transform rows
-  for (i = 0; i < 16; ++i) {
-    av1_idct16_c(input, outptr);
-    input += 16;
-    outptr += 16;
-  }
-
-  // Then transform columns
-  for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
-    av1_idct16_c(temp_in, temp_out);
-    for (j = 0; j < 16; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
-    }
-  }
-}
-
-void av1_iadst16_c(const tran_low_t *input, tran_low_t *output) {
-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
-  tran_high_t s9, s10, s11, s12, s13, s14, s15;
-
-  tran_high_t x0 = input[15];
-  tran_high_t x1 = input[0];
-  tran_high_t x2 = input[13];
-  tran_high_t x3 = input[2];
-  tran_high_t x4 = input[11];
-  tran_high_t x5 = input[4];
-  tran_high_t x6 = input[9];
-  tran_high_t x7 = input[6];
-  tran_high_t x8 = input[7];
-  tran_high_t x9 = input[8];
-  tran_high_t x10 = input[5];
-  tran_high_t x11 = input[10];
-  tran_high_t x12 = input[3];
-  tran_high_t x13 = input[12];
-  tran_high_t x14 = input[1];
-  tran_high_t x15 = input[14];
-
-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
-        x13 | x14 | x15)) {
-    output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
-        output[6] = output[7] = output[8] = output[9] = output[10] =
-            output[11] = output[12] = output[13] = output[14] = output[15] = 0;
-    return;
-  }
-
-  // stage 1
-  s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
-  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
-  s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
-  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
-  s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
-  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
-  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
-  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
-  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
-  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
-  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
-  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
-  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
-  s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
-  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
-  s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
-
-  x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
-  x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
-  x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
-  x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
-  x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
-  x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
-  x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
-  x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
-  x8 = WRAPLOW(dct_const_round_shift(s0 - s8));
-  x9 = WRAPLOW(dct_const_round_shift(s1 - s9));
-  x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
-  x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
-  x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
-  x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
-  x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
-  x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
-
-  // stage 2
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = x4;
-  s5 = x5;
-  s6 = x6;
-  s7 = x7;
-  s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
-  s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
-  s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
-  s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
-  s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
-  s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
-  s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
-  s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
-
-  x0 = WRAPLOW(s0 + s4);
-  x1 = WRAPLOW(s1 + s5);
-  x2 = WRAPLOW(s2 + s6);
-  x3 = WRAPLOW(s3 + s7);
-  x4 = WRAPLOW(s0 - s4);
-  x5 = WRAPLOW(s1 - s5);
-  x6 = WRAPLOW(s2 - s6);
-  x7 = WRAPLOW(s3 - s7);
-  x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
-  x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
-  x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
-  x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
-  x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
-  x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
-  x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
-  x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
-
-  // stage 3
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
-  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
-  s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
-  s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
-  s8 = x8;
-  s9 = x9;
-  s10 = x10;
-  s11 = x11;
-  s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
-  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
-  s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
-  s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
-
-  x0 = WRAPLOW(s0 + s2);
-  x1 = WRAPLOW(s1 + s3);
-  x2 = WRAPLOW(s0 - s2);
-  x3 = WRAPLOW(s1 - s3);
-  x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
-  x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
-  x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
-  x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
-  x8 = WRAPLOW(s8 + s10);
-  x9 = WRAPLOW(s9 + s11);
-  x10 = WRAPLOW(s8 - s10);
-  x11 = WRAPLOW(s9 - s11);
-  x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
-  x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
-  x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
-  x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
-
-  // stage 4
-  s2 = (-cospi_16_64) * (x2 + x3);
-  s3 = cospi_16_64 * (x2 - x3);
-  s6 = cospi_16_64 * (x6 + x7);
-  s7 = cospi_16_64 * (-x6 + x7);
-  s10 = cospi_16_64 * (x10 + x11);
-  s11 = cospi_16_64 * (-x10 + x11);
-  s14 = (-cospi_16_64) * (x14 + x15);
-  s15 = cospi_16_64 * (x14 - x15);
-
-  x2 = WRAPLOW(dct_const_round_shift(s2));
-  x3 = WRAPLOW(dct_const_round_shift(s3));
-  x6 = WRAPLOW(dct_const_round_shift(s6));
-  x7 = WRAPLOW(dct_const_round_shift(s7));
-  x10 = WRAPLOW(dct_const_round_shift(s10));
-  x11 = WRAPLOW(dct_const_round_shift(s11));
-  x14 = WRAPLOW(dct_const_round_shift(s14));
-  x15 = WRAPLOW(dct_const_round_shift(s15));
-
-  output[0] = WRAPLOW(x0);
-  output[1] = WRAPLOW(-x8);
-  output[2] = WRAPLOW(x12);
-  output[3] = WRAPLOW(-x4);
-  output[4] = WRAPLOW(x6);
-  output[5] = WRAPLOW(x14);
-  output[6] = WRAPLOW(x10);
-  output[7] = WRAPLOW(x2);
-  output[8] = WRAPLOW(x3);
-  output[9] = WRAPLOW(x11);
-  output[10] = WRAPLOW(x15);
-  output[11] = WRAPLOW(x7);
-  output[12] = WRAPLOW(x5);
-  output[13] = WRAPLOW(-x13);
-  output[14] = WRAPLOW(x9);
-  output[15] = WRAPLOW(-x1);
-}
-
-void av1_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
-                            int stride) {
-  tran_low_t out[16 * 16] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[16], temp_out[16];
-
-  // First transform rows. Since all non-zero dct coefficients are in
-  // upper-left 4x4 area, we only need to calculate first 4 rows here.
-  for (i = 0; i < 4; ++i) {
-    av1_idct16_c(input, outptr);
-    input += 16;
-    outptr += 16;
-  }
-
-  // Then transform columns
-  for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
-    av1_idct16_c(temp_in, temp_out);
-    for (j = 0; j < 16; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
-    }
-  }
-}
-
-void av1_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  int i, j;
-  tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
-  a1 = ROUND_POWER_OF_TWO(out, 6);
-  for (j = 0; j < 16; ++j) {
-    for (i = 0; i < 16; ++i) dest[i] = clip_pixel_add(dest[i], a1);
-    dest += stride;
-  }
-}
-
-void av1_idct32_c(const tran_low_t *input, tran_low_t *output) {
-  tran_low_t step1[32], step2[32];
-  tran_high_t temp1, temp2;
-
-  // stage 1
-  step1[0] = input[0];
-  step1[1] = input[16];
-  step1[2] = input[8];
-  step1[3] = input[24];
-  step1[4] = input[4];
-  step1[5] = input[20];
-  step1[6] = input[12];
-  step1[7] = input[28];
-  step1[8] = input[2];
-  step1[9] = input[18];
-  step1[10] = input[10];
-  step1[11] = input[26];
-  step1[12] = input[6];
-  step1[13] = input[22];
-  step1[14] = input[14];
-  step1[15] = input[30];
-
-  temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
-  temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
-  step1[16] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[31] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
-  temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
-  step1[17] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[30] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
-  temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
-  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
-  temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
-  step1[19] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[28] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
-  temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
-  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
-  temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
-  temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
-  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
-  temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
-  step1[23] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[24] = WRAPLOW(dct_const_round_shift(temp2));
-
-  // stage 2
-  step2[0] = step1[0];
-  step2[1] = step1[1];
-  step2[2] = step1[2];
-  step2[3] = step1[3];
-  step2[4] = step1[4];
-  step2[5] = step1[5];
-  step2[6] = step1[6];
-  step2[7] = step1[7];
-
-  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
-  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[15] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
-  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
-  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
-
-  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
-  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
-
-  step2[16] = WRAPLOW(step1[16] + step1[17]);
-  step2[17] = WRAPLOW(step1[16] - step1[17]);
-  step2[18] = WRAPLOW(-step1[18] + step1[19]);
-  step2[19] = WRAPLOW(step1[18] + step1[19]);
-  step2[20] = WRAPLOW(step1[20] + step1[21]);
-  step2[21] = WRAPLOW(step1[20] - step1[21]);
-  step2[22] = WRAPLOW(-step1[22] + step1[23]);
-  step2[23] = WRAPLOW(step1[22] + step1[23]);
-  step2[24] = WRAPLOW(step1[24] + step1[25]);
-  step2[25] = WRAPLOW(step1[24] - step1[25]);
-  step2[26] = WRAPLOW(-step1[26] + step1[27]);
-  step2[27] = WRAPLOW(step1[26] + step1[27]);
-  step2[28] = WRAPLOW(step1[28] + step1[29]);
-  step2[29] = WRAPLOW(step1[28] - step1[29]);
-  step2[30] = WRAPLOW(-step1[30] + step1[31]);
-  step2[31] = WRAPLOW(step1[30] + step1[31]);
-
-  // stage 3
-  step1[0] = step2[0];
-  step1[1] = step2[1];
-  step1[2] = step2[2];
-  step1[3] = step2[3];
-
-  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
-  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
-  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
-
-  step1[8] = WRAPLOW(step2[8] + step2[9]);
-  step1[9] = WRAPLOW(step2[8] - step2[9]);
-  step1[10] = WRAPLOW(-step2[10] + step2[11]);
-  step1[11] = WRAPLOW(step2[10] + step2[11]);
-  step1[12] = WRAPLOW(step2[12] + step2[13]);
-  step1[13] = WRAPLOW(step2[12] - step2[13]);
-  step1[14] = WRAPLOW(-step2[14] + step2[15]);
-  step1[15] = WRAPLOW(step2[14] + step2[15]);
-
-  step1[16] = step2[16];
-  step1[31] = step2[31];
-  temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
-  temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
-  step1[17] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[30] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
-  temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
-  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
-  step1[19] = step2[19];
-  step1[20] = step2[20];
-  temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
-  temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
-  temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
-  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
-  step1[23] = step2[23];
-  step1[24] = step2[24];
-  step1[27] = step2[27];
-  step1[28] = step2[28];
-
-  // stage 4
-  temp1 = (step1[0] + step1[1]) * cospi_16_64;
-  temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
-  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
-  step2[4] = WRAPLOW(step1[4] + step1[5]);
-  step2[5] = WRAPLOW(step1[4] - step1[5]);
-  step2[6] = WRAPLOW(-step1[6] + step1[7]);
-  step2[7] = WRAPLOW(step1[6] + step1[7]);
-
-  step2[8] = step1[8];
-  step2[15] = step1[15];
-  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
-  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
-  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
-  step2[11] = step1[11];
-  step2[12] = step1[12];
-
-  step2[16] = WRAPLOW(step1[16] + step1[19]);
-  step2[17] = WRAPLOW(step1[17] + step1[18]);
-  step2[18] = WRAPLOW(step1[17] - step1[18]);
-  step2[19] = WRAPLOW(step1[16] - step1[19]);
-  step2[20] = WRAPLOW(-step1[20] + step1[23]);
-  step2[21] = WRAPLOW(-step1[21] + step1[22]);
-  step2[22] = WRAPLOW(step1[21] + step1[22]);
-  step2[23] = WRAPLOW(step1[20] + step1[23]);
-
-  step2[24] = WRAPLOW(step1[24] + step1[27]);
-  step2[25] = WRAPLOW(step1[25] + step1[26]);
-  step2[26] = WRAPLOW(step1[25] - step1[26]);
-  step2[27] = WRAPLOW(step1[24] - step1[27]);
-  step2[28] = WRAPLOW(-step1[28] + step1[31]);
-  step2[29] = WRAPLOW(-step1[29] + step1[30]);
-  step2[30] = WRAPLOW(step1[29] + step1[30]);
-  step2[31] = WRAPLOW(step1[28] + step1[31]);
-
-  // stage 5
-  step1[0] = WRAPLOW(step2[0] + step2[3]);
-  step1[1] = WRAPLOW(step2[1] + step2[2]);
-  step1[2] = WRAPLOW(step2[1] - step2[2]);
-  step1[3] = WRAPLOW(step2[0] - step2[3]);
-  step1[4] = step2[4];
-  temp1 = (step2[6] - step2[5]) * cospi_16_64;
-  temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
-  step1[7] = step2[7];
-
-  step1[8] = WRAPLOW(step2[8] + step2[11]);
-  step1[9] = WRAPLOW(step2[9] + step2[10]);
-  step1[10] = WRAPLOW(step2[9] - step2[10]);
-  step1[11] = WRAPLOW(step2[8] - step2[11]);
-  step1[12] = WRAPLOW(-step2[12] + step2[15]);
-  step1[13] = WRAPLOW(-step2[13] + step2[14]);
-  step1[14] = WRAPLOW(step2[13] + step2[14]);
-  step1[15] = WRAPLOW(step2[12] + step2[15]);
-
-  step1[16] = step2[16];
-  step1[17] = step2[17];
-  temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
-  temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
-  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
-  temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
-  step1[19] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[28] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
-  temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
-  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
-  temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
-  step1[22] = step2[22];
-  step1[23] = step2[23];
-  step1[24] = step2[24];
-  step1[25] = step2[25];
-  step1[30] = step2[30];
-  step1[31] = step2[31];
-
-  // stage 6
-  step2[0] = WRAPLOW(step1[0] + step1[7]);
-  step2[1] = WRAPLOW(step1[1] + step1[6]);
-  step2[2] = WRAPLOW(step1[2] + step1[5]);
-  step2[3] = WRAPLOW(step1[3] + step1[4]);
-  step2[4] = WRAPLOW(step1[3] - step1[4]);
-  step2[5] = WRAPLOW(step1[2] - step1[5]);
-  step2[6] = WRAPLOW(step1[1] - step1[6]);
-  step2[7] = WRAPLOW(step1[0] - step1[7]);
-  step2[8] = step1[8];
-  step2[9] = step1[9];
-  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
-  temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
-  temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
-  step2[14] = step1[14];
-  step2[15] = step1[15];
-
-  step2[16] = WRAPLOW(step1[16] + step1[23]);
-  step2[17] = WRAPLOW(step1[17] + step1[22]);
-  step2[18] = WRAPLOW(step1[18] + step1[21]);
-  step2[19] = WRAPLOW(step1[19] + step1[20]);
-  step2[20] = WRAPLOW(step1[19] - step1[20]);
-  step2[21] = WRAPLOW(step1[18] - step1[21]);
-  step2[22] = WRAPLOW(step1[17] - step1[22]);
-  step2[23] = WRAPLOW(step1[16] - step1[23]);
-
-  step2[24] = WRAPLOW(-step1[24] + step1[31]);
-  step2[25] = WRAPLOW(-step1[25] + step1[30]);
-  step2[26] = WRAPLOW(-step1[26] + step1[29]);
-  step2[27] = WRAPLOW(-step1[27] + step1[28]);
-  step2[28] = WRAPLOW(step1[27] + step1[28]);
-  step2[29] = WRAPLOW(step1[26] + step1[29]);
-  step2[30] = WRAPLOW(step1[25] + step1[30]);
-  step2[31] = WRAPLOW(step1[24] + step1[31]);
-
-  // stage 7
-  step1[0] = WRAPLOW(step2[0] + step2[15]);
-  step1[1] = WRAPLOW(step2[1] + step2[14]);
-  step1[2] = WRAPLOW(step2[2] + step2[13]);
-  step1[3] = WRAPLOW(step2[3] + step2[12]);
-  step1[4] = WRAPLOW(step2[4] + step2[11]);
-  step1[5] = WRAPLOW(step2[5] + step2[10]);
-  step1[6] = WRAPLOW(step2[6] + step2[9]);
-  step1[7] = WRAPLOW(step2[7] + step2[8]);
-  step1[8] = WRAPLOW(step2[7] - step2[8]);
-  step1[9] = WRAPLOW(step2[6] - step2[9]);
-  step1[10] = WRAPLOW(step2[5] - step2[10]);
-  step1[11] = WRAPLOW(step2[4] - step2[11]);
-  step1[12] = WRAPLOW(step2[3] - step2[12]);
-  step1[13] = WRAPLOW(step2[2] - step2[13]);
-  step1[14] = WRAPLOW(step2[1] - step2[14]);
-  step1[15] = WRAPLOW(step2[0] - step2[15]);
-
-  step1[16] = step2[16];
-  step1[17] = step2[17];
-  step1[18] = step2[18];
-  step1[19] = step2[19];
-  temp1 = (-step2[20] + step2[27]) * cospi_16_64;
-  temp2 = (step2[20] + step2[27]) * cospi_16_64;
-  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = (-step2[21] + step2[26]) * cospi_16_64;
-  temp2 = (step2[21] + step2[26]) * cospi_16_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = (-step2[22] + step2[25]) * cospi_16_64;
-  temp2 = (step2[22] + step2[25]) * cospi_16_64;
-  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = (-step2[23] + step2[24]) * cospi_16_64;
-  temp2 = (step2[23] + step2[24]) * cospi_16_64;
-  step1[23] = WRAPLOW(dct_const_round_shift(temp1));
-  step1[24] = WRAPLOW(dct_const_round_shift(temp2));
-  step1[28] = step2[28];
-  step1[29] = step2[29];
-  step1[30] = step2[30];
-  step1[31] = step2[31];
-
-  // final stage
-  output[0] = WRAPLOW(step1[0] + step1[31]);
-  output[1] = WRAPLOW(step1[1] + step1[30]);
-  output[2] = WRAPLOW(step1[2] + step1[29]);
-  output[3] = WRAPLOW(step1[3] + step1[28]);
-  output[4] = WRAPLOW(step1[4] + step1[27]);
-  output[5] = WRAPLOW(step1[5] + step1[26]);
-  output[6] = WRAPLOW(step1[6] + step1[25]);
-  output[7] = WRAPLOW(step1[7] + step1[24]);
-  output[8] = WRAPLOW(step1[8] + step1[23]);
-  output[9] = WRAPLOW(step1[9] + step1[22]);
-  output[10] = WRAPLOW(step1[10] + step1[21]);
-  output[11] = WRAPLOW(step1[11] + step1[20]);
-  output[12] = WRAPLOW(step1[12] + step1[19]);
-  output[13] = WRAPLOW(step1[13] + step1[18]);
-  output[14] = WRAPLOW(step1[14] + step1[17]);
-  output[15] = WRAPLOW(step1[15] + step1[16]);
-  output[16] = WRAPLOW(step1[15] - step1[16]);
-  output[17] = WRAPLOW(step1[14] - step1[17]);
-  output[18] = WRAPLOW(step1[13] - step1[18]);
-  output[19] = WRAPLOW(step1[12] - step1[19]);
-  output[20] = WRAPLOW(step1[11] - step1[20]);
-  output[21] = WRAPLOW(step1[10] - step1[21]);
-  output[22] = WRAPLOW(step1[9] - step1[22]);
-  output[23] = WRAPLOW(step1[8] - step1[23]);
-  output[24] = WRAPLOW(step1[7] - step1[24]);
-  output[25] = WRAPLOW(step1[6] - step1[25]);
-  output[26] = WRAPLOW(step1[5] - step1[26]);
-  output[27] = WRAPLOW(step1[4] - step1[27]);
-  output[28] = WRAPLOW(step1[3] - step1[28]);
-  output[29] = WRAPLOW(step1[2] - step1[29]);
-  output[30] = WRAPLOW(step1[1] - step1[30]);
-  output[31] = WRAPLOW(step1[0] - step1[31]);
-}
-
-void av1_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
-                              int stride) {
-  tran_low_t out[32 * 32];
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[32], temp_out[32];
-
-  // Rows
-  for (i = 0; i < 32; ++i) {
-    int16_t zero_coeff[16];
-    for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
-    for (j = 0; j < 8; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-    for (j = 0; j < 4; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-    for (j = 0; j < 2; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-
-    if (zero_coeff[0] | zero_coeff[1])
-      av1_idct32_c(input, outptr);
-    else
-      memset(outptr, 0, sizeof(tran_low_t) * 32);
-    input += 32;
-    outptr += 32;
-  }
-
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
-    av1_idct32_c(temp_in, temp_out);
-    for (j = 0; j < 32; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
-    }
-  }
-}
-
-void av1_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
-                            int stride) {
-  tran_low_t out[32 * 32] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[32], temp_out[32];
-
-  // Rows
-  // only upper-left 8x8 has non-zero coeff
-  for (i = 0; i < 8; ++i) {
-    av1_idct32_c(input, outptr);
-    input += 32;
-    outptr += 32;
-  }
-
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
-    av1_idct32_c(temp_in, temp_out);
-    for (j = 0; j < 32; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
-    }
-  }
-}
-
-void av1_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  int i, j;
-  tran_high_t a1;
-
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
-  a1 = ROUND_POWER_OF_TWO(out, 6);
-
-  for (j = 0; j < 32; ++j) {
-    for (i = 0; i < 32; ++i) dest[i] = clip_pixel_add(dest[i], a1);
-    dest += stride;
-  }
-}
-
-#if CONFIG_AOM_HIGHBITDEPTH
-void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
-                                 int stride, int bd) {
-  /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
-     0.5 shifts per pixel. */
-  int i;
-  tran_low_t output[16];
-  tran_high_t a1, b1, c1, d1, e1;
-  const tran_low_t *ip = input;
-  tran_low_t *op = output;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
-  for (i = 0; i < 4; i++) {
-    a1 = ip[0] >> UNIT_QUANT_SHIFT;
-    c1 = ip[1] >> UNIT_QUANT_SHIFT;
-    d1 = ip[2] >> UNIT_QUANT_SHIFT;
-    b1 = ip[3] >> UNIT_QUANT_SHIFT;
-    a1 += c1;
-    d1 -= b1;
-    e1 = (a1 - d1) >> 1;
-    b1 = e1 - b1;
-    c1 = e1 - c1;
-    a1 -= b1;
-    d1 += c1;
-    op[0] = HIGHBD_WRAPLOW(a1, bd);
-    op[1] = HIGHBD_WRAPLOW(b1, bd);
-    op[2] = HIGHBD_WRAPLOW(c1, bd);
-    op[3] = HIGHBD_WRAPLOW(d1, bd);
-    ip += 4;
-    op += 4;
-  }
-
-  ip = output;
-  for (i = 0; i < 4; i++) {
-    a1 = ip[4 * 0];
-    c1 = ip[4 * 1];
-    d1 = ip[4 * 2];
-    b1 = ip[4 * 3];
-    a1 += c1;
-    d1 -= b1;
-    e1 = (a1 - d1) >> 1;
-    b1 = e1 - b1;
-    c1 = e1 - c1;
-    a1 -= b1;
-    d1 += c1;
-    dest[stride * 0] =
-        highbd_clip_pixel_add(dest[stride * 0], HIGHBD_WRAPLOW(a1, bd), bd);
-    dest[stride * 1] =
-        highbd_clip_pixel_add(dest[stride * 1], HIGHBD_WRAPLOW(b1, bd), bd);
-    dest[stride * 2] =
-        highbd_clip_pixel_add(dest[stride * 2], HIGHBD_WRAPLOW(c1, bd), bd);
-    dest[stride * 3] =
-        highbd_clip_pixel_add(dest[stride * 3], HIGHBD_WRAPLOW(d1, bd), bd);
-    ip++;
-    dest++;
-  }
-}
-
-void av1_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
-                                int dest_stride, int bd) {
-  int i;
-  tran_high_t a1, e1;
-  tran_low_t tmp[4];
-  const tran_low_t *ip = in;
-  tran_low_t *op = tmp;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  (void)bd;
-
-  a1 = ip[0] >> UNIT_QUANT_SHIFT;
-  e1 = a1 >> 1;
-  a1 -= e1;
-  op[0] = HIGHBD_WRAPLOW(a1, bd);
-  op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd);
-
-  ip = tmp;
-  for (i = 0; i < 4; i++) {
-    e1 = ip[0] >> 1;
-    a1 = ip[0] - e1;
-    dest[dest_stride * 0] =
-        highbd_clip_pixel_add(dest[dest_stride * 0], a1, bd);
-    dest[dest_stride * 1] =
-        highbd_clip_pixel_add(dest[dest_stride * 1], e1, bd);
-    dest[dest_stride * 2] =
-        highbd_clip_pixel_add(dest[dest_stride * 2], e1, bd);
-    dest[dest_stride * 3] =
-        highbd_clip_pixel_add(dest[dest_stride * 3], e1, bd);
-    ip++;
-    dest++;
-  }
-}
-
-void av1_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
-  tran_low_t step[4];
-  tran_high_t temp1, temp2;
-  (void)bd;
-  // stage 1
-  temp1 = (input[0] + input[2]) * cospi_16_64;
-  temp2 = (input[0] - input[2]) * cospi_16_64;
-  step[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
-  temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
-  step[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
-  // stage 2
-  output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd);
-  output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd);
-  output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd);
-  output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd);
-}
-
-void av1_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
-                                 int stride, int bd) {
-  tran_low_t out[4 * 4];
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[4], temp_out[4];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
-  // Rows
-  for (i = 0; i < 4; ++i) {
-    av1_highbd_idct4_c(input, outptr, bd);
-    input += 4;
-    outptr += 4;
-  }
-
-  // Columns
-  for (i = 0; i < 4; ++i) {
-    for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
-    av1_highbd_idct4_c(temp_in, temp_out, bd);
-    for (j = 0; j < 4; ++j) {
-      dest[j * stride + i] = highbd_clip_pixel_add(
-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
-    }
-  }
-}
-
-void av1_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
-                                int dest_stride, int bd) {
-  int i;
-  tran_high_t a1;
-  tran_low_t out =
-      HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
-  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
-  a1 = ROUND_POWER_OF_TWO(out, 4);
-
-  for (i = 0; i < 4; i++) {
-    dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);
-    dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
-    dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
-    dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
-    dest += dest_stride;
-  }
-}
-
-void av1_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
-  tran_low_t step1[8], step2[8];
-  tran_high_t temp1, temp2;
-  // stage 1
-  step1[0] = input[0];
-  step1[2] = input[4];
-  step1[1] = input[2];
-  step1[3] = input[6];
-  temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
-  temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
-  step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
-  temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
-  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
-  // stage 2 & stage 3 - even half
-  av1_highbd_idct4_c(step1, step1, bd);
-
-  // stage 2 - odd half
-  step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
-  step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
-  step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
-  step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
-
-  // stage 3 - odd half
-  step1[4] = step2[4];
-  temp1 = (step2[6] - step2[5]) * cospi_16_64;
-  temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  step1[7] = step2[7];
-
-  // stage 4
-  output[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
-  output[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
-  output[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
-  output[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
-  output[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
-  output[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
-  output[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
-  output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
-}
-
-void av1_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
-                                 int stride, int bd) {
-  tran_low_t out[8 * 8];
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[8], temp_out[8];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
-  // First transform rows.
-  for (i = 0; i < 8; ++i) {
-    av1_highbd_idct8_c(input, outptr, bd);
-    input += 8;
-    outptr += 8;
-  }
-
-  // Then transform columns.
-  for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
-    av1_highbd_idct8_c(temp_in, temp_out, bd);
-    for (j = 0; j < 8; ++j) {
-      dest[j * stride + i] = highbd_clip_pixel_add(
-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
-    }
-  }
-}
-
-void av1_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
-                                int stride, int bd) {
-  int i, j;
-  tran_high_t a1;
-  tran_low_t out =
-      HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
-  a1 = ROUND_POWER_OF_TWO(out, 5);
-  for (j = 0; j < 8; ++j) {
-    for (i = 0; i < 8; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
-    dest += stride;
-  }
-}
-
-void av1_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
-
-  tran_low_t x0 = input[0];
-  tran_low_t x1 = input[1];
-  tran_low_t x2 = input[2];
-  tran_low_t x3 = input[3];
-  (void)bd;
-
-  if (!(x0 | x1 | x2 | x3)) {
-    memset(output, 0, 4 * sizeof(*output));
-    return;
-  }
-
-  s0 = sinpi_1_9 * x0;
-  s1 = sinpi_2_9 * x0;
-  s2 = sinpi_3_9 * x1;
-  s3 = sinpi_4_9 * x2;
-  s4 = sinpi_1_9 * x2;
-  s5 = sinpi_2_9 * x3;
-  s6 = sinpi_4_9 * x3;
-  s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd);
-
-  s0 = s0 + s3 + s5;
-  s1 = s1 - s4 - s6;
-  s3 = s2;
-  s2 = sinpi_3_9 * s7;
-
-  // 1-D transform scaling factor is sqrt(2).
-  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
-  // + 1b (addition) = 29b.
-  // Hence the output bit depth is 15b.
-  output[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s3), bd);
-  output[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s3), bd);
-  output[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
-  output[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3), bd);
-}
-
-void av1_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
-
-  tran_low_t x0 = input[7];
-  tran_low_t x1 = input[0];
-  tran_low_t x2 = input[5];
-  tran_low_t x3 = input[2];
-  tran_low_t x4 = input[3];
-  tran_low_t x5 = input[4];
-  tran_low_t x6 = input[1];
-  tran_low_t x7 = input[6];
-  (void)bd;
-
-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
-    memset(output, 0, 8 * sizeof(*output));
-    return;
-  }
-
-  // stage 1
-  s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
-  s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
-  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
-  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
-  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
-  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
-  s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
-  s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
-
-  x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s4), bd);
-  x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s5), bd);
-  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s6), bd);
-  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s7), bd);
-  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s4), bd);
-  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s5), bd);
-  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s6), bd);
-  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s7), bd);
-
-  // stage 2
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
-  s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
-  s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
-  s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
-
-  x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
-  x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
-  x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
-  x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
-  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd);
-  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd);
-  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd);
-  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd);
-
-  // stage 3
-  s2 = cospi_16_64 * (x2 + x3);
-  s3 = cospi_16_64 * (x2 - x3);
-  s6 = cospi_16_64 * (x6 + x7);
-  s7 = cospi_16_64 * (x6 - x7);
-
-  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
-  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd);
-  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd);
-  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd);
-
-  output[0] = HIGHBD_WRAPLOW(x0, bd);
-  output[1] = HIGHBD_WRAPLOW(-x4, bd);
-  output[2] = HIGHBD_WRAPLOW(x6, bd);
-  output[3] = HIGHBD_WRAPLOW(-x2, bd);
-  output[4] = HIGHBD_WRAPLOW(x3, bd);
-  output[5] = HIGHBD_WRAPLOW(-x7, bd);
-  output[6] = HIGHBD_WRAPLOW(x5, bd);
-  output[7] = HIGHBD_WRAPLOW(-x1, bd);
-}
-
-void av1_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
-                                 int stride, int bd) {
-  tran_low_t out[8 * 8] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[8], temp_out[8];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
-  // First transform rows.
-  // Only first 4 row has non-zero coefs.
-  for (i = 0; i < 4; ++i) {
-    av1_highbd_idct8_c(input, outptr, bd);
-    input += 8;
-    outptr += 8;
-  }
-  // Then transform columns.
-  for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
-    av1_highbd_idct8_c(temp_in, temp_out, bd);
-    for (j = 0; j < 8; ++j) {
-      dest[j * stride + i] = highbd_clip_pixel_add(
-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
-    }
-  }
-}
-
-void av1_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
-  tran_low_t step1[16], step2[16];
-  tran_high_t temp1, temp2;
-  (void)bd;
-
-  // stage 1
-  step1[0] = input[0 / 2];
-  step1[1] = input[16 / 2];
-  step1[2] = input[8 / 2];
-  step1[3] = input[24 / 2];
-  step1[4] = input[4 / 2];
-  step1[5] = input[20 / 2];
-  step1[6] = input[12 / 2];
-  step1[7] = input[28 / 2];
-  step1[8] = input[2 / 2];
-  step1[9] = input[18 / 2];
-  step1[10] = input[10 / 2];
-  step1[11] = input[26 / 2];
-  step1[12] = input[6 / 2];
-  step1[13] = input[22 / 2];
-  step1[14] = input[14 / 2];
-  step1[15] = input[30 / 2];
-
-  // stage 2
-  step2[0] = step1[0];
-  step2[1] = step1[1];
-  step2[2] = step1[2];
-  step2[3] = step1[3];
-  step2[4] = step1[4];
-  step2[5] = step1[5];
-  step2[6] = step1[6];
-  step2[7] = step1[7];
-
-  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
-  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
-  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
-  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
-  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
-  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
-  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
-  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
-  // stage 3
-  step1[0] = step2[0];
-  step1[1] = step2[1];
-  step1[2] = step2[2];
-  step1[3] = step2[3];
-
-  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
-  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
-  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
-  step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
-  step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
-  step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
-  step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
-  step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
-  step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
-  step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
-  step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
-
-  // stage 4
-  temp1 = (step1[0] + step1[1]) * cospi_16_64;
-  temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
-  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
-  step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
-  step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
-  step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
-
-  step2[8] = step1[8];
-  step2[15] = step1[15];
-  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
-  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
-  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  step2[11] = step1[11];
-  step2[12] = step1[12];
-
-  // stage 5
-  step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
-  step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
-  step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
-  step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
-  step1[4] = step2[4];
-  temp1 = (step2[6] - step2[5]) * cospi_16_64;
-  temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  step1[7] = step2[7];
-
-  step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
-  step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
-  step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
-  step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
-  step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
-  step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
-  step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
-  step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
-
-  // stage 6
-  step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
-  step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
-  step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
-  step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
-  step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
-  step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
-  step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
-  step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
-  step2[8] = step1[8];
-  step2[9] = step1[9];
-  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
-  temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
-  temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  step2[14] = step1[14];
-  step2[15] = step1[15];
-
-  // stage 7
-  output[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
-  output[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
-  output[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
-  output[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
-  output[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
-  output[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
-  output[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
-  output[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
-  output[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
-  output[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
-  output[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
-  output[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
-  output[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
-  output[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
-  output[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
-  output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
-}
-
-void av1_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
-                                    int stride, int bd) {
-  tran_low_t out[16 * 16];
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[16], temp_out[16];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
-  // First transform rows.
-  for (i = 0; i < 16; ++i) {
-    av1_highbd_idct16_c(input, outptr, bd);
-    input += 16;
-    outptr += 16;
-  }
-
-  // Then transform columns.
-  for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
-    av1_highbd_idct16_c(temp_in, temp_out, bd);
-    for (j = 0; j < 16; ++j) {
-      dest[j * stride + i] = highbd_clip_pixel_add(
-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
-    }
-  }
-}
-
-void av1_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
-  tran_high_t s9, s10, s11, s12, s13, s14, s15;
-
-  tran_low_t x0 = input[15];
-  tran_low_t x1 = input[0];
-  tran_low_t x2 = input[13];
-  tran_low_t x3 = input[2];
-  tran_low_t x4 = input[11];
-  tran_low_t x5 = input[4];
-  tran_low_t x6 = input[9];
-  tran_low_t x7 = input[6];
-  tran_low_t x8 = input[7];
-  tran_low_t x9 = input[8];
-  tran_low_t x10 = input[5];
-  tran_low_t x11 = input[10];
-  tran_low_t x12 = input[3];
-  tran_low_t x13 = input[12];
-  tran_low_t x14 = input[1];
-  tran_low_t x15 = input[14];
-  (void)bd;
-
-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
-        x13 | x14 | x15)) {
-    memset(output, 0, 16 * sizeof(*output));
-    return;
-  }
-
-  // stage 1
-  s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
-  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
-  s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
-  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
-  s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
-  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
-  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
-  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
-  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
-  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
-  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
-  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
-  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
-  s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
-  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
-  s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
-
-  x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s8), bd);
-  x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s9), bd);
-  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s10), bd);
-  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s11), bd);
-  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s12), bd);
-  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s13), bd);
-  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 + s14), bd);
-  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 + s15), bd);
-  x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s8), bd);
-  x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s9), bd);
-  x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s10), bd);
-  x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s11), bd);
-  x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s12), bd);
-  x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s13), bd);
-  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 - s14), bd);
-  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 - s15), bd);
-
-  // stage 2
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = x4;
-  s5 = x5;
-  s6 = x6;
-  s7 = x7;
-  s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
-  s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
-  s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
-  s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
-  s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
-  s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
-  s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
-  s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
-
-  x0 = HIGHBD_WRAPLOW(s0 + s4, bd);
-  x1 = HIGHBD_WRAPLOW(s1 + s5, bd);
-  x2 = HIGHBD_WRAPLOW(s2 + s6, bd);
-  x3 = HIGHBD_WRAPLOW(s3 + s7, bd);
-  x4 = HIGHBD_WRAPLOW(s0 - s4, bd);
-  x5 = HIGHBD_WRAPLOW(s1 - s5, bd);
-  x6 = HIGHBD_WRAPLOW(s2 - s6, bd);
-  x7 = HIGHBD_WRAPLOW(s3 - s7, bd);
-  x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 + s12), bd);
-  x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 + s13), bd);
-  x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 + s14), bd);
-  x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 + s15), bd);
-  x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 - s12), bd);
-  x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 - s13), bd);
-  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 - s14), bd);
-  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 - s15), bd);
-
-  // stage 3
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
-  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
-  s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
-  s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
-  s8 = x8;
-  s9 = x9;
-  s10 = x10;
-  s11 = x11;
-  s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
-  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
-  s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
-  s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
-
-  x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
-  x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
-  x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
-  x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
-  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd);
-  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd);
-  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd);
-  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd);
-  x8 = HIGHBD_WRAPLOW(s8 + s10, bd);
-  x9 = HIGHBD_WRAPLOW(s9 + s11, bd);
-  x10 = HIGHBD_WRAPLOW(s8 - s10, bd);
-  x11 = HIGHBD_WRAPLOW(s9 - s11, bd);
-  x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 + s14), bd);
-  x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 + s15), bd);
-  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 - s14), bd);
-  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 - s15), bd);
-
-  // stage 4
-  s2 = (-cospi_16_64) * (x2 + x3);
-  s3 = cospi_16_64 * (x2 - x3);
-  s6 = cospi_16_64 * (x6 + x7);
-  s7 = cospi_16_64 * (-x6 + x7);
-  s10 = cospi_16_64 * (x10 + x11);
-  s11 = cospi_16_64 * (-x10 + x11);
-  s14 = (-cospi_16_64) * (x14 + x15);
-  s15 = cospi_16_64 * (x14 - x15);
-
-  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
-  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd);
-  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd);
-  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd);
-  x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10), bd);
-  x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11), bd);
-  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s14), bd);
-  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s15), bd);
-
-  output[0] = HIGHBD_WRAPLOW(x0, bd);
-  output[1] = HIGHBD_WRAPLOW(-x8, bd);
-  output[2] = HIGHBD_WRAPLOW(x12, bd);
-  output[3] = HIGHBD_WRAPLOW(-x4, bd);
-  output[4] = HIGHBD_WRAPLOW(x6, bd);
-  output[5] = HIGHBD_WRAPLOW(x14, bd);
-  output[6] = HIGHBD_WRAPLOW(x10, bd);
-  output[7] = HIGHBD_WRAPLOW(x2, bd);
-  output[8] = HIGHBD_WRAPLOW(x3, bd);
-  output[9] = HIGHBD_WRAPLOW(x11, bd);
-  output[10] = HIGHBD_WRAPLOW(x15, bd);
-  output[11] = HIGHBD_WRAPLOW(x7, bd);
-  output[12] = HIGHBD_WRAPLOW(x5, bd);
-  output[13] = HIGHBD_WRAPLOW(-x13, bd);
-  output[14] = HIGHBD_WRAPLOW(x9, bd);
-  output[15] = HIGHBD_WRAPLOW(-x1, bd);
-}
-
-void av1_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
-                                   int stride, int bd) {
-  tran_low_t out[16 * 16] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[16], temp_out[16];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
-  // First transform rows. Since all non-zero dct coefficients are in
-  // upper-left 4x4 area, we only need to calculate first 4 rows here.
-  for (i = 0; i < 4; ++i) {
-    av1_highbd_idct16_c(input, outptr, bd);
-    input += 16;
-    outptr += 16;
-  }
-
-  // Then transform columns.
-  for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
-    av1_highbd_idct16_c(temp_in, temp_out, bd);
-    for (j = 0; j < 16; ++j) {
-      dest[j * stride + i] = highbd_clip_pixel_add(
-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
-    }
-  }
-}
-
-void av1_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
-                                  int stride, int bd) {
-  int i, j;
-  tran_high_t a1;
-  tran_low_t out =
-      HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
-  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
-  a1 = ROUND_POWER_OF_TWO(out, 6);
-  for (j = 0; j < 16; ++j) {
-    for (i = 0; i < 16; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
-    dest += stride;
-  }
-}
-
-static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
-                            int bd) {
-  tran_low_t step1[32], step2[32];
-  tran_high_t temp1, temp2;
-  (void)bd;
-
-  // stage 1
-  step1[0] = input[0];
-  step1[1] = input[16];
-  step1[2] = input[8];
-  step1[3] = input[24];
-  step1[4] = input[4];
-  step1[5] = input[20];
-  step1[6] = input[12];
-  step1[7] = input[28];
-  step1[8] = input[2];
-  step1[9] = input[18];
-  step1[10] = input[10];
-  step1[11] = input[26];
-  step1[12] = input[6];
-  step1[13] = input[22];
-  step1[14] = input[14];
-  step1[15] = input[30];
-
-  temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
-  temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
-  step1[16] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[31] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
-  temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
-  temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
-  step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
-  temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
-  temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
-  step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
-  temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
-  temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
-  step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
-  temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
-  temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
-  step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
-  temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
-  temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
-  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
-  temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
-  temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
-  step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
-  temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
-  temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
-  step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
-  // stage 2
-  step2[0] = step1[0];
-  step2[1] = step1[1];
-  step2[2] = step1[2];
-  step2[3] = step1[3];
-  step2[4] = step1[4];
-  step2[5] = step1[5];
-  step2[6] = step1[6];
-  step2[7] = step1[7];
-
-  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
-  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
-  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
-  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
-  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
-  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
-  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
-  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
-  step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd);
-  step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd);
-  step2[18] = HIGHBD_WRAPLOW(-step1[18] + step1[19], bd);
-  step2[19] = HIGHBD_WRAPLOW(step1[18] + step1[19], bd);
-  step2[20] = HIGHBD_WRAPLOW(step1[20] + step1[21], bd);
-  step2[21] = HIGHBD_WRAPLOW(step1[20] - step1[21], bd);
-  step2[22] = HIGHBD_WRAPLOW(-step1[22] + step1[23], bd);
-  step2[23] = HIGHBD_WRAPLOW(step1[22] + step1[23], bd);
-  step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[25], bd);
-  step2[25] = HIGHBD_WRAPLOW(step1[24] - step1[25], bd);
-  step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[27], bd);
-  step2[27] = HIGHBD_WRAPLOW(step1[26] + step1[27], bd);
-  step2[28] = HIGHBD_WRAPLOW(step1[28] + step1[29], bd);
-  step2[29] = HIGHBD_WRAPLOW(step1[28] - step1[29], bd);
-  step2[30] = HIGHBD_WRAPLOW(-step1[30] + step1[31], bd);
-  step2[31] = HIGHBD_WRAPLOW(step1[30] + step1[31], bd);
-
-  // stage 3
-  step1[0] = step2[0];
-  step1[1] = step2[1];
-  step1[2] = step2[2];
-  step1[3] = step2[3];
-
-  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
-  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
-  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-
-  step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
-  step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
-  step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
-  step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
-  step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
-  step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
-  step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
-  step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
-
-  step1[16] = step2[16];
-  step1[31] = step2[31];
-  temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
-  temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
-  step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
-  temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
-  step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  step1[19] = step2[19];
-  step1[20] = step2[20];
-  temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
-  temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
-  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
-  temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
-  step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  step1[23] = step2[23];
-  step1[24] = step2[24];
-  step1[27] = step2[27];
-  step1[28] = step2[28];
-
-  // stage 4
-  temp1 = (step1[0] + step1[1]) * cospi_16_64;
-  temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
-  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
-  step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
-  step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
-  step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
-
-  step2[8] = step1[8];
-  step2[15] = step1[15];
-  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
-  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
-  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  step2[11] = step1[11];
-  step2[12] = step1[12];
-
-  step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[19], bd);
-  step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[18], bd);
-  step2[18] = HIGHBD_WRAPLOW(step1[17] - step1[18], bd);
-  step2[19] = HIGHBD_WRAPLOW(step1[16] - step1[19], bd);
-  step2[20] = HIGHBD_WRAPLOW(-step1[20] + step1[23], bd);
-  step2[21] = HIGHBD_WRAPLOW(-step1[21] + step1[22], bd);
-  step2[22] = HIGHBD_WRAPLOW(step1[21] + step1[22], bd);
-  step2[23] = HIGHBD_WRAPLOW(step1[20] + step1[23], bd);
-
-  step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[27], bd);
-  step2[25] = HIGHBD_WRAPLOW(step1[25] + step1[26], bd);
-  step2[26] = HIGHBD_WRAPLOW(step1[25] - step1[26], bd);
-  step2[27] = HIGHBD_WRAPLOW(step1[24] - step1[27], bd);
-  step2[28] = HIGHBD_WRAPLOW(-step1[28] + step1[31], bd);
-  step2[29] = HIGHBD_WRAPLOW(-step1[29] + step1[30], bd);
-  step2[30] = HIGHBD_WRAPLOW(step1[29] + step1[30], bd);
-  step2[31] = HIGHBD_WRAPLOW(step1[28] + step1[31], bd);
-
-  // stage 5
-  step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
-  step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
-  step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
-  step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
-  step1[4] = step2[4];
-  temp1 = (step2[6] - step2[5]) * cospi_16_64;
-  temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  step1[7] = step2[7];
-
-  step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
-  step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
-  step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
-  step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
-  step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
-  step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
-  step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
-  step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
-
-  step1[16] = step2[16];
-  step1[17] = step2[17];
-  temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
-  temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
-  step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
-  temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
-  step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
-  temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
-  step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
-  temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
-  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  step1[22] = step2[22];
-  step1[23] = step2[23];
-  step1[24] = step2[24];
-  step1[25] = step2[25];
-  step1[30] = step2[30];
-  step1[31] = step2[31];
-
-  // stage 6
-  step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
-  step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
-  step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
-  step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
-  step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
-  step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
-  step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
-  step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
-  step2[8] = step1[8];
-  step2[9] = step1[9];
-  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
-  temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
-  temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  step2[14] = step1[14];
-  step2[15] = step1[15];
-
-  step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[23], bd);
-  step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[22], bd);
-  step2[18] = HIGHBD_WRAPLOW(step1[18] + step1[21], bd);
-  step2[19] = HIGHBD_WRAPLOW(step1[19] + step1[20], bd);
-  step2[20] = HIGHBD_WRAPLOW(step1[19] - step1[20], bd);
-  step2[21] = HIGHBD_WRAPLOW(step1[18] - step1[21], bd);
-  step2[22] = HIGHBD_WRAPLOW(step1[17] - step1[22], bd);
-  step2[23] = HIGHBD_WRAPLOW(step1[16] - step1[23], bd);
-
-  step2[24] = HIGHBD_WRAPLOW(-step1[24] + step1[31], bd);
-  step2[25] = HIGHBD_WRAPLOW(-step1[25] + step1[30], bd);
-  step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[29], bd);
-  step2[27] = HIGHBD_WRAPLOW(-step1[27] + step1[28], bd);
-  step2[28] = HIGHBD_WRAPLOW(step1[27] + step1[28], bd);
-  step2[29] = HIGHBD_WRAPLOW(step1[26] + step1[29], bd);
-  step2[30] = HIGHBD_WRAPLOW(step1[25] + step1[30], bd);
-  step2[31] = HIGHBD_WRAPLOW(step1[24] + step1[31], bd);
-
-  // stage 7
-  step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
-  step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
-  step1[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
-  step1[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
-  step1[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
-  step1[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
-  step1[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
-  step1[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
-  step1[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
-  step1[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
-  step1[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
-  step1[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
-  step1[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
-  step1[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
-  step1[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
-  step1[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
-
-  step1[16] = step2[16];
-  step1[17] = step2[17];
-  step1[18] = step2[18];
-  step1[19] = step2[19];
-  temp1 = (-step2[20] + step2[27]) * cospi_16_64;
-  temp2 = (step2[20] + step2[27]) * cospi_16_64;
-  step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = (-step2[21] + step2[26]) * cospi_16_64;
-  temp2 = (step2[21] + step2[26]) * cospi_16_64;
-  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = (-step2[22] + step2[25]) * cospi_16_64;
-  temp2 = (step2[22] + step2[25]) * cospi_16_64;
-  step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  temp1 = (-step2[23] + step2[24]) * cospi_16_64;
-  temp2 = (step2[23] + step2[24]) * cospi_16_64;
-  step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
-  step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
-  step1[28] = step2[28];
-  step1[29] = step2[29];
-  step1[30] = step2[30];
-  step1[31] = step2[31];
-
-  // final stage
-  output[0] = HIGHBD_WRAPLOW(step1[0] + step1[31], bd);
-  output[1] = HIGHBD_WRAPLOW(step1[1] + step1[30], bd);
-  output[2] = HIGHBD_WRAPLOW(step1[2] + step1[29], bd);
-  output[3] = HIGHBD_WRAPLOW(step1[3] + step1[28], bd);
-  output[4] = HIGHBD_WRAPLOW(step1[4] + step1[27], bd);
-  output[5] = HIGHBD_WRAPLOW(step1[5] + step1[26], bd);
-  output[6] = HIGHBD_WRAPLOW(step1[6] + step1[25], bd);
-  output[7] = HIGHBD_WRAPLOW(step1[7] + step1[24], bd);
-  output[8] = HIGHBD_WRAPLOW(step1[8] + step1[23], bd);
-  output[9] = HIGHBD_WRAPLOW(step1[9] + step1[22], bd);
-  output[10] = HIGHBD_WRAPLOW(step1[10] + step1[21], bd);
-  output[11] = HIGHBD_WRAPLOW(step1[11] + step1[20], bd);
-  output[12] = HIGHBD_WRAPLOW(step1[12] + step1[19], bd);
-  output[13] = HIGHBD_WRAPLOW(step1[13] + step1[18], bd);
-  output[14] = HIGHBD_WRAPLOW(step1[14] + step1[17], bd);
-  output[15] = HIGHBD_WRAPLOW(step1[15] + step1[16], bd);
-  output[16] = HIGHBD_WRAPLOW(step1[15] - step1[16], bd);
-  output[17] = HIGHBD_WRAPLOW(step1[14] - step1[17], bd);
-  output[18] = HIGHBD_WRAPLOW(step1[13] - step1[18], bd);
-  output[19] = HIGHBD_WRAPLOW(step1[12] - step1[19], bd);
-  output[20] = HIGHBD_WRAPLOW(step1[11] - step1[20], bd);
-  output[21] = HIGHBD_WRAPLOW(step1[10] - step1[21], bd);
-  output[22] = HIGHBD_WRAPLOW(step1[9] - step1[22], bd);
-  output[23] = HIGHBD_WRAPLOW(step1[8] - step1[23], bd);
-  output[24] = HIGHBD_WRAPLOW(step1[7] - step1[24], bd);
-  output[25] = HIGHBD_WRAPLOW(step1[6] - step1[25], bd);
-  output[26] = HIGHBD_WRAPLOW(step1[5] - step1[26], bd);
-  output[27] = HIGHBD_WRAPLOW(step1[4] - step1[27], bd);
-  output[28] = HIGHBD_WRAPLOW(step1[3] - step1[28], bd);
-  output[29] = HIGHBD_WRAPLOW(step1[2] - step1[29], bd);
-  output[30] = HIGHBD_WRAPLOW(step1[1] - step1[30], bd);
-  output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd);
-}
-
-void av1_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
-                                     int stride, int bd) {
-  tran_low_t out[32 * 32];
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[32], temp_out[32];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
-  // Rows
-  for (i = 0; i < 32; ++i) {
-    tran_low_t zero_coeff[16];
-    for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
-    for (j = 0; j < 8; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-    for (j = 0; j < 4; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-    for (j = 0; j < 2; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-
-    if (zero_coeff[0] | zero_coeff[1])
-      highbd_idct32_c(input, outptr, bd);
-    else
-      memset(outptr, 0, sizeof(tran_low_t) * 32);
-    input += 32;
-    outptr += 32;
-  }
-
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
-    highbd_idct32_c(temp_in, temp_out, bd);
-    for (j = 0; j < 32; ++j) {
-      dest[j * stride + i] = highbd_clip_pixel_add(
-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
-    }
-  }
-}
-
-void av1_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
-                                   int stride, int bd) {
-  tran_low_t out[32 * 32] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[32], temp_out[32];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
-  // Rows
-  // Only upper-left 8x8 has non-zero coeff.
-  for (i = 0; i < 8; ++i) {
-    highbd_idct32_c(input, outptr, bd);
-    input += 32;
-    outptr += 32;
-  }
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
-    highbd_idct32_c(temp_in, temp_out, bd);
-    for (j = 0; j < 32; ++j) {
-      dest[j * stride + i] = highbd_clip_pixel_add(
-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
-    }
-  }
-}
-
-void av1_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
-                                  int stride, int bd) {
-  int i, j;
-  int a1;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
-  tran_low_t out =
-      HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
-  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
-  a1 = ROUND_POWER_OF_TWO(out, 6);
-
-  for (j = 0; j < 32; ++j) {
-    for (i = 0; i < 32; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
-    dest += stride;
-  }
-}
-#endif  // CONFIG_AOM_HIGHBITDEPTH
diff --git a/av1/common/av1_inv_txfm.h b/av1/common/av1_inv_txfm.h
deleted file mode 100644
index c57e888..0000000
--- a/av1/common/av1_inv_txfm.h
+++ /dev/null
@@ -1,133 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_INV_TXFM_H_
-#define AOM_DSP_INV_TXFM_H_
-
-#include <assert.h>
-
-#include "./aom_config.h"
-#include "aom_dsp/txfm_common.h"
-#include "aom_dsp/inv_txfm.h"
-#include "aom_ports/mem.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-static INLINE tran_high_t check_range(tran_high_t input) {
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
-  // For valid input streams, intermediate stage coefficients should always
-  // stay within the range of a signed 16 bit integer. Coefficients can go out
-  // of this range for invalid/corrupt streams. However, strictly checking
-  // this range for every intermediate coefficient can burdensome for a decoder,
-  // therefore the following assertion is only enabled when configured with
-  // --enable-coefficient-range-checking.
-  assert(INT16_MIN <= input);
-  assert(input <= INT16_MAX);
-#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
-  return input;
-}
-
-static INLINE tran_high_t dct_const_round_shift(tran_high_t input) {
-  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-  return rv;
-}
-
-#if CONFIG_AOM_HIGHBITDEPTH
-static INLINE tran_high_t highbd_check_range(tran_high_t input, int bd) {
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
-  // For valid highbitdepth streams, intermediate stage coefficients will
-  // stay within the ranges:
-  // - 8 bit: signed 16 bit integer
-  // - 10 bit: signed 18 bit integer
-  // - 12 bit: signed 20 bit integer
-  const int32_t int_max = (1 << (7 + bd)) - 1;
-  const int32_t int_min = -int_max - 1;
-  assert(int_min <= input);
-  assert(input <= int_max);
-  (void)int_min;
-#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
-  (void)bd;
-  return input;
-}
-
-static INLINE tran_high_t highbd_dct_const_round_shift(tran_high_t input) {
-  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-  return rv;
-}
-#endif  // CONFIG_AOM_HIGHBITDEPTH
-
-#if CONFIG_EMULATE_HARDWARE
-// When CONFIG_EMULATE_HARDWARE is 1 the transform performs a
-// non-normative method to handle overflows. A stream that causes
-// overflows  in the inverse transform is considered invalid,
-// and a hardware implementer is free to choose any reasonable
-// method to handle overflows. However to aid in hardware
-// verification they can use a specific implementation of the
-// WRAPLOW() macro below that is identical to their intended
-// hardware implementation (and also use configure options to trigger
-// the C-implementation of the transform).
-//
-// The particular WRAPLOW implementation below performs strict
-// overflow wrapping to match common hardware implementations.
-// bd of 8 uses trans_low with 16bits, need to remove 16bits
-// bd of 10 uses trans_low with 18bits, need to remove 14bits
-// bd of 12 uses trans_low with 20bits, need to remove 12bits
-// bd of x uses trans_low with 8+x bits, need to remove 24-x bits
-
-#define WRAPLOW(x) ((((int32_t)check_range(x)) << 16) >> 16)
-#if CONFIG_AOM_HIGHBITDEPTH
-#define HIGHBD_WRAPLOW(x, bd) \
-  ((((int32_t)highbd_check_range((x), bd)) << (24 - bd)) >> (24 - bd))
-#endif  // CONFIG_AOM_HIGHBITDEPTH
-
-#else  // CONFIG_EMULATE_HARDWARE
-
-#define WRAPLOW(x) ((int32_t)check_range(x))
-#if CONFIG_AOM_HIGHBITDEPTH
-#define HIGHBD_WRAPLOW(x, bd) ((int32_t)highbd_check_range((x), bd))
-#endif  // CONFIG_AOM_HIGHBITDEPTH
-
-#endif  // CONFIG_EMULATE_HARDWARE
-
-void av1_idct4_c(const tran_low_t *input, tran_low_t *output);
-void av1_idct8_c(const tran_low_t *input, tran_low_t *output);
-void av1_idct16_c(const tran_low_t *input, tran_low_t *output);
-void av1_idct32_c(const tran_low_t *input, tran_low_t *output);
-void av1_iadst4_c(const tran_low_t *input, tran_low_t *output);
-void av1_iadst8_c(const tran_low_t *input, tran_low_t *output);
-void av1_iadst16_c(const tran_low_t *input, tran_low_t *output);
-
-#if CONFIG_AOM_HIGHBITDEPTH
-void av1_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd);
-void av1_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd);
-void av1_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd);
-
-void av1_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd);
-void av1_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd);
-void av1_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd);
-
-static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
-                                             int bd) {
-  trans = HIGHBD_WRAPLOW(trans, bd);
-  return clip_pixel_highbd(dest + (int)trans, bd);
-}
-#endif
-
-static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {
-  trans = WRAPLOW(trans);
-  return clip_pixel(dest + (int)trans);
-}
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-#endif  // AOM_DSP_INV_TXFM_H_
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index e66826f..af98f79 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -61,25 +61,23 @@
     add_proto qw/void av1_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
     specialize qw/av1_iht4x4_16_add/;
 
-    if (aom_config("CONFIG_EXT_TX") eq "yes") {
-      add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht4x8_32_add/;
 
-      add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht8x4_32_add/;
 
-      add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht8x16_128_add/;
 
-      add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht16x8_128_add/;
 
-      add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht16x32_512_add/;
 
-      add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht32x16_512_add/;
-    }
 
     add_proto qw/void av1_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
     specialize qw/av1_iht8x8_64_add/;
@@ -90,31 +88,29 @@
     add_proto qw/void av1_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
     specialize qw/av1_iht4x4_16_add sse2/;
 
-    if (aom_config("CONFIG_EXT_TX") eq "yes") {
-      add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht4x8_32_add sse2/;
 
-      add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht8x4_32_add sse2/;
 
-      add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht8x16_128_add sse2/;
 
-      add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht16x8_128_add sse2/;
 
-      add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht16x32_512_add sse2/;
 
-      add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht32x16_512_add sse2/;
-    }
 
     add_proto qw/void av1_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
     specialize qw/av1_iht8x8_64_add sse2/;
 
     add_proto qw/void av1_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
-    specialize qw/av1_iht16x16_256_add sse2/;
+    specialize qw/av1_iht16x16_256_add sse2 avx2/;
   }
 } else {
   # Force C versions if CONFIG_EMULATE_HARDWARE is 1
@@ -122,25 +118,23 @@
     add_proto qw/void av1_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
     specialize qw/av1_iht4x4_16_add/;
 
-    if (aom_config("CONFIG_EXT_TX") eq "yes") {
-      add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht4x8_32_add/;
 
-      add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht8x4_32_add/;
 
-      add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht8x16_128_add/;
 
-      add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht16x8_128_add/;
 
-      add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht16x32_512_add/;
 
-      add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht32x16_512_add/;
-    }
 
     add_proto qw/void av1_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
     specialize qw/av1_iht8x8_64_add/;
@@ -151,31 +145,29 @@
     add_proto qw/void av1_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
     specialize qw/av1_iht4x4_16_add sse2 neon dspr2/;
 
-    if (aom_config("CONFIG_EXT_TX") eq "yes") {
-      add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht4x8_32_add sse2/;
 
-      add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht8x4_32_add sse2/;
 
-      add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht8x16_128_add sse2/;
 
-      add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht16x8_128_add sse2/;
 
-      add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht16x32_512_add sse2/;
 
-      add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+    add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
       specialize qw/av1_iht32x16_512_add sse2/;
-    }
 
     add_proto qw/void av1_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
     specialize qw/av1_iht8x8_64_add sse2 neon dspr2/;
 
     add_proto qw/void av1_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
-    specialize qw/av1_iht16x16_256_add sse2 dspr2/;
+    specialize qw/av1_iht16x16_256_add sse2 avx2 dspr2/;
 
     if (aom_config("CONFIG_EXT_TX") ne "yes") {
       specialize qw/av1_iht4x4_16_add msa/;
@@ -283,25 +275,23 @@
   add_proto qw/void av1_highbd_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
   specialize qw/av1_highbd_iht4x4_16_add/;
 
-  if (aom_config("CONFIG_EXT_TX") eq "yes") {
-    add_proto qw/void av1_highbd_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+  add_proto qw/void av1_highbd_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
     specialize qw/av1_highbd_iht4x8_32_add/;
 
-    add_proto qw/void av1_highbd_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+  add_proto qw/void av1_highbd_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
     specialize qw/av1_highbd_iht8x4_32_add/;
 
-    add_proto qw/void av1_highbd_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+  add_proto qw/void av1_highbd_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
     specialize qw/av1_highbd_iht8x16_128_add/;
 
-    add_proto qw/void av1_highbd_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+  add_proto qw/void av1_highbd_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
     specialize qw/av1_highbd_iht16x8_128_add/;
 
-    add_proto qw/void av1_highbd_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+  add_proto qw/void av1_highbd_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
     specialize qw/av1_highbd_iht16x32_512_add/;
 
-    add_proto qw/void av1_highbd_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+  add_proto qw/void av1_highbd_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
     specialize qw/av1_highbd_iht32x16_512_add/;
-  }
 
   add_proto qw/void av1_highbd_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
   specialize qw/av1_highbd_iht8x8_64_add/;
@@ -394,6 +384,11 @@
 add_proto qw/void av1_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
 specialize qw/av1_fht32x32 avx2/;
 
+if (aom_config("CONFIG_TX64X64") eq "yes") {
+  add_proto qw/void av1_fht64x64/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/av1_fht64x64/;
+}
+
 if (aom_config("CONFIG_EXT_TX") eq "yes") {
   add_proto qw/void av1_fht4x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
   specialize qw/av1_fht4x8 sse2/;
@@ -414,62 +409,6 @@
   specialize qw/av1_fht32x16 sse2/;
 }
 
-if (aom_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
-  add_proto qw/void av1_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/av1_fdct4x4/;
-
-  add_proto qw/void av1_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/av1_fdct4x4_1/;
-
-  add_proto qw/void av1_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/av1_fdct8x8/;
-
-  add_proto qw/void av1_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/av1_fdct8x8_1/;
-
-  add_proto qw/void av1_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/av1_fdct16x16/;
-
-  add_proto qw/void av1_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/av1_fdct16x16_1/;
-
-  add_proto qw/void av1_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/av1_fdct32x32/;
-
-  add_proto qw/void av1_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/av1_fdct32x32_rd/;
-
-  add_proto qw/void av1_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/av1_fdct32x32_1/;
-} else {
-  add_proto qw/void av1_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/av1_fdct4x4 sse2/;
-
-  add_proto qw/void av1_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/av1_fdct4x4_1 sse2/;
-
-  add_proto qw/void av1_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/av1_fdct8x8 sse2/;
-
-  add_proto qw/void av1_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/av1_fdct8x8_1 sse2/;
-
-  add_proto qw/void av1_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/av1_fdct16x16 sse2/;
-
-  add_proto qw/void av1_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/av1_fdct16x16_1 sse2/;
-
-  add_proto qw/void av1_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/av1_fdct32x32 sse2/;
-
-  add_proto qw/void av1_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/av1_fdct32x32_rd sse2/;
-
-  add_proto qw/void av1_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/av1_fdct32x32_1 sse2/;
-}
-
 if (aom_config("CONFIG_AOM_HIGHBITDEPTH") ne "yes") {
   if (aom_config("CONFIG_EXT_TX") ne "yes") {
     specialize qw/av1_fht4x4 msa/;
@@ -478,243 +417,9 @@
   }
 }
 
-if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
-  if (aom_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
-    add_proto qw/void av1_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_highbd_fdct4x4/;
-
-    add_proto qw/void av1_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_highbd_fdct8x8/;
-
-    add_proto qw/void av1_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_highbd_fdct8x8_1/;
-
-    add_proto qw/void av1_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_highbd_fdct16x16/;
-
-    add_proto qw/void av1_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_highbd_fdct16x16_1/;
-
-    add_proto qw/void av1_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_highbd_fdct32x32/;
-
-    add_proto qw/void av1_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_highbd_fdct32x32_rd/;
-
-    add_proto qw/void av1_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_highbd_fdct32x32_1/;
-  } else {
-    add_proto qw/void av1_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_highbd_fdct4x4 sse2/;
-
-    add_proto qw/void av1_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_highbd_fdct8x8 sse2/;
-
-    add_proto qw/void av1_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_highbd_fdct8x8_1/;
-
-    add_proto qw/void av1_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_highbd_fdct16x16 sse2/;
-
-    add_proto qw/void av1_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_highbd_fdct16x16_1/;
-
-    add_proto qw/void av1_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_highbd_fdct32x32 sse2/;
-
-    add_proto qw/void av1_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_highbd_fdct32x32_rd sse2/;
-
-    add_proto qw/void av1_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_highbd_fdct32x32_1/;
-  }
-}
-
 add_proto qw/void av1_fwd_idtx/, "const int16_t *src_diff, tran_low_t *coeff, int stride, int bs, int tx_type";
   specialize qw/av1_fwd_idtx/;
 
-# Inverse transform
-if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
-  # Note as optimized versions of these functions are added we need to add a check to ensure
-  # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
-  add_proto qw/void av1_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/av1_idct4x4_1_add/;
-
-  add_proto qw/void av1_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/av1_idct4x4_16_add/;
-
-  add_proto qw/void av1_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/av1_idct8x8_1_add/;
-
-  add_proto qw/void av1_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/av1_idct8x8_64_add/;
-
-  add_proto qw/void av1_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/av1_idct8x8_12_add/;
-
-  add_proto qw/void av1_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/av1_idct16x16_1_add/;
-
-  add_proto qw/void av1_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/av1_idct16x16_256_add/;
-
-  add_proto qw/void av1_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/av1_idct16x16_10_add/;
-
-  add_proto qw/void av1_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/av1_idct32x32_1024_add/;
-
-  add_proto qw/void av1_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/av1_idct32x32_34_add/;
-
-  add_proto qw/void av1_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/av1_idct32x32_1_add/;
-
-  add_proto qw/void av1_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/av1_iwht4x4_1_add/;
-
-  add_proto qw/void av1_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/av1_iwht4x4_16_add/;
-
-  add_proto qw/void av1_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/av1_highbd_idct4x4_1_add/;
-
-  add_proto qw/void av1_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/av1_highbd_idct8x8_1_add/;
-
-  add_proto qw/void av1_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/av1_highbd_idct16x16_1_add/;
-
-  add_proto qw/void av1_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/av1_highbd_idct32x32_1024_add/;
-
-  add_proto qw/void av1_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/av1_highbd_idct32x32_34_add/;
-
-  add_proto qw/void av1_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/av1_highbd_idct32x32_1_add/;
-
-  add_proto qw/void av1_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/av1_highbd_iwht4x4_1_add/;
-
-  add_proto qw/void av1_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/av1_highbd_iwht4x4_16_add/;
-
-  # Force C versions if CONFIG_EMULATE_HARDWARE is 1
-  if (aom_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
-    add_proto qw/void av1_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-    specialize qw/av1_highbd_idct4x4_16_add/;
-
-    add_proto qw/void av1_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-    specialize qw/av1_highbd_idct8x8_64_add/;
-
-    add_proto qw/void av1_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-    specialize qw/av1_highbd_idct8x8_10_add/;
-
-    add_proto qw/void av1_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-    specialize qw/av1_highbd_idct16x16_256_add/;
-
-    add_proto qw/void av1_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-    specialize qw/av1_highbd_idct16x16_10_add/;
-  } else {
-    add_proto qw/void av1_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-    specialize qw/av1_highbd_idct4x4_16_add sse2/;
-
-    add_proto qw/void av1_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-    specialize qw/av1_highbd_idct8x8_64_add sse2/;
-
-    add_proto qw/void av1_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-    specialize qw/av1_highbd_idct8x8_10_add sse2/;
-
-    add_proto qw/void av1_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-    specialize qw/av1_highbd_idct16x16_256_add sse2/;
-
-    add_proto qw/void av1_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-    specialize qw/av1_highbd_idct16x16_10_add sse2/;
-  }  # CONFIG_EMULATE_HARDWARE
-} else {
-  # Force C versions if CONFIG_EMULATE_HARDWARE is 1
-  if (aom_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
-    add_proto qw/void av1_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_idct4x4_1_add/;
-
-    add_proto qw/void av1_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_idct4x4_16_add/;
-
-    add_proto qw/void av1_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_idct8x8_1_add/;
-
-    add_proto qw/void av1_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_idct8x8_64_add/;
-
-    add_proto qw/void av1_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_idct8x8_12_add/;
-
-    add_proto qw/void av1_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_idct16x16_1_add/;
-
-    add_proto qw/void av1_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_idct16x16_256_add/;
-
-    add_proto qw/void av1_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_idct16x16_10_add/;
-
-    add_proto qw/void av1_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_idct32x32_1024_add/;
-
-    add_proto qw/void av1_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_idct32x32_34_add/;
-
-    add_proto qw/void av1_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_idct32x32_1_add/;
-
-    add_proto qw/void av1_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_iwht4x4_1_add/;
-
-    add_proto qw/void av1_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_iwht4x4_16_add/;
-  } else {
-    add_proto qw/void av1_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_idct4x4_1_add sse2/;
-
-    add_proto qw/void av1_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_idct4x4_16_add sse2/;
-
-    add_proto qw/void av1_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_idct8x8_1_add sse2/;
-
-    add_proto qw/void av1_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_idct8x8_64_add sse2/;
-
-    add_proto qw/void av1_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_idct8x8_12_add sse2/;
-
-    add_proto qw/void av1_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_idct16x16_1_add sse2/;
-
-    add_proto qw/void av1_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_idct16x16_256_add sse2/;
-
-    add_proto qw/void av1_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_idct16x16_10_add sse2/;
-
-    add_proto qw/void av1_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_idct32x32_1024_add sse2/;
-
-    add_proto qw/void av1_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_idct32x32_34_add sse2/;
-
-    add_proto qw/void av1_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_idct32x32_1_add sse2/;
-
-    add_proto qw/void av1_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_iwht4x4_1_add/;
-
-    add_proto qw/void av1_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_iwht4x4_16_add/;
-  }  # CONFIG_EMULATE_HARDWARE
-}  # CONFIG_AOM_HIGHBITDEPTH
-
 if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
   #fwd txfm
   add_proto qw/void av1_fwd_txfm2d_4x4/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
@@ -826,6 +531,11 @@
   add_proto qw/void av1_highbd_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
   specialize qw/av1_highbd_fht32x32/;
 
+  if (aom_config("CONFIG_TX64X64") eq "yes") {
+    add_proto qw/void av1_highbd_fht64x64/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+    specialize qw/av1_highbd_fht64x64/;
+  }
+
   add_proto qw/void av1_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/av1_highbd_fwht4x4/;
 
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index 8d7c7f8..4d8f5e2 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -443,18 +443,6 @@
 }
 #endif  // CONFIG_SUPERTX
 
-static INLINE int get_tx1d_width(TX_SIZE tx_size) {
-  return num_4x4_blocks_wide_txsize_lookup[tx_size] << 2;
-}
-
-static INLINE int get_tx1d_height(TX_SIZE tx_size) {
-  return num_4x4_blocks_high_txsize_lookup[tx_size] << 2;
-}
-
-static INLINE int get_tx2d_size(TX_SIZE tx_size) {
-  return num_4x4_blocks_txsize_lookup[tx_size] << 4;
-}
-
 #if CONFIG_EXT_TX
 #define ALLOW_INTRA_EXT_TX 1
 
@@ -712,6 +700,14 @@
 
 void av1_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y);
 
+static INLINE int tx_size_to_depth(const TX_SIZE tx_size) {
+  return (int)(tx_size - TX_4X4);
+}
+
+static INLINE TX_SIZE depth_to_tx_size(const int depth) {
+  return (TX_SIZE)(depth + TX_4X4);
+}
+
 static INLINE TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi,
                                      const struct macroblockd_plane *pd) {
   TX_SIZE uv_txsize;
diff --git a/av1/common/common_data.h b/av1/common/common_data.h
index 9d851e2..4f799db 100644
--- a/av1/common/common_data.h
+++ b/av1/common/common_data.h
@@ -67,44 +67,6 @@
   1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 4, 2, 4, IF_EXT_PARTITION(8, 4, 8)
 };
 
-static const uint8_t num_4x4_blocks_txsize_lookup[TX_SIZES_ALL] = {
-  1, 4, 16, 64,
-#if CONFIG_EXT_TX
-  2, 2, 8,  8,  32, 32
-#endif  // CONFIG_EXT_TX
-};
-static const uint8_t num_4x4_blocks_wide_txsize_lookup[TX_SIZES_ALL] = {
-  1, 2, 4, 8,
-#if CONFIG_EXT_TX
-  1, 2, 2, 4, 4, 8
-#endif  // CONFIG_EXT_TX
-};
-static const uint8_t num_4x4_blocks_high_txsize_lookup[TX_SIZES_ALL] = {
-  1, 2, 4, 8,
-#if CONFIG_EXT_TX
-  2, 1, 4, 2, 8, 4
-#endif  // CONFIG_EXT_TX
-};
-
-static const uint8_t num_4x4_blocks_txsize_log2_lookup[TX_SIZES_ALL] = {
-  0, 2, 4, 6,
-#if CONFIG_EXT_TX
-  1, 1, 3, 3, 5, 5
-#endif  // CONFIG_EXT_TX
-};
-static const uint8_t num_4x4_blocks_wide_txsize_log2_lookup[TX_SIZES_ALL] = {
-  0, 1, 2, 3,
-#if CONFIG_EXT_TX
-  0, 1, 1, 2, 2, 3
-#endif  // CONFIG_EXT_TX
-};
-static const uint8_t num_4x4_blocks_high_txsize_log2_lookup[TX_SIZES_ALL] = {
-  0, 1, 2, 3,
-#if CONFIG_EXT_TX
-  1, 0, 2, 1, 3, 2
-#endif  // CONFIG_EXT_TX
-};
-
 // AOMMIN(3, AOMMIN(b_width_log2(bsize), b_height_log2(bsize)))
 static const uint8_t size_group_lookup[BLOCK_SIZES] = {
   0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, IF_EXT_PARTITION(3, 3, 3)
@@ -419,6 +381,9 @@
 /* clang-format on */
 
 static const TX_SIZE txsize_horz_map[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+  TX_2X2,  // TX_2X2
+#endif
   TX_4X4,    // TX_4X4
   TX_8X8,    // TX_8X8
   TX_16X16,  // TX_16X16
@@ -434,6 +399,9 @@
 };
 
 static const TX_SIZE txsize_vert_map[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+  TX_2X2,  // TX_2X2
+#endif
   TX_4X4,    // TX_4X4
   TX_8X8,    // TX_8X8
   TX_16X16,  // TX_16X16
@@ -450,6 +418,9 @@
 
 // Transform block width in pixels
 static const int tx_size_wide[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+  2,
+#endif
   4, 8, 16, 32,
 #if CONFIG_EXT_TX
   4, 8, 8,  16, 16, 32,
@@ -458,6 +429,9 @@
 
 // Transform block height in pixels
 static const int tx_size_high[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+  2,
+#endif
   4, 8, 16, 32,
 #if CONFIG_EXT_TX
   8, 4, 16, 8,  32, 16,
@@ -466,6 +440,9 @@
 
 // Transform block width in unit
 static const int tx_size_wide_unit[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+  1,
+#endif
   1, 2, 4, 8,
 #if CONFIG_EXT_TX
   1, 2, 2, 4, 4, 8,
@@ -474,6 +451,9 @@
 
 // Transform block height in unit
 static const int tx_size_high_unit[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+  1,
+#endif
   1, 2, 4, 8,
 #if CONFIG_EXT_TX
   2, 1, 4, 2, 8, 4,
@@ -482,6 +462,9 @@
 
 // Transform block width in log2
 static const int tx_size_wide_log2[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+  2,
+#endif
   2, 3, 4, 5,
 #if CONFIG_EXT_TX
   2, 3, 3, 4, 4, 5,
@@ -490,6 +473,9 @@
 
 // Transform block height in log2
 static const int tx_size_high_log2[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+  2,
+#endif
   2, 3, 4, 5,
 #if CONFIG_EXT_TX
   3, 2, 4, 3, 5, 4,
@@ -497,6 +483,9 @@
 };
 
 static const int tx_size_2d[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+  4,
+#endif
   16, 64, 256, 1024,
 #if CONFIG_EXT_TX
   32, 32, 128, 128,  512, 512,
@@ -509,6 +498,9 @@
 static const int tx_size_1d_in_unit_log2[TX_SIZES] = { 0, 1, 2, 3 };
 
 static const BLOCK_SIZE txsize_to_bsize[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+  BLOCK_4X4,  // TX_2X2
+#endif
   BLOCK_4X4,    // TX_4X4
   BLOCK_8X8,    // TX_8X8
   BLOCK_16X16,  // TX_16X16
@@ -524,6 +516,9 @@
 };
 
 static const TX_SIZE txsize_sqr_map[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+  TX_2X2,  // TX_2X2
+#endif
   TX_4X4,    // TX_4X4
   TX_8X8,    // TX_8X8
   TX_16X16,  // TX_16X16
@@ -539,6 +534,9 @@
 };
 
 static const TX_SIZE txsize_sqr_up_map[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+  TX_2X2,  // TX_2X2
+#endif
   TX_4X4,    // TX_4X4
   TX_8X8,    // TX_8X8
   TX_16X16,  // TX_16X16
@@ -589,7 +587,10 @@
   //  ss_x == 0    ss_x == 0        ss_x == 1      ss_x == 1
   //  ss_y == 0    ss_y == 1        ss_y == 0      ss_y == 1
   {
-      // BLOCK_4X4
+// BLOCK_4X4
+#if CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
       { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
       { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
       { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
@@ -604,7 +605,10 @@
 #endif  // CONFIG_EXT_TX
   },
   {
-      // BLOCK_4X8
+// BLOCK_4X8
+#if CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
       { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
       { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
       { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
@@ -623,7 +627,10 @@
 #endif  // CONFIG_EXT_TX
   },
   {
-      // BLOCK_8X4
+// BLOCK_8X4
+#if CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
       { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
       { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
       { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
@@ -642,7 +649,10 @@
 #endif  // CONFIG_EXT_TX
   },
   {
-      // BLOCK_8X8
+// BLOCK_8X8
+#if CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
       { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
       { { TX_8X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
       { { TX_8X8, TX_8X8 }, { TX_4X4, TX_4X4 } },
@@ -657,7 +667,10 @@
 #endif  // CONFIG_EXT_TX
   },
   {
-      // BLOCK_8X16
+// BLOCK_8X16
+#if CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
       { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
       { { TX_8X8, TX_8X8 }, { TX_4X4, TX_4X4 } },
       { { TX_8X8, TX_8X8 }, { TX_4X4, TX_4X4 } },
@@ -676,7 +689,10 @@
 #endif  // CONFIG_EXT_TX
   },
   {
-      // BLOCK_16X8
+// BLOCK_16X8
+#if CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
       { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
       { { TX_8X8, TX_4X4 }, { TX_8X8, TX_4X4 } },
       { { TX_8X8, TX_4X4 }, { TX_8X8, TX_8X8 } },
@@ -695,7 +711,10 @@
 #endif  // CONFIG_EXT_TX
   },
   {
-      // BLOCK_16X16
+// BLOCK_16X16
+#if CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
       { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
       { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
       { { TX_16X16, TX_8X8 }, { TX_8X8, TX_8X8 } },
@@ -710,7 +729,10 @@
 #endif  // CONFIG_EXT_TX
   },
   {
-      // BLOCK_16X32
+// BLOCK_16X32
+#if CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
       { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
       { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
       { { TX_16X16, TX_16X16 }, { TX_8X8, TX_8X8 } },
@@ -729,7 +751,10 @@
 #endif  // CONFIG_EXT_TX
   },
   {
-      // BLOCK_32X16
+// BLOCK_32X16
+#if CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
       { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
       { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
       { { TX_16X16, TX_8X8 }, { TX_16X16, TX_8X8 } },
@@ -748,7 +773,10 @@
 #endif  // CONFIG_EXT_TX
   },
   {
-      // BLOCK_32X32
+// BLOCK_32X32
+#if CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
       { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
       { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
       { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
@@ -763,7 +791,10 @@
 #endif  // CONFIG_EXT_TX
   },
   {
-      // BLOCK_32X64
+// BLOCK_32X64
+#if CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
       { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
       { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
       { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
@@ -778,7 +809,10 @@
 #endif  // CONFIG_EXT_TX
   },
   {
-      // BLOCK_64X32
+// BLOCK_64X32
+#if CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
       { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
       { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
       { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
@@ -793,7 +827,10 @@
 #endif  // CONFIG_EXT_TX
   },
   {
-      // BLOCK_64X64
+// BLOCK_64X64
+#if CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
       { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
       { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
       { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
@@ -801,7 +838,10 @@
 #if CONFIG_EXT_PARTITION
   },
   {
-      // BLOCK_64X128
+// BLOCK_64X128
+#if CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
       { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
       { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
       { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
@@ -816,7 +856,10 @@
 #endif  // CONFIG_EXT_TX
   },
   {
-      // BLOCK_128X64
+// BLOCK_128X64
+#if CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
       { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
       { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
       { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
@@ -831,7 +874,10 @@
 #endif  // CONFIG_EXT_TX
   },
   {
-      // BLOCK_128X128
+// BLOCK_128X128
+#if CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
       { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
       { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
       { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
@@ -891,8 +937,11 @@
 
 #if CONFIG_SUPERTX
 static const TX_SIZE uvsupertx_size_lookup[TX_SIZES][2][2] = {
-  //  ss_x == 0 ss_x == 0   ss_x == 1 ss_x == 1
-  //  ss_y == 0 ss_y == 1   ss_y == 0 ss_y == 1
+//  ss_x == 0 ss_x == 0   ss_x == 1 ss_x == 1
+//  ss_y == 0 ss_y == 1   ss_y == 0 ss_y == 1
+#if CONFIG_CB4X4
+  { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
   { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
   { { TX_8X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
   { { TX_16X16, TX_8X8 }, { TX_8X8, TX_8X8 } },
diff --git a/av1/common/av1_convolve.c b/av1/common/convolve.c
similarity index 95%
rename from av1/common/av1_convolve.c
rename to av1/common/convolve.c
index 1f8d623..eef629e 100644
--- a/av1/common/av1_convolve.c
+++ b/av1/common/convolve.c
@@ -1,8 +1,19 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
 #include <assert.h>
 #include <string.h>
 
 #include "./av1_rtcd.h"
-#include "av1/common/av1_convolve.h"
+#include "av1/common/convolve.h"
 #include "av1/common/filter.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_ports/mem.h"
diff --git a/av1/common/av1_convolve.h b/av1/common/convolve.h
similarity index 67%
rename from av1/common/av1_convolve.h
rename to av1/common/convolve.h
index 804c102..dafa032 100644
--- a/av1/common/av1_convolve.h
+++ b/av1/common/convolve.h
@@ -1,3 +1,14 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
 #ifndef AV1_COMMON_AV1_CONVOLVE_H_
 #define AV1_COMMON_AV1_CONVOLVE_H_
 #include "av1/common/filter.h"
diff --git a/av1/common/dering.c b/av1/common/dering.c
index c21d4e5..908c588 100644
--- a/av1/common/dering.c
+++ b/av1/common/dering.c
@@ -9,6 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+// clang-format off
+
 #include <string.h>
 #include <math.h>
 
@@ -45,22 +47,94 @@
   return skip;
 }
 
+int sb_all_skip_out(const AV1_COMMON *const cm, int mi_row, int mi_col,
+    unsigned char (*bskip)[2], int *count_ptr) {
+  int r, c;
+  int maxc, maxr;
+  int skip = 1;
+  MODE_INFO **grid;
+  int count=0;
+  grid = cm->mi_grid_visible;
+  maxc = cm->mi_cols - mi_col;
+  maxr = cm->mi_rows - mi_row;
+  if (maxr > MAX_MIB_SIZE) maxr = MAX_MIB_SIZE;
+  if (maxc > MAX_MIB_SIZE) maxc = MAX_MIB_SIZE;
+  for (r = 0; r < maxr; r++) {
+    MODE_INFO **grid_row;
+    grid_row = &grid[(mi_row + r) * cm->mi_stride + mi_col];
+    for (c = 0; c < maxc; c++) {
+      if (!grid_row[c]->mbmi.skip) {
+        skip = 0;
+        bskip[count][0] = r;
+        bskip[count][1] = c;
+        count++;
+      }
+    }
+  }
+  *count_ptr = count;
+  return skip;
+}
+
+static INLINE void copy_8x8_16_8bit(uint8_t *dst, int dstride, int16_t *src, int sstride) {
+  int i, j;
+  for (i = 0; i < 8; i++)
+    for (j = 0; j < 8; j++)
+      dst[i * dstride + j] = src[i * sstride + j];
+}
+
+static INLINE void copy_4x4_16_8bit(uint8_t *dst, int dstride, int16_t *src, int sstride) {
+  int i, j;
+  for (i = 0; i < 4; i++)
+    for (j = 0; j < 4; j++)
+      dst[i * dstride + j] = src[i * sstride + j];
+}
+
+/* TODO: Optimize this function for SSE. */
+void copy_blocks_16_8bit(uint8_t *dst, int dstride, int16_t *src, int sstride,
+    unsigned char (*bskip)[2], int dering_count, int bsize)
+{
+  int bi, bx, by;
+  if (bsize == 3) {
+    for (bi = 0; bi < dering_count; bi++) {
+      by = bskip[bi][0];
+      bx = bskip[bi][1];
+      copy_8x8_16_8bit(&dst[(by << 3) * dstride + (bx << 3)],
+                     dstride,
+                     &src[(by << 3) * sstride + (bx << 3)], sstride);
+    }
+  } else {
+    for (bi = 0; bi < dering_count; bi++) {
+      by = bskip[bi][0];
+      bx = bskip[bi][1];
+      copy_4x4_16_8bit(&dst[(by << 2) * dstride + (bx << 2)],
+                     dstride,
+                     &src[(by << 2) * sstride + (bx << 2)], sstride);
+    }
+  }
+}
+
 void av1_dering_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
                       MACROBLOCKD *xd, int global_level) {
   int r, c;
   int sbr, sbc;
   int nhsb, nvsb;
   od_dering_in *src[3];
-  unsigned char *bskip;
+  unsigned char bskip[MAX_MIB_SIZE*MAX_MIB_SIZE][2];
+  int dering_count;
   int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS] = { { 0 } };
   int stride;
   int bsize[3];
   int dec[3];
   int pli;
   int coeff_shift = AOMMAX(cm->bit_depth - 8, 0);
+  int nplanes;
+  if (xd->plane[1].subsampling_x == xd->plane[1].subsampling_y &&
+      xd->plane[2].subsampling_x == xd->plane[2].subsampling_y)
+    nplanes = 3;
+  else
+    nplanes = 1;
   nvsb = (cm->mi_rows + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
   nhsb = (cm->mi_cols + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
-  bskip = aom_malloc(sizeof(*bskip) * cm->mi_rows * cm->mi_cols);
   av1_setup_dst_planes(xd->plane, frame, 0, 0);
   for (pli = 0; pli < 3; pli++) {
     dec[pli] = xd->plane[pli].subsampling_x;
@@ -85,13 +159,6 @@
       }
     }
   }
-  for (r = 0; r < cm->mi_rows; ++r) {
-    for (c = 0; c < cm->mi_cols; ++c) {
-      const MB_MODE_INFO *mbmi =
-          &cm->mi_grid_visible[r * cm->mi_stride + c]->mbmi;
-      bskip[r * cm->mi_cols + c] = mbmi->skip;
-    }
-  }
   for (sbr = 0; sbr < nvsb; sbr++) {
     for (sbc = 0; sbc < nhsb; sbc++) {
       int level;
@@ -102,9 +169,10 @@
           global_level, cm->mi_grid_visible[MAX_MIB_SIZE * sbr * cm->mi_stride +
                                             MAX_MIB_SIZE * sbc]
                             ->mbmi.dering_gain);
-      if (level == 0 || sb_all_skip(cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE))
+      if (level == 0 ||
+          sb_all_skip_out(cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE, bskip, &dering_count))
         continue;
-      for (pli = 0; pli < 3; pli++) {
+      for (pli = 0; pli < nplanes; pli++) {
         int16_t dst[MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8];
         int threshold;
         /* FIXME: This is a temporary hack that uses more conservative
@@ -118,33 +186,31 @@
                   &src[pli][sbr * stride * bsize[pli] * MAX_MIB_SIZE +
                             sbc * bsize[pli] * MAX_MIB_SIZE],
                   stride, nhb, nvb, sbc, sbr, nhsb, nvsb, dec[pli], dir, pli,
-                  &bskip[MAX_MIB_SIZE * sbr * cm->mi_cols + MAX_MIB_SIZE * sbc],
-                  cm->mi_cols, threshold, coeff_shift);
-        for (r = 0; r < bsize[pli] * nvb; ++r) {
-          for (c = 0; c < bsize[pli] * nhb; ++c) {
+                  bskip, dering_count, threshold, coeff_shift);
 #if CONFIG_AOM_HIGHBITDEPTH
-            if (cm->use_highbitdepth) {
-              CONVERT_TO_SHORTPTR(xd->plane[pli].dst.buf)
-              [xd->plane[pli].dst.stride *
-                   (bsize[pli] * MAX_MIB_SIZE * sbr + r) +
-               sbc * bsize[pli] * MAX_MIB_SIZE + c] =
-                  dst[r * MAX_MIB_SIZE * bsize[pli] + c];
-            } else {
+        if (cm->use_highbitdepth) {
+          copy_blocks_16bit(
+              (int16_t*)&CONVERT_TO_SHORTPTR(
+                  xd->plane[pli].dst.buf)[xd->plane[pli].dst.stride *
+                  (bsize[pli] * MAX_MIB_SIZE * sbr) +
+                  sbc * bsize[pli] * MAX_MIB_SIZE],
+              xd->plane[pli].dst.stride, dst, MAX_MIB_SIZE * bsize[pli], bskip,
+              dering_count, 3 - dec[pli]);
+        } else {
 #endif
-              xd->plane[pli].dst.buf[xd->plane[pli].dst.stride *
-                                         (bsize[pli] * MAX_MIB_SIZE * sbr + r) +
-                                     sbc * bsize[pli] * MAX_MIB_SIZE + c] =
-                  dst[r * MAX_MIB_SIZE * bsize[pli] + c];
+          copy_blocks_16_8bit(
+              &xd->plane[pli].dst.buf[xd->plane[pli].dst.stride *
+                                    (bsize[pli] * MAX_MIB_SIZE * sbr) +
+                                    sbc * bsize[pli] * MAX_MIB_SIZE],
+              xd->plane[pli].dst.stride, dst, MAX_MIB_SIZE * bsize[pli], bskip,
+              dering_count, 3 - dec[pli]);
 #if CONFIG_AOM_HIGHBITDEPTH
-            }
-#endif
-          }
         }
+#endif
       }
     }
   }
-  for (pli = 0; pli < 3; pli++) {
+  for (pli = 0; pli < nplanes; pli++) {
     aom_free(src[pli]);
   }
-  aom_free(bskip);
 }
diff --git a/av1/common/dering.h b/av1/common/dering.h
index 7c93f8b..c906994 100644
--- a/av1/common/dering.h
+++ b/av1/common/dering.h
@@ -11,6 +11,8 @@
 #ifndef AV1_COMMON_DERING_H_
 #define AV1_COMMON_DERING_H_
 
+// clang-format off
+
 #include "av1/common/od_dering.h"
 #include "av1/common/onyxc_int.h"
 #include "aom/aom_integer.h"
@@ -29,6 +31,8 @@
 
 int compute_level_from_index(int global_level, int gi);
 int sb_all_skip(const AV1_COMMON *const cm, int mi_row, int mi_col);
+int sb_all_skip_out(const AV1_COMMON *const cm, int mi_row, int mi_col,
+    unsigned char (*bskip)[2], int *count_ptr);
 void av1_dering_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
                       MACROBLOCKD *xd, int global_level);
 
diff --git a/av1/common/entropy.c b/av1/common/entropy.c
index 870632d..7e66e93 100644
--- a/av1/common/entropy.c
+++ b/av1/common/entropy.c
@@ -9,11 +9,12 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include "./aom_config.h"
 #include "av1/common/entropy.h"
 #include "av1/common/blockd.h"
 #include "av1/common/onyxc_int.h"
-#include "av1/common/scan.h"
 #include "av1/common/entropymode.h"
+#include "av1/common/scan.h"
 #include "aom_mem/aom_mem.h"
 #include "aom/aom_integer.h"
 
@@ -58,6 +59,9 @@
 #endif
 
 const uint16_t band_count_table[TX_SIZES_ALL][8] = {
+#if CONFIG_CB4X4
+  { 1, 2, 2, 3, 0, 0, 0 },
+#endif
   { 1, 2, 3, 4, 3, 16 - 13, 0 },   { 1, 2, 3, 4, 11, 64 - 21, 0 },
   { 1, 2, 3, 4, 11, 256 - 21, 0 }, { 1, 2, 3, 4, 11, 1024 - 21, 0 },
 #if CONFIG_EXT_TX
@@ -68,6 +72,9 @@
 };
 
 const uint16_t band_cum_count_table[TX_SIZES_ALL][8] = {
+#if CONFIG_CB4X4
+  { 0, 1, 3, 6, 10, 13, 16, 0 },
+#endif
   { 0, 1, 3, 6, 10, 13, 16, 0 },  { 0, 1, 3, 6, 10, 21, 64, 0 },
   { 0, 1, 3, 6, 10, 21, 256, 0 }, { 0, 1, 3, 6, 10, 21, 1024, 0 },
 #if CONFIG_EXT_TX
@@ -407,7 +414,7 @@
   { 255, 246, 247, 255, 239, 255, 253, 255 },
 };
 
-#if CONFIG_RANS || CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
 // Model obtained from a 2-sided zero-centered distribution derived
 // from a Pareto distribution. The cdf of the distribution is:
 // cdf(x) = 0.5 + 0.5 * sgn(x) * [1 - {alpha/(alpha + |x|)} ^ beta]
@@ -679,7 +686,7 @@
       { 32512, 238, 11, 1, 1, 1, 1, 1, 1, 1 },
       { 32640, 117, 4, 1, 1, 1, 1, 1, 1, 1 },
     };
-#endif  // CONFIG_RANS
+#endif  // CONFIG_EC_MULTISYMBOL
 
 /* clang-format off */
 #if CONFIG_ENTROPY
@@ -2833,6 +2840,9 @@
       ROUND_POWER_OF_TWO(cm->base_qindex, 8 - QCTX_BIN_BITS), QCTX_BINS - 1);
   av1_copy(cm->fc->coef_probs, default_qctx_coef_probs[index]);
 #else
+#if CONFIG_CB4X4
+  av1_copy(cm->fc->coef_probs[TX_2X2], default_coef_probs_4x4);
+#endif
   av1_copy(cm->fc->coef_probs[TX_4X4], default_coef_probs_4x4);
   av1_copy(cm->fc->coef_probs[TX_8X8], default_coef_probs_8x8);
   av1_copy(cm->fc->coef_probs[TX_16X16], default_coef_probs_16x16);
@@ -2913,11 +2923,8 @@
     count_sat = COEF_COUNT_SAT;
   }
 #endif  // CONFIG_ENTROPY
-  for (tx_size = TX_4X4; tx_size <= TX_32X32; tx_size++)
+  for (tx_size = 0; tx_size < TX_SIZES; tx_size++)
     adapt_coef_probs(cm, tx_size, count_sat, update_factor);
-#if CONFIG_RANS
-  av1_coef_pareto_cdfs(cm->fc);
-#endif  // CONFIG_RANS
 
 #if CONFIG_ADAPT_SCAN
   for (tx_size = TX_4X4; tx_size < TX_SIZES; ++tx_size)
diff --git a/av1/common/entropy.h b/av1/common/entropy.h
index 469b484..423b35c 100644
--- a/av1/common/entropy.h
+++ b/av1/common/entropy.h
@@ -14,9 +14,6 @@
 
 #include "./aom_config.h"
 #include "aom/aom_integer.h"
-#if CONFIG_RANS
-#include "aom_dsp/ans.h"
-#endif  // CONFIG_RANS
 #include "aom_dsp/prob.h"
 
 #include "av1/common/common.h"
@@ -200,14 +197,14 @@
 
 void av1_model_to_full_probs(const aom_prob *model, aom_prob *full);
 
-#if CONFIG_RANS || CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
 typedef aom_cdf_prob coeff_cdf_model[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS]
                                     [ENTROPY_TOKENS];
 extern const aom_cdf_prob av1_pareto8_token_probs[COEFF_PROB_MODELS]
                                                  [ENTROPY_TOKENS - 2];
 struct frame_contexts;
 void av1_coef_pareto_cdfs(struct frame_contexts *fc);
-#endif  // CONFIG_RANS
+#endif  // CONFIG_EC_MULTISYMBOL
 
 typedef char ENTROPY_CONTEXT;
 
diff --git a/av1/common/entropymode.c b/av1/common/entropymode.c
index ecad3f4..910ba0f 100644
--- a/av1/common/entropymode.c
+++ b/av1/common/entropymode.c
@@ -561,280 +561,348 @@
         -PALETTE_COLOR_SEVEN, -PALETTE_COLOR_EIGHT },
     };
 
-const aom_prob
-    av1_default_palette_y_color_prob[PALETTE_MAX_SIZE - 1]
-                                    [PALETTE_COLOR_CONTEXTS]
-                                    [PALETTE_COLORS - 1] = {
-                                      {
-                                          // 2 colors
-                                          { 230, 255, 128, 128, 128, 128, 128 },
-                                          { 214, 255, 128, 128, 128, 128, 128 },
-                                          { 128, 128, 128, 128, 128, 128, 128 },
-                                          { 128, 128, 128, 128, 128, 128, 128 },
-                                          { 128, 128, 128, 128, 128, 128, 128 },
-                                          { 240, 255, 128, 128, 128, 128, 128 },
-                                          { 73, 255, 128, 128, 128, 128, 128 },
-                                          { 128, 128, 128, 128, 128, 128, 128 },
-                                          { 130, 255, 128, 128, 128, 128, 128 },
-                                          { 227, 255, 128, 128, 128, 128, 128 },
-                                          { 128, 128, 128, 128, 128, 128, 128 },
-                                          { 188, 255, 128, 128, 128, 128, 128 },
-                                          { 75, 255, 128, 128, 128, 128, 128 },
-                                          { 250, 255, 128, 128, 128, 128, 128 },
-                                          { 223, 255, 128, 128, 128, 128, 128 },
-                                          { 252, 255, 128, 128, 128, 128, 128 },
-                                      },
-                                      {
-                                          // 3 colors
-                                          { 229, 137, 255, 128, 128, 128, 128 },
-                                          { 197, 120, 255, 128, 128, 128, 128 },
-                                          { 107, 195, 255, 128, 128, 128, 128 },
-                                          { 128, 128, 128, 128, 128, 128, 128 },
-                                          { 27, 151, 255, 128, 128, 128, 128 },
-                                          { 230, 130, 255, 128, 128, 128, 128 },
-                                          { 37, 230, 255, 128, 128, 128, 128 },
-                                          { 67, 221, 255, 128, 128, 128, 128 },
-                                          { 124, 230, 255, 128, 128, 128, 128 },
-                                          { 195, 109, 255, 128, 128, 128, 128 },
-                                          { 99, 122, 255, 128, 128, 128, 128 },
-                                          { 205, 208, 255, 128, 128, 128, 128 },
-                                          { 40, 235, 255, 128, 128, 128, 128 },
-                                          { 251, 132, 255, 128, 128, 128, 128 },
-                                          { 237, 186, 255, 128, 128, 128, 128 },
-                                          { 253, 112, 255, 128, 128, 128, 128 },
-                                      },
-                                      {
-                                          // 4 colors
-                                          { 195, 87, 128, 255, 128, 128, 128 },
-                                          { 143, 100, 123, 255, 128, 128, 128 },
-                                          { 94, 124, 119, 255, 128, 128, 128 },
-                                          { 77, 91, 130, 255, 128, 128, 128 },
-                                          { 39, 114, 178, 255, 128, 128, 128 },
-                                          { 222, 94, 125, 255, 128, 128, 128 },
-                                          { 44, 203, 132, 255, 128, 128, 128 },
-                                          { 68, 175, 122, 255, 128, 128, 128 },
-                                          { 110, 187, 124, 255, 128, 128, 128 },
-                                          { 152, 91, 128, 255, 128, 128, 128 },
-                                          { 70, 109, 181, 255, 128, 128, 128 },
-                                          { 133, 113, 164, 255, 128, 128, 128 },
-                                          { 47, 205, 133, 255, 128, 128, 128 },
-                                          { 247, 94, 136, 255, 128, 128, 128 },
-                                          { 205, 122, 146, 255, 128, 128, 128 },
-                                          { 251, 100, 141, 255, 128, 128, 128 },
-                                      },
-                                      {
-                                          // 5 colors
-                                          { 195, 65, 84, 125, 255, 128, 128 },
-                                          { 150, 76, 84, 121, 255, 128, 128 },
-                                          { 94, 110, 81, 117, 255, 128, 128 },
-                                          { 79, 85, 91, 139, 255, 128, 128 },
-                                          { 26, 102, 139, 127, 255, 128, 128 },
-                                          { 220, 73, 91, 119, 255, 128, 128 },
-                                          { 38, 203, 86, 127, 255, 128, 128 },
-                                          { 61, 186, 72, 124, 255, 128, 128 },
-                                          { 132, 199, 84, 128, 255, 128, 128 },
-                                          { 172, 52, 62, 120, 255, 128, 128 },
-                                          { 102, 89, 121, 122, 255, 128, 128 },
-                                          { 182, 48, 69, 186, 255, 128, 128 },
-                                          { 36, 206, 87, 126, 255, 128, 128 },
-                                          { 249, 55, 67, 122, 255, 128, 128 },
-                                          { 218, 88, 75, 122, 255, 128, 128 },
-                                          { 253, 64, 80, 119, 255, 128, 128 },
-                                      },
-                                      {
-                                          // 6 colors
-                                          { 182, 54, 64, 75, 118, 255, 128 },
-                                          { 126, 67, 70, 76, 116, 255, 128 },
-                                          { 79, 92, 67, 85, 120, 255, 128 },
-                                          { 63, 61, 81, 118, 132, 255, 128 },
-                                          { 21, 80, 105, 83, 119, 255, 128 },
-                                          { 215, 72, 74, 74, 111, 255, 128 },
-                                          { 50, 176, 63, 79, 120, 255, 128 },
-                                          { 72, 148, 66, 77, 120, 255, 128 },
-                                          { 105, 177, 57, 78, 130, 255, 128 },
-                                          { 150, 66, 66, 80, 127, 255, 128 },
-                                          { 81, 76, 109, 85, 116, 255, 128 },
-                                          { 113, 81, 62, 96, 148, 255, 128 },
-                                          { 54, 179, 69, 82, 121, 255, 128 },
-                                          { 244, 47, 48, 67, 118, 255, 128 },
-                                          { 198, 83, 53, 65, 121, 255, 128 },
-                                          { 250, 42, 51, 69, 110, 255, 128 },
-                                      },
-                                      {
-                                          // 7 colors
-                                          { 182, 45, 54, 62, 74, 113, 255 },
-                                          { 124, 63, 57, 62, 77, 114, 255 },
-                                          { 77, 80, 56, 66, 76, 117, 255 },
-                                          { 63, 57, 69, 98, 85, 131, 255 },
-                                          { 19, 81, 98, 63, 80, 116, 255 },
-                                          { 215, 56, 60, 63, 68, 105, 255 },
-                                          { 50, 174, 50, 60, 79, 118, 255 },
-                                          { 68, 151, 50, 58, 73, 117, 255 },
-                                          { 104, 182, 53, 57, 79, 127, 255 },
-                                          { 156, 50, 51, 63, 77, 111, 255 },
-                                          { 88, 67, 97, 59, 82, 120, 255 },
-                                          { 114, 81, 46, 65, 103, 132, 255 },
-                                          { 55, 166, 57, 66, 82, 120, 255 },
-                                          { 245, 34, 38, 43, 63, 114, 255 },
-                                          { 203, 68, 45, 47, 60, 118, 255 },
-                                          { 250, 35, 37, 47, 66, 110, 255 },
-                                      },
-                                      {
-                                          // 8 colors
-                                          { 180, 43, 46, 50, 56, 69, 109 },
-                                          { 116, 53, 51, 49, 57, 73, 115 },
-                                          { 79, 70, 49, 50, 59, 74, 117 },
-                                          { 60, 54, 57, 70, 62, 83, 129 },
-                                          { 20, 73, 85, 52, 66, 81, 119 },
-                                          { 213, 56, 52, 49, 53, 62, 104 },
-                                          { 48, 161, 41, 45, 56, 77, 116 },
-                                          { 68, 139, 40, 47, 54, 71, 116 },
-                                          { 123, 166, 42, 43, 52, 76, 130 },
-                                          { 153, 44, 44, 47, 54, 79, 129 },
-                                          { 87, 64, 83, 49, 60, 75, 127 },
-                                          { 131, 68, 43, 48, 73, 96, 130 },
-                                          { 55, 152, 45, 51, 64, 77, 113 },
-                                          { 243, 30, 28, 33, 41, 65, 114 },
-                                          { 202, 56, 35, 36, 42, 63, 123 },
-                                          { 249, 31, 29, 32, 45, 68, 111 },
-                                      }
-                                    };
+// Note: Has to be non-zero to avoid any asserts triggering.
+#define UNUSED_PROB 128
+
+const aom_prob av1_default_palette_y_color_prob
+    [PALETTE_MAX_SIZE - 1][PALETTE_COLOR_CONTEXTS][PALETTE_COLORS - 1] = {
+      {
+          // 2 colors
+          { 230, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { 214, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { 240, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { 73, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { 130, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { 227, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { 188, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { 75, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 250, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { 223, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { 252, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+      },
+      {
+          // 3 colors
+          { 229, 137, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 197, 120, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 107, 195, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { 27, 151, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 230, 130, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 37, 230, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 67, 221, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 124, 230, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 195, 109, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 99, 122, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 205, 208, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 40, 235, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 251, 132, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 237, 186, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 253, 112, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+      },
+      {
+          // 4 colors
+          { 195, 87, 128, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 143, 100, 123, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 94, 124, 119, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 77, 91, 130, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 39, 114, 178, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 222, 94, 125, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 44, 203, 132, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 68, 175, 122, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 110, 187, 124, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 152, 91, 128, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 70, 109, 181, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 133, 113, 164, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 47, 205, 133, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 247, 94, 136, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 205, 122, 146, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 251, 100, 141, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+      },
+      {
+          // 5 colors
+          { 195, 65, 84, 125, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 150, 76, 84, 121, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 94, 110, 81, 117, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 79, 85, 91, 139, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 26, 102, 139, 127, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 220, 73, 91, 119, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 38, 203, 86, 127, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 61, 186, 72, 124, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 132, 199, 84, 128, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 172, 52, 62, 120, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 102, 89, 121, 122, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 182, 48, 69, 186, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 36, 206, 87, 126, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 249, 55, 67, 122, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 218, 88, 75, 122, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 253, 64, 80, 119, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+      },
+      {
+          // 6 colors
+          { 182, 54, 64, 75, 118, UNUSED_PROB, UNUSED_PROB },
+          { 126, 67, 70, 76, 116, UNUSED_PROB, UNUSED_PROB },
+          { 79, 92, 67, 85, 120, UNUSED_PROB, UNUSED_PROB },
+          { 63, 61, 81, 118, 132, UNUSED_PROB, UNUSED_PROB },
+          { 21, 80, 105, 83, 119, UNUSED_PROB, UNUSED_PROB },
+          { 215, 72, 74, 74, 111, UNUSED_PROB, UNUSED_PROB },
+          { 50, 176, 63, 79, 120, UNUSED_PROB, UNUSED_PROB },
+          { 72, 148, 66, 77, 120, UNUSED_PROB, UNUSED_PROB },
+          { 105, 177, 57, 78, 130, UNUSED_PROB, UNUSED_PROB },
+          { 150, 66, 66, 80, 127, UNUSED_PROB, UNUSED_PROB },
+          { 81, 76, 109, 85, 116, UNUSED_PROB, UNUSED_PROB },
+          { 113, 81, 62, 96, 148, UNUSED_PROB, UNUSED_PROB },
+          { 54, 179, 69, 82, 121, UNUSED_PROB, UNUSED_PROB },
+          { 244, 47, 48, 67, 118, UNUSED_PROB, UNUSED_PROB },
+          { 198, 83, 53, 65, 121, UNUSED_PROB, UNUSED_PROB },
+          { 250, 42, 51, 69, 110, UNUSED_PROB, UNUSED_PROB },
+      },
+      {
+          // 7 colors
+          { 182, 45, 54, 62, 74, 113, UNUSED_PROB },
+          { 124, 63, 57, 62, 77, 114, UNUSED_PROB },
+          { 77, 80, 56, 66, 76, 117, UNUSED_PROB },
+          { 63, 57, 69, 98, 85, 131, UNUSED_PROB },
+          { 19, 81, 98, 63, 80, 116, UNUSED_PROB },
+          { 215, 56, 60, 63, 68, 105, UNUSED_PROB },
+          { 50, 174, 50, 60, 79, 118, UNUSED_PROB },
+          { 68, 151, 50, 58, 73, 117, UNUSED_PROB },
+          { 104, 182, 53, 57, 79, 127, UNUSED_PROB },
+          { 156, 50, 51, 63, 77, 111, UNUSED_PROB },
+          { 88, 67, 97, 59, 82, 120, UNUSED_PROB },
+          { 114, 81, 46, 65, 103, 132, UNUSED_PROB },
+          { 55, 166, 57, 66, 82, 120, UNUSED_PROB },
+          { 245, 34, 38, 43, 63, 114, UNUSED_PROB },
+          { 203, 68, 45, 47, 60, 118, UNUSED_PROB },
+          { 250, 35, 37, 47, 66, 110, UNUSED_PROB },
+      },
+      {
+          // 8 colors
+          { 180, 43, 46, 50, 56, 69, 109 },
+          { 116, 53, 51, 49, 57, 73, 115 },
+          { 79, 70, 49, 50, 59, 74, 117 },
+          { 60, 54, 57, 70, 62, 83, 129 },
+          { 20, 73, 85, 52, 66, 81, 119 },
+          { 213, 56, 52, 49, 53, 62, 104 },
+          { 48, 161, 41, 45, 56, 77, 116 },
+          { 68, 139, 40, 47, 54, 71, 116 },
+          { 123, 166, 42, 43, 52, 76, 130 },
+          { 153, 44, 44, 47, 54, 79, 129 },
+          { 87, 64, 83, 49, 60, 75, 127 },
+          { 131, 68, 43, 48, 73, 96, 130 },
+          { 55, 152, 45, 51, 64, 77, 113 },
+          { 243, 30, 28, 33, 41, 65, 114 },
+          { 202, 56, 35, 36, 42, 63, 123 },
+          { 249, 31, 29, 32, 45, 68, 111 },
+      }
+    };
 
 const aom_prob av1_default_palette_uv_color_prob
-    [PALETTE_MAX_SIZE - 1][PALETTE_COLOR_CONTEXTS]
-    [PALETTE_COLORS - 1] = { {
-                                 // 2 colors
-                                 { 228, 255, 128, 128, 128, 128, 128 },
-                                 { 195, 255, 128, 128, 128, 128, 128 },
-                                 { 128, 128, 128, 128, 128, 128, 128 },
-                                 { 128, 128, 128, 128, 128, 128, 128 },
-                                 { 128, 128, 128, 128, 128, 128, 128 },
-                                 { 228, 255, 128, 128, 128, 128, 128 },
-                                 { 71, 255, 128, 128, 128, 128, 128 },
-                                 { 128, 128, 128, 128, 128, 128, 128 },
-                                 { 129, 255, 128, 128, 128, 128, 128 },
-                                 { 206, 255, 128, 128, 128, 128, 128 },
-                                 { 128, 128, 128, 128, 128, 128, 128 },
-                                 { 136, 255, 128, 128, 128, 128, 128 },
-                                 { 98, 255, 128, 128, 128, 128, 128 },
-                                 { 236, 255, 128, 128, 128, 128, 128 },
-                                 { 222, 255, 128, 128, 128, 128, 128 },
-                                 { 249, 255, 128, 128, 128, 128, 128 },
-                             },
-                             {
-                                 // 3 colors
-                                 { 198, 136, 255, 128, 128, 128, 128 },
-                                 { 178, 105, 255, 128, 128, 128, 128 },
-                                 { 100, 206, 255, 128, 128, 128, 128 },
-                                 { 128, 128, 128, 128, 128, 128, 128 },
-                                 { 12, 136, 255, 128, 128, 128, 128 },
-                                 { 219, 134, 255, 128, 128, 128, 128 },
-                                 { 50, 198, 255, 128, 128, 128, 128 },
-                                 { 61, 231, 255, 128, 128, 128, 128 },
-                                 { 110, 209, 255, 128, 128, 128, 128 },
-                                 { 173, 106, 255, 128, 128, 128, 128 },
-                                 { 145, 166, 255, 128, 128, 128, 128 },
-                                 { 156, 175, 255, 128, 128, 128, 128 },
-                                 { 69, 183, 255, 128, 128, 128, 128 },
-                                 { 241, 163, 255, 128, 128, 128, 128 },
-                                 { 224, 160, 255, 128, 128, 128, 128 },
-                                 { 246, 154, 255, 128, 128, 128, 128 },
-                             },
-                             {
-                                 // 4 colors
-                                 { 173, 88, 143, 255, 128, 128, 128 },
-                                 { 146, 81, 127, 255, 128, 128, 128 },
-                                 { 84, 134, 102, 255, 128, 128, 128 },
-                                 { 69, 138, 140, 255, 128, 128, 128 },
-                                 { 31, 103, 200, 255, 128, 128, 128 },
-                                 { 217, 101, 139, 255, 128, 128, 128 },
-                                 { 51, 174, 121, 255, 128, 128, 128 },
-                                 { 64, 177, 109, 255, 128, 128, 128 },
-                                 { 96, 179, 145, 255, 128, 128, 128 },
-                                 { 164, 77, 114, 255, 128, 128, 128 },
-                                 { 87, 94, 156, 255, 128, 128, 128 },
-                                 { 105, 57, 173, 255, 128, 128, 128 },
-                                 { 63, 158, 137, 255, 128, 128, 128 },
-                                 { 236, 102, 156, 255, 128, 128, 128 },
-                                 { 197, 115, 153, 255, 128, 128, 128 },
-                                 { 245, 106, 154, 255, 128, 128, 128 },
-                             },
-                             {
-                                 // 5 colors
-                                 { 179, 64, 97, 129, 255, 128, 128 },
-                                 { 137, 56, 88, 125, 255, 128, 128 },
-                                 { 82, 107, 61, 118, 255, 128, 128 },
-                                 { 59, 113, 86, 115, 255, 128, 128 },
-                                 { 23, 88, 118, 130, 255, 128, 128 },
-                                 { 213, 66, 90, 125, 255, 128, 128 },
-                                 { 37, 181, 103, 121, 255, 128, 128 },
-                                 { 47, 188, 61, 131, 255, 128, 128 },
-                                 { 104, 185, 103, 144, 255, 128, 128 },
-                                 { 163, 39, 76, 112, 255, 128, 128 },
-                                 { 94, 74, 131, 126, 255, 128, 128 },
-                                 { 142, 42, 103, 163, 255, 128, 128 },
-                                 { 53, 162, 99, 149, 255, 128, 128 },
-                                 { 239, 54, 84, 108, 255, 128, 128 },
-                                 { 203, 84, 110, 147, 255, 128, 128 },
-                                 { 248, 70, 105, 151, 255, 128, 128 },
-                             },
-                             {
-                                 // 6 colors
-                                 { 189, 50, 67, 90, 130, 255, 128 },
-                                 { 114, 50, 55, 90, 123, 255, 128 },
-                                 { 66, 76, 54, 82, 128, 255, 128 },
-                                 { 43, 69, 69, 80, 129, 255, 128 },
-                                 { 22, 59, 87, 88, 141, 255, 128 },
-                                 { 203, 49, 68, 87, 122, 255, 128 },
-                                 { 43, 157, 74, 104, 146, 255, 128 },
-                                 { 54, 138, 51, 95, 138, 255, 128 },
-                                 { 82, 171, 58, 102, 146, 255, 128 },
-                                 { 129, 38, 59, 64, 168, 255, 128 },
-                                 { 56, 67, 119, 92, 112, 255, 128 },
-                                 { 96, 62, 53, 132, 82, 255, 128 },
-                                 { 60, 147, 77, 108, 145, 255, 128 },
-                                 { 238, 76, 73, 93, 148, 255, 128 },
-                                 { 189, 86, 73, 103, 157, 255, 128 },
-                                 { 246, 62, 75, 83, 167, 255, 128 },
-                             },
-                             {
-                                 // 7 colors
-                                 { 179, 42, 51, 73, 99, 134, 255 },
-                                 { 119, 52, 52, 61, 64, 114, 255 },
-                                 { 53, 77, 35, 65, 71, 131, 255 },
-                                 { 38, 70, 51, 68, 89, 144, 255 },
-                                 { 23, 65, 128, 73, 97, 131, 255 },
-                                 { 210, 47, 52, 63, 81, 143, 255 },
-                                 { 42, 159, 57, 68, 98, 143, 255 },
-                                 { 49, 153, 45, 82, 93, 143, 255 },
-                                 { 81, 169, 52, 72, 113, 151, 255 },
-                                 { 136, 46, 35, 56, 75, 96, 255 },
-                                 { 57, 84, 109, 47, 107, 131, 255 },
-                                 { 128, 78, 57, 36, 128, 85, 255 },
-                                 { 54, 149, 68, 77, 94, 153, 255 },
-                                 { 243, 58, 50, 71, 81, 167, 255 },
-                                 { 189, 92, 64, 70, 121, 173, 255 },
-                                 { 248, 35, 38, 51, 82, 201, 255 },
-                             },
-                             {
-                                 // 8 colors
-                                 { 201, 40, 36, 42, 64, 92, 123 },
-                                 { 116, 43, 33, 43, 73, 102, 128 },
-                                 { 46, 77, 37, 69, 62, 78, 150 },
-                                 { 40, 65, 52, 50, 76, 89, 133 },
-                                 { 28, 48, 91, 17, 64, 77, 133 },
-                                 { 218, 43, 43, 37, 56, 72, 163 },
-                                 { 41, 155, 44, 83, 82, 129, 180 },
-                                 { 44, 141, 29, 55, 64, 89, 147 },
-                                 { 92, 166, 48, 45, 59, 126, 179 },
-                                 { 169, 35, 49, 41, 36, 99, 139 },
-                                 { 55, 77, 77, 56, 60, 75, 156 },
-                                 { 155, 81, 51, 64, 57, 182, 255 },
-                                 { 60, 134, 49, 49, 93, 128, 174 },
-                                 { 244, 98, 51, 46, 22, 73, 238 },
-                                 { 189, 70, 40, 87, 93, 79, 201 },
-                                 { 248, 54, 49, 40, 29, 42, 227 },
-                             } };
+    [PALETTE_MAX_SIZE - 1][PALETTE_COLOR_CONTEXTS][PALETTE_COLORS - 1] = {
+      {
+          // 2 colors
+          { 228, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { 195, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { 228, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { 71, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { 129, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { 206, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { 136, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { 98, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 236, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { 222, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { 249, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+      },
+      {
+          // 3 colors
+          { 198, 136, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 178, 105, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 100, 206, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { 12, 136, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 219, 134, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 50, 198, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 61, 231, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 110, 209, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 173, 106, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 145, 166, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 156, 175, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 69, 183, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 241, 163, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 224, 160, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 246, 154, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+      },
+      {
+          // 4 colors
+          { 173, 88, 143, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 146, 81, 127, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 84, 134, 102, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 69, 138, 140, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 31, 103, 200, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 217, 101, 139, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 51, 174, 121, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 64, 177, 109, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 96, 179, 145, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 164, 77, 114, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 87, 94, 156, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 105, 57, 173, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 63, 158, 137, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 236, 102, 156, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 197, 115, 153, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 245, 106, 154, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+      },
+      {
+          // 5 colors
+          { 179, 64, 97, 129, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 137, 56, 88, 125, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 82, 107, 61, 118, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 59, 113, 86, 115, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 23, 88, 118, 130, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 213, 66, 90, 125, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 37, 181, 103, 121, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 47, 188, 61, 131, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 104, 185, 103, 144, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 163, 39, 76, 112, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 94, 74, 131, 126, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 142, 42, 103, 163, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 53, 162, 99, 149, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 239, 54, 84, 108, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 203, 84, 110, 147, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 248, 70, 105, 151, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+      },
+      {
+          // 6 colors
+          { 189, 50, 67, 90, 130, UNUSED_PROB, UNUSED_PROB },
+          { 114, 50, 55, 90, 123, UNUSED_PROB, UNUSED_PROB },
+          { 66, 76, 54, 82, 128, UNUSED_PROB, UNUSED_PROB },
+          { 43, 69, 69, 80, 129, UNUSED_PROB, UNUSED_PROB },
+          { 22, 59, 87, 88, 141, UNUSED_PROB, UNUSED_PROB },
+          { 203, 49, 68, 87, 122, UNUSED_PROB, UNUSED_PROB },
+          { 43, 157, 74, 104, 146, UNUSED_PROB, UNUSED_PROB },
+          { 54, 138, 51, 95, 138, UNUSED_PROB, UNUSED_PROB },
+          { 82, 171, 58, 102, 146, UNUSED_PROB, UNUSED_PROB },
+          { 129, 38, 59, 64, 168, UNUSED_PROB, UNUSED_PROB },
+          { 56, 67, 119, 92, 112, UNUSED_PROB, UNUSED_PROB },
+          { 96, 62, 53, 132, 82, UNUSED_PROB, UNUSED_PROB },
+          { 60, 147, 77, 108, 145, UNUSED_PROB, UNUSED_PROB },
+          { 238, 76, 73, 93, 148, UNUSED_PROB, UNUSED_PROB },
+          { 189, 86, 73, 103, 157, UNUSED_PROB, UNUSED_PROB },
+          { 246, 62, 75, 83, 167, UNUSED_PROB, UNUSED_PROB },
+      },
+      {
+          // 7 colors
+          { 179, 42, 51, 73, 99, 134, UNUSED_PROB },
+          { 119, 52, 52, 61, 64, 114, UNUSED_PROB },
+          { 53, 77, 35, 65, 71, 131, UNUSED_PROB },
+          { 38, 70, 51, 68, 89, 144, UNUSED_PROB },
+          { 23, 65, 128, 73, 97, 131, UNUSED_PROB },
+          { 210, 47, 52, 63, 81, 143, UNUSED_PROB },
+          { 42, 159, 57, 68, 98, 143, UNUSED_PROB },
+          { 49, 153, 45, 82, 93, 143, UNUSED_PROB },
+          { 81, 169, 52, 72, 113, 151, UNUSED_PROB },
+          { 136, 46, 35, 56, 75, 96, UNUSED_PROB },
+          { 57, 84, 109, 47, 107, 131, UNUSED_PROB },
+          { 128, 78, 57, 36, 128, 85, UNUSED_PROB },
+          { 54, 149, 68, 77, 94, 153, UNUSED_PROB },
+          { 243, 58, 50, 71, 81, 167, UNUSED_PROB },
+          { 189, 92, 64, 70, 121, 173, UNUSED_PROB },
+          { 248, 35, 38, 51, 82, 201, UNUSED_PROB },
+      },
+      {
+          // 8 colors
+          { 201, 40, 36, 42, 64, 92, 123 },
+          { 116, 43, 33, 43, 73, 102, 128 },
+          { 46, 77, 37, 69, 62, 78, 150 },
+          { 40, 65, 52, 50, 76, 89, 133 },
+          { 28, 48, 91, 17, 64, 77, 133 },
+          { 218, 43, 43, 37, 56, 72, 163 },
+          { 41, 155, 44, 83, 82, 129, 180 },
+          { 44, 141, 29, 55, 64, 89, 147 },
+          { 92, 166, 48, 45, 59, 126, 179 },
+          { 169, 35, 49, 41, 36, 99, 139 },
+          { 55, 77, 77, 56, 60, 75, 156 },
+          { 155, 81, 51, 64, 57, 182, 255 },
+          { 60, 134, 49, 49, 93, 128, 174 },
+          { 244, 98, 51, 46, 22, 73, 238 },
+          { 189, 70, 40, 87, 93, 79, 201 },
+          { 248, 54, 49, 40, 29, 42, 227 },
+      }
+    };
+
+#undef UNUSED_PROB
 
 static const int palette_color_context_lookup[PALETTE_COLOR_CONTEXTS] = {
   // (3, 0, 0, 0), (3, 2, 0, 0), (3, 3, 2, 0), (3, 3, 2, 2),
@@ -848,18 +916,18 @@
 };
 #endif  // CONFIG_PALETTE
 
-const aom_tree_index av1_tx_size_tree[TX_SIZES - 1][TREE_SIZE(TX_SIZES)] = {
+const aom_tree_index av1_tx_size_tree[MAX_TX_DEPTH][TREE_SIZE(TX_SIZES)] = {
   {
       // Max tx_size is 8X8
-      -TX_4X4, -TX_8X8,
+      -0, -1,
   },
   {
       // Max tx_size is 16X16
-      -TX_4X4, 2, -TX_8X8, -TX_16X16,
+      -0, 2, -1, -2,
   },
   {
       // Max tx_size is 32X32
-      -TX_4X4, 2, -TX_8X8, 4, -TX_16X16, -TX_32X32,
+      -0, 2, -1, 4, -2, -3,
   },
 };
 
@@ -902,30 +970,26 @@
 int av1_get_palette_color_context(const uint8_t *color_map, int cols, int r,
                                   int c, int n, uint8_t *color_order,
                                   int *color_idx) {
-  int i, j, max, max_idx, temp;
+  int i;
+  // The +10 below should not be needed. But we get a warning "array subscript
+  // is above array bounds [-Werror=array-bounds]" without it, possibly due to
+  // this (or similar) bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=59124
   int scores[PALETTE_MAX_SIZE + 10];
-  int weights[4] = { 3, 2, 3, 2 };
-  int color_ctx = 0;
+  const int weights[4] = { 3, 2, 3, 2 };
+  int color_ctx_hash;
+  int color_ctx;
   int color_neighbors[4];
   int inverse_color_order[PALETTE_MAX_SIZE];
   assert(n <= PALETTE_MAX_SIZE);
 
-  if (c - 1 >= 0)
-    color_neighbors[0] = color_map[r * cols + c - 1];
-  else
-    color_neighbors[0] = -1;
-  if (c - 1 >= 0 && r - 1 >= 0)
-    color_neighbors[1] = color_map[(r - 1) * cols + c - 1];
-  else
-    color_neighbors[1] = -1;
-  if (r - 1 >= 0)
-    color_neighbors[2] = color_map[(r - 1) * cols + c];
-  else
-    color_neighbors[2] = -1;
-  if (r - 1 >= 0 && c + 1 <= cols - 1)
-    color_neighbors[3] = color_map[(r - 1) * cols + c + 1];
-  else
-    color_neighbors[3] = -1;
+  // Get color indices of neighbors.
+  color_neighbors[0] = (c - 1 >= 0) ? color_map[r * cols + c - 1] : -1;
+  color_neighbors[1] =
+      (c - 1 >= 0 && r - 1 >= 0) ? color_map[(r - 1) * cols + c - 1] : -1;
+  color_neighbors[2] = (r - 1 >= 0) ? color_map[(r - 1) * cols + c] : -1;
+  color_neighbors[3] = (r - 1 >= 0 && c + 1 <= cols - 1)
+                           ? color_map[(r - 1) * cols + c + 1]
+                           : -1;
 
   for (i = 0; i < PALETTE_MAX_SIZE; ++i) {
     color_order[i] = i;
@@ -933,23 +997,25 @@
   }
   memset(scores, 0, PALETTE_MAX_SIZE * sizeof(scores[0]));
   for (i = 0; i < 4; ++i) {
-    if (color_neighbors[i] >= 0) scores[color_neighbors[i]] += weights[i];
+    if (color_neighbors[i] >= 0) {
+      scores[color_neighbors[i]] += weights[i];
+    }
   }
 
+  // Get the top 4 scores (sorted from large to small).
   for (i = 0; i < 4; ++i) {
-    max = scores[i];
-    max_idx = i;
-    j = i + 1;
-    while (j < n) {
+    int max = scores[i];
+    int max_idx = i;
+    int j;
+    for (j = i + 1; j < n; ++j) {
       if (scores[j] > max) {
         max = scores[j];
         max_idx = j;
       }
-      ++j;
     }
 
     if (max_idx != i) {
-      temp = scores[i];
+      int temp = scores[i];
       scores[i] = scores[max_idx];
       scores[max_idx] = temp;
 
@@ -961,15 +1027,19 @@
     }
   }
 
-  for (i = 0; i < 4; ++i) color_ctx = color_ctx * 11 + scores[i];
+  // Get hash value of context.
+  color_ctx_hash = 0;
+  for (i = 0; i < 4; ++i) color_ctx_hash = color_ctx_hash * 11 + scores[i];
 
-  for (i = 0; i < PALETTE_COLOR_CONTEXTS; ++i)
-    if (color_ctx == palette_color_context_lookup[i]) {
+  // Lookup context from hash.
+  color_ctx = 0;  // Default.
+  for (i = 0; i < PALETTE_COLOR_CONTEXTS; ++i) {
+    if (color_ctx_hash == palette_color_context_lookup[i]) {
       color_ctx = i;
       break;
     }
+  }
 
-  if (color_ctx >= PALETTE_COLOR_CONTEXTS) color_ctx = 0;
   if (color_idx != NULL) {
     *color_idx = inverse_color_order[color_map[r * cols + c]];
   }
@@ -979,7 +1049,7 @@
 
 #if CONFIG_VAR_TX
 static const aom_prob default_txfm_partition_probs[TXFM_PARTITION_CONTEXTS] = {
-  192, 128, 64, 192, 128, 64, 192, 128, 64,
+  250, 231, 212, 241, 166, 66, 241, 230, 135, 243, 154, 64, 248, 161, 63, 128,
 };
 #endif
 
@@ -1310,13 +1380,21 @@
 
 static const aom_prob
     default_intra_ext_tx_prob[EXT_TX_SIZES][TX_TYPES][TX_TYPES - 1] = {
+#if CONFIG_CB4X4
+      { { 240, 85, 128 }, { 4, 1, 248 }, { 4, 1, 8 }, { 4, 248, 128 } },
+#endif
       { { 240, 85, 128 }, { 4, 1, 248 }, { 4, 1, 8 }, { 4, 248, 128 } },
       { { 244, 85, 128 }, { 8, 2, 248 }, { 8, 2, 8 }, { 8, 248, 128 } },
       { { 248, 85, 128 }, { 16, 4, 248 }, { 16, 4, 8 }, { 16, 248, 128 } },
     };
 
 static const aom_prob default_inter_ext_tx_prob[EXT_TX_SIZES][TX_TYPES - 1] = {
-  { 160, 85, 128 }, { 176, 85, 128 }, { 192, 85, 128 },
+#if CONFIG_CB4X4
+  { 160, 85, 128 },
+#endif
+  { 160, 85, 128 },
+  { 176, 85, 128 },
+  { 192, 85, 128 },
 };
 #endif  // CONFIG_EXT_TX
 
@@ -1440,6 +1518,48 @@
 #if CONFIG_DAALA_EC
 int av1_switchable_interp_ind[SWITCHABLE_FILTERS];
 int av1_switchable_interp_inv[SWITCHABLE_FILTERS];
+
+void av1_set_mode_cdfs(struct AV1Common *cm) {
+  FRAME_CONTEXT *fc = cm->fc;
+  int i, j;
+  if (cm->seg.enabled && cm->seg.update_map) {
+    av1_tree_to_cdf(av1_segment_tree, cm->fc->seg.tree_probs,
+                    cm->fc->seg.tree_cdf);
+  }
+
+  for (i = 0; i < INTRA_MODES; ++i)
+    av1_tree_to_cdf(av1_intra_mode_tree, fc->uv_mode_prob[i],
+                    fc->uv_mode_cdf[i]);
+
+  for (i = 0; i < PARTITION_CONTEXTS; ++i)
+    av1_tree_to_cdf(av1_partition_tree, fc->partition_prob[i],
+                    fc->partition_cdf[i]);
+
+  for (i = 0; i < INTRA_MODES; ++i)
+    for (j = 0; j < INTRA_MODES; ++j)
+      av1_tree_to_cdf(av1_intra_mode_tree, cm->kf_y_prob[i][j],
+                      cm->kf_y_cdf[i][j]);
+
+  for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
+    av1_tree_to_cdf(av1_switchable_interp_tree, fc->switchable_interp_prob[j],
+                    fc->switchable_interp_cdf[j]);
+
+  for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
+    av1_tree_to_cdf(av1_inter_mode_tree, fc->inter_mode_probs[i],
+                    fc->inter_mode_cdf[i]);
+
+  for (i = 0; i < BLOCK_SIZE_GROUPS; ++i)
+    av1_tree_to_cdf(av1_intra_mode_tree, fc->y_mode_prob[i], fc->y_mode_cdf[i]);
+
+  for (i = TX_4X4; i < EXT_TX_SIZES; ++i)
+    for (j = 0; j < TX_TYPES; ++j)
+      av1_tree_to_cdf(av1_ext_tx_tree, fc->intra_ext_tx_prob[i][j],
+                      fc->intra_ext_tx_cdf[i][j]);
+
+  for (i = TX_4X4; i < EXT_TX_SIZES; ++i)
+    av1_tree_to_cdf(av1_ext_tx_tree, fc->inter_ext_tx_prob[i],
+                    fc->inter_ext_tx_cdf[i]);
+}
 #endif
 
 #if CONFIG_EXT_INTERP
@@ -1561,7 +1681,7 @@
 
 #if CONFIG_VAR_TX && CONFIG_EXT_TX && CONFIG_RECT_TX
   if (cm->tx_mode == TX_MODE_SELECT) {
-    for (i = 0; i < TX_SIZES - 1; ++i) {
+    for (i = 0; i < MAX_TX_DEPTH; ++i) {
       fc->rect_tx_prob[i] =
           av1_mode_mv_merge_probs(pre_fc->rect_tx_prob[i], counts->rect_tx[i]);
     }
@@ -1637,19 +1757,11 @@
       aom_tree_merge_probs(av1_ext_tx_tree, pre_fc->intra_ext_tx_prob[i][j],
                            counts->intra_ext_tx[i][j],
                            fc->intra_ext_tx_prob[i][j]);
-#if CONFIG_DAALA_EC
-      av1_tree_to_cdf(av1_ext_tx_tree, fc->intra_ext_tx_prob[i][j],
-                      fc->intra_ext_tx_cdf[i][j]);
-#endif
     }
   }
   for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
     aom_tree_merge_probs(av1_ext_tx_tree, pre_fc->inter_ext_tx_prob[i],
                          counts->inter_ext_tx[i], fc->inter_ext_tx_prob[i]);
-#if CONFIG_DAALA_EC
-    av1_tree_to_cdf(av1_ext_tx_tree, fc->inter_ext_tx_prob[i],
-                    fc->inter_ext_tx_cdf[i]);
-#endif
   }
 #endif  // CONFIG_EXT_TX
 
@@ -1679,10 +1791,6 @@
   for (i = 0; i < PARTITION_CONTEXTS; i++) {
     aom_tree_merge_probs(av1_partition_tree, pre_fc->partition_prob[i],
                          counts->partition[i], fc->partition_prob[i]);
-#if CONFIG_DAALA_EC
-    av1_tree_to_cdf(av1_partition_tree, fc->partition_prob[i],
-                    fc->partition_cdf[i]);
-#endif
   }
 #endif  // CONFIG_EXT_PARTITION_TYPES
 #if CONFIG_DELTA_Q
diff --git a/av1/common/entropymode.h b/av1/common/entropymode.h
index 31ab65d..1d95cdc 100644
--- a/av1/common/entropymode.h
+++ b/av1/common/entropymode.h
@@ -62,9 +62,9 @@
   aom_prob partition_prob[PARTITION_CONTEXTS][PARTITION_TYPES - 1];
 #endif
   av1_coeff_probs_model coef_probs[TX_SIZES][PLANE_TYPES];
-#if CONFIG_RANS || CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
   coeff_cdf_model coef_cdfs[TX_SIZES][PLANE_TYPES];
-#endif  // CONFIG_RANS
+#endif  // CONFIG_EC_MULTISYMBOL
   aom_prob switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
                                  [SWITCHABLE_FILTERS - 1];
 
@@ -324,7 +324,7 @@
 extern const aom_tree_index av1_palette_color_tree[PALETTE_MAX_SIZE - 1]
                                                   [TREE_SIZE(PALETTE_COLORS)];
 #endif  // CONFIG_PALETTE
-extern const aom_tree_index av1_tx_size_tree[TX_SIZES - 1][TREE_SIZE(TX_SIZES)];
+extern const aom_tree_index av1_tx_size_tree[MAX_TX_DEPTH][TREE_SIZE(TX_SIZES)];
 #if CONFIG_EXT_INTRA
 extern const aom_tree_index av1_intra_filter_tree[TREE_SIZE(INTRA_FILTERS)];
 #endif  // CONFIG_EXT_INTRA
@@ -349,6 +349,8 @@
 #if CONFIG_DAALA_EC
 extern int av1_switchable_interp_ind[SWITCHABLE_FILTERS];
 extern int av1_switchable_interp_inv[SWITCHABLE_FILTERS];
+
+void av1_set_mode_cdfs(struct AV1Common *cm);
 #endif
 
 void av1_setup_past_independence(struct AV1Common *cm);
diff --git a/av1/common/entropymv.c b/av1/common/entropymv.c
index a80165e..029f9f6 100644
--- a/av1/common/entropymv.c
+++ b/av1/common/entropymv.c
@@ -43,21 +43,21 @@
 
 static const nmv_context default_nmv_context = {
   { 32, 64, 96 },  // joints
-#if CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
   { 0, 0, 0, 0 },  // joint_cdf is computed from joints in av1_init_mv_probs()
 #endif
   { {
         // Vertical component
         128,                                                   // sign
         { 224, 144, 192, 168, 192, 176, 192, 198, 198, 245 },  // class
-#if CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
         { 0 },  // class_cdf is computed from class in av1_init_mv_probs()
 #endif
         { 216 },                                               // class0
         { 136, 140, 148, 160, 176, 192, 224, 234, 234, 240 },  // bits
         { { 128, 128, 64 }, { 96, 112, 64 } },                 // class0_fp
         { 64, 96, 64 },                                        // fp
-#if CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
         { { 0 }, { 0 } },  // class0_fp_cdf is computed in av1_init_mv_probs()
         { 0 },             // fp_cdf is computed from fp in av1_init_mv_probs()
 #endif
@@ -68,14 +68,14 @@
         // Horizontal component
         128,                                                   // sign
         { 216, 128, 176, 160, 176, 176, 192, 198, 198, 208 },  // class
-#if CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
         { 0 },  // class_cdf is computed from class in av1_init_mv_probs()
 #endif
         { 208 },                                               // class0
         { 136, 140, 148, 160, 176, 192, 224, 234, 234, 240 },  // bits
         { { 128, 128, 64 }, { 96, 112, 64 } },                 // class0_fp
         { 64, 96, 64 },                                        // fp
-#if CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
         { { 0 }, { 0 } },  // class0_fp_cdf is computed in av1_init_mv_probs()
         { 0 },             // fp_cdf is computed from fp in av1_init_mv_probs()
 #endif
@@ -149,13 +149,6 @@
   return c;
 }
 
-// TODO(jingning): This idle function is intentionally left as is for
-// experimental purpose.
-int av1_use_mv_hp(const MV *ref) {
-  (void)ref;
-  return 1;
-}
-
 static void inc_mv_component(int v, nmv_component_counts *comp_counts, int incr,
                              int usehp) {
   int s, z, c, o, d, e, f;
@@ -273,28 +266,33 @@
 #endif
 }
 
+#if CONFIG_EC_MULTISYMBOL
+void av1_set_mv_cdfs(nmv_context *ctx) {
+  int i;
+  int j;
+  av1_tree_to_cdf(av1_mv_joint_tree, ctx->joints, ctx->joint_cdf);
+
+  for (i = 0; i < 2; ++i) {
+    nmv_component *const comp_ctx = &ctx->comps[i];
+    av1_tree_to_cdf(av1_mv_class_tree, comp_ctx->classes, comp_ctx->class_cdf);
+
+    for (j = 0; j < CLASS0_SIZE; ++j) {
+      av1_tree_to_cdf(av1_mv_fp_tree, comp_ctx->class0_fp[j],
+                      comp_ctx->class0_fp_cdf[j]);
+    }
+    av1_tree_to_cdf(av1_mv_fp_tree, comp_ctx->fp, comp_ctx->fp_cdf);
+  }
+}
+#endif
+
 void av1_init_mv_probs(AV1_COMMON *cm) {
 #if CONFIG_REF_MV
   int i;
   for (i = 0; i < NMV_CONTEXTS; ++i) cm->fc->nmvc[i] = default_nmv_context;
 #else
   cm->fc->nmvc = default_nmv_context;
-#if CONFIG_DAALA_EC
-  {
-    int i, j;
-    av1_tree_to_cdf(av1_mv_joint_tree, cm->fc->nmvc.joints,
-                    cm->fc->nmvc.joint_cdf);
-    for (i = 0; i < 2; i++) {
-      av1_tree_to_cdf(av1_mv_class_tree, cm->fc->nmvc.comps[i].classes,
-                      cm->fc->nmvc.comps[i].class_cdf);
-      av1_tree_to_cdf(av1_mv_fp_tree, cm->fc->nmvc.comps[i].fp,
-                      cm->fc->nmvc.comps[i].fp_cdf);
-      for (j = 0; j < CLASS0_SIZE; j++) {
-        av1_tree_to_cdf(av1_mv_fp_tree, cm->fc->nmvc.comps[i].class0_fp[j],
-                        cm->fc->nmvc.comps[i].class0_fp_cdf[j]);
-      }
-    }
-  }
+#if CONFIG_EC_MULTISYMBOL
+  av1_set_mv_cdfs(&cm->fc->nmvc);
 #endif
 #endif
 #if CONFIG_GLOBAL_MOTION
diff --git a/av1/common/entropymv.h b/av1/common/entropymv.h
index f308ef3..1ebbdb2 100644
--- a/av1/common/entropymv.h
+++ b/av1/common/entropymv.h
@@ -27,7 +27,6 @@
 void av1_init_mv_probs(struct AV1Common *cm);
 
 void av1_adapt_mv_probs(struct AV1Common *cm, int usehp);
-int av1_use_mv_hp(const MV *ref);
 
 #define MV_UPDATE_PROB 252
 
@@ -85,14 +84,14 @@
 typedef struct {
   aom_prob sign;
   aom_prob classes[MV_CLASSES - 1];
-#if CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
   aom_cdf_prob class_cdf[MV_CLASSES];
 #endif
   aom_prob class0[CLASS0_SIZE - 1];
   aom_prob bits[MV_OFFSET_BITS];
   aom_prob class0_fp[CLASS0_SIZE][MV_FP_SIZE - 1];
   aom_prob fp[MV_FP_SIZE - 1];
-#if CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
   aom_cdf_prob class0_fp_cdf[CLASS0_SIZE][MV_FP_SIZE];
   aom_cdf_prob fp_cdf[MV_FP_SIZE];
 #endif
@@ -102,7 +101,7 @@
 
 typedef struct {
   aom_prob joints[MV_JOINTS - 1];
-#if CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
   aom_cdf_prob joint_cdf[MV_JOINTS];
 #endif
   nmv_component comps[2];
@@ -135,11 +134,13 @@
 } nmv_context_counts;
 
 void av1_inc_mv(const MV *mv, nmv_context_counts *mvctx, const int usehp);
-
 #if CONFIG_GLOBAL_MOTION
 extern const aom_tree_index
     av1_global_motion_types_tree[TREE_SIZE(GLOBAL_MOTION_TYPES)];
 #endif  // CONFIG_GLOBAL_MOTION
+#if CONFIG_EC_MULTISYMBOL
+void av1_set_mv_cdfs(nmv_context *ctx);
+#endif
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/av1/common/enums.h b/av1/common/enums.h
index 701d4b9..2ec83ec 100644
--- a/av1/common/enums.h
+++ b/av1/common/enums.h
@@ -131,18 +131,19 @@
 
 // block transform size
 typedef enum ATTRIBUTE_PACKED {
-  TX_4X4,    // 4x4 transform
-  TX_8X8,    // 8x8 transform
-  TX_16X16,  // 16x16 transform
-  TX_32X32,  // 32x32 transform
-#if CONFIG_EXT_TX
+#if CONFIG_CB4X4
+  TX_2X2,  // 2x2 transform
+#endif
+  TX_4X4,                   // 4x4 transform
+  TX_8X8,                   // 8x8 transform
+  TX_16X16,                 // 16x16 transform
+  TX_32X32,                 // 32x32 transform
   TX_4X8,                   // 4x8 transform
   TX_8X4,                   // 8x4 transform
   TX_8X16,                  // 8x16 transform
   TX_16X8,                  // 16x8 transform
   TX_16X32,                 // 16x32 transform
   TX_32X16,                 // 32x16 transform
-#endif                      // CONFIG_EXT_TX
   TX_SIZES_ALL,             // Includes rectangular transforms
   TX_SIZES = TX_32X32 + 1,  // Does NOT include rectangular transforms
   TX_INVALID = 255          // Invalid transform size
@@ -208,8 +209,12 @@
 #define EXT_TX_SETS_INTER 4  // Sets of transform selections for INTER
 #define EXT_TX_SETS_INTRA 3  // Sets of transform selections for INTRA
 #else
+#if CONFIG_CB4X4
+#define EXT_TX_SIZES 4  // number of sizes that use extended transforms
+#else
 #define EXT_TX_SIZES 3  // number of sizes that use extended transforms
-#endif                  // CONFIG_EXT_TX
+#endif
+#endif  // CONFIG_EXT_TX
 
 typedef enum {
   AOM_LAST_FLAG = 1 << 0,
@@ -401,8 +406,8 @@
 #define REF_CONTEXTS 5
 
 #if CONFIG_VAR_TX
-#define TXFM_PARTITION_CONTEXTS 9
-typedef TX_SIZE TXFM_CONTEXT;
+#define TXFM_PARTITION_CONTEXTS 16
+typedef uint8_t TXFM_CONTEXT;
 #endif
 
 #define NONE -1
diff --git a/av1/common/filter.h b/av1/common/filter.h
index 15fc806..eb39a7f 100644
--- a/av1/common/filter.h
+++ b/av1/common/filter.h
@@ -38,8 +38,8 @@
 #else
 #define SWITCHABLE_FILTERS 3 /* Number of switchable filters */
 #define LOG_SWITCHABLE_FILTERS \
-  2 /* (1 << LOG_SWITCHABLE_FILTERS) > SWITCHABLE_FILTERS */
-#endif                       // CONFIG_EXT_INTERP
+  2     /* (1 << LOG_SWITCHABLE_FILTERS) > SWITCHABLE_FILTERS */
+#endif  // CONFIG_EXT_INTERP
 
 #define USE_TEMPORALFILTER_12TAP 1
 #if USE_TEMPORALFILTER_12TAP
diff --git a/av1/common/idct.c b/av1/common/idct.c
index 4f33f9b..b5e3742 100644
--- a/av1/common/idct.c
+++ b/av1/common/idct.c
@@ -58,6 +58,7 @@
   int i;
   for (i = 0; i < 32; ++i) output[i] = input[i] * 4;
 }
+#endif  // CONFIG_EXT_TX
 
 // For use in lieu of ADST
 static void ihalfright32_c(const tran_low_t *input, tran_low_t *output) {
@@ -70,11 +71,53 @@
   for (i = 0; i < 16; ++i) {
     output[i] = input[16 + i] * 4;
   }
-  idct16_c(inputhalf, output + 16);
+  aom_idct16_c(inputhalf, output + 16);
   // Note overall scaling factor is 4 times orthogonal
 }
 
+#if CONFIG_TX64X64
+static void idct64_col_c(const tran_low_t *input, tran_low_t *output) {
+  int32_t in[64], out[64];
+  int i;
+  for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
+  av1_idct64_new(in, out, inv_cos_bit_col_dct_dct_64,
+                 inv_stage_range_col_dct_dct_64);
+  for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
+}
+
+static void idct64_row_c(const tran_low_t *input, tran_low_t *output) {
+  int32_t in[64], out[64];
+  int i;
+  for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
+  av1_idct64_new(in, out, inv_cos_bit_row_dct_dct_64,
+                 inv_stage_range_row_dct_dct_64);
+  for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
+}
+
+static void iidtx64_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 64; ++i)
+    output[i] = (tran_low_t)dct_const_round_shift(input[i] * 4 * Sqrt2);
+}
+
+// For use in lieu of ADST
+static void ihalfright64_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  tran_low_t inputhalf[32];
+  // Multiply input by sqrt(2)
+  for (i = 0; i < 32; ++i) {
+    inputhalf[i] = (tran_low_t)dct_const_round_shift(input[i] * Sqrt2);
+  }
+  for (i = 0; i < 32; ++i) {
+    output[i] = (tran_low_t)dct_const_round_shift(input[32 + i] * 4 * Sqrt2);
+  }
+  aom_idct32_c(inputhalf, output + 32);
+  // Note overall scaling factor is 4 * sqrt(2)  times orthogonal
+}
+#endif  // CONFIG_TX64X64
+
 #if CONFIG_AOM_HIGHBITDEPTH
+#if CONFIG_EXT_TX
 static void highbd_iidtx4_c(const tran_low_t *input, tran_low_t *output,
                             int bd) {
   int i;
@@ -120,9 +163,61 @@
   aom_highbd_idct16_c(inputhalf, output + 16, bd);
   // Note overall scaling factor is 4 times orthogonal
 }
+
+#if CONFIG_TX64X64
+static void highbd_iidtx64_c(const tran_low_t *input, tran_low_t *output,
+                             int bd) {
+  int i;
+  for (i = 0; i < 64; ++i)
+    output[i] =
+        HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[i] * 4 * Sqrt2), bd);
+}
+
+// For use in lieu of ADST
+static void highbd_ihalfright64_c(const tran_low_t *input, tran_low_t *output,
+                                  int bd) {
+  int i;
+  tran_low_t inputhalf[32];
+  // Multiply input by sqrt(2)
+  for (i = 0; i < 32; ++i) {
+    inputhalf[i] =
+        HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[i] * Sqrt2), bd);
+  }
+  for (i = 0; i < 32; ++i) {
+    output[i] = HIGHBD_WRAPLOW(
+        highbd_dct_const_round_shift(input[32 + i] * 4 * Sqrt2), bd);
+  }
+  aom_highbd_idct32_c(inputhalf, output + 32, bd);
+  // Note overall scaling factor is 4 * sqrt(2)  times orthogonal
+}
+
+static void highbd_idct64_col_c(const tran_low_t *input, tran_low_t *output,
+                                int bd) {
+  int32_t in[64], out[64];
+  int i;
+  (void)bd;
+  for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
+  av1_idct64_new(in, out, inv_cos_bit_col_dct_dct_64,
+                 inv_stage_range_col_dct_dct_64);
+  for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
+}
+
+static void highbd_idct64_row_c(const tran_low_t *input, tran_low_t *output,
+                                int bd) {
+  int32_t in[64], out[64];
+  int i;
+  (void)bd;
+  for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
+  av1_idct64_new(in, out, inv_cos_bit_row_dct_dct_64,
+                 inv_stage_range_row_dct_dct_64);
+  for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
+}
+#endif  // CONFIG_TX64X64
+#endif  // CONFIG_EXT_TX
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
 // Inverse identity transform and add.
+#if CONFIG_EXT_TX
 static void inv_idtx_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                            int bs, int tx_type) {
   int r, c;
@@ -136,6 +231,7 @@
     }
   }
 }
+#endif  // CONFIG_EXT_TX
 
 #define FLIPUD_PTR(dest, stride, size)       \
   do {                                       \
@@ -143,6 +239,7 @@
     (stride) = -(stride);                    \
   } while (0)
 
+#if CONFIG_EXT_TX
 static void maybe_flip_strides(uint8_t **dst, int *dstride, tran_low_t **src,
                                int *sstride, int tx_type, int sizey,
                                int sizex) {
@@ -180,8 +277,10 @@
     default: assert(0); break;
   }
 }
+#endif  // CONFIG_EXT_TX
 
 #if CONFIG_AOM_HIGHBITDEPTH
+#if CONFIG_EXT_TX
 static void highbd_inv_idtx_add_c(const tran_low_t *input, uint8_t *dest8,
                                   int stride, int bs, int tx_type, int bd) {
   int r, c;
@@ -235,30 +334,30 @@
     default: assert(0); break;
   }
 }
-#endif  // CONFIG_AOM_HIGHBITDEPTH
 #endif  // CONFIG_EXT_TX
+#endif  // CONFIG_AOM_HIGHBITDEPTH
 
 void av1_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                          int tx_type) {
   static const transform_2d IHT_4[] = {
-    { idct4_c, idct4_c },    // DCT_DCT
-    { iadst4_c, idct4_c },   // ADST_DCT
-    { idct4_c, iadst4_c },   // DCT_ADST
-    { iadst4_c, iadst4_c },  // ADST_ADST
+    { aom_idct4_c, aom_idct4_c },    // DCT_DCT  = 0
+    { aom_iadst4_c, aom_idct4_c },   // ADST_DCT = 1
+    { aom_idct4_c, aom_iadst4_c },   // DCT_ADST = 2
+    { aom_iadst4_c, aom_iadst4_c },  // ADST_ADST = 3
 #if CONFIG_EXT_TX
-    { iadst4_c, idct4_c },   // FLIPADST_DCT
-    { idct4_c, iadst4_c },   // DCT_FLIPADST
-    { iadst4_c, iadst4_c },  // FLIPADST_FLIPADST
-    { iadst4_c, iadst4_c },  // ADST_FLIPADST
-    { iadst4_c, iadst4_c },  // FLIPADST_ADST
-    { iidtx4_c, iidtx4_c },  // IDTX
-    { idct4_c, iidtx4_c },   // V_DCT
-    { iidtx4_c, idct4_c },   // H_DCT
-    { iadst4_c, iidtx4_c },  // V_ADST
-    { iidtx4_c, iadst4_c },  // H_ADST
-    { iadst4_c, iidtx4_c },  // V_FLIPADST
-    { iidtx4_c, iadst4_c },  // H_FLIPADST
-#endif                       // CONFIG_EXT_TX
+    { aom_iadst4_c, aom_idct4_c },   // FLIPADST_DCT
+    { aom_idct4_c, aom_iadst4_c },   // DCT_FLIPADST
+    { aom_iadst4_c, aom_iadst4_c },  // FLIPADST_FLIPADST
+    { aom_iadst4_c, aom_iadst4_c },  // ADST_FLIPADST
+    { aom_iadst4_c, aom_iadst4_c },  // FLIPADST_ADST
+    { iidtx4_c, iidtx4_c },          // IDTX
+    { aom_idct4_c, iidtx4_c },       // V_DCT
+    { iidtx4_c, aom_idct4_c },       // H_DCT
+    { aom_iadst4_c, iidtx4_c },      // V_ADST
+    { iidtx4_c, aom_iadst4_c },      // H_ADST
+    { aom_iadst4_c, iidtx4_c },      // V_FLIPADST
+    { iidtx4_c, aom_iadst4_c },      // H_FLIPADST
+#endif                               // CONFIG_EXT_TX
   };
 
   int i, j;
@@ -301,26 +400,27 @@
   }
 }
 
-#if CONFIG_EXT_TX
 void av1_iht4x8_32_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                          int tx_type) {
   static const transform_2d IHT_4x8[] = {
-    { idct8_c, idct4_c },    // DCT_DCT
-    { iadst8_c, idct4_c },   // ADST_DCT
-    { idct8_c, iadst4_c },   // DCT_ADST
-    { iadst8_c, iadst4_c },  // ADST_ADST
-    { iadst8_c, idct4_c },   // FLIPADST_DCT
-    { idct8_c, iadst4_c },   // DCT_FLIPADST
-    { iadst8_c, iadst4_c },  // FLIPADST_FLIPADST
-    { iadst8_c, iadst4_c },  // ADST_FLIPADST
-    { iadst8_c, iadst4_c },  // FLIPADST_ADST
-    { iidtx8_c, iidtx4_c },  // IDTX
-    { idct8_c, iidtx4_c },   // V_DCT
-    { iidtx8_c, idct4_c },   // H_DCT
-    { iadst8_c, iidtx4_c },  // V_ADST
-    { iidtx8_c, iadst4_c },  // H_ADST
-    { iadst8_c, iidtx4_c },  // V_FLIPADST
-    { iidtx8_c, iadst4_c },  // H_FLIPADST
+    { aom_idct8_c, aom_idct4_c },    // DCT_DCT
+    { aom_iadst8_c, aom_idct4_c },   // ADST_DCT
+    { aom_idct8_c, aom_iadst4_c },   // DCT_ADST
+    { aom_iadst8_c, aom_iadst4_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { aom_iadst8_c, aom_idct4_c },   // FLIPADST_DCT
+    { aom_idct8_c, aom_iadst4_c },   // DCT_FLIPADST
+    { aom_iadst8_c, aom_iadst4_c },  // FLIPADST_FLIPADST
+    { aom_iadst8_c, aom_iadst4_c },  // ADST_FLIPADST
+    { aom_iadst8_c, aom_iadst4_c },  // FLIPADST_ADST
+    { iidtx8_c, iidtx4_c },          // IDTX
+    { aom_idct8_c, iidtx4_c },       // V_DCT
+    { iidtx8_c, aom_idct4_c },       // H_DCT
+    { aom_iadst8_c, iidtx4_c },      // V_ADST
+    { iidtx8_c, aom_iadst4_c },      // H_ADST
+    { aom_iadst8_c, iidtx4_c },      // V_FLIPADST
+    { iidtx8_c, aom_iadst4_c },      // H_FLIPADST
+#endif
   };
 
   const int n = 4;
@@ -343,7 +443,9 @@
     IHT_4x8[tx_type].cols(out[i], out[i]);
   }
 
+#if CONFIG_EXT_TX
   maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n2, n);
+#endif
 
   // Sum with the destination
   for (i = 0; i < n2; ++i) {
@@ -358,22 +460,24 @@
 void av1_iht8x4_32_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                          int tx_type) {
   static const transform_2d IHT_8x4[] = {
-    { idct4_c, idct8_c },    // DCT_DCT
-    { iadst4_c, idct8_c },   // ADST_DCT
-    { idct4_c, iadst8_c },   // DCT_ADST
-    { iadst4_c, iadst8_c },  // ADST_ADST
-    { iadst4_c, idct8_c },   // FLIPADST_DCT
-    { idct4_c, iadst8_c },   // DCT_FLIPADST
-    { iadst4_c, iadst8_c },  // FLIPADST_FLIPADST
-    { iadst4_c, iadst8_c },  // ADST_FLIPADST
-    { iadst4_c, iadst8_c },  // FLIPADST_ADST
-    { iidtx4_c, iidtx8_c },  // IDTX
-    { idct4_c, iidtx8_c },   // V_DCT
-    { iidtx4_c, idct8_c },   // H_DCT
-    { iadst4_c, iidtx8_c },  // V_ADST
-    { iidtx4_c, iadst8_c },  // H_ADST
-    { iadst4_c, iidtx8_c },  // V_FLIPADST
-    { iidtx4_c, iadst8_c },  // H_FLIPADST
+    { aom_idct4_c, aom_idct8_c },    // DCT_DCT
+    { aom_iadst4_c, aom_idct8_c },   // ADST_DCT
+    { aom_idct4_c, aom_iadst8_c },   // DCT_ADST
+    { aom_iadst4_c, aom_iadst8_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { aom_iadst4_c, aom_idct8_c },   // FLIPADST_DCT
+    { aom_idct4_c, aom_iadst8_c },   // DCT_FLIPADST
+    { aom_iadst4_c, aom_iadst8_c },  // FLIPADST_FLIPADST
+    { aom_iadst4_c, aom_iadst8_c },  // ADST_FLIPADST
+    { aom_iadst4_c, aom_iadst8_c },  // FLIPADST_ADST
+    { iidtx4_c, iidtx8_c },          // IDTX
+    { aom_idct4_c, iidtx8_c },       // V_DCT
+    { iidtx4_c, aom_idct8_c },       // H_DCT
+    { aom_iadst4_c, iidtx8_c },      // V_ADST
+    { iidtx4_c, aom_iadst8_c },      // H_ADST
+    { aom_iadst4_c, iidtx8_c },      // V_FLIPADST
+    { iidtx4_c, aom_iadst8_c },      // H_FLIPADST
+#endif
   };
   const int n = 4;
   const int n2 = 8;
@@ -396,7 +500,9 @@
     IHT_8x4[tx_type].cols(out[i], out[i]);
   }
 
+#if CONFIG_EXT_TX
   maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n, n2);
+#endif
 
   // Sum with the destination
   for (i = 0; i < n; ++i) {
@@ -411,22 +517,24 @@
 void av1_iht8x16_128_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                            int tx_type) {
   static const transform_2d IHT_8x16[] = {
-    { idct16_c, idct8_c },    // DCT_DCT
-    { iadst16_c, idct8_c },   // ADST_DCT
-    { idct16_c, iadst8_c },   // DCT_ADST
-    { iadst16_c, iadst8_c },  // ADST_ADST
-    { iadst16_c, idct8_c },   // FLIPADST_DCT
-    { idct16_c, iadst8_c },   // DCT_FLIPADST
-    { iadst16_c, iadst8_c },  // FLIPADST_FLIPADST
-    { iadst16_c, iadst8_c },  // ADST_FLIPADST
-    { iadst16_c, iadst8_c },  // FLIPADST_ADST
-    { iidtx16_c, iidtx8_c },  // IDTX
-    { idct16_c, iidtx8_c },   // V_DCT
-    { iidtx16_c, idct8_c },   // H_DCT
-    { iadst16_c, iidtx8_c },  // V_ADST
-    { iidtx16_c, iadst8_c },  // H_ADST
-    { iadst16_c, iidtx8_c },  // V_FLIPADST
-    { iidtx16_c, iadst8_c },  // H_FLIPADST
+    { aom_idct16_c, aom_idct8_c },    // DCT_DCT
+    { aom_iadst16_c, aom_idct8_c },   // ADST_DCT
+    { aom_idct16_c, aom_iadst8_c },   // DCT_ADST
+    { aom_iadst16_c, aom_iadst8_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { aom_iadst16_c, aom_idct8_c },   // FLIPADST_DCT
+    { aom_idct16_c, aom_iadst8_c },   // DCT_FLIPADST
+    { aom_iadst16_c, aom_iadst8_c },  // FLIPADST_FLIPADST
+    { aom_iadst16_c, aom_iadst8_c },  // ADST_FLIPADST
+    { aom_iadst16_c, aom_iadst8_c },  // FLIPADST_ADST
+    { iidtx16_c, iidtx8_c },          // IDTX
+    { aom_idct16_c, iidtx8_c },       // V_DCT
+    { iidtx16_c, aom_idct8_c },       // H_DCT
+    { aom_iadst16_c, iidtx8_c },      // V_ADST
+    { iidtx16_c, aom_iadst8_c },      // H_ADST
+    { aom_iadst16_c, iidtx8_c },      // V_FLIPADST
+    { iidtx16_c, aom_iadst8_c },      // H_FLIPADST
+#endif
   };
 
   const int n = 8;
@@ -449,7 +557,9 @@
     IHT_8x16[tx_type].cols(out[i], out[i]);
   }
 
+#if CONFIG_EXT_TX
   maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n2, n);
+#endif
 
   // Sum with the destination
   for (i = 0; i < n2; ++i) {
@@ -464,22 +574,24 @@
 void av1_iht16x8_128_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                            int tx_type) {
   static const transform_2d IHT_16x8[] = {
-    { idct8_c, idct16_c },    // DCT_DCT
-    { iadst8_c, idct16_c },   // ADST_DCT
-    { idct8_c, iadst16_c },   // DCT_ADST
-    { iadst8_c, iadst16_c },  // ADST_ADST
-    { iadst8_c, idct16_c },   // FLIPADST_DCT
-    { idct8_c, iadst16_c },   // DCT_FLIPADST
-    { iadst8_c, iadst16_c },  // FLIPADST_FLIPADST
-    { iadst8_c, iadst16_c },  // ADST_FLIPADST
-    { iadst8_c, iadst16_c },  // FLIPADST_ADST
-    { iidtx8_c, iidtx16_c },  // IDTX
-    { idct8_c, iidtx16_c },   // V_DCT
-    { iidtx8_c, idct16_c },   // H_DCT
-    { iadst8_c, iidtx16_c },  // V_ADST
-    { iidtx8_c, iadst16_c },  // H_ADST
-    { iadst8_c, iidtx16_c },  // V_FLIPADST
-    { iidtx8_c, iadst16_c },  // H_FLIPADST
+    { aom_idct8_c, aom_idct16_c },    // DCT_DCT
+    { aom_iadst8_c, aom_idct16_c },   // ADST_DCT
+    { aom_idct8_c, aom_iadst16_c },   // DCT_ADST
+    { aom_iadst8_c, aom_iadst16_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { aom_iadst8_c, aom_idct16_c },   // FLIPADST_DCT
+    { aom_idct8_c, aom_iadst16_c },   // DCT_FLIPADST
+    { aom_iadst8_c, aom_iadst16_c },  // FLIPADST_FLIPADST
+    { aom_iadst8_c, aom_iadst16_c },  // ADST_FLIPADST
+    { aom_iadst8_c, aom_iadst16_c },  // FLIPADST_ADST
+    { iidtx8_c, iidtx16_c },          // IDTX
+    { aom_idct8_c, iidtx16_c },       // V_DCT
+    { iidtx8_c, aom_idct16_c },       // H_DCT
+    { aom_iadst8_c, iidtx16_c },      // V_ADST
+    { iidtx8_c, aom_iadst16_c },      // H_ADST
+    { aom_iadst8_c, iidtx16_c },      // V_FLIPADST
+    { iidtx8_c, aom_iadst16_c },      // H_FLIPADST
+#endif
   };
   const int n = 8;
   const int n2 = 16;
@@ -502,7 +614,9 @@
     IHT_16x8[tx_type].cols(out[i], out[i]);
   }
 
+#if CONFIG_EXT_TX
   maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n, n2);
+#endif
 
   // Sum with the destination
   for (i = 0; i < n; ++i) {
@@ -517,22 +631,24 @@
 void av1_iht16x32_512_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                             int tx_type) {
   static const transform_2d IHT_16x32[] = {
-    { idct32_c, idct16_c },         // DCT_DCT
-    { ihalfright32_c, idct16_c },   // ADST_DCT
-    { idct32_c, iadst16_c },        // DCT_ADST
-    { ihalfright32_c, iadst16_c },  // ADST_ADST
-    { ihalfright32_c, idct16_c },   // FLIPADST_DCT
-    { idct32_c, iadst16_c },        // DCT_FLIPADST
-    { ihalfright32_c, iadst16_c },  // FLIPADST_FLIPADST
-    { ihalfright32_c, iadst16_c },  // ADST_FLIPADST
-    { ihalfright32_c, iadst16_c },  // FLIPADST_ADST
-    { iidtx32_c, iidtx16_c },       // IDTX
-    { idct32_c, iidtx16_c },        // V_DCT
-    { iidtx32_c, idct16_c },        // H_DCT
-    { ihalfright32_c, iidtx16_c },  // V_ADST
-    { iidtx32_c, iadst16_c },       // H_ADST
-    { ihalfright32_c, iidtx16_c },  // V_FLIPADST
-    { iidtx32_c, iadst16_c },       // H_FLIPADST
+    { aom_idct32_c, aom_idct16_c },     // DCT_DCT
+    { ihalfright32_c, aom_idct16_c },   // ADST_DCT
+    { aom_idct32_c, aom_iadst16_c },    // DCT_ADST
+    { ihalfright32_c, aom_iadst16_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { ihalfright32_c, aom_idct16_c },   // FLIPADST_DCT
+    { aom_idct32_c, aom_iadst16_c },    // DCT_FLIPADST
+    { ihalfright32_c, aom_iadst16_c },  // FLIPADST_FLIPADST
+    { ihalfright32_c, aom_iadst16_c },  // ADST_FLIPADST
+    { ihalfright32_c, aom_iadst16_c },  // FLIPADST_ADST
+    { iidtx32_c, iidtx16_c },           // IDTX
+    { aom_idct32_c, iidtx16_c },        // V_DCT
+    { iidtx32_c, aom_idct16_c },        // H_DCT
+    { ihalfright32_c, iidtx16_c },      // V_ADST
+    { iidtx32_c, aom_iadst16_c },       // H_ADST
+    { ihalfright32_c, iidtx16_c },      // V_FLIPADST
+    { iidtx32_c, aom_iadst16_c },       // H_FLIPADST
+#endif
   };
 
   const int n = 16;
@@ -555,7 +671,9 @@
     IHT_16x32[tx_type].cols(out[i], out[i]);
   }
 
+#if CONFIG_EXT_TX
   maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n2, n);
+#endif
 
   // Sum with the destination
   for (i = 0; i < n2; ++i) {
@@ -570,22 +688,24 @@
 void av1_iht32x16_512_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                             int tx_type) {
   static const transform_2d IHT_32x16[] = {
-    { idct16_c, idct32_c },         // DCT_DCT
-    { iadst16_c, idct32_c },        // ADST_DCT
-    { idct16_c, ihalfright32_c },   // DCT_ADST
-    { iadst16_c, ihalfright32_c },  // ADST_ADST
-    { iadst16_c, idct32_c },        // FLIPADST_DCT
-    { idct16_c, ihalfright32_c },   // DCT_FLIPADST
-    { iadst16_c, ihalfright32_c },  // FLIPADST_FLIPADST
-    { iadst16_c, ihalfright32_c },  // ADST_FLIPADST
-    { iadst16_c, ihalfright32_c },  // FLIPADST_ADST
-    { iidtx16_c, iidtx32_c },       // IDTX
-    { idct16_c, iidtx32_c },        // V_DCT
-    { iidtx16_c, idct32_c },        // H_DCT
-    { iadst16_c, iidtx32_c },       // V_ADST
-    { iidtx16_c, ihalfright32_c },  // H_ADST
-    { iadst16_c, iidtx32_c },       // V_FLIPADST
-    { iidtx16_c, ihalfright32_c },  // H_FLIPADST
+    { aom_idct16_c, aom_idct32_c },     // DCT_DCT
+    { aom_iadst16_c, aom_idct32_c },    // ADST_DCT
+    { aom_idct16_c, ihalfright32_c },   // DCT_ADST
+    { aom_iadst16_c, ihalfright32_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { aom_iadst16_c, aom_idct32_c },    // FLIPADST_DCT
+    { aom_idct16_c, ihalfright32_c },   // DCT_FLIPADST
+    { aom_iadst16_c, ihalfright32_c },  // FLIPADST_FLIPADST
+    { aom_iadst16_c, ihalfright32_c },  // ADST_FLIPADST
+    { aom_iadst16_c, ihalfright32_c },  // FLIPADST_ADST
+    { iidtx16_c, iidtx32_c },           // IDTX
+    { aom_idct16_c, iidtx32_c },        // V_DCT
+    { iidtx16_c, aom_idct32_c },        // H_DCT
+    { aom_iadst16_c, iidtx32_c },       // V_ADST
+    { iidtx16_c, ihalfright32_c },      // H_ADST
+    { aom_iadst16_c, iidtx32_c },       // V_FLIPADST
+    { iidtx16_c, ihalfright32_c },      // H_FLIPADST
+#endif
   };
   const int n = 16;
   const int n2 = 32;
@@ -608,7 +728,9 @@
     IHT_32x16[tx_type].cols(out[i], out[i]);
   }
 
+#if CONFIG_EXT_TX
   maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n, n2);
+#endif
 
   // Sum with the destination
   for (i = 0; i < n; ++i) {
@@ -619,29 +741,28 @@
     }
   }
 }
-#endif  // CONFIG_EXT_TX
 
 void av1_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                          int tx_type) {
   static const transform_2d IHT_8[] = {
-    { idct8_c, idct8_c },    // DCT_DCT
-    { iadst8_c, idct8_c },   // ADST_DCT
-    { idct8_c, iadst8_c },   // DCT_ADST
-    { iadst8_c, iadst8_c },  // ADST_ADST
+    { aom_idct8_c, aom_idct8_c },    // DCT_DCT  = 0
+    { aom_iadst8_c, aom_idct8_c },   // ADST_DCT = 1
+    { aom_idct8_c, aom_iadst8_c },   // DCT_ADST = 2
+    { aom_iadst8_c, aom_iadst8_c },  // ADST_ADST = 3
 #if CONFIG_EXT_TX
-    { iadst8_c, idct8_c },   // FLIPADST_DCT
-    { idct8_c, iadst8_c },   // DCT_FLIPADST
-    { iadst8_c, iadst8_c },  // FLIPADST_FLIPADST
-    { iadst8_c, iadst8_c },  // ADST_FLIPADST
-    { iadst8_c, iadst8_c },  // FLIPADST_ADST
-    { iidtx8_c, iidtx8_c },  // IDTX
-    { idct8_c, iidtx8_c },   // V_DCT
-    { iidtx8_c, idct8_c },   // H_DCT
-    { iadst8_c, iidtx8_c },  // V_ADST
-    { iidtx8_c, iadst8_c },  // H_ADST
-    { iadst8_c, iidtx8_c },  // V_FLIPADST
-    { iidtx8_c, iadst8_c },  // H_FLIPADST
-#endif                       // CONFIG_EXT_TX
+    { aom_iadst8_c, aom_idct8_c },   // FLIPADST_DCT
+    { aom_idct8_c, aom_iadst8_c },   // DCT_FLIPADST
+    { aom_iadst8_c, aom_iadst8_c },  // FLIPADST_FLIPADST
+    { aom_iadst8_c, aom_iadst8_c },  // ADST_FLIPADST
+    { aom_iadst8_c, aom_iadst8_c },  // FLIPADST_ADST
+    { iidtx8_c, iidtx8_c },          // IDTX
+    { aom_idct8_c, iidtx8_c },       // V_DCT
+    { iidtx8_c, aom_idct8_c },       // H_DCT
+    { aom_iadst8_c, iidtx8_c },      // V_ADST
+    { iidtx8_c, aom_iadst8_c },      // H_ADST
+    { aom_iadst8_c, iidtx8_c },      // V_FLIPADST
+    { iidtx8_c, aom_iadst8_c },      // H_FLIPADST
+#endif                               // CONFIG_EXT_TX
   };
 
   int i, j;
@@ -687,24 +808,24 @@
 void av1_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                             int tx_type) {
   static const transform_2d IHT_16[] = {
-    { idct16_c, idct16_c },    // DCT_DCT
-    { iadst16_c, idct16_c },   // ADST_DCT
-    { idct16_c, iadst16_c },   // DCT_ADST
-    { iadst16_c, iadst16_c },  // ADST_ADST
+    { aom_idct16_c, aom_idct16_c },    // DCT_DCT  = 0
+    { aom_iadst16_c, aom_idct16_c },   // ADST_DCT = 1
+    { aom_idct16_c, aom_iadst16_c },   // DCT_ADST = 2
+    { aom_iadst16_c, aom_iadst16_c },  // ADST_ADST = 3
 #if CONFIG_EXT_TX
-    { iadst16_c, idct16_c },   // FLIPADST_DCT
-    { idct16_c, iadst16_c },   // DCT_FLIPADST
-    { iadst16_c, iadst16_c },  // FLIPADST_FLIPADST
-    { iadst16_c, iadst16_c },  // ADST_FLIPADST
-    { iadst16_c, iadst16_c },  // FLIPADST_ADST
-    { iidtx16_c, iidtx16_c },  // IDTX
-    { idct16_c, iidtx16_c },   // V_DCT
-    { iidtx16_c, idct16_c },   // H_DCT
-    { iadst16_c, iidtx16_c },  // V_ADST
-    { iidtx16_c, iadst16_c },  // H_ADST
-    { iadst16_c, iidtx16_c },  // V_FLIPADST
-    { iidtx16_c, iadst16_c },  // H_FLIPADST
-#endif                         // CONFIG_EXT_TX
+    { aom_iadst16_c, aom_idct16_c },   // FLIPADST_DCT
+    { aom_idct16_c, aom_iadst16_c },   // DCT_FLIPADST
+    { aom_iadst16_c, aom_iadst16_c },  // FLIPADST_FLIPADST
+    { aom_iadst16_c, aom_iadst16_c },  // ADST_FLIPADST
+    { aom_iadst16_c, aom_iadst16_c },  // FLIPADST_ADST
+    { iidtx16_c, iidtx16_c },          // IDTX
+    { aom_idct16_c, iidtx16_c },       // V_DCT
+    { iidtx16_c, aom_idct16_c },       // H_DCT
+    { aom_iadst16_c, iidtx16_c },      // V_ADST
+    { iidtx16_c, aom_iadst16_c },      // H_ADST
+    { aom_iadst16_c, iidtx16_c },      // V_FLIPADST
+    { iidtx16_c, aom_iadst16_c },      // H_FLIPADST
+#endif                                 // CONFIG_EXT_TX
   };
 
   int i, j;
@@ -751,22 +872,22 @@
 void av1_iht32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                              int tx_type) {
   static const transform_2d IHT_32[] = {
-    { idct32_c, idct32_c },              // DCT_DCT
-    { ihalfright32_c, idct32_c },        // ADST_DCT
-    { idct32_c, ihalfright32_c },        // DCT_ADST
+    { aom_idct32_c, aom_idct32_c },      // DCT_DCT
+    { ihalfright32_c, aom_idct32_c },    // ADST_DCT
+    { aom_idct32_c, ihalfright32_c },    // DCT_ADST
     { ihalfright32_c, ihalfright32_c },  // ADST_ADST
-    { ihalfright32_c, idct32_c },        // FLIPADST_DCT
-    { idct32_c, ihalfright32_c },        // DCT_FLIPADST
+    { ihalfright32_c, aom_idct32_c },    // FLIPADST_DCT
+    { aom_idct32_c, ihalfright32_c },    // DCT_FLIPADST
     { ihalfright32_c, ihalfright32_c },  // FLIPADST_FLIPADST
     { ihalfright32_c, ihalfright32_c },  // ADST_FLIPADST
     { ihalfright32_c, ihalfright32_c },  // FLIPADST_ADST
     { iidtx32_c, iidtx32_c },            // IDTX
-    { idct32_c, iidtx32_c },             // V_DCT
-    { iidtx32_c, idct32_c },             // H_DCT
-    { ihalfright32_c, iidtx16_c },       // V_ADST
-    { iidtx16_c, ihalfright32_c },       // H_ADST
-    { ihalfright32_c, iidtx16_c },       // V_FLIPADST
-    { iidtx16_c, ihalfright32_c },       // H_FLIPADST
+    { aom_idct32_c, iidtx32_c },         // V_DCT
+    { iidtx32_c, aom_idct32_c },         // H_DCT
+    { ihalfright32_c, iidtx32_c },       // V_ADST
+    { iidtx32_c, ihalfright32_c },       // H_ADST
+    { ihalfright32_c, iidtx32_c },       // V_FLIPADST
+    { iidtx32_c, ihalfright32_c },       // H_FLIPADST
   };
 
   int i, j;
@@ -806,6 +927,68 @@
     }
   }
 }
+
+#if CONFIG_TX64X64
+void av1_iht64x64_4096_add_c(const tran_low_t *input, uint8_t *dest, int stride,
+                             int tx_type) {
+  static const transform_2d IHT_64[] = {
+    { idct64_col_c, idct64_row_c },      // DCT_DCT
+    { ihalfright64_c, idct64_row_c },    // ADST_DCT
+    { idct64_col_c, ihalfright64_c },    // DCT_ADST
+    { ihalfright64_c, ihalfright64_c },  // ADST_ADST
+    { ihalfright64_c, idct64_row_c },    // FLIPADST_DCT
+    { idct64_col_c, ihalfright64_c },    // DCT_FLIPADST
+    { ihalfright64_c, ihalfright64_c },  // FLIPADST_FLIPADST
+    { ihalfright64_c, ihalfright64_c },  // ADST_FLIPADST
+    { ihalfright64_c, ihalfright64_c },  // FLIPADST_ADST
+    { iidtx64_c, iidtx64_c },            // IDTX
+    { idct64_col_c, iidtx64_c },         // V_DCT
+    { iidtx64_c, idct64_row_c },         // H_DCT
+    { ihalfright64_c, iidtx64_c },       // V_ADST
+    { iidtx64_c, ihalfright64_c },       // H_ADST
+    { ihalfright64_c, iidtx64_c },       // V_FLIPADST
+    { iidtx64_c, ihalfright64_c },       // H_FLIPADST
+  };
+
+  int i, j;
+  tran_low_t tmp;
+  tran_low_t out[64][64];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 64;
+
+  // inverse transform row vectors
+  for (i = 0; i < 64; ++i) {
+    IHT_64[tx_type].rows(input, out[i]);
+    for (j = 0; j < 64; ++j) out[i][j] = ROUND_POWER_OF_TWO(out[i][j], 1);
+    input += 64;
+  }
+
+  // transpose
+  for (i = 1; i < 64; i++) {
+    for (j = 0; j < i; j++) {
+      tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < 64; ++i) {
+    IHT_64[tx_type].cols(out[i], out[i]);
+  }
+
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 64, 64);
+
+  // Sum with the destination
+  for (i = 0; i < 64; ++i) {
+    for (j = 0; j < 64; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
+    }
+  }
+}
+#endif  // CONFIG_TX64X64
 #endif  // CONFIG_EXT_TX
 
 // idct
@@ -905,7 +1088,6 @@
   }
 }
 
-#if CONFIG_EXT_TX
 void av1_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest, int stride,
                           int eob, TX_TYPE tx_type) {
   (void)eob;
@@ -941,7 +1123,6 @@
   (void)eob;
   av1_iht32x16_512_add(input, dest, stride, tx_type);
 }
-#endif  // CONFIG_EXT_TX
 
 void av1_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest, int stride,
                           int eob, TX_TYPE tx_type) {
@@ -984,17 +1165,12 @@
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
-      av1_iht16x16_256_add(input, dest, stride, tx_type);
-      break;
     case V_DCT:
     case H_DCT:
     case V_ADST:
     case H_ADST:
     case V_FLIPADST:
-    case H_FLIPADST:
-      // Use C version since DST only exists in C code
-      av1_iht16x16_256_add_c(input, dest, stride, tx_type);
-      break;
+    case H_FLIPADST: av1_iht16x16_256_add(input, dest, stride, tx_type); break;
     case IDTX: inv_idtx_add_c(input, dest, stride, 16, tx_type); break;
 #endif  // CONFIG_EXT_TX
     default: assert(0); break;
@@ -1635,6 +1811,71 @@
     }
   }
 }
+
+#if CONFIG_TX64X64
+void av1_highbd_iht64x64_4096_add_c(const tran_low_t *input, uint8_t *dest8,
+                                    int stride, int tx_type, int bd) {
+  static const highbd_transform_2d HIGH_IHT_64[] = {
+    { highbd_idct64_col_c, highbd_idct64_row_c },      // DCT_DCT
+    { highbd_ihalfright64_c, highbd_idct64_row_c },    // ADST_DCT
+    { highbd_idct64_col_c, highbd_ihalfright64_c },    // DCT_ADST
+    { highbd_ihalfright64_c, highbd_ihalfright64_c },  // ADST_ADST
+    { highbd_ihalfright64_c, highbd_idct64_row_c },    // FLIPADST_DCT
+    { highbd_idct64_col_c, highbd_ihalfright64_c },    // DCT_FLIPADST
+    { highbd_ihalfright64_c, highbd_ihalfright64_c },  // FLIPADST_FLIPADST
+    { highbd_ihalfright64_c, highbd_ihalfright64_c },  // ADST_FLIPADST
+    { highbd_ihalfright64_c, highbd_ihalfright64_c },  // FLIPADST_ADST
+    { highbd_iidtx64_c, highbd_iidtx64_c },            // IDTX
+    { highbd_idct64_col_c, highbd_iidtx64_c },         // V_DCT
+    { highbd_iidtx64_c, highbd_idct64_row_c },         // H_DCT
+    { highbd_ihalfright64_c, highbd_iidtx64_c },       // V_ADST
+    { highbd_iidtx64_c, highbd_ihalfright64_c },       // H_ADST
+    { highbd_ihalfright64_c, highbd_iidtx64_c },       // V_FLIPADST
+    { highbd_iidtx64_c, highbd_ihalfright64_c },       // H_FLIPADST
+  };
+
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  int i, j;
+  tran_low_t tmp;
+  tran_low_t out[64][64];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 64;
+
+  // inverse transform row vectors
+  for (i = 0; i < 64; ++i) {
+    HIGH_IHT_64[tx_type].rows(input, out[i], bd);
+    for (j = 0; j < 64; ++j) out[i][j] = ROUND_POWER_OF_TWO(out[i][j], 1);
+    input += 64;
+  }
+
+  // transpose
+  for (i = 1; i < 64; i++) {
+    for (j = 0; j < i; j++) {
+      tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < 64; ++i) {
+    HIGH_IHT_64[tx_type].cols(out[i], out[i], bd);
+  }
+
+  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 64, 64);
+
+  // Sum with the destination
+  for (i = 0; i < 64; ++i) {
+    for (j = 0; j < 64; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] =
+          highbd_clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5), bd);
+    }
+  }
+}
+#endif  // CONFIG_TX64X64
 #endif  // CONFIG_EXT_TX
 
 // idct
@@ -1909,7 +2150,6 @@
       av1_inv_txfm_add_16x16(input, dest, stride, eob, tx_type);
       break;
     case TX_8X8: av1_inv_txfm_add_8x8(input, dest, stride, eob, tx_type); break;
-#if CONFIG_EXT_TX
     case TX_4X8: av1_inv_txfm_add_4x8(input, dest, stride, eob, tx_type); break;
     case TX_8X4: av1_inv_txfm_add_8x4(input, dest, stride, eob, tx_type); break;
     case TX_8X16:
@@ -1924,7 +2164,6 @@
     case TX_32X16:
       av1_inv_txfm_add_32x16(input, dest, stride, eob, tx_type);
       break;
-#endif  // CONFIG_EXT_TX
     case TX_4X4:
       // this is like av1_short_idct4x4 but has a special case around eob<=1
       // which is significant (not just an optimization) for the lossless
diff --git a/av1/common/idct.h b/av1/common/idct.h
index 1acc825..db9a6e2 100644
--- a/av1/common/idct.h
+++ b/av1/common/idct.h
@@ -67,12 +67,10 @@
 
 void av1_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest, int stride,
                           int eob, TX_TYPE tx_type, int lossless);
-#if CONFIG_EXT_TX
 void av1_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest, int stride,
                           int eob, TX_TYPE tx_type);
 void av1_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest, int stride,
                           int eob, TX_TYPE tx_type);
-#endif  // CONFIG_EXT_TX
 void av1_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest, int stride,
                           int eob, TX_TYPE tx_type);
 void av1_inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest, int stride,
@@ -95,12 +93,10 @@
 void av1_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
                                  int stride, int eob, int bd, TX_TYPE tx_type,
                                  int lossless);
-#if CONFIG_EXT_TX
 void av1_highbd_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest,
                                  int stride, int eob, int bd, TX_TYPE tx_type);
 void av1_highbd_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest,
                                  int stride, int eob, int bd, TX_TYPE tx_type);
-#endif  // CONFIG_EXT_TX
 void av1_highbd_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest,
                                  int stride, int eob, int bd, TX_TYPE tx_type);
 void av1_highbd_inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest,
diff --git a/av1/common/loopfilter.c b/av1/common/loopfilter.c
index d0b897c..25ce24a 100644
--- a/av1/common/loopfilter.c
+++ b/av1/common/loopfilter.c
@@ -40,6 +40,9 @@
 //
 // A loopfilter should be applied to every other 8x8 horizontally.
 static const uint64_t left_64x64_txform_mask[TX_SIZES] = {
+#if CONFIG_CB4X4
+  0xffffffffffffffffULL,  // TX_2X2
+#endif
   0xffffffffffffffffULL,  // TX_4X4
   0xffffffffffffffffULL,  // TX_8x8
   0x5555555555555555ULL,  // TX_16x16
@@ -64,6 +67,9 @@
 //
 // A loopfilter should be applied to every other 4 the row vertically.
 static const uint64_t above_64x64_txform_mask[TX_SIZES] = {
+#if CONFIG_CB4X4
+  0xffffffffffffffffULL,  // TX_4X4
+#endif
   0xffffffffffffffffULL,  // TX_4X4
   0xffffffffffffffffULL,  // TX_8x8
   0x00ff00ff00ff00ffULL,  // TX_16x16
@@ -142,6 +148,9 @@
 
 // 16 bit masks for uv transform sizes.
 static const uint16_t left_64x64_txform_mask_uv[TX_SIZES] = {
+#if CONFIG_CB4X4
+  0xffff,  // TX_2X2
+#endif
   0xffff,  // TX_4X4
   0xffff,  // TX_8x8
   0x5555,  // TX_16x16
@@ -149,6 +158,9 @@
 };
 
 static const uint16_t above_64x64_txform_mask_uv[TX_SIZES] = {
+#if CONFIG_CB4X4
+  0xffff,  // TX_2X2
+#endif
   0xffff,  // TX_4X4
   0xffff,  // TX_8x8
   0x0f0f,  // TX_16x16
@@ -1054,6 +1066,8 @@
       lfm->above_uv[TX_8X8] |= lfm->above_uv[TX_16X16] & 0xff00;
       lfm->above_uv[TX_16X16] &= ~(lfm->above_uv[TX_16X16] & 0xff00);
     }
+  } else {
+    lfm->above_int_4x4_uv = lfm->left_int_4x4_uv;
   }
 
   if (mi_col + MAX_MIB_SIZE > cm->mi_cols) {
@@ -1183,21 +1197,24 @@
 }
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
-void av1_filter_block_plane_non420(AV1_COMMON *cm,
-                                   struct macroblockd_plane *plane,
-                                   MODE_INFO **mib, int mi_row, int mi_col) {
+void av1_filter_block_plane_non420_ver(AV1_COMMON *cm,
+                                       struct macroblockd_plane *plane,
+                                       MODE_INFO **mib, int mi_row,
+                                       int mi_col) {
   const int ss_x = plane->subsampling_x;
   const int ss_y = plane->subsampling_y;
   const int row_step = 1 << ss_y;
   const int col_step = 1 << ss_x;
+  const int row_step_stride = cm->mi_stride * row_step;
   struct buf_2d *const dst = &plane->dst;
   uint8_t *const dst0 = dst->buf;
   unsigned int mask_16x16[MAX_MIB_SIZE] = { 0 };
   unsigned int mask_8x8[MAX_MIB_SIZE] = { 0 };
   unsigned int mask_4x4[MAX_MIB_SIZE] = { 0 };
   unsigned int mask_4x4_int[MAX_MIB_SIZE] = { 0 };
-  uint8_t lfl[MAX_MIB_SIZE][MAX_MIB_SIZE];
+  uint8_t lfl[MAX_MIB_SIZE][MAX_MIB_SIZE] = { { 0 } };
   int r, c;
+  MODE_INFO **tmp_mi = mib;
 
   for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += row_step) {
     unsigned int mask_16x16_c = 0;
@@ -1207,7 +1224,7 @@
 
     // Determine the vertical edges that need filtering
     for (c = 0; c < cm->mib_size && mi_col + c < cm->mi_cols; c += col_step) {
-      const MODE_INFO *mi = mib[c];
+      const MODE_INFO *mi = tmp_mi[c];
       const MB_MODE_INFO *mbmi = &mi[0].mbmi;
       const BLOCK_SIZE sb_type = mbmi->sb_type;
       const int skip_this = mbmi->skip && is_inter_block(mbmi);
@@ -1224,24 +1241,39 @@
       const int skip_this_r = skip_this && !block_edge_above;
 
 #if CONFIG_VAR_TX
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+      TX_SIZE mb_tx_size = is_rect_tx(mbmi->tx_size)
+                               ? mbmi->tx_size
+                               : mbmi->inter_tx_size[blk_row][blk_col];
+#else
+      const TX_SIZE mb_tx_size = mbmi->inter_tx_size[blk_row][blk_col];
+#endif
+#endif
+
       TX_SIZE tx_size = (plane->plane_type == PLANE_TYPE_UV)
                             ? get_uv_tx_size(mbmi, plane)
                             : mbmi->tx_size;
-#else
-      const TX_SIZE tx_size = (plane->plane_type == PLANE_TYPE_UV)
-                                  ? get_uv_tx_size(mbmi, plane)
-                                  : mbmi->tx_size;
-#endif
 
       const int skip_border_4x4_c = ss_x && mi_col + c == cm->mi_cols - 1;
       const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
 
-      TX_SIZE tx_size_c = num_4x4_blocks_wide_txsize_log2_lookup[tx_size];
-      TX_SIZE tx_size_r = num_4x4_blocks_high_txsize_log2_lookup[tx_size];
+      TX_SIZE tx_size_c = tx_size_wide_unit[tx_size];
+      TX_SIZE tx_size_r = tx_size_high_unit[tx_size];
 
       int tx_size_mask = 0;
+      const int c_step = (c >> ss_x);
+      const int r_step = (r >> ss_y);
+      const int col_mask = 1 << c_step;
+
+#if CONFIG_VAR_TX
+      if (is_inter_block(mbmi) && !mbmi->skip)
+        tx_size = (plane->plane_type == PLANE_TYPE_UV)
+                      ? uv_txsize_lookup[sb_type][mb_tx_size][ss_x][ss_y]
+                      : mb_tx_size;
+#endif
+
       // Filter level can vary per MI
-      if (!(lfl[r][c >> ss_x] = get_filter_level(&cm->lf_info, mbmi))) continue;
+      if (!(lfl[r][c_step] = get_filter_level(&cm->lf_info, mbmi))) continue;
 
       if (txsize_sqr_up_map[tx_size] == TX_32X32)
         tx_size_mask = 3;
@@ -1251,19 +1283,6 @@
         tx_size_mask = 0;
 
 #if CONFIG_VAR_TX
-      if (is_inter_block(mbmi) && !mbmi->skip) {
-#if CONFIG_EXT_TX && CONFIG_RECT_TX
-        TX_SIZE mb_tx_size = is_rect_tx(mbmi->tx_size)
-                                 ? mbmi->tx_size
-                                 : mbmi->inter_tx_size[blk_row][blk_col];
-#else
-        TX_SIZE mb_tx_size = mbmi->inter_tx_size[blk_row][blk_col];
-#endif
-        tx_size = (plane->plane_type == PLANE_TYPE_UV)
-                      ? uv_txsize_lookup[sb_type][mb_tx_size][ss_x][ss_y]
-                      : mb_tx_size;
-      }
-
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
       tx_size_r =
           AOMMIN(txsize_horz_map[tx_size], cm->above_txfm_context[mi_col + c]);
@@ -1286,60 +1305,60 @@
       // Build masks based on the transform size of each block
       // handle vertical mask
       if (tx_size_c == TX_32X32) {
-        if (!skip_this_c && ((c >> ss_x) & tx_size_mask) == 0) {
+        if (!skip_this_c && (c_step & tx_size_mask) == 0) {
           if (!skip_border_4x4_c)
-            mask_16x16_c |= 1 << (c >> ss_x);
+            mask_16x16_c |= col_mask;
           else
-            mask_8x8_c |= 1 << (c >> ss_x);
+            mask_8x8_c |= col_mask;
         }
       } else if (tx_size_c == TX_16X16) {
-        if (!skip_this_c && ((c >> ss_x) & tx_size_mask) == 0) {
+        if (!skip_this_c && (c_step & tx_size_mask) == 0) {
           if (!skip_border_4x4_c)
-            mask_16x16_c |= 1 << (c >> ss_x);
+            mask_16x16_c |= col_mask;
           else
-            mask_8x8_c |= 1 << (c >> ss_x);
+            mask_8x8_c |= col_mask;
         }
       } else {
         // force 8x8 filtering on 32x32 boundaries
-        if (!skip_this_c && ((c >> ss_x) & tx_size_mask) == 0) {
+        if (!skip_this_c && (c_step & tx_size_mask) == 0) {
           if (tx_size_c == TX_8X8 || ((c >> ss_x) & 3) == 0)
-            mask_8x8_c |= 1 << (c >> ss_x);
+            mask_8x8_c |= col_mask;
           else
-            mask_4x4_c |= 1 << (c >> ss_x);
+            mask_4x4_c |= col_mask;
         }
 
         if (!skip_this && tx_size_c < TX_8X8 && !skip_border_4x4_c &&
-            ((c >> ss_x) & tx_size_mask) == 0)
-          mask_4x4_int[r] |= 1 << (c >> ss_x);
+            (c_step & tx_size_mask) == 0)
+          mask_4x4_int[r] |= col_mask;
       }
 
       // set horizontal mask
       if (tx_size_r == TX_32X32) {
-        if (!skip_this_r && ((r >> ss_y) & tx_size_mask) == 0) {
+        if (!skip_this_r && (r_step & tx_size_mask) == 0) {
           if (!skip_border_4x4_r)
-            mask_16x16[r] |= 1 << (c >> ss_x);
+            mask_16x16[r] |= col_mask;
           else
-            mask_8x8[r] |= 1 << (c >> ss_x);
+            mask_8x8[r] |= col_mask;
         }
       } else if (tx_size_r == TX_16X16) {
-        if (!skip_this_r && ((r >> ss_y) & tx_size_mask) == 0) {
+        if (!skip_this_r && (r_step & tx_size_mask) == 0) {
           if (!skip_border_4x4_r)
-            mask_16x16[r] |= 1 << (c >> ss_x);
+            mask_16x16[r] |= col_mask;
           else
-            mask_8x8[r] |= 1 << (c >> ss_x);
+            mask_8x8[r] |= col_mask;
         }
       } else {
         // force 8x8 filtering on 32x32 boundaries
-        if (!skip_this_r && ((r >> ss_y) & tx_size_mask) == 0) {
-          if (tx_size_r == TX_8X8 || ((r >> ss_y) & 3) == 0)
-            mask_8x8[r] |= 1 << (c >> ss_x);
+        if (!skip_this_r && (r_step & tx_size_mask) == 0) {
+          if (tx_size_r == TX_8X8 || (r_step & 3) == 0)
+            mask_8x8[r] |= col_mask;
           else
-            mask_4x4[r] |= 1 << (c >> ss_x);
+            mask_4x4[r] |= col_mask;
         }
 
         if (!skip_this && tx_size_r < TX_8X8 && !skip_border_4x4_c &&
             ((r >> ss_y) & tx_size_mask) == 0)
-          mask_4x4_int[r] |= 1 << (c >> ss_x);
+          mask_4x4_int[r] |= col_mask;
       }
     }
 
@@ -1364,11 +1383,176 @@
                             mask_4x4_int[r], &cm->lf_info, &lfl[r][0]);
 #endif  // CONFIG_AOM_HIGHBITDEPTH
     dst->buf += MI_SIZE * dst->stride;
-    mib += row_step * cm->mi_stride;
+    tmp_mi += row_step_stride;
   }
 
   // Now do horizontal pass
   dst->buf = dst0;
+}
+
+void av1_filter_block_plane_non420_hor(AV1_COMMON *cm,
+                                       struct macroblockd_plane *plane,
+                                       MODE_INFO **mib, int mi_row,
+                                       int mi_col) {
+  const int ss_x = plane->subsampling_x;
+  const int ss_y = plane->subsampling_y;
+  const int row_step = 1 << ss_y;
+  const int col_step = 1 << ss_x;
+  const int row_step_stride = cm->mi_stride * row_step;
+  struct buf_2d *const dst = &plane->dst;
+  uint8_t *const dst0 = dst->buf;
+  unsigned int mask_16x16[MAX_MIB_SIZE] = { 0 };
+  unsigned int mask_8x8[MAX_MIB_SIZE] = { 0 };
+  unsigned int mask_4x4[MAX_MIB_SIZE] = { 0 };
+  unsigned int mask_4x4_int[MAX_MIB_SIZE] = { 0 };
+  uint8_t lfl[MAX_MIB_SIZE][MAX_MIB_SIZE];
+  int r, c;
+  MODE_INFO **tmp_mi = mib;
+  for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += row_step) {
+    unsigned int mask_16x16_c = 0;
+    unsigned int mask_8x8_c = 0;
+    unsigned int mask_4x4_c = 0;
+
+    // Determine the vertical edges that need filtering
+    for (c = 0; c < cm->mib_size && mi_col + c < cm->mi_cols; c += col_step) {
+      const MODE_INFO *mi = tmp_mi[c];
+      const MB_MODE_INFO *mbmi = &mi[0].mbmi;
+      const BLOCK_SIZE sb_type = mbmi->sb_type;
+      const int skip_this = mbmi->skip && is_inter_block(mbmi);
+      const int blk_row = r & (num_8x8_blocks_high_lookup[sb_type] - 1);
+      const int blk_col = c & (num_8x8_blocks_wide_lookup[sb_type] - 1);
+
+      // left edge of current unit is block/partition edge -> no skip
+      const int block_edge_left =
+          (num_4x4_blocks_wide_lookup[sb_type] > 1) ? !blk_col : 1;
+      const int skip_this_c = skip_this && !block_edge_left;
+      // top edge of current unit is block/partition edge -> no skip
+      const int block_edge_above =
+          (num_4x4_blocks_high_lookup[sb_type] > 1) ? !blk_row : 1;
+      const int skip_this_r = skip_this && !block_edge_above;
+
+      TX_SIZE tx_size = (plane->plane_type == PLANE_TYPE_UV)
+                            ? get_uv_tx_size(mbmi, plane)
+                            : mbmi->tx_size;
+#if CONFIG_VAR_TX
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+      TX_SIZE mb_tx_size = is_rect_tx(mbmi->tx_size)
+                               ? mbmi->tx_size
+                               : mbmi->inter_tx_size[blk_row][blk_col];
+#else
+      TX_SIZE mb_tx_size = mbmi->inter_tx_size[blk_row][blk_col];
+#endif
+#endif
+      const int skip_border_4x4_c = ss_x && mi_col + c == cm->mi_cols - 1;
+      const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
+
+      TX_SIZE tx_size_c = tx_size_wide_unit[tx_size];
+      TX_SIZE tx_size_r = tx_size_high_unit[tx_size];
+
+      int tx_size_mask = 0;
+      const int c_step = (c >> ss_x);
+      const int r_step = (r >> ss_y);
+      const int col_mask = 1 << c_step;
+
+#if CONFIG_VAR_TX
+      if (is_inter_block(mbmi) && !mbmi->skip) {
+        tx_size = (plane->plane_type == PLANE_TYPE_UV)
+                      ? uv_txsize_lookup[sb_type][mb_tx_size][ss_x][ss_y]
+                      : mb_tx_size;
+      }
+#endif
+
+      // Filter level can vary per MI
+      if (!(lfl[r][c_step] = get_filter_level(&cm->lf_info, mbmi))) continue;
+
+      if (txsize_sqr_up_map[tx_size] == TX_32X32)
+        tx_size_mask = 3;
+      else if (txsize_sqr_up_map[tx_size] == TX_16X16)
+        tx_size_mask = 1;
+      else
+        tx_size_mask = 0;
+
+#if CONFIG_VAR_TX
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+      tx_size_r =
+          AOMMIN(txsize_horz_map[tx_size], cm->above_txfm_context[mi_col + c]);
+      tx_size_c = AOMMIN(txsize_vert_map[tx_size],
+                         cm->left_txfm_context[(mi_row + r) & MAX_MIB_MASK]);
+
+      cm->above_txfm_context[mi_col + c] = txsize_horz_map[tx_size];
+      cm->left_txfm_context[(mi_row + r) & MAX_MIB_MASK] =
+          txsize_vert_map[tx_size];
+#else
+      tx_size_r = AOMMIN(tx_size, cm->above_txfm_context[mi_col + c]);
+      tx_size_c =
+          AOMMIN(tx_size, cm->left_txfm_context[(mi_row + r) & MAX_MIB_MASK]);
+
+      cm->above_txfm_context[mi_col + c] = tx_size;
+      cm->left_txfm_context[(mi_row + r) & MAX_MIB_MASK] = tx_size;
+#endif
+#endif
+
+      // Build masks based on the transform size of each block
+      // handle vertical mask
+      if (tx_size_c == TX_32X32) {
+        if (!skip_this_c && (c_step & tx_size_mask) == 0) {
+          if (!skip_border_4x4_c)
+            mask_16x16_c |= col_mask;
+          else
+            mask_8x8_c |= col_mask;
+        }
+      } else if (tx_size_c == TX_16X16) {
+        if (!skip_this_c && (c_step & tx_size_mask) == 0) {
+          if (!skip_border_4x4_c)
+            mask_16x16_c |= col_mask;
+          else
+            mask_8x8_c |= col_mask;
+        }
+      } else {
+        // force 8x8 filtering on 32x32 boundaries
+        if (!skip_this_c && (c_step & tx_size_mask) == 0) {
+          if (tx_size_c == TX_8X8 || ((c >> ss_x) & 3) == 0)
+            mask_8x8_c |= col_mask;
+          else
+            mask_4x4_c |= col_mask;
+        }
+
+        if (!skip_this && tx_size_c < TX_8X8 && !skip_border_4x4_c &&
+            (c_step & tx_size_mask) == 0)
+          mask_4x4_int[r] |= col_mask;
+      }
+
+      // set horizontal mask
+      if (tx_size_r == TX_32X32) {
+        if (!skip_this_r && (r_step & tx_size_mask) == 0) {
+          if (!skip_border_4x4_r)
+            mask_16x16[r] |= col_mask;
+          else
+            mask_8x8[r] |= col_mask;
+        }
+      } else if (tx_size_r == TX_16X16) {
+        if (!skip_this_r && (r_step & tx_size_mask) == 0) {
+          if (!skip_border_4x4_r)
+            mask_16x16[r] |= col_mask;
+          else
+            mask_8x8[r] |= col_mask;
+        }
+      } else {
+        // force 8x8 filtering on 32x32 boundaries
+        if (!skip_this_r && (r_step & tx_size_mask) == 0) {
+          if (tx_size_r == TX_8X8 || (r_step & 3) == 0)
+            mask_8x8[r] |= col_mask;
+          else
+            mask_4x4[r] |= col_mask;
+        }
+
+        if (!skip_this && tx_size_r < TX_8X8 && !skip_border_4x4_c &&
+            ((r >> ss_y) & tx_size_mask) == 0)
+          mask_4x4_int[r] |= col_mask;
+      }
+    }
+    tmp_mi += row_step_stride;
+  }
   for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += row_step) {
     const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
     const unsigned int mask_4x4_int_r = skip_border_4x4_r ? 0 : mask_4x4_int[r];
@@ -1404,11 +1588,12 @@
 #endif  // CONFIG_AOM_HIGHBITDEPTH
     dst->buf += MI_SIZE * dst->stride;
   }
+  dst->buf = dst0;
 }
 
-void av1_filter_block_plane_ss00(AV1_COMMON *const cm,
-                                 struct macroblockd_plane *const plane,
-                                 int mi_row, LOOP_FILTER_MASK *lfm) {
+void av1_filter_block_plane_ss00_ver(AV1_COMMON *const cm,
+                                     struct macroblockd_plane *const plane,
+                                     int mi_row, LOOP_FILTER_MASK *lfm) {
   struct buf_2d *const dst = &plane->dst;
   uint8_t *const dst0 = dst->buf;
   int r;
@@ -1452,10 +1637,20 @@
 
   // Horizontal pass
   dst->buf = dst0;
-  mask_16x16 = lfm->above_y[TX_16X16];
-  mask_8x8 = lfm->above_y[TX_8X8];
-  mask_4x4 = lfm->above_y[TX_4X4];
-  mask_4x4_int = lfm->int_4x4_y;
+}
+
+void av1_filter_block_plane_ss00_hor(AV1_COMMON *const cm,
+                                     struct macroblockd_plane *const plane,
+                                     int mi_row, LOOP_FILTER_MASK *lfm) {
+  struct buf_2d *const dst = &plane->dst;
+  uint8_t *const dst0 = dst->buf;
+  int r;
+  uint64_t mask_16x16 = lfm->above_y[TX_16X16];
+  uint64_t mask_8x8 = lfm->above_y[TX_8X8];
+  uint64_t mask_4x4 = lfm->above_y[TX_4X4];
+  uint64_t mask_4x4_int = lfm->int_4x4_y;
+
+  assert(plane->subsampling_x == 0 && plane->subsampling_y == 0);
 
   for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r++) {
     unsigned int mask_16x16_r;
@@ -1495,11 +1690,13 @@
     mask_4x4 >>= MI_SIZE;
     mask_4x4_int >>= MI_SIZE;
   }
+  // restore the buf pointer in case there is additional filter pass.
+  dst->buf = dst0;
 }
 
-void av1_filter_block_plane_ss11(AV1_COMMON *const cm,
-                                 struct macroblockd_plane *const plane,
-                                 int mi_row, LOOP_FILTER_MASK *lfm) {
+void av1_filter_block_plane_ss11_ver(AV1_COMMON *const cm,
+                                     struct macroblockd_plane *const plane,
+                                     int mi_row, LOOP_FILTER_MASK *lfm) {
   struct buf_2d *const dst = &plane->dst;
   uint8_t *const dst0 = dst->buf;
   int r, c;
@@ -1511,6 +1708,7 @@
 
   assert(plane->subsampling_x == 1 && plane->subsampling_y == 1);
   assert(plane->plane_type == PLANE_TYPE_UV);
+  memset(lfm->lfl_uv, 0, sizeof(lfm->lfl_uv));
 
   // Vertical pass: do 2 rows at one time
   for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += 4) {
@@ -1554,10 +1752,30 @@
 
   // Horizontal pass
   dst->buf = dst0;
-  mask_16x16 = lfm->above_uv[TX_16X16];
-  mask_8x8 = lfm->above_uv[TX_8X8];
-  mask_4x4 = lfm->above_uv[TX_4X4];
-  mask_4x4_int = lfm->above_int_4x4_uv;
+}
+
+void av1_filter_block_plane_ss11_hor(AV1_COMMON *const cm,
+                                     struct macroblockd_plane *const plane,
+                                     int mi_row, LOOP_FILTER_MASK *lfm) {
+  struct buf_2d *const dst = &plane->dst;
+  uint8_t *const dst0 = dst->buf;
+  int r, c;
+  uint64_t mask_16x16 = lfm->above_uv[TX_16X16];
+  uint64_t mask_8x8 = lfm->above_uv[TX_8X8];
+  uint64_t mask_4x4 = lfm->above_uv[TX_4X4];
+  uint64_t mask_4x4_int = lfm->above_int_4x4_uv;
+
+  assert(plane->subsampling_x == 1 && plane->subsampling_y == 1);
+  memset(lfm->lfl_uv, 0, sizeof(lfm->lfl_uv));
+
+  // re-porpulate the filter level for uv, same as the code for vertical
+  // filter in av1_filter_block_plane_ss11_ver
+  for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += 4) {
+    for (c = 0; c < (cm->mib_size >> 1); c++) {
+      lfm->lfl_uv[r >> 1][c] = lfm->lfl_y[r][c << 1];
+      lfm->lfl_uv[(r + 2) >> 1][c] = lfm->lfl_y[r + 2][c << 1];
+    }
+  }
 
   for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += 2) {
     const int skip_border_4x4_r = mi_row + r == cm->mi_rows - 1;
@@ -1600,6 +1818,8 @@
     mask_4x4 >>= MI_SIZE / 2;
     mask_4x4_int >>= MI_SIZE / 2;
   }
+  // restore the buf pointer in case there is additional filter pass.
+  dst->buf = dst0;
 }
 
 void av1_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
@@ -1622,12 +1842,15 @@
 
       av1_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
 
-      for (plane = 0; plane < num_planes; ++plane)
-        av1_filter_block_plane_non420(cm, &planes[plane], mi + mi_col, mi_row,
-                                      mi_col);
+      for (plane = 0; plane < num_planes; ++plane) {
+        av1_filter_block_plane_non420_ver(cm, &planes[plane], mi + mi_col,
+                                          mi_row, mi_col);
+        av1_filter_block_plane_non420_hor(cm, &planes[plane], mi + mi_col,
+                                          mi_row, mi_col);
+      }
     }
   }
-#else
+#else  // CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES
   const int num_planes = y_only ? 1 : MAX_MB_PLANE;
   int mi_row, mi_col;
   enum lf_path path;
@@ -1641,7 +1864,7 @@
     path = LF_PATH_444;
   else
     path = LF_PATH_SLOW;
-
+#if CONFIG_PARALLEL_DEBLOCKING
   for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
     MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
     for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
@@ -1652,23 +1875,85 @@
       // TODO(JBB): Make setup_mask work for non 420.
       av1_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm);
 
-      av1_filter_block_plane_ss00(cm, &planes[0], mi_row, &lfm);
+      av1_filter_block_plane_ss00_ver(cm, &planes[0], mi_row, &lfm);
       for (plane = 1; plane < num_planes; ++plane) {
         switch (path) {
           case LF_PATH_420:
-            av1_filter_block_plane_ss11(cm, &planes[plane], mi_row, &lfm);
+            av1_filter_block_plane_ss11_ver(cm, &planes[plane], mi_row, &lfm);
             break;
           case LF_PATH_444:
-            av1_filter_block_plane_ss00(cm, &planes[plane], mi_row, &lfm);
+            av1_filter_block_plane_ss00_ver(cm, &planes[plane], mi_row, &lfm);
             break;
           case LF_PATH_SLOW:
-            av1_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
-                                          mi_row, mi_col);
+            av1_filter_block_plane_non420_ver(cm, &planes[plane], mi + mi_col,
+                                              mi_row, mi_col);
             break;
         }
       }
     }
   }
+  for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
+    MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
+      int plane;
+
+      av1_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
+
+      // TODO(JBB): Make setup_mask work for non 420.
+      av1_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm);
+
+      av1_filter_block_plane_ss00_hor(cm, &planes[0], mi_row, &lfm);
+      for (plane = 1; plane < num_planes; ++plane) {
+        switch (path) {
+          case LF_PATH_420:
+            av1_filter_block_plane_ss11_hor(cm, &planes[plane], mi_row, &lfm);
+            break;
+          case LF_PATH_444:
+            av1_filter_block_plane_ss00_hor(cm, &planes[plane], mi_row, &lfm);
+            break;
+          case LF_PATH_SLOW:
+            av1_filter_block_plane_non420_hor(cm, &planes[plane], mi + mi_col,
+                                              mi_row, mi_col);
+            break;
+        }
+      }
+    }
+  }
+#else   // CONFIG_PARALLEL_DEBLOCKING
+  for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
+    MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
+      int plane;
+
+      av1_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
+
+      // TODO(JBB): Make setup_mask work for non 420.
+      av1_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm);
+
+      av1_filter_block_plane_ss00_ver(cm, &planes[0], mi_row, &lfm);
+      av1_filter_block_plane_ss00_hor(cm, &planes[0], mi_row, &lfm);
+      for (plane = 1; plane < num_planes; ++plane) {
+        switch (path) {
+          case LF_PATH_420:
+            av1_filter_block_plane_ss11_ver(cm, &planes[plane], mi_row, &lfm);
+            av1_filter_block_plane_ss11_hor(cm, &planes[plane], mi_row, &lfm);
+            break;
+          case LF_PATH_444:
+            av1_filter_block_plane_ss00_ver(cm, &planes[plane], mi_row, &lfm);
+            av1_filter_block_plane_ss00_hor(cm, &planes[plane], mi_row, &lfm);
+            break;
+          case LF_PATH_SLOW:
+            av1_filter_block_plane_non420_ver(cm, &planes[plane], mi + mi_col,
+                                              mi_row, mi_col);
+            av1_filter_block_plane_non420_hor(cm, &planes[plane], mi + mi_col,
+                                              mi_row, mi_col);
+
+            break;
+        }
+      }
+    }
+  }
+#endif  // CONFIG_PARALLEL_DEBLOCKING
 #endif  // CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES
 }
 
diff --git a/av1/common/loopfilter.h b/av1/common/loopfilter.h
index 975cbdf..0f70672 100644
--- a/av1/common/loopfilter.h
+++ b/av1/common/loopfilter.h
@@ -99,17 +99,27 @@
                     const int mi_col, MODE_INFO **mi_8x8,
                     const int mode_info_stride, LOOP_FILTER_MASK *lfm);
 
-void av1_filter_block_plane_ss00(struct AV1Common *const cm,
-                                 struct macroblockd_plane *const plane,
-                                 int mi_row, LOOP_FILTER_MASK *lfm);
+void av1_filter_block_plane_ss00_ver(struct AV1Common *const cm,
+                                     struct macroblockd_plane *const plane,
+                                     int mi_row, LOOP_FILTER_MASK *lfm);
+void av1_filter_block_plane_ss00_hor(struct AV1Common *const cm,
+                                     struct macroblockd_plane *const plane,
+                                     int mi_row, LOOP_FILTER_MASK *lfm);
+void av1_filter_block_plane_ss11_ver(struct AV1Common *const cm,
+                                     struct macroblockd_plane *const plane,
+                                     int mi_row, LOOP_FILTER_MASK *lfm);
+void av1_filter_block_plane_ss11_hor(struct AV1Common *const cm,
+                                     struct macroblockd_plane *const plane,
+                                     int mi_row, LOOP_FILTER_MASK *lfm);
 
-void av1_filter_block_plane_ss11(struct AV1Common *const cm,
-                                 struct macroblockd_plane *const plane,
-                                 int mi_row, LOOP_FILTER_MASK *lfm);
-
-void av1_filter_block_plane_non420(struct AV1Common *cm,
-                                   struct macroblockd_plane *plane,
-                                   MODE_INFO **mi_8x8, int mi_row, int mi_col);
+void av1_filter_block_plane_non420_ver(struct AV1Common *cm,
+                                       struct macroblockd_plane *plane,
+                                       MODE_INFO **mi_8x8, int mi_row,
+                                       int mi_col);
+void av1_filter_block_plane_non420_hor(struct AV1Common *cm,
+                                       struct macroblockd_plane *plane,
+                                       MODE_INFO **mi_8x8, int mi_row,
+                                       int mi_col);
 
 void av1_loop_filter_init(struct AV1Common *cm);
 
diff --git a/av1/common/mvref_common.c b/av1/common/mvref_common.c
index 2344bc1..f9402e9 100644
--- a/av1/common/mvref_common.c
+++ b/av1/common/mvref_common.c
@@ -350,7 +350,11 @@
 
       if (idx == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
         ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int;
+#if CONFIG_SIMP_MV_PRED
+        ref_mv_stack[idx].pred_mv[0] = prev_frame_mvs->mv[ref];
+#else
         ref_mv_stack[idx].pred_mv[0] = prev_frame_mvs->pred_mv[ref];
+#endif
         ref_mv_stack[idx].weight = 2;
         ++(*refmv_count);
       }
@@ -394,8 +398,8 @@
 
   // Check top-right boundary
   if (has_tr)
-    newmv_count += scan_blk_mbmi(cm, xd, mi_row, mi_col, block, rf, -1, 1,
-                                 ref_mv_stack, refmv_count);
+    newmv_count += scan_blk_mbmi(cm, xd, mi_row, mi_col, block, rf, -1,
+                                 xd->n8_w, ref_mv_stack, refmv_count);
 
   nearest_refmv_count = *refmv_count;
 
@@ -419,13 +423,26 @@
     mode_context[ref_frame] |= (1 << ZEROMV_OFFSET);
   }
 
-  // Scan the second outer area.
+// Scan the second outer area.
+#if CONFIG_SIMP_MV_PRED
+  scan_blk_mbmi(cm, xd, mi_row, mi_col, block, rf, -1, -1, ref_mv_stack,
+                refmv_count);
+  for (idx = 2; idx <= 3; ++idx) {
+    scan_row_mbmi(cm, xd, mi_row, mi_col, block, rf, -idx, ref_mv_stack,
+                  refmv_count);
+    scan_col_mbmi(cm, xd, mi_row, mi_col, block, rf, -idx, ref_mv_stack,
+                  refmv_count);
+  }
+  scan_col_mbmi(cm, xd, mi_row, mi_col, block, rf, -4, ref_mv_stack,
+                refmv_count);
+#else
   for (idx = 2; idx <= 4; ++idx) {
     scan_row_mbmi(cm, xd, mi_row, mi_col, block, rf, -idx, ref_mv_stack,
                   refmv_count);
     scan_col_mbmi(cm, xd, mi_row, mi_col, block, rf, -idx, ref_mv_stack,
                   refmv_count);
   }
+#endif
 
   switch (nearest_refmv_count) {
     case 0:
@@ -530,7 +547,9 @@
                              void *const data, int16_t *mode_context) {
   const int *ref_sign_bias = cm->ref_frame_sign_bias;
   int i, refmv_count = 0;
+#if !CONFIG_SIMP_MV_PRED
   const POSITION *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type];
+#endif
   int different_ref_found = 0;
   int context_counter = 0;
   const MV_REF *const prev_frame_mvs =
@@ -540,6 +559,29 @@
   const TileInfo *const tile = &xd->tile;
   const int bw = num_8x8_blocks_wide_lookup[mi->mbmi.sb_type] << 3;
   const int bh = num_8x8_blocks_high_lookup[mi->mbmi.sb_type] << 3;
+#if CONFIG_SIMP_MV_PRED
+  POSITION mv_ref_search[MVREF_NEIGHBOURS];
+  const int num_8x8_blocks_wide = bw >> 3;
+  const int num_8x8_blocks_high = bh >> 3;
+  mv_ref_search[0].row = num_8x8_blocks_high - 1;
+  mv_ref_search[0].col = -1;
+  mv_ref_search[1].row = -1;
+  mv_ref_search[1].col = num_8x8_blocks_wide - 1;
+  mv_ref_search[2].row = -1;
+  mv_ref_search[2].col = (num_8x8_blocks_wide - 1) >> 1;
+  mv_ref_search[3].row = (num_8x8_blocks_high - 1) >> 1;
+  mv_ref_search[3].col = -1;
+  mv_ref_search[4].row = -1;
+  mv_ref_search[4].col = -1;
+  mv_ref_search[5].row = -1;
+  mv_ref_search[5].col = num_8x8_blocks_wide;
+  mv_ref_search[6].row = num_8x8_blocks_high;
+  mv_ref_search[6].col = -1;
+  mv_ref_search[7].row = -1;
+  mv_ref_search[7].col = -3;
+  mv_ref_search[8].row = num_8x8_blocks_high - 1;
+  mv_ref_search[8].col = -3;
+#endif
 
   // The nearest 2 blocks are treated differently
   // if the size < 8x8 we get the mv from the bmi substructure,
@@ -571,6 +613,11 @@
     if (is_inside(tile, mi_col, mi_row, mv_ref)) {
       const MB_MODE_INFO *const candidate =
           &xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride]->mbmi;
+#if CONFIG_SIMP_MV_PRED
+      if (candidate == NULL) continue;
+      if ((mi_row % 8) + mv_ref->row >= 8 || (mi_col % 8) + mv_ref->col >= 8)
+        continue;
+#endif
       different_ref_found = 1;
 
       if (candidate->ref_frame[0] == ref_frame)
@@ -617,6 +664,11 @@
       if (is_inside(tile, mi_col, mi_row, mv_ref)) {
         const MB_MODE_INFO *const candidate =
             &xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride]->mbmi;
+#if CONFIG_SIMP_MV_PRED
+        if (candidate == NULL) continue;
+        if ((mi_row % 8) + mv_ref->row >= 8 || (mi_col % 8) + mv_ref->col >= 8)
+          continue;
+#endif
 
         // If the candidate is INTRA we don't want to consider its mv.
         IF_DIFF_REF_FRAME_ADD_MV(candidate, ref_frame, ref_sign_bias,
diff --git a/av1/common/mvref_common.h b/av1/common/mvref_common.h
index 25ebbfd..a9478a6 100644
--- a/av1/common/mvref_common.h
+++ b/av1/common/mvref_common.h
@@ -18,7 +18,11 @@
 extern "C" {
 #endif
 
+#if CONFIG_SIMP_MV_PRED
+#define MVREF_NEIGHBOURS 9
+#else
 #define MVREF_NEIGHBOURS 8
+#endif
 
 typedef struct position {
   int row;
@@ -96,6 +100,7 @@
   BOTH_INTRA             // 18
 };
 
+#if !CONFIG_SIMP_MV_PRED
 static const POSITION mv_ref_blocks[BLOCK_SIZES][MVREF_NEIGHBOURS] = {
   // 4X4
   { { -1, 0 },
@@ -245,6 +250,7 @@
     { -2, 12 } },
 #endif  // CONFIG_EXT_PARTITION
 };
+#endif
 
 static const int idx_n_column_to_subblock[4][2] = {
   { 1, 2 }, { 1, 3 }, { 3, 2 }, { 3, 3 }
@@ -268,22 +274,30 @@
 // on whether the block_size < 8x8 and we have check_sub_blocks set.
 static INLINE int_mv get_sub_block_mv(const MODE_INFO *candidate, int which_mv,
                                       int search_col, int block_idx) {
+#if CONFIG_SIMP_MV_PRED
+  return candidate->mbmi.mv[which_mv];
+#else
   return block_idx >= 0 && candidate->mbmi.sb_type < BLOCK_8X8
              ? candidate
                    ->bmi[idx_n_column_to_subblock[block_idx][search_col == 0]]
                    .as_mv[which_mv]
              : candidate->mbmi.mv[which_mv];
+#endif
 }
 
 #if CONFIG_REF_MV
 static INLINE int_mv get_sub_block_pred_mv(const MODE_INFO *candidate,
                                            int which_mv, int search_col,
                                            int block_idx) {
+#if CONFIG_SIMP_MV_PRED
+  return candidate->mbmi.mv[which_mv];
+#else
   return block_idx >= 0 && candidate->mbmi.sb_type < BLOCK_8X8
              ? candidate
                    ->bmi[idx_n_column_to_subblock[block_idx][search_col == 0]]
                    .pred_mv[which_mv]
              : candidate->mbmi.pred_mv[which_mv];
+#endif
 }
 #endif
 
@@ -341,8 +355,7 @@
 }
 
 static INLINE void lower_mv_precision(MV *mv, int allow_hp) {
-  const int use_hp = allow_hp && av1_use_mv_hp(mv);
-  if (!use_hp) {
+  if (!allow_hp) {
     if (mv->row & 1) mv->row += (mv->row > 0 ? -1 : 1);
     if (mv->col & 1) mv->col += (mv->col > 0 ? -1 : 1);
   }
diff --git a/av1/common/od_dering.c b/av1/common/od_dering.c
index 7aa704f..f19291c 100644
--- a/av1/common/od_dering.c
+++ b/av1/common/od_dering.c
@@ -262,13 +262,52 @@
   return (threshold * OD_THRESH_TABLE_Q8[OD_ILOG(v1)] + 128) >> 8;
 }
 
+static INLINE void copy_8x8_16bit(int16_t *dst, int dstride, int16_t *src, int sstride) {
+  int i, j;
+  for (i = 0; i < 8; i++)
+    for (j = 0; j < 8; j++)
+      dst[i * dstride + j] = src[i * sstride + j];
+}
+
+static INLINE void copy_4x4_16bit(int16_t *dst, int dstride, int16_t *src, int sstride) {
+  int i, j;
+  for (i = 0; i < 4; i++)
+    for (j = 0; j < 4; j++)
+      dst[i * dstride + j] = src[i * sstride + j];
+}
+
+/* TODO: Optimize this function for SSE. */
+void copy_blocks_16bit(int16_t *dst, int dstride, int16_t *src, int sstride,
+    unsigned char (*bskip)[2], int dering_count, int bsize)
+{
+  int bi, bx, by;
+  if (bsize == 3) {
+    for (bi = 0; bi < dering_count; bi++) {
+      by = bskip[bi][0];
+      bx = bskip[bi][1];
+      copy_8x8_16bit(&dst[(by << 3) * dstride + (bx << 3)],
+                     dstride,
+                     &src[(by << 3) * sstride + (bx << 3)], sstride);
+    }
+  } else {
+    for (bi = 0; bi < dering_count; bi++) {
+      by = bskip[bi][0];
+      bx = bskip[bi][1];
+      copy_4x4_16bit(&dst[(by << 2) * dstride + (bx << 2)],
+                     dstride,
+                     &src[(by << 2) * sstride + (bx << 2)], sstride);
+    }
+  }
+}
+
 void od_dering(int16_t *y, int ystride, const od_dering_in *x, int xstride,
                int nhb, int nvb, int sbx, int sby, int nhsb, int nvsb, int xdec,
                int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
-               unsigned char *bskip, int skip_stride, int threshold,
+               unsigned char (*bskip)[2], int dering_count, int threshold,
                int coeff_shift) {
   int i;
   int j;
+  int bi;
   int bx;
   int by;
   int16_t inbuf[OD_DERING_INBUF_SIZE];
@@ -295,56 +334,44 @@
       in[i * OD_FILT_BSTRIDE + j] = x[i * xstride + j];
     }
   }
-  /* Assume deringing filter is sparsely applied, so do one large copy rather
-     than small copies later if deringing is skipped. */
-  for (i = 0; i < nvb << bsize; i++) {
-    for (j = 0; j < nhb << bsize; j++) {
-      y[i * ystride + j] = in[i * OD_FILT_BSTRIDE + j];
-    }
-  }
   if (pli == 0) {
-    for (by = 0; by < nvb; by++) {
-      for (bx = 0; bx < nhb; bx++) {
-        if (bskip[by * skip_stride + bx]) continue;
-        dir[by][bx] = od_dir_find8(&x[8 * by * xstride + 8 * bx], xstride,
-                                   &var[by][bx], coeff_shift);
-        /* Deringing orthogonal to the direction uses a tighter threshold
-           because we want to be conservative. We've presumably already
-           achieved some deringing, so the amount of change is expected
-           to be low. Also, since we might be filtering across an edge, we
-           want to make sure not to blur it. That being said, we might want
-           to be a little bit more aggressive on pure horizontal/vertical
-           since the ringing there tends to be directional, so it doesn't
-           get removed by the directional filtering. */
-        filter2_thresh[by][bx] = (filter_dering_direction[bsize - OD_LOG_BSIZE0])(
-            &y[(by * ystride << bsize) + (bx << bsize)], ystride,
-            &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)],
-            od_adjust_thresh(threshold, var[by][bx]), dir[by][bx]);
-      }
+    for (bi = 0; bi < dering_count; bi++) {
+      by = bskip[bi][0];
+      bx = bskip[bi][1];
+      dir[by][bx] = od_dir_find8(&x[8 * by * xstride + 8 * bx], xstride,
+                                 &var[by][bx], coeff_shift);
+      /* Deringing orthogonal to the direction uses a tighter threshold
+         because we want to be conservative. We've presumably already
+         achieved some deringing, so the amount of change is expected
+         to be low. Also, since we might be filtering across an edge, we
+         want to make sure not to blur it. That being said, we might want
+         to be a little bit more aggressive on pure horizontal/vertical
+         since the ringing there tends to be directional, so it doesn't
+         get removed by the directional filtering. */
+      filter2_thresh[by][bx] = (filter_dering_direction[bsize - OD_LOG_BSIZE0])(
+          &y[(by * ystride << bsize) + (bx << bsize)], ystride,
+          &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)],
+          od_adjust_thresh(threshold, var[by][bx]), dir[by][bx]);
     }
   } else {
-    for (by = 0; by < nvb; by++) {
-      for (bx = 0; bx < nhb; bx++) {
-        if (bskip[by * skip_stride + bx]) continue;
-        filter2_thresh[by][bx] = (filter_dering_direction[bsize - OD_LOG_BSIZE0])(
-            &y[(by * ystride << bsize) + (bx << bsize)], ystride,
-            &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], threshold,
-            dir[by][bx]);
-      }
-    }
-  }
-  for (i = 0; i < nvb << bsize; i++) {
-    for (j = 0; j < nhb << bsize; j++) {
-      in[i * OD_FILT_BSTRIDE + j] = y[i * ystride + j];
-    }
-  }
-  for (by = 0; by < nvb; by++) {
-    for (bx = 0; bx < nhb; bx++) {
-      if (bskip[by * skip_stride + bx] || filter2_thresh[by][bx] == 0) continue;
-      (filter_dering_orthogonal[bsize - OD_LOG_BSIZE0])(
+    for (bi = 0; bi < dering_count; bi++) {
+      by = bskip[bi][0];
+      bx = bskip[bi][1];
+      filter2_thresh[by][bx] = (filter_dering_direction[bsize - OD_LOG_BSIZE0])(
           &y[(by * ystride << bsize) + (bx << bsize)], ystride,
-          &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], filter2_thresh[by][bx],
+          &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], threshold,
           dir[by][bx]);
     }
   }
+  copy_blocks_16bit(in, OD_FILT_BSTRIDE, y, ystride, bskip, dering_count,
+      bsize);
+  for (bi = 0; bi < dering_count; bi++) {
+    by = bskip[bi][0];
+    bx = bskip[bi][1];
+    if (filter2_thresh[by][bx] == 0) continue;
+    (filter_dering_orthogonal[bsize - OD_LOG_BSIZE0])(
+        &y[(by * ystride << bsize) + (bx << bsize)], ystride,
+        &in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], filter2_thresh[by][bx],
+        dir[by][bx]);
+  }
 }
diff --git a/av1/common/od_dering.h b/av1/common/od_dering.h
index c64439f..97090e5 100644
--- a/av1/common/od_dering.h
+++ b/av1/common/od_dering.h
@@ -12,6 +12,8 @@
 #if !defined(_dering_H)
 #define _dering_H (1)
 
+// clang-format off
+
 #include "odintrin.h"
 
 #if defined(DAALA_ODINTRIN)
@@ -34,10 +36,13 @@
 typedef void (*od_filter_dering_orthogonal_func)(int16_t *y, int ystride,
                                                  const int16_t *in,
                                                  int threshold, int dir);
+void copy_blocks_16bit(int16_t *dst, int dstride, int16_t *src, int sstride,
+    unsigned char (*bskip)[2], int dering_count, int bsize);
+
 void od_dering(int16_t *y, int ystride, const od_dering_in *x, int xstride,
                int nvb, int nhb, int sbx, int sby, int nhsb, int nvsb, int xdec,
                int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
-               unsigned char *bskip, int skip_stride, int threshold,
+               unsigned char (*bskip)[2], int skip_stride, int threshold,
                int coeff_shift);
 int od_filter_dering_direction_4x4_c(int16_t *y, int ystride, const int16_t *in,
                                      int threshold, int dir);
diff --git a/av1/common/onyxc_int.h b/av1/common/onyxc_int.h
index be1cbc1..464314e 100644
--- a/av1/common/onyxc_int.h
+++ b/av1/common/onyxc_int.h
@@ -391,11 +391,15 @@
 #if CONFIG_DERING
   int dering_level;
 #endif
+
 #if CONFIG_DELTA_Q
   int delta_q_present_flag;
   // Resolution of delta quant
   int delta_q_res;
 #endif
+#if CONFIG_TILE_GROUPS
+  int num_tg;
+#endif
 } AV1_COMMON;
 
 // TODO(hkuang): Don't need to lock the whole pool after implementing atomic
@@ -588,11 +592,10 @@
 }
 
 #if CONFIG_DAALA_EC
-static INLINE const aom_cdf_prob *get_y_mode_cdf(const AV1_COMMON *cm,
-                                                 const MODE_INFO *mi,
-                                                 const MODE_INFO *above_mi,
-                                                 const MODE_INFO *left_mi,
-                                                 int block) {
+static INLINE aom_cdf_prob *get_y_mode_cdf(AV1_COMMON *cm, const MODE_INFO *mi,
+                                           const MODE_INFO *above_mi,
+                                           const MODE_INFO *left_mi,
+                                           int block) {
   const PREDICTION_MODE above = av1_above_block_mode(mi, above_mi, block);
   const PREDICTION_MODE left = av1_left_block_mode(mi, left_mi, block);
   return cm->kf_y_cdf[above][left];
@@ -675,6 +678,30 @@
   return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
 }
 
+static INLINE int max_block_wide(const MACROBLOCKD *xd, const BLOCK_SIZE bsize,
+                                 const int plane) {
+  int max_blocks_wide = block_size_wide[bsize];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+
+  if (xd->mb_to_right_edge < 0)
+    max_blocks_wide += xd->mb_to_right_edge >> (3 + pd->subsampling_x);
+
+  // Scale the width in the transform block unit.
+  return max_blocks_wide >> tx_size_wide_log2[0];
+}
+
+static INLINE int max_block_high(const MACROBLOCKD *xd, const BLOCK_SIZE bsize,
+                                 const int plane) {
+  int max_blocks_high = block_size_high[bsize];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+
+  if (xd->mb_to_bottom_edge < 0)
+    max_blocks_high += xd->mb_to_bottom_edge >> (3 + pd->subsampling_y);
+
+  // Scale the width in the transform block unit.
+  return max_blocks_high >> tx_size_wide_log2[0];
+}
+
 static INLINE void av1_zero_above_context(AV1_COMMON *const cm,
                                           int mi_col_start, int mi_col_end) {
   const int width = mi_col_end - mi_col_start;
@@ -704,36 +731,60 @@
 }
 
 #if CONFIG_VAR_TX
-static INLINE void set_txfm_ctx(TXFM_CONTEXT *txfm_ctx, TX_SIZE tx_size,
-                                int len) {
+static INLINE void set_txfm_ctx(TXFM_CONTEXT *txfm_ctx, uint8_t txs, int len) {
   int i;
-  for (i = 0; i < len; ++i) txfm_ctx[i] = tx_size;
+  for (i = 0; i < len; ++i) txfm_ctx[i] = txs;
 }
 
 static INLINE void set_txfm_ctxs(TX_SIZE tx_size, int n8_w, int n8_h,
                                  const MACROBLOCKD *xd) {
-  set_txfm_ctx(xd->above_txfm_context, txsize_horz_map[tx_size], n8_w);
-  set_txfm_ctx(xd->left_txfm_context, txsize_vert_map[tx_size], n8_h);
+  uint8_t bw = tx_size_wide[tx_size];
+  uint8_t bh = tx_size_high[tx_size];
+  set_txfm_ctx(xd->above_txfm_context, bw, n8_w);
+  set_txfm_ctx(xd->left_txfm_context, bh, n8_h);
 }
 
 static INLINE void txfm_partition_update(TXFM_CONTEXT *above_ctx,
                                          TXFM_CONTEXT *left_ctx,
                                          TX_SIZE tx_size) {
   BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
-  int bs = num_8x8_blocks_high_lookup[bsize];
+  int bh = num_8x8_blocks_high_lookup[bsize];
+  int bw = num_8x8_blocks_wide_lookup[bsize];
+  uint8_t txw = tx_size_wide[tx_size];
+  uint8_t txh = tx_size_high[tx_size];
   int i;
-  for (i = 0; i < bs; ++i) {
-    above_ctx[i] = tx_size;
-    left_ctx[i] = tx_size;
-  }
+  for (i = 0; i < bh; ++i) left_ctx[i] = txh;
+  for (i = 0; i < bw; ++i) above_ctx[i] = txw;
 }
 
 static INLINE int txfm_partition_context(TXFM_CONTEXT *above_ctx,
                                          TXFM_CONTEXT *left_ctx,
-                                         TX_SIZE tx_size) {
-  int above = *above_ctx < tx_size;
-  int left = *left_ctx < tx_size;
-  return (tx_size - 1) * 3 + above + left;
+                                         const BLOCK_SIZE bsize,
+                                         const TX_SIZE tx_size) {
+  const uint8_t txw = tx_size_wide[tx_size];
+  const uint8_t txh = tx_size_high[tx_size];
+  const int above = *above_ctx < txw;
+  const int left = *left_ctx < txh;
+  TX_SIZE max_tx_size = max_txsize_lookup[bsize];
+  int category = 15;
+
+  if (max_tx_size == TX_32X32) {
+    if (tx_size == TX_32X32)
+      category = 0;
+    else
+      category = 1;
+  } else if (max_tx_size == TX_16X16) {
+    if (tx_size == TX_16X16)
+      category = 2;
+    else
+      category = 3;
+  } else if (max_tx_size == TX_8X8) {
+    category = 4;
+  }
+
+  if (category == 15) return category;
+
+  return category * 3 + above + left;
 }
 #endif
 
diff --git a/av1/common/pred_common.c b/av1/common/pred_common.c
index 35067f2..5b7c2ec 100644
--- a/av1/common/pred_common.c
+++ b/av1/common/pred_common.c
@@ -756,36 +756,24 @@
     } else if (above_intra || left_intra) {  // intra/inter or inter/intra
       const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
 
-      if (!has_second_ref(edge_mbmi))
+      if (!has_second_ref(edge_mbmi))  // single
         pred_context = 4 * (!CHECK_BACKWARD_REFS(edge_mbmi->ref_frame[0]));
-      else
-        pred_context = 1 + (!CHECK_BACKWARD_REFS(edge_mbmi->ref_frame[0]) ||
-                            !CHECK_BACKWARD_REFS(edge_mbmi->ref_frame[1]));
+      else  // comp
+        pred_context = 2;
     } else {  // inter/inter
       const int above_has_second = has_second_ref(above_mbmi);
       const int left_has_second = has_second_ref(left_mbmi);
 
       const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
-      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
       const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
-      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
 
-      if (above_has_second && left_has_second) {
-        pred_context =
-            1 + (!CHECK_BACKWARD_REFS(above0) || !CHECK_BACKWARD_REFS(above1) ||
-                 !CHECK_BACKWARD_REFS(left0) || !CHECK_BACKWARD_REFS(left1));
-      } else if (above_has_second || left_has_second) {
+      if (above_has_second && left_has_second) {  // comp/comp
+        pred_context = 2;
+      } else if (above_has_second || left_has_second) {  // single/comp
         const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
-        const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
-        const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
 
-        if (!CHECK_BACKWARD_REFS(rfs))
-          pred_context =
-              3 + (!CHECK_BACKWARD_REFS(crf1) || !CHECK_BACKWARD_REFS(crf2));
-        else
-          pred_context =
-              !CHECK_BACKWARD_REFS(crf1) || !CHECK_BACKWARD_REFS(crf2);
-      } else {
+        pred_context = (!CHECK_BACKWARD_REFS(rfs)) ? 4 : 1;
+      } else {  // single/single
         pred_context = 2 * (!CHECK_BACKWARD_REFS(above0)) +
                        2 * (!CHECK_BACKWARD_REFS(left0));
       }
@@ -794,12 +782,11 @@
     const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
     if (!is_inter_block(edge_mbmi)) {  // intra
       pred_context = 2;
-    } else {  // inter
-      if (!has_second_ref(edge_mbmi))
+    } else {                           // inter
+      if (!has_second_ref(edge_mbmi))  // single
         pred_context = 4 * (!CHECK_BACKWARD_REFS(edge_mbmi->ref_frame[0]));
-      else
-        pred_context = 1 + (!CHECK_BACKWARD_REFS(edge_mbmi->ref_frame[0]) ||
-                            !CHECK_BACKWARD_REFS(edge_mbmi->ref_frame[1]));
+      else  // comp
+        pred_context = 2;
     }
   } else {  // no edges available
     pred_context = 2;
@@ -833,12 +820,12 @@
       pred_context = 2;
     } else if (above_intra || left_intra) {  // intra/inter or inter/intra
       const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
-      if (!has_second_ref(edge_mbmi)) {
+      if (!has_second_ref(edge_mbmi)) {  // single
         if (!CHECK_BACKWARD_REFS(edge_mbmi->ref_frame[0]))
           pred_context = 3;
         else
           pred_context = 4 * (edge_mbmi->ref_frame[0] == BWDREF_FRAME);
-      } else {
+      } else {  // comp
         pred_context = 1 +
                        2 * (edge_mbmi->ref_frame[0] == BWDREF_FRAME ||
                             edge_mbmi->ref_frame[1] == BWDREF_FRAME);
@@ -851,14 +838,14 @@
       const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
       const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
 
-      if (above_has_second && left_has_second) {
+      if (above_has_second && left_has_second) {  // comp/comp
         if (above0 == left0 && above1 == left1)
           pred_context =
               3 * (above0 == BWDREF_FRAME || above1 == BWDREF_FRAME ||
                    left0 == BWDREF_FRAME || left1 == BWDREF_FRAME);
         else
           pred_context = 2;
-      } else if (above_has_second || left_has_second) {
+      } else if (above_has_second || left_has_second) {  // single/comp
         const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
         const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
         const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
@@ -869,7 +856,7 @@
           pred_context = (crf1 == BWDREF_FRAME || crf2 == BWDREF_FRAME);
         else
           pred_context = 1 + 2 * (crf1 == BWDREF_FRAME || crf2 == BWDREF_FRAME);
-      } else {
+      } else {  // single/single
         if (!CHECK_BACKWARD_REFS(above0) && !CHECK_BACKWARD_REFS(left0)) {
           pred_context = 2 + (above0 == left0);
         } else if (!CHECK_BACKWARD_REFS(above0) ||
@@ -890,9 +877,9 @@
         (!CHECK_BACKWARD_REFS(edge_mbmi->ref_frame[0]) &&
          !has_second_ref(edge_mbmi)))
       pred_context = 2;
-    else if (!has_second_ref(edge_mbmi))
+    else if (!has_second_ref(edge_mbmi))  // single
       pred_context = 4 * (edge_mbmi->ref_frame[0] == BWDREF_FRAME);
-    else
+    else  // comp
       pred_context = 3 * (edge_mbmi->ref_frame[0] == BWDREF_FRAME ||
                           edge_mbmi->ref_frame[1] == BWDREF_FRAME);
   } else {  // no edges available (2)
@@ -927,12 +914,12 @@
       pred_context = 2;
     } else if (above_intra || left_intra) {  // intra/inter or inter/intra
       const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
-      if (!has_second_ref(edge_mbmi)) {
+      if (!has_second_ref(edge_mbmi)) {  // single
         if (CHECK_BACKWARD_REFS(edge_mbmi->ref_frame[0]))
           pred_context = 3;
         else
           pred_context = 4 * CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]);
-      } else {
+      } else {  // comp
         pred_context = 1 +
                        2 * (CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]) ||
                             CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[1]));
@@ -945,14 +932,14 @@
       const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
       const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
 
-      if (above_has_second && left_has_second) {
+      if (above_has_second && left_has_second) {  // comp/comp
         if (above0 == left0 && above1 == left1)
           pred_context =
               3 * (CHECK_LAST_OR_LAST2(above0) || CHECK_LAST_OR_LAST2(above1) ||
                    CHECK_LAST_OR_LAST2(left0) || CHECK_LAST_OR_LAST2(left1));
         else
           pred_context = 2;
-      } else if (above_has_second || left_has_second) {
+      } else if (above_has_second || left_has_second) {  // single/comp
         const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
         const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
         const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
@@ -966,7 +953,7 @@
         else
           pred_context =
               1 + 2 * (CHECK_LAST_OR_LAST2(crf1) || CHECK_LAST_OR_LAST2(crf2));
-      } else {
+      } else {  // single/single
         if (CHECK_BACKWARD_REFS(above0) && CHECK_BACKWARD_REFS(left0)) {
           pred_context = 2 + (above0 == left0);
         } else if (CHECK_BACKWARD_REFS(above0) || CHECK_BACKWARD_REFS(left0)) {
@@ -986,9 +973,9 @@
         (CHECK_BACKWARD_REFS(edge_mbmi->ref_frame[0]) &&
          !has_second_ref(edge_mbmi)))
       pred_context = 2;
-    else if (!has_second_ref(edge_mbmi))
+    else if (!has_second_ref(edge_mbmi))  // single
       pred_context = 4 * (CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]));
-    else
+    else  // comp
       pred_context = 3 * (CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]) ||
                           CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[1]));
   } else {  // no edges available (2)
@@ -1023,12 +1010,12 @@
       pred_context = 2;
     } else if (above_intra || left_intra) {  // intra/inter or inter/intra
       const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
-      if (!has_second_ref(edge_mbmi)) {
+      if (!has_second_ref(edge_mbmi)) {  // single
         if (!CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]))
           pred_context = 3;
         else
           pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME);
-      } else {
+      } else {  // comp
         pred_context = 1 +
                        2 * (edge_mbmi->ref_frame[0] == LAST_FRAME ||
                             edge_mbmi->ref_frame[1] == LAST_FRAME);
@@ -1041,13 +1028,13 @@
       const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
       const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
 
-      if (above_has_second && left_has_second) {
+      if (above_has_second && left_has_second) {  // comp/comp
         if (above0 == left0 && above1 == left1)
           pred_context = 3 * (above0 == LAST_FRAME || above1 == LAST_FRAME ||
                               left0 == LAST_FRAME || left1 == LAST_FRAME);
         else
           pred_context = 2;
-      } else if (above_has_second || left_has_second) {
+      } else if (above_has_second || left_has_second) {  // single/comp
         const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
         const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
         const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
@@ -1058,7 +1045,7 @@
           pred_context = (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
         else
           pred_context = 1 + 2 * (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
-      } else {
+      } else {  // single/single
         if (!CHECK_LAST_OR_LAST2(above0) && !CHECK_LAST_OR_LAST2(left0)) {
           pred_context = 2 + (above0 == left0);
         } else if (!CHECK_LAST_OR_LAST2(above0) ||
@@ -1078,9 +1065,9 @@
         (!CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]) &&
          !has_second_ref(edge_mbmi)))
       pred_context = 2;
-    else if (!has_second_ref(edge_mbmi))
+    else if (!has_second_ref(edge_mbmi))  // single
       pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME);
-    else
+    else  // comp
       pred_context = 3 * (edge_mbmi->ref_frame[0] == LAST_FRAME ||
                           edge_mbmi->ref_frame[1] == LAST_FRAME);
   } else {  // no edges available (2)
@@ -1115,12 +1102,12 @@
       pred_context = 2;
     } else if (above_intra || left_intra) {  // intra/inter or inter/intra
       const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
-      if (!has_second_ref(edge_mbmi)) {
+      if (!has_second_ref(edge_mbmi)) {  // single
         if (!CHECK_GOLDEN_OR_LAST3(edge_mbmi->ref_frame[0]))
           pred_context = 3;
         else
           pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST3_FRAME);
-      } else {
+      } else {  // comp
         pred_context = 1 +
                        2 * (edge_mbmi->ref_frame[0] == LAST3_FRAME ||
                             edge_mbmi->ref_frame[1] == LAST3_FRAME);
@@ -1133,13 +1120,13 @@
       const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
       const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
 
-      if (above_has_second && left_has_second) {
+      if (above_has_second && left_has_second) {  // comp/comp
         if (above0 == left0 && above1 == left1)
           pred_context = 3 * (above0 == LAST3_FRAME || above1 == LAST3_FRAME ||
                               left0 == LAST3_FRAME || left1 == LAST3_FRAME);
         else
           pred_context = 2;
-      } else if (above_has_second || left_has_second) {
+      } else if (above_has_second || left_has_second) {  // single/comp
         const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
         const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
         const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
@@ -1150,7 +1137,7 @@
           pred_context = (crf1 == LAST3_FRAME || crf2 == LAST3_FRAME);
         else
           pred_context = 1 + 2 * (crf1 == LAST3_FRAME || crf2 == LAST3_FRAME);
-      } else {
+      } else {  // single/single
         if (!CHECK_GOLDEN_OR_LAST3(above0) && !CHECK_GOLDEN_OR_LAST3(left0)) {
           pred_context = 2 + (above0 == left0);
         } else if (!CHECK_GOLDEN_OR_LAST3(above0) ||
@@ -1171,9 +1158,9 @@
         (!CHECK_GOLDEN_OR_LAST3(edge_mbmi->ref_frame[0]) &&
          !has_second_ref(edge_mbmi)))
       pred_context = 2;
-    else if (!has_second_ref(edge_mbmi))
+    else if (!has_second_ref(edge_mbmi))  // single
       pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST3_FRAME);
-    else
+    else  // comp
       pred_context = 3 * (edge_mbmi->ref_frame[0] == LAST3_FRAME ||
                           edge_mbmi->ref_frame[1] == LAST3_FRAME);
   } else {  // no edges available (2)
diff --git a/av1/common/pred_common.h b/av1/common/pred_common.h
index b3ef1c4..6b47ed2 100644
--- a/av1/common/pred_common.h
+++ b/av1/common/pred_common.h
@@ -186,8 +186,13 @@
   if (!has_left) left_ctx = above_ctx;
 
   if (!has_above) above_ctx = left_ctx;
-
+#if CONFIG_CB4X4
+  // TODO(jingning): Temporary setup. Will rework this after the cb4x4
+  // framework is up running.
+  return (above_ctx + left_ctx) > max_tx_size + 1;
+#else
   return (above_ctx + left_ctx) > max_tx_size;
+#endif
 }
 
 #if CONFIG_VAR_TX
@@ -200,13 +205,8 @@
   const int tx_row = blk_row >> (1 - pd->subsampling_y);
   const int tx_col = blk_col >> (1 - pd->subsampling_x);
   const TX_SIZE plane_tx_size = mbmi->inter_tx_size[tx_row][tx_col];
-  int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
-  int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
-
-  if (xd->mb_to_bottom_edge < 0)
-    max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
-  if (xd->mb_to_right_edge < 0)
-    max_blocks_wide += xd->mb_to_right_edge >> (5 + pd->subsampling_x);
+  const int max_blocks_high = max_block_high(xd, plane_bsize, 0);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0);
 
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
diff --git a/av1/common/quant_common.c b/av1/common/quant_common.c
index b3228b7..69d0cc0 100644
--- a/av1/common/quant_common.c
+++ b/av1/common/quant_common.c
@@ -16,11 +16,6 @@
 #include "av1/common/seg_common.h"
 #include "av1/common/blockd.h"
 
-#if CONFIG_AOM_QM
-static void make_qmatrices(qm_val_t *wmatrix[NUM_QM_LEVELS][2][2][TX_SIZES],
-                           qm_val_t *iwmatrix[NUM_QM_LEVELS][2][2][TX_SIZES]);
-#endif
-
 #if CONFIG_NEW_QUANT
 // Bin widths expressed as a fraction over 128 of the quant stepsize,
 // for the quantization bins 0-4.
diff --git a/av1/common/reconinter.c b/av1/common/reconinter.c
index 6c4ae2a..66b6bfd 100644
--- a/av1/common/reconinter.c
+++ b/av1/common/reconinter.c
@@ -1889,6 +1889,7 @@
                                                   int dst_stride,
                                                   PREDICTION_MODE mode,
                                                   BLOCK_SIZE bsize, int plane) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
   BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, &xd->plane[plane]);
   const int bwl = b_width_log2_lookup[plane_bsize];
   const int bhl = b_height_log2_lookup[plane_bsize];
@@ -1897,14 +1898,14 @@
   TX_SIZE max_tx_size = max_txsize_lookup[plane_bsize];
 
   if (bwl == bhl) {
-    av1_predict_intra_block(xd, bwl, bhl, max_tx_size, mode, ref, ref_stride,
-                            dst, dst_stride, 0, 0, plane);
+    av1_predict_intra_block(xd, pd->width, pd->height, max_tx_size, mode, ref,
+                            ref_stride, dst, dst_stride, 0, 0, plane);
 
   } else if (bwl < bhl) {
     uint8_t *src_2 = ref + pxbw * ref_stride;
     uint8_t *dst_2 = dst + pxbw * dst_stride;
-    av1_predict_intra_block(xd, bwl, bhl, max_tx_size, mode, ref, ref_stride,
-                            dst, dst_stride, 0, 0, plane);
+    av1_predict_intra_block(xd, pd->width, pd->height, max_tx_size, mode, ref,
+                            ref_stride, dst, dst_stride, 0, 0, plane);
 #if CONFIG_AOM_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       uint16_t *src_216 = CONVERT_TO_SHORTPTR(src_2);
@@ -1916,14 +1917,14 @@
     {
       memcpy(src_2 - ref_stride, dst_2 - dst_stride, sizeof(*src_2) * pxbw);
     }
-    av1_predict_intra_block(xd, bwl, bhl, max_tx_size, mode, src_2, ref_stride,
-                            dst_2, dst_stride, 0, 1 << bwl, plane);
+    av1_predict_intra_block(xd, pd->width, pd->height, max_tx_size, mode, src_2,
+                            ref_stride, dst_2, dst_stride, 0, 1 << bwl, plane);
   } else {  // bwl > bhl
     int i;
     uint8_t *src_2 = ref + pxbh;
     uint8_t *dst_2 = dst + pxbh;
-    av1_predict_intra_block(xd, bwl, bhl, max_tx_size, mode, ref, ref_stride,
-                            dst, dst_stride, 0, 0, plane);
+    av1_predict_intra_block(xd, pd->width, pd->height, max_tx_size, mode, ref,
+                            ref_stride, dst, dst_stride, 0, 0, plane);
 #if CONFIG_AOM_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       uint16_t *src_216 = CONVERT_TO_SHORTPTR(src_2);
@@ -1936,8 +1937,8 @@
       for (i = 0; i < pxbh; ++i)
         src_2[i * ref_stride - 1] = dst_2[i * dst_stride - 1];
     }
-    av1_predict_intra_block(xd, bwl, bhl, max_tx_size, mode, src_2, ref_stride,
-                            dst_2, dst_stride, 1 << bhl, 0, plane);
+    av1_predict_intra_block(xd, pd->width, pd->height, max_tx_size, mode, src_2,
+                            ref_stride, dst_2, dst_stride, 1 << bhl, 0, plane);
   }
 }
 
diff --git a/av1/common/reconinter.h b/av1/common/reconinter.h
index 3eec384..13f581e 100644
--- a/av1/common/reconinter.h
+++ b/av1/common/reconinter.h
@@ -14,7 +14,7 @@
 
 #include "av1/common/filter.h"
 #include "av1/common/onyxc_int.h"
-#include "av1/common/av1_convolve.h"
+#include "av1/common/convolve.h"
 #include "aom/aom_integer.h"
 
 #ifdef __cplusplus
@@ -418,7 +418,6 @@
                           const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
                           const struct scale_factors *sf);
 
-#if CONFIG_DUAL_FILTER
 // Detect if the block have sub-pixel level motion vectors
 // per component.
 static INLINE int has_subpel_mv_component(const MODE_INFO *const mi,
@@ -460,60 +459,28 @@
 
   return 0;
 }
-#endif
 
-#if CONFIG_EXT_INTERP
+#define CHECK_SUBPEL 0
 static INLINE int av1_is_interp_needed(const MACROBLOCKD *const xd) {
+#if CHECK_SUBPEL
   MODE_INFO *const mi = xd->mi[0];
-  MB_MODE_INFO *const mbmi = &mi->mbmi;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
-  const int is_compound = has_second_ref(mbmi);
-  int intpel_mv = 1;
-  int plane;
-
-#if SUPPORT_NONINTERPOLATING_FILTERS
-  // TODO(debargha): This is is currently only for experimentation
-  // with non-interpolating filters. Remove later.
-  // If any of the filters are non-interpolating, then indicate the
-  // interpolation filter always.
-  int i;
-  for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
-    if (!IsInterpolatingFilter(i)) return 1;
-  }
-#endif
-
-  // For scaled references, interpolation filter is indicated all the time.
-  if (av1_is_scaled(&xd->block_refs[0]->sf)) return 1;
-  if (is_compound && av1_is_scaled(&xd->block_refs[1]->sf)) return 1;
-
-  if (bsize < BLOCK_8X8) {
-    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-      const PARTITION_TYPE bp = BLOCK_8X8 - bsize;
-      const struct macroblockd_plane *const pd = &xd->plane[plane];
-      const int have_vsplit = bp != PARTITION_HORZ;
-      const int have_hsplit = bp != PARTITION_VERT;
-      const int num_4x4_w = 2 >> ((!have_vsplit) | pd->subsampling_x);
-      const int num_4x4_h = 2 >> ((!have_hsplit) | pd->subsampling_y);
-      int ref;
-      for (ref = 0; ref < 1 + is_compound; ++ref) {
-        int x, y;
-        for (y = 0; y < num_4x4_h; ++y)
-          for (x = 0; x < num_4x4_w; ++x) {
-            const MV mv = average_split_mvs(pd, mi, ref, y * 2 + x);
-            if (mv_has_subpel(&mv)) return 1;
-          }
+  const int is_compound = has_second_ref(&mi->mbmi);
+  int ref;
+  for (ref = 0; ref < 1 + is_compound; ++ref) {
+    int row_col;
+    for (row_col = 0; row_col < 2; ++row_col) {
+      const int dir = (ref << 1) + row_col;
+      if (has_subpel_mv_component(mi, xd, dir)) {
+        return 1;
       }
     }
-    return 0;
-  } else {
-    intpel_mv = !mv_has_subpel(&mbmi->mv[0].as_mv);
-    if (is_compound && intpel_mv) {
-      intpel_mv &= !mv_has_subpel(&mbmi->mv[1].as_mv);
-    }
   }
-  return !intpel_mv;
+  return 0;
+#else
+  (void)xd;
+  return 1;
+#endif
 }
-#endif  // CONFIG_EXT_INTERP
 
 #if CONFIG_MOTION_VAR
 const uint8_t *av1_get_obmc_mask(int length);
diff --git a/av1/common/scan.c b/av1/common/scan.c
index 1281843..b2386b9 100644
--- a/av1/common/scan.c
+++ b/av1/common/scan.c
@@ -36,12 +36,12 @@
   0, 1, 4, 2, 5, 3, 6, 8, 9, 7, 12, 10, 13, 11, 14, 15,
 };
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, default_scan_4x8[32]) = {
   0,  1,  4,  5,  2,  8,  6,  9,  10, 3,  12, 7,  13, 11, 14, 16,
   17, 15, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
 };
 
+#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x8[32]) = {
   0, 4, 8,  12, 16, 20, 24, 28, 1, 5, 9,  13, 17, 21, 25, 29,
   2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31,
@@ -51,12 +51,14 @@
   0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
   16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
 };
+#endif
 
 DECLARE_ALIGNED(16, static const int16_t, default_scan_8x4[32]) = {
   0,  1,  8,  9, 2,  16, 10, 17, 18, 3,  24, 11, 25, 19, 26, 4,
   12, 27, 20, 5, 28, 13, 21, 29, 6,  14, 22, 30, 7,  15, 23, 31,
 };
 
+#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x4[32]) = {
   0, 8,  16, 24, 1, 9,  17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
   4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31,
@@ -66,7 +68,7 @@
   0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
   16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
 };
-#endif  // CONFIG_EXT_TX
+#endif
 
 DECLARE_ALIGNED(16, static const int16_t, default_scan_8x8[64]) = {
   0,  8,  1,  16, 9,  2,  17, 24, 10, 3,  18, 25, 32, 11, 4,  26,
@@ -105,7 +107,6 @@
   58, 45, 38, 52, 31, 59, 53, 46, 60, 39, 61, 47, 54, 55, 62, 63,
 };
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, default_scan_8x16[128]) = {
   0,   1,   8,   2,   9,   16,  3,   10,  17,  24,  4,   11,  18,  25,  32,
   5,   12,  19,  26,  33,  40,  6,   13,  20,  27,  34,  41,  48,  7,   14,
@@ -129,6 +130,7 @@
   122, 63, 78,  93,  108, 123, 79, 94, 109, 124, 95,  110, 125, 111, 126, 127,
 };
 
+#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x16[128]) = {
   0, 8,  16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96,  104, 112, 120,
   1, 9,  17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97,  105, 113, 121,
@@ -174,6 +176,7 @@
   105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
   120, 121, 122, 123, 124, 125, 126, 127,
 };
+#endif
 
 DECLARE_ALIGNED(16, static const int16_t, default_scan_16x32[512]) = {
   0,   1,   16,  2,   17,  32,  3,   18,  33,  48,  4,   19,  34,  49,  64,
@@ -251,6 +254,7 @@
   510, 511,
 };
 
+#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x32[512]) = {
   0,   16,  32,  48,  64,  80,  96,  112, 128, 144, 160, 176, 192, 208, 224,
   240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464,
@@ -1034,7 +1038,6 @@
   8, 3, 6, 8, 9, 6, 9, 9, 12, 7, 10, 10, 13, 11, 14, 0, 0,
 };
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t,
                 default_scan_4x8_neighbors[33 * MAX_NEIGHBORS]) = {
   0,  0,  0,  0,  0,  0,  1,  4,  1,  1,  4,  4,  2,  5,  5,  8,  6,
@@ -1043,6 +1046,7 @@
   24, 22, 25, 23, 26, 24, 24, 25, 28, 26, 29, 27, 30, 0,  0
 };
 
+#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t,
                 mcol_scan_4x8_neighbors[33 * MAX_NEIGHBORS]) = {
   0, 0, 0,  0,  4,  4,  8,  8,  12, 12, 16, 16, 20, 20, 24, 24, 0,
@@ -1058,6 +1062,7 @@
   13, 16, 14, 17, 15, 18, 16, 16, 17, 20, 18, 21, 19, 22, 20, 20, 21,
   24, 22, 25, 23, 26, 24, 24, 25, 28, 26, 29, 27, 30, 0,  0
 };
+#endif
 
 DECLARE_ALIGNED(16, static const int16_t,
                 default_scan_8x4_neighbors[33 * MAX_NEIGHBORS]) = {
@@ -1067,6 +1072,7 @@
   13, 14, 21, 22, 29, 6, 6,  7,  14, 15, 22, 23, 30, 0,  0
 };
 
+#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t,
                 mcol_scan_8x4_neighbors[33 * MAX_NEIGHBORS]) = {
   0,  0,  0,  0,  8,  8,  16, 16, 0,  0,  1,  8,  9,  16, 17, 24, 1,
@@ -1141,7 +1147,6 @@
   31, 38, 53, 60, 46, 53, 39, 46, 54, 61, 47, 54, 55, 62, 0,  0,
 };
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t,
                 default_scan_8x16_neighbors[129 * MAX_NEIGHBORS]) = {
   0,   0,   0,   0,   0,   0,   1,   1,   1,   8,   8,   8,   2,   2,   2,
@@ -1186,6 +1191,7 @@
   126, 0,   0
 };
 
+#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t,
                 mcol_scan_8x16_neighbors[129 * MAX_NEIGHBORS]) = {
   0,  0,  0,  0,  8,  8,  16, 16, 24, 24,  32,  32,  40,  40,  48,  48,
@@ -1271,6 +1277,7 @@
   104, 119, 105, 120, 106, 121, 107, 122, 108, 123, 109, 124, 110, 125, 111,
   126, 0,   0
 };
+#endif
 
 DECLARE_ALIGNED(16, static const int16_t,
                 default_scan_16x32_neighbors[513 * MAX_NEIGHBORS]) = {
@@ -1418,6 +1425,7 @@
   478, 509, 479, 510, 0,   0
 };
 
+#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t,
                 mcol_scan_16x32_neighbors[513 * MAX_NEIGHBORS]) = {
   0,   0,   0,   0,   16,  16,  32,  32,  48,  48,  64,  64,  80,  80,  96,
@@ -2841,12 +2849,12 @@
   0, 1, 3, 5, 2, 4, 6, 9, 7, 8, 11, 13, 10, 12, 14, 15,
 };
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_4x8[32]) = {
   0,  1,  4,  9,  2,  3,  6,  11, 5,  7,  8,  13, 10, 12, 14, 17,
   15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
 };
 
+#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_4x8[32]) = {
   0, 8,  16, 24, 1, 9,  17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
   4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31,
@@ -2856,12 +2864,14 @@
   0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
   16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
 };
+#endif
 
 DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x4[32]) = {
   0, 1, 4, 9,  15, 19, 24, 28, 2,  3,  6,  11, 16, 21, 25, 29,
   5, 7, 8, 13, 18, 22, 26, 30, 10, 12, 14, 17, 20, 23, 27, 31,
 };
 
+#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x4[32]) = {
   0, 4, 8,  12, 16, 20, 24, 28, 1, 5, 9,  13, 17, 21, 25, 29,
   2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31,
@@ -2910,7 +2920,6 @@
   25, 32, 39, 45, 50, 55, 59, 62, 33, 40, 46, 51, 54, 58, 61, 63,
 };
 
-#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x16[128]) = {
   0,  1,  3,   6,   10,  15,  21,  28,  2,  4,   7,   11,  16,  22,  29,  36,
   5,  8,  12,  17,  23,  30,  37,  44,  9,  13,  18,  24,  31,  38,  45,  52,
@@ -2933,6 +2942,7 @@
   35, 43, 51, 59, 67, 75, 83, 91, 99, 106, 112, 117, 121, 124, 126, 127,
 };
 
+#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x16[128]) = {
   0,  16, 32, 48, 64, 80, 96,  112, 1,  17, 33, 49, 65, 81, 97,  113,
   2,  18, 34, 50, 66, 82, 98,  114, 3,  19, 35, 51, 67, 83, 99,  115,
@@ -2978,6 +2988,7 @@
   105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
   120, 121, 122, 123, 124, 125, 126, 127,
 };
+#endif
 
 DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x32[512]) = {
   0,   1,   3,   6,   10,  15,  21,  28,  36,  45,  55,  66,  78,  91,  105,
@@ -3055,6 +3066,7 @@
   510, 511,
 };
 
+#if CONFIG_EXT_TX
 DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x32[512]) = {
   0,  32, 64, 96,  128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480,
   1,  33, 65, 97,  129, 161, 193, 225, 257, 289, 321, 353, 385, 417, 449, 481,
@@ -3801,20 +3813,24 @@
 #endif  // CONFIG_EXT_TX
 
 const SCAN_ORDER av1_default_scan_orders[TX_SIZES] = {
+#if CONFIG_CB4X4
+  { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+#endif
   { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
   { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
   { default_scan_16x16, av1_default_iscan_16x16, default_scan_16x16_neighbors },
   { default_scan_32x32, av1_default_iscan_32x32, default_scan_32x32_neighbors },
 };
 
-#if CONFIG_EXT_TX
 const SCAN_ORDER av1_intra_scan_orders[TX_SIZES][TX_TYPES] = {
+#if CONFIG_CB4X4
   {
-      // TX_4X4
+      // TX_2X2
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
       { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
       { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+#if CONFIG_EXT_TX
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
@@ -3827,6 +3843,29 @@
       { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
       { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
       { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
+#endif  // CONFIG_EXT_TX
+  },
+#endif
+  {
+      // TX_4X4
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
+      { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+#if CONFIG_EXT_TX
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
+      { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
+      { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
+      { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
+      { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
+      { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
+      { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
+#endif  // CONFIG_EXT_TX
   },
   {
       // TX_8X8
@@ -3834,6 +3873,7 @@
       { row_scan_8x8, av1_row_iscan_8x8, row_scan_8x8_neighbors },
       { col_scan_8x8, av1_col_iscan_8x8, col_scan_8x8_neighbors },
       { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+#if CONFIG_EXT_TX
       { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
       { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
       { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
@@ -3846,6 +3886,7 @@
       { col_scan_8x8, av1_col_iscan_8x8, col_scan_8x8_neighbors },
       { row_scan_8x8, av1_row_iscan_8x8, row_scan_8x8_neighbors },
       { col_scan_8x8, av1_col_iscan_8x8, col_scan_8x8_neighbors },
+#endif  // CONFIG_EXT_TX
   },
   {
       // TX_16X16
@@ -3855,6 +3896,7 @@
       { col_scan_16x16, av1_col_iscan_16x16, col_scan_16x16_neighbors },
       { default_scan_16x16, av1_default_iscan_16x16,
         default_scan_16x16_neighbors },
+#if CONFIG_EXT_TX
       { default_scan_16x16, av1_default_iscan_16x16,
         default_scan_16x16_neighbors },
       { default_scan_16x16, av1_default_iscan_16x16,
@@ -3872,11 +3914,13 @@
       { col_scan_16x16, av1_col_iscan_16x16, col_scan_16x16_neighbors },
       { row_scan_16x16, av1_row_iscan_16x16, row_scan_16x16_neighbors },
       { col_scan_16x16, av1_col_iscan_16x16, col_scan_16x16_neighbors },
+#endif  // CONFIG_EXT_TX
   },
   {
       // TX_32X32
       { default_scan_32x32, av1_default_iscan_32x32,
         default_scan_32x32_neighbors },
+#if CONFIG_EXT_TX
       { h2_scan_32x32, av1_h2_iscan_32x32, h2_scan_32x32_neighbors },
       { v2_scan_32x32, av1_v2_iscan_32x32, v2_scan_32x32_neighbors },
       { qtr_scan_32x32, av1_qtr_iscan_32x32, qtr_scan_32x32_neighbors },
@@ -3892,16 +3936,41 @@
       { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
       { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
       { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+#endif  // CONFIG_EXT_TX
   }
 };
 
 const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
+#if CONFIG_CB4X4
+  {
+      // TX_2X2
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+#if CONFIG_EXT_TX
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
+      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
+      { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
+      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
+      { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
+      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
+      { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
+#endif  // CONFIG_EXT_TX
+  },
+#endif
   {
       // TX_4X4
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+#if CONFIG_EXT_TX
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
       { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
@@ -3914,6 +3983,7 @@
       { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
       { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
       { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
+#endif  // CONFIG_EXT_TX
   },
   {
       // TX_8X8
@@ -3921,6 +3991,7 @@
       { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
       { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
       { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+#if CONFIG_EXT_TX
       { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
       { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
       { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
@@ -3933,6 +4004,7 @@
       { mcol_scan_8x8, av1_mcol_iscan_8x8, mcol_scan_8x8_neighbors },
       { mrow_scan_8x8, av1_mrow_iscan_8x8, mrow_scan_8x8_neighbors },
       { mcol_scan_8x8, av1_mcol_iscan_8x8, mcol_scan_8x8_neighbors },
+#endif  // CONFIG_EXT_TX
   },
   {
       // TX_16X16
@@ -3944,6 +4016,7 @@
         default_scan_16x16_neighbors },
       { default_scan_16x16, av1_default_iscan_16x16,
         default_scan_16x16_neighbors },
+#if CONFIG_EXT_TX
       { default_scan_16x16, av1_default_iscan_16x16,
         default_scan_16x16_neighbors },
       { default_scan_16x16, av1_default_iscan_16x16,
@@ -3961,11 +4034,13 @@
       { mcol_scan_16x16, av1_mcol_iscan_16x16, mcol_scan_16x16_neighbors },
       { mrow_scan_16x16, av1_mrow_iscan_16x16, mrow_scan_16x16_neighbors },
       { mcol_scan_16x16, av1_mcol_iscan_16x16, mcol_scan_16x16_neighbors },
+#endif  // CONFIG_EXT_TX
   },
   {
       // TX_32X32
       { default_scan_32x32, av1_default_iscan_32x32,
         default_scan_32x32_neighbors },
+#if CONFIG_EXT_TX
       { h2_scan_32x32, av1_h2_iscan_32x32, h2_scan_32x32_neighbors },
       { v2_scan_32x32, av1_v2_iscan_32x32, v2_scan_32x32_neighbors },
       { qtr_scan_32x32, av1_qtr_iscan_32x32, qtr_scan_32x32_neighbors },
@@ -3981,6 +4056,7 @@
       { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
       { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
       { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+#endif  // CONFIG_EXT_TX
   },
   {
       // TX_4X8
@@ -3988,6 +4064,7 @@
       { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
       { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
       { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+#if CONFIG_EXT_TX
       { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
       { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
       { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
@@ -4000,6 +4077,7 @@
       { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
       { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
       { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
+#endif  // CONFIG_EXT_TX
   },
   {
       // TX_8X4
@@ -4007,6 +4085,7 @@
       { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
       { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
       { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+#if CONFIG_EXT_TX
       { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
       { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
       { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
@@ -4019,6 +4098,7 @@
       { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
       { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
       { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
+#endif  // CONFIG_EXT_TX
   },
   {
       // TX_8X16
@@ -4030,6 +4110,7 @@
         default_scan_8x16_neighbors },
       { default_scan_8x16, av1_default_iscan_8x16,
         default_scan_8x16_neighbors },
+#if CONFIG_EXT_TX
       { default_scan_8x16, av1_default_iscan_8x16,
         default_scan_8x16_neighbors },
       { default_scan_8x16, av1_default_iscan_8x16,
@@ -4047,6 +4128,7 @@
       { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
       { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
       { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
+#endif  // CONFIG_EXT_TX
   },
   {
       // TX_16X8
@@ -4058,6 +4140,7 @@
         default_scan_16x8_neighbors },
       { default_scan_16x8, av1_default_iscan_16x8,
         default_scan_16x8_neighbors },
+#if CONFIG_EXT_TX
       { default_scan_16x8, av1_default_iscan_16x8,
         default_scan_16x8_neighbors },
       { default_scan_16x8, av1_default_iscan_16x8,
@@ -4075,6 +4158,7 @@
       { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
       { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
       { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
+#endif  // CONFIG_EXT_TX
   },
   {
       // TX_16X32
@@ -4086,6 +4170,7 @@
         default_scan_16x32_neighbors },
       { default_scan_16x32, av1_default_iscan_16x32,
         default_scan_16x32_neighbors },
+#if CONFIG_EXT_TX
       { default_scan_16x32, av1_default_iscan_16x32,
         default_scan_16x32_neighbors },
       { default_scan_16x32, av1_default_iscan_16x32,
@@ -4103,6 +4188,7 @@
       { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
       { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
       { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
+#endif  // CONFIG_EXT_TX
   },
   {
       // TX_32X16
@@ -4114,6 +4200,7 @@
         default_scan_32x16_neighbors },
       { default_scan_32x16, av1_default_iscan_32x16,
         default_scan_32x16_neighbors },
+#if CONFIG_EXT_TX
       { default_scan_32x16, av1_default_iscan_32x16,
         default_scan_32x16_neighbors },
       { default_scan_32x16, av1_default_iscan_32x16,
@@ -4131,42 +4218,9 @@
       { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
       { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
       { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
-  }
-};
-
-#else   // CONFIG_EXT_TX
-
-const SCAN_ORDER av1_intra_scan_orders[TX_SIZES][TX_TYPES] = {
-  { // TX_4X4
-    { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-    { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
-    { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
-    { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors } },
-  { // TX_8X8
-    { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
-    { row_scan_8x8, av1_row_iscan_8x8, row_scan_8x8_neighbors },
-    { col_scan_8x8, av1_col_iscan_8x8, col_scan_8x8_neighbors },
-    { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors } },
-  { // TX_16X16
-    { default_scan_16x16, av1_default_iscan_16x16,
-      default_scan_16x16_neighbors },
-    { row_scan_16x16, av1_row_iscan_16x16, row_scan_16x16_neighbors },
-    { col_scan_16x16, av1_col_iscan_16x16, col_scan_16x16_neighbors },
-    { default_scan_16x16, av1_default_iscan_16x16,
-      default_scan_16x16_neighbors } },
-  {
-      // TX_32X32
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-      { default_scan_32x32, av1_default_iscan_32x32,
-        default_scan_32x32_neighbors },
-  }
-};
 #endif  // CONFIG_EXT_TX
+  }
+};
 
 #if CONFIG_ADAPT_SCAN
 // TX_32X32 will has 1024 coefficients whose indexes can be represented in 10
@@ -4375,8 +4429,7 @@
   int16_t *scan = get_adapt_scan(cm->fc, tx_size, tx_type);
   int16_t *iscan = get_adapt_iscan(cm->fc, tx_size, tx_type);
   int16_t *nb = get_adapt_nb(cm->fc, tx_size, tx_type);
-  const int tx2d_size = tx_size_2d[tx_size];
-  assert(tx2d_size <= 1024);
+  assert(tx_size_2d[tx_size] <= 1024);
   av1_update_sort_order(tx_size, non_zero_prob, sort_order);
   av1_update_scan_order(tx_size, sort_order, scan, iscan);
   av1_update_neighbors(tx_size, scan, iscan, nb);
diff --git a/av1/common/scan.h b/av1/common/scan.h
index af39993..2078e99 100644
--- a/av1/common/scan.h
+++ b/av1/common/scan.h
@@ -27,6 +27,7 @@
 
 extern const SCAN_ORDER av1_default_scan_orders[TX_SIZES];
 extern const SCAN_ORDER av1_intra_scan_orders[TX_SIZES][TX_TYPES];
+extern const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES];
 
 #if CONFIG_ADAPT_SCAN
 void av1_update_scan_prob(AV1_COMMON *cm, TX_SIZE tx_size, TX_TYPE tx_type,
@@ -87,7 +88,7 @@
   return &cm->fc->sc[tx_size][tx_type];
 #else  // CONFIG_ADAPT_SCAN
   (void)cm;
-#if CONFIG_EXT_TX
+#if CONFIG_EXT_TX || CONFIG_VAR_TX
   return is_inter ? &av1_inter_scan_orders[tx_size][tx_type]
                   : &av1_intra_scan_orders[tx_size][tx_type];
 #else
diff --git a/av1/common/thread_common.c b/av1/common/thread_common.c
index eeaeb21..541db1d 100644
--- a/av1/common/thread_common.c
+++ b/av1/common/thread_common.c
@@ -85,25 +85,155 @@
 #endif  // CONFIG_MULTITHREAD
 }
 
-// Implement row loopfiltering for each thread.
-static INLINE void thread_loop_filter_rows(
-    const YV12_BUFFER_CONFIG *const frame_buffer, AV1_COMMON *const cm,
-    struct macroblockd_plane planes[MAX_MB_PLANE], int start, int stop,
-    int y_only, AV1LfSync *const lf_sync) {
-  const int num_planes = y_only ? 1 : MAX_MB_PLANE;
-  const int sb_cols = mi_cols_aligned_to_sb(cm) >> cm->mib_size_log2;
+#if !CONFIG_EXT_PARTITION_TYPES
+static INLINE enum lf_path get_loop_filter_path(
+    int y_only, struct macroblockd_plane planes[MAX_MB_PLANE]) {
+  if (y_only)
+    return LF_PATH_444;
+  else if (planes[1].subsampling_y == 1 && planes[1].subsampling_x == 1)
+    return LF_PATH_420;
+  else if (planes[1].subsampling_y == 0 && planes[1].subsampling_x == 0)
+    return LF_PATH_444;
+  else
+    return LF_PATH_SLOW;
+}
+
+static INLINE void loop_filter_block_plane_ver(
+    AV1_COMMON *cm, struct macroblockd_plane planes[MAX_MB_PLANE], int plane,
+    MODE_INFO **mi, int mi_row, int mi_col, enum lf_path path,
+    LOOP_FILTER_MASK *lfm) {
+  if (plane == 0) {
+    av1_filter_block_plane_ss00_ver(cm, &planes[0], mi_row, lfm);
+  } else {
+    switch (path) {
+      case LF_PATH_420:
+        av1_filter_block_plane_ss11_ver(cm, &planes[plane], mi_row, lfm);
+        break;
+      case LF_PATH_444:
+        av1_filter_block_plane_ss00_ver(cm, &planes[plane], mi_row, lfm);
+        break;
+      case LF_PATH_SLOW:
+        av1_filter_block_plane_non420_ver(cm, &planes[plane], mi, mi_row,
+                                          mi_col);
+        break;
+    }
+  }
+}
+
+static INLINE void loop_filter_block_plane_hor(
+    AV1_COMMON *cm, struct macroblockd_plane planes[MAX_MB_PLANE], int plane,
+    MODE_INFO **mi, int mi_row, int mi_col, enum lf_path path,
+    LOOP_FILTER_MASK *lfm) {
+  if (plane == 0) {
+    av1_filter_block_plane_ss00_hor(cm, &planes[0], mi_row, lfm);
+  } else {
+    switch (path) {
+      case LF_PATH_420:
+        av1_filter_block_plane_ss11_hor(cm, &planes[plane], mi_row, lfm);
+        break;
+      case LF_PATH_444:
+        av1_filter_block_plane_ss00_hor(cm, &planes[plane], mi_row, lfm);
+        break;
+      case LF_PATH_SLOW:
+        av1_filter_block_plane_non420_hor(cm, &planes[plane], mi, mi_row,
+                                          mi_col);
+        break;
+    }
+  }
+}
+#endif
+// Row-based multi-threaded loopfilter hook
+#if CONFIG_PARALLEL_DEBLOCKING
+static int loop_filter_ver_row_worker(AV1LfSync *const lf_sync,
+                                      LFWorkerData *const lf_data) {
+  const int num_planes = lf_data->y_only ? 1 : MAX_MB_PLANE;
   int mi_row, mi_col;
 #if !CONFIG_EXT_PARTITION_TYPES
-  enum lf_path path;
-  LOOP_FILTER_MASK lfm;
-  if (y_only)
-    path = LF_PATH_444;
-  else if (planes[1].subsampling_y == 1 && planes[1].subsampling_x == 1)
-    path = LF_PATH_420;
-  else if (planes[1].subsampling_y == 0 && planes[1].subsampling_x == 0)
-    path = LF_PATH_444;
-  else
-    path = LF_PATH_SLOW;
+  enum lf_path path = get_loop_filter_path(lf_data->y_only, lf_data->planes);
+#endif
+  for (mi_row = lf_data->start; mi_row < lf_data->stop;
+       mi_row += lf_sync->num_workers * lf_data->cm->mib_size) {
+    MODE_INFO **const mi =
+        lf_data->cm->mi_grid_visible + mi_row * lf_data->cm->mi_stride;
+
+    for (mi_col = 0; mi_col < lf_data->cm->mi_cols;
+         mi_col += lf_data->cm->mib_size) {
+      LOOP_FILTER_MASK lfm;
+      int plane;
+
+      av1_setup_dst_planes(lf_data->planes, lf_data->frame_buffer, mi_row,
+                           mi_col);
+      av1_setup_mask(lf_data->cm, mi_row, mi_col, mi + mi_col,
+                     lf_data->cm->mi_stride, &lfm);
+
+#if CONFIG_EXT_PARTITION_TYPES
+      for (plane = 0; plane < num_planes; ++plane)
+        av1_filter_block_plane_non420_ver(lf_data->cm, &lf_data->planes[plane],
+                                          mi + mi_col, mi_row, mi_col);
+#else
+
+      for (plane = 0; plane < num_planes; ++plane)
+        loop_filter_block_plane_ver(lf_data->cm, lf_data->planes, plane,
+                                    mi + mi_col, mi_row, mi_col, path, &lfm);
+#endif
+    }
+  }
+  return 1;
+}
+
+static int loop_filter_hor_row_worker(AV1LfSync *const lf_sync,
+                                      LFWorkerData *const lf_data) {
+  const int num_planes = lf_data->y_only ? 1 : MAX_MB_PLANE;
+  const int sb_cols =
+      mi_cols_aligned_to_sb(lf_data->cm) >> lf_data->cm->mib_size_log2;
+  int mi_row, mi_col;
+#if !CONFIG_EXT_PARTITION_TYPES
+  enum lf_path path = get_loop_filter_path(lf_data->y_only, lf_data->planes);
+#endif
+
+  for (mi_row = lf_data->start; mi_row < lf_data->stop;
+       mi_row += lf_sync->num_workers * lf_data->cm->mib_size) {
+    MODE_INFO **const mi =
+        lf_data->cm->mi_grid_visible + mi_row * lf_data->cm->mi_stride;
+
+    for (mi_col = 0; mi_col < lf_data->cm->mi_cols;
+         mi_col += lf_data->cm->mib_size) {
+      const int r = mi_row >> lf_data->cm->mib_size_log2;
+      const int c = mi_col >> lf_data->cm->mib_size_log2;
+      LOOP_FILTER_MASK lfm;
+      int plane;
+
+      // TODO(wenhao.zhang@intel.com): For better parallelization, reorder
+      // the outer loop to column-based and remove the synchronizations here.
+      sync_read(lf_sync, r, c);
+
+      av1_setup_dst_planes(lf_data->planes, lf_data->frame_buffer, mi_row,
+                           mi_col);
+      av1_setup_mask(lf_data->cm, mi_row, mi_col, mi + mi_col,
+                     lf_data->cm->mi_stride, &lfm);
+#if CONFIG_EXT_PARTITION_TYPES
+      for (plane = 0; plane < num_planes; ++plane)
+        av1_filter_block_plane_non420_hor(lf_data->cm, &lf_data->planes[plane],
+                                          mi + mi_col, mi_row, mi_col);
+#else
+      for (plane = 0; plane < num_planes; ++plane)
+        loop_filter_block_plane_hor(lf_data->cm, lf_data->planes, plane,
+                                    mi + mi_col, mi_row, mi_col, path, &lfm);
+#endif
+      sync_write(lf_sync, r, c, sb_cols);
+    }
+  }
+  return 1;
+}
+#else  //  CONFIG_PARALLEL_DEBLOCKING
+static int loop_filter_row_worker(AV1LfSync *const lf_sync,
+                                  LFWorkerData *const lf_data) {
+  const int num_planes = lf_data->y_only ? 1 : MAX_MB_PLANE;
+  const int sb_cols =
+      mi_cols_aligned_to_sb(lf_data->cm) >> lf_data->cm->mib_size_log2;
+  int mi_row, mi_col;
+#if !CONFIG_EXT_PARTITION_TYPES
+  enum lf_path path = get_loop_filter_path(lf_data->y_only, lf_data->planes);
 #endif  // !CONFIG_EXT_PARTITION_TYPES
 
 #if CONFIG_EXT_PARTITION
@@ -113,56 +243,48 @@
   exit(EXIT_FAILURE);
 #endif  // CONFIG_EXT_PARTITION
 
-  for (mi_row = start; mi_row < stop;
-       mi_row += lf_sync->num_workers * cm->mib_size) {
-    MODE_INFO **const mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
+  for (mi_row = lf_data->start; mi_row < lf_data->stop;
+       mi_row += lf_sync->num_workers * lf_data->cm->mib_size) {
+    MODE_INFO **const mi =
+        lf_data->cm->mi_grid_visible + mi_row * lf_data->cm->mi_stride;
 
-    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += cm->mib_size) {
-      const int r = mi_row >> cm->mib_size_log2;
-      const int c = mi_col >> cm->mib_size_log2;
+    for (mi_col = 0; mi_col < lf_data->cm->mi_cols;
+         mi_col += lf_data->cm->mib_size) {
+      const int r = mi_row >> lf_data->cm->mib_size_log2;
+      const int c = mi_col >> lf_data->cm->mib_size_log2;
+#if !CONFIG_EXT_PARTITION_TYPES
+      LOOP_FILTER_MASK lfm;
+#endif
       int plane;
 
       sync_read(lf_sync, r, c);
 
-      av1_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
-
+      av1_setup_dst_planes(lf_data->planes, lf_data->frame_buffer, mi_row,
+                           mi_col);
 #if CONFIG_EXT_PARTITION_TYPES
-      for (plane = 0; plane < num_planes; ++plane)
-        av1_filter_block_plane_non420(cm, &planes[plane], mi + mi_col, mi_row,
-                                      mi_col);
+      for (plane = 0; plane < num_planes; ++plane) {
+        av1_filter_block_plane_non420_ver(lf_data->cm, &lf_data->planes[plane],
+                                          mi + mi_col, mi_row, mi_col);
+        av1_filter_block_plane_non420_hor(lf_data->cm, &lf_data->planes[plane],
+                                          mi + mi_col, mi_row, mi_col);
+      }
 #else
-      // TODO(JBB): Make setup_mask work for non 420.
-      av1_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm);
+      av1_setup_mask(lf_data->cm, mi_row, mi_col, mi + mi_col,
+                     lf_data->cm->mi_stride, &lfm);
 
-      av1_filter_block_plane_ss00(cm, &planes[0], mi_row, &lfm);
-      for (plane = 1; plane < num_planes; ++plane) {
-        switch (path) {
-          case LF_PATH_420:
-            av1_filter_block_plane_ss11(cm, &planes[plane], mi_row, &lfm);
-            break;
-          case LF_PATH_444:
-            av1_filter_block_plane_ss00(cm, &planes[plane], mi_row, &lfm);
-            break;
-          case LF_PATH_SLOW:
-            av1_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
-                                          mi_row, mi_col);
-            break;
-        }
+      for (plane = 0; plane < num_planes; ++plane) {
+        loop_filter_block_plane_ver(lf_data->cm, lf_data->planes, plane,
+                                    mi + mi_col, mi_row, mi_col, path, &lfm);
+        loop_filter_block_plane_hor(lf_data->cm, lf_data->planes, plane,
+                                    mi + mi_col, mi_row, mi_col, path, &lfm);
       }
 #endif  // CONFIG_EXT_PARTITION_TYPES
       sync_write(lf_sync, r, c, sb_cols);
     }
   }
-}
-
-// Row-based multi-threaded loopfilter hook
-static int loop_filter_row_worker(AV1LfSync *const lf_sync,
-                                  LFWorkerData *const lf_data) {
-  thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
-                          lf_data->start, lf_data->stop, lf_data->y_only,
-                          lf_sync);
   return 1;
 }
+#endif  //  CONFIG_PARALLEL_DEBLOCKING
 
 static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
                                 struct macroblockd_plane planes[MAX_MB_PLANE],
@@ -191,17 +313,79 @@
     av1_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
   }
 
+// Set up loopfilter thread data.
+// The decoder is capping num_workers because it has been observed that using
+// more threads on the loopfilter than there are cores will hurt performance
+// on Android. This is because the system will only schedule the tile decode
+// workers on cores equal to the number of tile columns. Then if the decoder
+// tries to use more threads for the loopfilter, it will hurt performance
+// because of contention. If the multithreading code changes in the future
+// then the number of workers used by the loopfilter should be revisited.
+
+#if CONFIG_PARALLEL_DEBLOCKING
   // Initialize cur_sb_col to -1 for all SB rows.
   memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
 
-  // Set up loopfilter thread data.
-  // The decoder is capping num_workers because it has been observed that using
-  // more threads on the loopfilter than there are cores will hurt performance
-  // on Android. This is because the system will only schedule the tile decode
-  // workers on cores equal to the number of tile columns. Then if the decoder
-  // tries to use more threads for the loopfilter, it will hurt performance
-  // because of contention. If the multithreading code changes in the future
-  // then the number of workers used by the loopfilter should be revisited.
+  // Filter all the vertical edges in the whole frame
+  for (i = 0; i < num_workers; ++i) {
+    AVxWorker *const worker = &workers[i];
+    LFWorkerData *const lf_data = &lf_sync->lfdata[i];
+
+    worker->hook = (AVxWorkerHook)loop_filter_ver_row_worker;
+    worker->data1 = lf_sync;
+    worker->data2 = lf_data;
+
+    // Loopfilter data
+    av1_loop_filter_data_reset(lf_data, frame, cm, planes);
+    lf_data->start = start + i * cm->mib_size;
+    lf_data->stop = stop;
+    lf_data->y_only = y_only;
+
+    // Start loopfiltering
+    if (i == num_workers - 1) {
+      winterface->execute(worker);
+    } else {
+      winterface->launch(worker);
+    }
+  }
+
+  // Wait till all rows are finished
+  for (i = 0; i < num_workers; ++i) {
+    winterface->sync(&workers[i]);
+  }
+
+  memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
+  // Filter all the horizontal edges in the whole frame
+  for (i = 0; i < num_workers; ++i) {
+    AVxWorker *const worker = &workers[i];
+    LFWorkerData *const lf_data = &lf_sync->lfdata[i];
+
+    worker->hook = (AVxWorkerHook)loop_filter_hor_row_worker;
+    worker->data1 = lf_sync;
+    worker->data2 = lf_data;
+
+    // Loopfilter data
+    av1_loop_filter_data_reset(lf_data, frame, cm, planes);
+    lf_data->start = start + i * cm->mib_size;
+    lf_data->stop = stop;
+    lf_data->y_only = y_only;
+
+    // Start loopfiltering
+    if (i == num_workers - 1) {
+      winterface->execute(worker);
+    } else {
+      winterface->launch(worker);
+    }
+  }
+
+  // Wait till all rows are finished
+  for (i = 0; i < num_workers; ++i) {
+    winterface->sync(&workers[i]);
+  }
+#else   // CONFIG_PARALLEL_DEBLOCKING
+  // Initialize cur_sb_col to -1 for all SB rows.
+  memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
+
   for (i = 0; i < num_workers; ++i) {
     AVxWorker *const worker = &workers[i];
     LFWorkerData *const lf_data = &lf_sync->lfdata[i];
@@ -228,6 +412,7 @@
   for (i = 0; i < num_workers; ++i) {
     winterface->sync(&workers[i]);
   }
+#endif  // CONFIG_PARALLEL_DEBLOCKING
 }
 
 void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
diff --git a/av1/common/tile_common.h b/av1/common/tile_common.h
index 9fed2d6..d63d260 100644
--- a/av1/common/tile_common.h
+++ b/av1/common/tile_common.h
@@ -18,6 +18,10 @@
 
 struct AV1Common;
 
+#if CONFIG_TILE_GROUPS
+#define MAX_NUM_TG 3
+#endif
+
 typedef struct TileInfo {
   int mi_row_start, mi_row_end;
   int mi_col_start, mi_col_end;
diff --git a/av1/common/x86/av1_fwd_dct32x32_impl_sse2.h b/av1/common/x86/av1_fwd_dct32x32_impl_sse2.h
deleted file mode 100644
index 876e579..0000000
--- a/av1/common/x86/av1_fwd_dct32x32_impl_sse2.h
+++ /dev/null
@@ -1,3202 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>  // SSE2
-
-#include "./av1_rtcd.h"
-#include "av1/common/av1_fwd_txfm.h"
-#include "aom_dsp/txfm_common.h"
-#include "aom_dsp/x86/txfm_common_sse2.h"
-
-// TODO(jingning) The high bit-depth version needs re-work for performance.
-// The current SSE2 implementation also causes cross reference to the static
-// functions in the C implementation file.
-#if DCT_HIGH_BIT_DEPTH
-#define ADD_EPI16 _mm_adds_epi16
-#define SUB_EPI16 _mm_subs_epi16
-#if FDCT32x32_HIGH_PRECISION
-void av1_fdct32x32_rows_c(const int16_t *intermediate, tran_low_t *out) {
-  int i, j;
-  for (i = 0; i < 32; ++i) {
-    tran_high_t temp_in[32], temp_out[32];
-    for (j = 0; j < 32; ++j) temp_in[j] = intermediate[j * 32 + i];
-    av1_fdct32(temp_in, temp_out, 0);
-    for (j = 0; j < 32; ++j)
-      out[j + i * 32] =
-          (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
-  }
-}
-#define HIGH_FDCT32x32_2D_C av1_highbd_fdct32x32_c
-#define HIGH_FDCT32x32_2D_ROWS_C av1_fdct32x32_rows_c
-#else
-void av1_fdct32x32_rd_rows_c(const int16_t *intermediate, tran_low_t *out) {
-  int i, j;
-  for (i = 0; i < 32; ++i) {
-    tran_high_t temp_in[32], temp_out[32];
-    for (j = 0; j < 32; ++j) temp_in[j] = intermediate[j * 32 + i];
-    av1_fdct32(temp_in, temp_out, 1);
-    for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j];
-  }
-}
-#define HIGH_FDCT32x32_2D_C av1_highbd_fdct32x32_rd_c
-#define HIGH_FDCT32x32_2D_ROWS_C av1_fdct32x32_rd_rows_c
-#endif  // FDCT32x32_HIGH_PRECISION
-#else
-#define ADD_EPI16 _mm_add_epi16
-#define SUB_EPI16 _mm_sub_epi16
-#endif  // DCT_HIGH_BIT_DEPTH
-
-void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) {
-  // Calculate pre-multiplied strides
-  const int str1 = stride;
-  const int str2 = 2 * stride;
-  const int str3 = 2 * stride + str1;
-  // We need an intermediate buffer between passes.
-  DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]);
-  // Constants
-  //    When we use them, in one case, they are all the same. In all others
-  //    it's a pair of them that we need to repeat four times. This is done
-  //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-  const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64);
-  const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64);
-  const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64);
-  const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64);
-  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
-  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64);
-  const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64);
-  const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64);
-  const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64);
-  const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64);
-  const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64);
-  const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64);
-  const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64);
-  const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64);
-  const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64);
-  const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64);
-  const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64);
-  const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64);
-  const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64);
-  const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64);
-  const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i kZero = _mm_set1_epi16(0);
-  const __m128i kOne = _mm_set1_epi16(1);
-  // Do the two transform/transpose passes
-  int pass;
-#if DCT_HIGH_BIT_DEPTH
-  int overflow;
-#endif
-  for (pass = 0; pass < 2; ++pass) {
-    // We process eight columns (transposed rows in second pass) at a time.
-    int column_start;
-    for (column_start = 0; column_start < 32; column_start += 8) {
-      __m128i step1[32];
-      __m128i step2[32];
-      __m128i step3[32];
-      __m128i out[32];
-      // Stage 1
-      // Note: even though all the loads below are aligned, using the aligned
-      //       intrinsic make the code slightly slower.
-      if (0 == pass) {
-        const int16_t *in = &input[column_start];
-        // step1[i] =  (in[ 0 * stride] + in[(32 -  1) * stride]) << 2;
-        // Note: the next four blocks could be in a loop. That would help the
-        //       instruction cache but is actually slower.
-        {
-          const int16_t *ina = in + 0 * str1;
-          const int16_t *inb = in + 31 * str1;
-          __m128i *step1a = &step1[0];
-          __m128i *step1b = &step1[31];
-          const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
-          const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
-          const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
-          const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
-          const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
-          const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
-          const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
-          const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
-          step1a[0] = _mm_add_epi16(ina0, inb0);
-          step1a[1] = _mm_add_epi16(ina1, inb1);
-          step1a[2] = _mm_add_epi16(ina2, inb2);
-          step1a[3] = _mm_add_epi16(ina3, inb3);
-          step1b[-3] = _mm_sub_epi16(ina3, inb3);
-          step1b[-2] = _mm_sub_epi16(ina2, inb2);
-          step1b[-1] = _mm_sub_epi16(ina1, inb1);
-          step1b[-0] = _mm_sub_epi16(ina0, inb0);
-          step1a[0] = _mm_slli_epi16(step1a[0], 2);
-          step1a[1] = _mm_slli_epi16(step1a[1], 2);
-          step1a[2] = _mm_slli_epi16(step1a[2], 2);
-          step1a[3] = _mm_slli_epi16(step1a[3], 2);
-          step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
-          step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
-          step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
-          step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
-        }
-        {
-          const int16_t *ina = in + 4 * str1;
-          const int16_t *inb = in + 27 * str1;
-          __m128i *step1a = &step1[4];
-          __m128i *step1b = &step1[27];
-          const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
-          const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
-          const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
-          const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
-          const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
-          const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
-          const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
-          const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
-          step1a[0] = _mm_add_epi16(ina0, inb0);
-          step1a[1] = _mm_add_epi16(ina1, inb1);
-          step1a[2] = _mm_add_epi16(ina2, inb2);
-          step1a[3] = _mm_add_epi16(ina3, inb3);
-          step1b[-3] = _mm_sub_epi16(ina3, inb3);
-          step1b[-2] = _mm_sub_epi16(ina2, inb2);
-          step1b[-1] = _mm_sub_epi16(ina1, inb1);
-          step1b[-0] = _mm_sub_epi16(ina0, inb0);
-          step1a[0] = _mm_slli_epi16(step1a[0], 2);
-          step1a[1] = _mm_slli_epi16(step1a[1], 2);
-          step1a[2] = _mm_slli_epi16(step1a[2], 2);
-          step1a[3] = _mm_slli_epi16(step1a[3], 2);
-          step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
-          step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
-          step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
-          step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
-        }
-        {
-          const int16_t *ina = in + 8 * str1;
-          const int16_t *inb = in + 23 * str1;
-          __m128i *step1a = &step1[8];
-          __m128i *step1b = &step1[23];
-          const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
-          const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
-          const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
-          const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
-          const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
-          const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
-          const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
-          const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
-          step1a[0] = _mm_add_epi16(ina0, inb0);
-          step1a[1] = _mm_add_epi16(ina1, inb1);
-          step1a[2] = _mm_add_epi16(ina2, inb2);
-          step1a[3] = _mm_add_epi16(ina3, inb3);
-          step1b[-3] = _mm_sub_epi16(ina3, inb3);
-          step1b[-2] = _mm_sub_epi16(ina2, inb2);
-          step1b[-1] = _mm_sub_epi16(ina1, inb1);
-          step1b[-0] = _mm_sub_epi16(ina0, inb0);
-          step1a[0] = _mm_slli_epi16(step1a[0], 2);
-          step1a[1] = _mm_slli_epi16(step1a[1], 2);
-          step1a[2] = _mm_slli_epi16(step1a[2], 2);
-          step1a[3] = _mm_slli_epi16(step1a[3], 2);
-          step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
-          step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
-          step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
-          step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
-        }
-        {
-          const int16_t *ina = in + 12 * str1;
-          const int16_t *inb = in + 19 * str1;
-          __m128i *step1a = &step1[12];
-          __m128i *step1b = &step1[19];
-          const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
-          const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
-          const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
-          const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
-          const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
-          const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
-          const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
-          const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
-          step1a[0] = _mm_add_epi16(ina0, inb0);
-          step1a[1] = _mm_add_epi16(ina1, inb1);
-          step1a[2] = _mm_add_epi16(ina2, inb2);
-          step1a[3] = _mm_add_epi16(ina3, inb3);
-          step1b[-3] = _mm_sub_epi16(ina3, inb3);
-          step1b[-2] = _mm_sub_epi16(ina2, inb2);
-          step1b[-1] = _mm_sub_epi16(ina1, inb1);
-          step1b[-0] = _mm_sub_epi16(ina0, inb0);
-          step1a[0] = _mm_slli_epi16(step1a[0], 2);
-          step1a[1] = _mm_slli_epi16(step1a[1], 2);
-          step1a[2] = _mm_slli_epi16(step1a[2], 2);
-          step1a[3] = _mm_slli_epi16(step1a[3], 2);
-          step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
-          step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
-          step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
-          step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
-        }
-      } else {
-        int16_t *in = &intermediate[column_start];
-        // step1[i] =  in[ 0 * 32] + in[(32 -  1) * 32];
-        // Note: using the same approach as above to have common offset is
-        //       counter-productive as all offsets can be calculated at compile
-        //       time.
-        // Note: the next four blocks could be in a loop. That would help the
-        //       instruction cache but is actually slower.
-        {
-          __m128i in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 32));
-          __m128i in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 32));
-          __m128i in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 32));
-          __m128i in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 32));
-          __m128i in28 = _mm_loadu_si128((const __m128i *)(in + 28 * 32));
-          __m128i in29 = _mm_loadu_si128((const __m128i *)(in + 29 * 32));
-          __m128i in30 = _mm_loadu_si128((const __m128i *)(in + 30 * 32));
-          __m128i in31 = _mm_loadu_si128((const __m128i *)(in + 31 * 32));
-          step1[0] = ADD_EPI16(in00, in31);
-          step1[1] = ADD_EPI16(in01, in30);
-          step1[2] = ADD_EPI16(in02, in29);
-          step1[3] = ADD_EPI16(in03, in28);
-          step1[28] = SUB_EPI16(in03, in28);
-          step1[29] = SUB_EPI16(in02, in29);
-          step1[30] = SUB_EPI16(in01, in30);
-          step1[31] = SUB_EPI16(in00, in31);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&step1[0], &step1[1], &step1[2],
-                                             &step1[3], &step1[28], &step1[29],
-                                             &step1[30], &step1[31]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          __m128i in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 32));
-          __m128i in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 32));
-          __m128i in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 32));
-          __m128i in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 32));
-          __m128i in24 = _mm_loadu_si128((const __m128i *)(in + 24 * 32));
-          __m128i in25 = _mm_loadu_si128((const __m128i *)(in + 25 * 32));
-          __m128i in26 = _mm_loadu_si128((const __m128i *)(in + 26 * 32));
-          __m128i in27 = _mm_loadu_si128((const __m128i *)(in + 27 * 32));
-          step1[4] = ADD_EPI16(in04, in27);
-          step1[5] = ADD_EPI16(in05, in26);
-          step1[6] = ADD_EPI16(in06, in25);
-          step1[7] = ADD_EPI16(in07, in24);
-          step1[24] = SUB_EPI16(in07, in24);
-          step1[25] = SUB_EPI16(in06, in25);
-          step1[26] = SUB_EPI16(in05, in26);
-          step1[27] = SUB_EPI16(in04, in27);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&step1[4], &step1[5], &step1[6],
-                                             &step1[7], &step1[24], &step1[25],
-                                             &step1[26], &step1[27]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          __m128i in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 32));
-          __m128i in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 32));
-          __m128i in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 32));
-          __m128i in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 32));
-          __m128i in20 = _mm_loadu_si128((const __m128i *)(in + 20 * 32));
-          __m128i in21 = _mm_loadu_si128((const __m128i *)(in + 21 * 32));
-          __m128i in22 = _mm_loadu_si128((const __m128i *)(in + 22 * 32));
-          __m128i in23 = _mm_loadu_si128((const __m128i *)(in + 23 * 32));
-          step1[8] = ADD_EPI16(in08, in23);
-          step1[9] = ADD_EPI16(in09, in22);
-          step1[10] = ADD_EPI16(in10, in21);
-          step1[11] = ADD_EPI16(in11, in20);
-          step1[20] = SUB_EPI16(in11, in20);
-          step1[21] = SUB_EPI16(in10, in21);
-          step1[22] = SUB_EPI16(in09, in22);
-          step1[23] = SUB_EPI16(in08, in23);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&step1[8], &step1[9], &step1[10],
-                                             &step1[11], &step1[20], &step1[21],
-                                             &step1[22], &step1[23]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          __m128i in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 32));
-          __m128i in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 32));
-          __m128i in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 32));
-          __m128i in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 32));
-          __m128i in16 = _mm_loadu_si128((const __m128i *)(in + 16 * 32));
-          __m128i in17 = _mm_loadu_si128((const __m128i *)(in + 17 * 32));
-          __m128i in18 = _mm_loadu_si128((const __m128i *)(in + 18 * 32));
-          __m128i in19 = _mm_loadu_si128((const __m128i *)(in + 19 * 32));
-          step1[12] = ADD_EPI16(in12, in19);
-          step1[13] = ADD_EPI16(in13, in18);
-          step1[14] = ADD_EPI16(in14, in17);
-          step1[15] = ADD_EPI16(in15, in16);
-          step1[16] = SUB_EPI16(in15, in16);
-          step1[17] = SUB_EPI16(in14, in17);
-          step1[18] = SUB_EPI16(in13, in18);
-          step1[19] = SUB_EPI16(in12, in19);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&step1[12], &step1[13], &step1[14],
-                                             &step1[15], &step1[16], &step1[17],
-                                             &step1[18], &step1[19]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-      }
-      // Stage 2
-      {
-        step2[0] = ADD_EPI16(step1[0], step1[15]);
-        step2[1] = ADD_EPI16(step1[1], step1[14]);
-        step2[2] = ADD_EPI16(step1[2], step1[13]);
-        step2[3] = ADD_EPI16(step1[3], step1[12]);
-        step2[4] = ADD_EPI16(step1[4], step1[11]);
-        step2[5] = ADD_EPI16(step1[5], step1[10]);
-        step2[6] = ADD_EPI16(step1[6], step1[9]);
-        step2[7] = ADD_EPI16(step1[7], step1[8]);
-        step2[8] = SUB_EPI16(step1[7], step1[8]);
-        step2[9] = SUB_EPI16(step1[6], step1[9]);
-        step2[10] = SUB_EPI16(step1[5], step1[10]);
-        step2[11] = SUB_EPI16(step1[4], step1[11]);
-        step2[12] = SUB_EPI16(step1[3], step1[12]);
-        step2[13] = SUB_EPI16(step1[2], step1[13]);
-        step2[14] = SUB_EPI16(step1[1], step1[14]);
-        step2[15] = SUB_EPI16(step1[0], step1[15]);
-#if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x16(
-            &step2[0], &step2[1], &step2[2], &step2[3], &step2[4], &step2[5],
-            &step2[6], &step2[7], &step2[8], &step2[9], &step2[10], &step2[11],
-            &step2[12], &step2[13], &step2[14], &step2[15]);
-        if (overflow) {
-          if (pass == 0)
-            HIGH_FDCT32x32_2D_C(input, output_org, stride);
-          else
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-          return;
-        }
-#endif  // DCT_HIGH_BIT_DEPTH
-      }
-      {
-        const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]);
-        const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]);
-        const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]);
-        const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]);
-        const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]);
-        const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]);
-        const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]);
-        const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]);
-        const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16);
-        const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16);
-        const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16);
-        const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16);
-        const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16);
-        const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16);
-        const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16);
-        const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16);
-        const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16);
-        const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16);
-        const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16);
-        const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16);
-        const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16);
-        const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16);
-        const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16);
-        const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16);
-        // dct_const_round_shift
-        const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS);
-        const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS);
-        const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS);
-        const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS);
-        const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS);
-        const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS);
-        const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS);
-        const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS);
-        const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS);
-        const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS);
-        const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS);
-        const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS);
-        const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS);
-        const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS);
-        const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS);
-        const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS);
-        // Combine
-        step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7);
-        step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7);
-        step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7);
-        step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7);
-        step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7);
-        step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7);
-        step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7);
-        step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7);
-#if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x8(&step2[20], &step2[21], &step2[22],
-                                           &step2[23], &step2[24], &step2[25],
-                                           &step2[26], &step2[27]);
-        if (overflow) {
-          if (pass == 0)
-            HIGH_FDCT32x32_2D_C(input, output_org, stride);
-          else
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-          return;
-        }
-#endif  // DCT_HIGH_BIT_DEPTH
-      }
-
-#if !FDCT32x32_HIGH_PRECISION
-      // dump the magnitude by half, hence the intermediate values are within
-      // the range of 16 bits.
-      if (1 == pass) {
-        __m128i s3_00_0 = _mm_cmplt_epi16(step2[0], kZero);
-        __m128i s3_01_0 = _mm_cmplt_epi16(step2[1], kZero);
-        __m128i s3_02_0 = _mm_cmplt_epi16(step2[2], kZero);
-        __m128i s3_03_0 = _mm_cmplt_epi16(step2[3], kZero);
-        __m128i s3_04_0 = _mm_cmplt_epi16(step2[4], kZero);
-        __m128i s3_05_0 = _mm_cmplt_epi16(step2[5], kZero);
-        __m128i s3_06_0 = _mm_cmplt_epi16(step2[6], kZero);
-        __m128i s3_07_0 = _mm_cmplt_epi16(step2[7], kZero);
-        __m128i s2_08_0 = _mm_cmplt_epi16(step2[8], kZero);
-        __m128i s2_09_0 = _mm_cmplt_epi16(step2[9], kZero);
-        __m128i s3_10_0 = _mm_cmplt_epi16(step2[10], kZero);
-        __m128i s3_11_0 = _mm_cmplt_epi16(step2[11], kZero);
-        __m128i s3_12_0 = _mm_cmplt_epi16(step2[12], kZero);
-        __m128i s3_13_0 = _mm_cmplt_epi16(step2[13], kZero);
-        __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero);
-        __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero);
-        __m128i s3_16_0 = _mm_cmplt_epi16(step1[16], kZero);
-        __m128i s3_17_0 = _mm_cmplt_epi16(step1[17], kZero);
-        __m128i s3_18_0 = _mm_cmplt_epi16(step1[18], kZero);
-        __m128i s3_19_0 = _mm_cmplt_epi16(step1[19], kZero);
-        __m128i s3_20_0 = _mm_cmplt_epi16(step2[20], kZero);
-        __m128i s3_21_0 = _mm_cmplt_epi16(step2[21], kZero);
-        __m128i s3_22_0 = _mm_cmplt_epi16(step2[22], kZero);
-        __m128i s3_23_0 = _mm_cmplt_epi16(step2[23], kZero);
-        __m128i s3_24_0 = _mm_cmplt_epi16(step2[24], kZero);
-        __m128i s3_25_0 = _mm_cmplt_epi16(step2[25], kZero);
-        __m128i s3_26_0 = _mm_cmplt_epi16(step2[26], kZero);
-        __m128i s3_27_0 = _mm_cmplt_epi16(step2[27], kZero);
-        __m128i s3_28_0 = _mm_cmplt_epi16(step1[28], kZero);
-        __m128i s3_29_0 = _mm_cmplt_epi16(step1[29], kZero);
-        __m128i s3_30_0 = _mm_cmplt_epi16(step1[30], kZero);
-        __m128i s3_31_0 = _mm_cmplt_epi16(step1[31], kZero);
-
-        step2[0] = SUB_EPI16(step2[0], s3_00_0);
-        step2[1] = SUB_EPI16(step2[1], s3_01_0);
-        step2[2] = SUB_EPI16(step2[2], s3_02_0);
-        step2[3] = SUB_EPI16(step2[3], s3_03_0);
-        step2[4] = SUB_EPI16(step2[4], s3_04_0);
-        step2[5] = SUB_EPI16(step2[5], s3_05_0);
-        step2[6] = SUB_EPI16(step2[6], s3_06_0);
-        step2[7] = SUB_EPI16(step2[7], s3_07_0);
-        step2[8] = SUB_EPI16(step2[8], s2_08_0);
-        step2[9] = SUB_EPI16(step2[9], s2_09_0);
-        step2[10] = SUB_EPI16(step2[10], s3_10_0);
-        step2[11] = SUB_EPI16(step2[11], s3_11_0);
-        step2[12] = SUB_EPI16(step2[12], s3_12_0);
-        step2[13] = SUB_EPI16(step2[13], s3_13_0);
-        step2[14] = SUB_EPI16(step2[14], s2_14_0);
-        step2[15] = SUB_EPI16(step2[15], s2_15_0);
-        step1[16] = SUB_EPI16(step1[16], s3_16_0);
-        step1[17] = SUB_EPI16(step1[17], s3_17_0);
-        step1[18] = SUB_EPI16(step1[18], s3_18_0);
-        step1[19] = SUB_EPI16(step1[19], s3_19_0);
-        step2[20] = SUB_EPI16(step2[20], s3_20_0);
-        step2[21] = SUB_EPI16(step2[21], s3_21_0);
-        step2[22] = SUB_EPI16(step2[22], s3_22_0);
-        step2[23] = SUB_EPI16(step2[23], s3_23_0);
-        step2[24] = SUB_EPI16(step2[24], s3_24_0);
-        step2[25] = SUB_EPI16(step2[25], s3_25_0);
-        step2[26] = SUB_EPI16(step2[26], s3_26_0);
-        step2[27] = SUB_EPI16(step2[27], s3_27_0);
-        step1[28] = SUB_EPI16(step1[28], s3_28_0);
-        step1[29] = SUB_EPI16(step1[29], s3_29_0);
-        step1[30] = SUB_EPI16(step1[30], s3_30_0);
-        step1[31] = SUB_EPI16(step1[31], s3_31_0);
-#if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x32(
-            &step2[0], &step2[1], &step2[2], &step2[3], &step2[4], &step2[5],
-            &step2[6], &step2[7], &step2[8], &step2[9], &step2[10], &step2[11],
-            &step2[12], &step2[13], &step2[14], &step2[15], &step1[16],
-            &step1[17], &step1[18], &step1[19], &step2[20], &step2[21],
-            &step2[22], &step2[23], &step2[24], &step2[25], &step2[26],
-            &step2[27], &step1[28], &step1[29], &step1[30], &step1[31]);
-        if (overflow) {
-          HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-          return;
-        }
-#endif  // DCT_HIGH_BIT_DEPTH
-        step2[0] = _mm_add_epi16(step2[0], kOne);
-        step2[1] = _mm_add_epi16(step2[1], kOne);
-        step2[2] = _mm_add_epi16(step2[2], kOne);
-        step2[3] = _mm_add_epi16(step2[3], kOne);
-        step2[4] = _mm_add_epi16(step2[4], kOne);
-        step2[5] = _mm_add_epi16(step2[5], kOne);
-        step2[6] = _mm_add_epi16(step2[6], kOne);
-        step2[7] = _mm_add_epi16(step2[7], kOne);
-        step2[8] = _mm_add_epi16(step2[8], kOne);
-        step2[9] = _mm_add_epi16(step2[9], kOne);
-        step2[10] = _mm_add_epi16(step2[10], kOne);
-        step2[11] = _mm_add_epi16(step2[11], kOne);
-        step2[12] = _mm_add_epi16(step2[12], kOne);
-        step2[13] = _mm_add_epi16(step2[13], kOne);
-        step2[14] = _mm_add_epi16(step2[14], kOne);
-        step2[15] = _mm_add_epi16(step2[15], kOne);
-        step1[16] = _mm_add_epi16(step1[16], kOne);
-        step1[17] = _mm_add_epi16(step1[17], kOne);
-        step1[18] = _mm_add_epi16(step1[18], kOne);
-        step1[19] = _mm_add_epi16(step1[19], kOne);
-        step2[20] = _mm_add_epi16(step2[20], kOne);
-        step2[21] = _mm_add_epi16(step2[21], kOne);
-        step2[22] = _mm_add_epi16(step2[22], kOne);
-        step2[23] = _mm_add_epi16(step2[23], kOne);
-        step2[24] = _mm_add_epi16(step2[24], kOne);
-        step2[25] = _mm_add_epi16(step2[25], kOne);
-        step2[26] = _mm_add_epi16(step2[26], kOne);
-        step2[27] = _mm_add_epi16(step2[27], kOne);
-        step1[28] = _mm_add_epi16(step1[28], kOne);
-        step1[29] = _mm_add_epi16(step1[29], kOne);
-        step1[30] = _mm_add_epi16(step1[30], kOne);
-        step1[31] = _mm_add_epi16(step1[31], kOne);
-
-        step2[0] = _mm_srai_epi16(step2[0], 2);
-        step2[1] = _mm_srai_epi16(step2[1], 2);
-        step2[2] = _mm_srai_epi16(step2[2], 2);
-        step2[3] = _mm_srai_epi16(step2[3], 2);
-        step2[4] = _mm_srai_epi16(step2[4], 2);
-        step2[5] = _mm_srai_epi16(step2[5], 2);
-        step2[6] = _mm_srai_epi16(step2[6], 2);
-        step2[7] = _mm_srai_epi16(step2[7], 2);
-        step2[8] = _mm_srai_epi16(step2[8], 2);
-        step2[9] = _mm_srai_epi16(step2[9], 2);
-        step2[10] = _mm_srai_epi16(step2[10], 2);
-        step2[11] = _mm_srai_epi16(step2[11], 2);
-        step2[12] = _mm_srai_epi16(step2[12], 2);
-        step2[13] = _mm_srai_epi16(step2[13], 2);
-        step2[14] = _mm_srai_epi16(step2[14], 2);
-        step2[15] = _mm_srai_epi16(step2[15], 2);
-        step1[16] = _mm_srai_epi16(step1[16], 2);
-        step1[17] = _mm_srai_epi16(step1[17], 2);
-        step1[18] = _mm_srai_epi16(step1[18], 2);
-        step1[19] = _mm_srai_epi16(step1[19], 2);
-        step2[20] = _mm_srai_epi16(step2[20], 2);
-        step2[21] = _mm_srai_epi16(step2[21], 2);
-        step2[22] = _mm_srai_epi16(step2[22], 2);
-        step2[23] = _mm_srai_epi16(step2[23], 2);
-        step2[24] = _mm_srai_epi16(step2[24], 2);
-        step2[25] = _mm_srai_epi16(step2[25], 2);
-        step2[26] = _mm_srai_epi16(step2[26], 2);
-        step2[27] = _mm_srai_epi16(step2[27], 2);
-        step1[28] = _mm_srai_epi16(step1[28], 2);
-        step1[29] = _mm_srai_epi16(step1[29], 2);
-        step1[30] = _mm_srai_epi16(step1[30], 2);
-        step1[31] = _mm_srai_epi16(step1[31], 2);
-      }
-#endif  // !FDCT32x32_HIGH_PRECISION
-
-#if FDCT32x32_HIGH_PRECISION
-      if (pass == 0) {
-#endif
-        // Stage 3
-        {
-          step3[0] = ADD_EPI16(step2[(8 - 1)], step2[0]);
-          step3[1] = ADD_EPI16(step2[(8 - 2)], step2[1]);
-          step3[2] = ADD_EPI16(step2[(8 - 3)], step2[2]);
-          step3[3] = ADD_EPI16(step2[(8 - 4)], step2[3]);
-          step3[4] = SUB_EPI16(step2[(8 - 5)], step2[4]);
-          step3[5] = SUB_EPI16(step2[(8 - 6)], step2[5]);
-          step3[6] = SUB_EPI16(step2[(8 - 7)], step2[6]);
-          step3[7] = SUB_EPI16(step2[(8 - 8)], step2[7]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&step3[0], &step3[1], &step3[2],
-                                             &step3[3], &step3[4], &step3[5],
-                                             &step3[6], &step3[7]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
-          const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
-          const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
-          const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
-          const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
-          const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
-          const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
-          const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
-          const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
-          const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
-          const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
-          const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
-          // dct_const_round_shift
-          const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
-          const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
-          const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
-          const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
-          const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
-          const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
-          const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
-          const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
-          // Combine
-          step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7);
-          step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7);
-          step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7);
-          step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(&step3[10], &step3[11], &step3[12],
-                                             &step3[13]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          step3[16] = ADD_EPI16(step2[23], step1[16]);
-          step3[17] = ADD_EPI16(step2[22], step1[17]);
-          step3[18] = ADD_EPI16(step2[21], step1[18]);
-          step3[19] = ADD_EPI16(step2[20], step1[19]);
-          step3[20] = SUB_EPI16(step1[19], step2[20]);
-          step3[21] = SUB_EPI16(step1[18], step2[21]);
-          step3[22] = SUB_EPI16(step1[17], step2[22]);
-          step3[23] = SUB_EPI16(step1[16], step2[23]);
-          step3[24] = SUB_EPI16(step1[31], step2[24]);
-          step3[25] = SUB_EPI16(step1[30], step2[25]);
-          step3[26] = SUB_EPI16(step1[29], step2[26]);
-          step3[27] = SUB_EPI16(step1[28], step2[27]);
-          step3[28] = ADD_EPI16(step2[27], step1[28]);
-          step3[29] = ADD_EPI16(step2[26], step1[29]);
-          step3[30] = ADD_EPI16(step2[25], step1[30]);
-          step3[31] = ADD_EPI16(step2[24], step1[31]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x16(
-              &step3[16], &step3[17], &step3[18], &step3[19], &step3[20],
-              &step3[21], &step3[22], &step3[23], &step3[24], &step3[25],
-              &step3[26], &step3[27], &step3[28], &step3[29], &step3[30],
-              &step3[31]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-
-        // Stage 4
-        {
-          step1[0] = ADD_EPI16(step3[3], step3[0]);
-          step1[1] = ADD_EPI16(step3[2], step3[1]);
-          step1[2] = SUB_EPI16(step3[1], step3[2]);
-          step1[3] = SUB_EPI16(step3[0], step3[3]);
-          step1[8] = ADD_EPI16(step3[11], step2[8]);
-          step1[9] = ADD_EPI16(step3[10], step2[9]);
-          step1[10] = SUB_EPI16(step2[9], step3[10]);
-          step1[11] = SUB_EPI16(step2[8], step3[11]);
-          step1[12] = SUB_EPI16(step2[15], step3[12]);
-          step1[13] = SUB_EPI16(step2[14], step3[13]);
-          step1[14] = ADD_EPI16(step3[13], step2[14]);
-          step1[15] = ADD_EPI16(step3[12], step2[15]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x16(
-              &step1[0], &step1[1], &step1[2], &step1[3], &step1[4], &step1[5],
-              &step1[6], &step1[7], &step1[8], &step1[9], &step1[10],
-              &step1[11], &step1[12], &step1[13], &step1[14], &step1[15]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
-          const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]);
-          const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16);
-          const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16);
-          const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16);
-          const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16);
-          // dct_const_round_shift
-          const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS);
-          const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS);
-          const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS);
-          const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS);
-          // Combine
-          step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7);
-          step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x2(&step1[5], &step1[6]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
-          const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]);
-          const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]);
-          const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]);
-          const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]);
-          const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]);
-          const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]);
-          const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]);
-          const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24);
-          const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24);
-          const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24);
-          const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24);
-          const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08);
-          const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08);
-          const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08);
-          const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08);
-          const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24);
-          const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24);
-          const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24);
-          const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24);
-          const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08);
-          const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08);
-          const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08);
-          const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08);
-          // dct_const_round_shift
-          const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS);
-          const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS);
-          const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS);
-          const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS);
-          const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS);
-          const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS);
-          const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS);
-          const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS);
-          const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS);
-          const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS);
-          const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS);
-          const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS);
-          const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS);
-          const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS);
-          const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS);
-          const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS);
-          // Combine
-          step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7);
-          step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7);
-          step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7);
-          step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7);
-          step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7);
-          step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7);
-          step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7);
-          step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&step1[18], &step1[19], &step1[20],
-                                             &step1[21], &step1[26], &step1[27],
-                                             &step1[28], &step1[29]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        // Stage 5
-        {
-          step2[4] = ADD_EPI16(step1[5], step3[4]);
-          step2[5] = SUB_EPI16(step3[4], step1[5]);
-          step2[6] = SUB_EPI16(step3[7], step1[6]);
-          step2[7] = ADD_EPI16(step1[6], step3[7]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(&step2[4], &step2[5], &step2[6],
-                                             &step2[7]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]);
-          const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]);
-          const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]);
-          const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]);
-          const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16);
-          const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16);
-          const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16);
-          const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16);
-          const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08);
-          const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08);
-          const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24);
-          const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24);
-          // dct_const_round_shift
-          const __m128i out_00_4 =
-              _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_00_5 =
-              _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_16_4 =
-              _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_16_5 =
-              _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_08_4 =
-              _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_08_5 =
-              _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_24_4 =
-              _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_24_5 =
-              _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS);
-          const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS);
-          const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS);
-          const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS);
-          const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS);
-          const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS);
-          const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS);
-          const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS);
-          // Combine
-          out[0] = _mm_packs_epi32(out_00_6, out_00_7);
-          out[16] = _mm_packs_epi32(out_16_6, out_16_7);
-          out[8] = _mm_packs_epi32(out_08_6, out_08_7);
-          out[24] = _mm_packs_epi32(out_24_6, out_24_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x4(&out[0], &out[16], &out[8], &out[24]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[9], step1[14]);
-          const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[9], step1[14]);
-          const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]);
-          const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]);
-          const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24);
-          const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24);
-          const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08);
-          const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08);
-          const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24);
-          const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24);
-          const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08);
-          const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08);
-          // dct_const_round_shift
-          const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
-          const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
-          const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
-          const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
-          const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
-          const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
-          const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
-          const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
-          const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS);
-          const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS);
-          const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS);
-          const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS);
-          const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS);
-          const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS);
-          const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS);
-          const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS);
-          // Combine
-          step2[9] = _mm_packs_epi32(s2_09_6, s2_09_7);
-          step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7);
-          step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7);
-          step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(&step2[9], &step2[10], &step2[13],
-                                             &step2[14]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          step2[16] = ADD_EPI16(step1[19], step3[16]);
-          step2[17] = ADD_EPI16(step1[18], step3[17]);
-          step2[18] = SUB_EPI16(step3[17], step1[18]);
-          step2[19] = SUB_EPI16(step3[16], step1[19]);
-          step2[20] = SUB_EPI16(step3[23], step1[20]);
-          step2[21] = SUB_EPI16(step3[22], step1[21]);
-          step2[22] = ADD_EPI16(step1[21], step3[22]);
-          step2[23] = ADD_EPI16(step1[20], step3[23]);
-          step2[24] = ADD_EPI16(step1[27], step3[24]);
-          step2[25] = ADD_EPI16(step1[26], step3[25]);
-          step2[26] = SUB_EPI16(step3[25], step1[26]);
-          step2[27] = SUB_EPI16(step3[24], step1[27]);
-          step2[28] = SUB_EPI16(step3[31], step1[28]);
-          step2[29] = SUB_EPI16(step3[30], step1[29]);
-          step2[30] = ADD_EPI16(step1[29], step3[30]);
-          step2[31] = ADD_EPI16(step1[28], step3[31]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x16(
-              &step2[16], &step2[17], &step2[18], &step2[19], &step2[20],
-              &step2[21], &step2[22], &step2[23], &step2[24], &step2[25],
-              &step2[26], &step2[27], &step2[28], &step2[29], &step2[30],
-              &step2[31]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        // Stage 6
-        {
-          const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
-          const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
-          const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
-          const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
-          const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
-          const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
-          const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
-          const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
-          const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04);
-          const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04);
-          const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20);
-          const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20);
-          const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12);
-          const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12);
-          const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28);
-          const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28);
-          // dct_const_round_shift
-          const __m128i out_04_4 =
-              _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_04_5 =
-              _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_20_4 =
-              _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_20_5 =
-              _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_12_4 =
-              _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_12_5 =
-              _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_28_4 =
-              _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_28_5 =
-              _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS);
-          const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS);
-          const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS);
-          const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS);
-          const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS);
-          const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS);
-          const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS);
-          const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS);
-          // Combine
-          out[4] = _mm_packs_epi32(out_04_6, out_04_7);
-          out[20] = _mm_packs_epi32(out_20_6, out_20_7);
-          out[12] = _mm_packs_epi32(out_12_6, out_12_7);
-          out[28] = _mm_packs_epi32(out_28_6, out_28_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x4(&out[4], &out[20], &out[12], &out[28]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          step3[8] = ADD_EPI16(step2[9], step1[8]);
-          step3[9] = SUB_EPI16(step1[8], step2[9]);
-          step3[10] = SUB_EPI16(step1[11], step2[10]);
-          step3[11] = ADD_EPI16(step2[10], step1[11]);
-          step3[12] = ADD_EPI16(step2[13], step1[12]);
-          step3[13] = SUB_EPI16(step1[12], step2[13]);
-          step3[14] = SUB_EPI16(step1[15], step2[14]);
-          step3[15] = ADD_EPI16(step2[14], step1[15]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&step3[8], &step3[9], &step3[10],
-                                             &step3[11], &step3[12], &step3[13],
-                                             &step3[14], &step3[15]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]);
-          const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]);
-          const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]);
-          const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]);
-          const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]);
-          const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]);
-          const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]);
-          const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]);
-          const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28);
-          const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28);
-          const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04);
-          const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04);
-          const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12);
-          const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12);
-          const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20);
-          const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20);
-          const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12);
-          const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12);
-          const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20);
-          const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20);
-          const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28);
-          const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28);
-          const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04);
-          const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04);
-          // dct_const_round_shift
-          const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS);
-          const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS);
-          const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS);
-          const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS);
-          const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS);
-          const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS);
-          const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS);
-          const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS);
-          const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS);
-          const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS);
-          const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS);
-          const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS);
-          const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS);
-          const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS);
-          const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS);
-          const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS);
-          // Combine
-          step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7);
-          step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7);
-          step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7);
-          step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7);
-          // Combine
-          step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7);
-          step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7);
-          step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7);
-          step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&step3[17], &step3[18], &step3[21],
-                                             &step3[22], &step3[25], &step3[26],
-                                             &step3[29], &step3[30]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        // Stage 7
-        {
-          const __m128i out_02_0 = _mm_unpacklo_epi16(step3[8], step3[15]);
-          const __m128i out_02_1 = _mm_unpackhi_epi16(step3[8], step3[15]);
-          const __m128i out_18_0 = _mm_unpacklo_epi16(step3[9], step3[14]);
-          const __m128i out_18_1 = _mm_unpackhi_epi16(step3[9], step3[14]);
-          const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]);
-          const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]);
-          const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]);
-          const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]);
-          const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02);
-          const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02);
-          const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18);
-          const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18);
-          const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10);
-          const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10);
-          const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26);
-          const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26);
-          const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06);
-          const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06);
-          const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22);
-          const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22);
-          const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14);
-          const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14);
-          const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30);
-          const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30);
-          // dct_const_round_shift
-          const __m128i out_02_4 =
-              _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_02_5 =
-              _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_18_4 =
-              _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_18_5 =
-              _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_10_4 =
-              _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_10_5 =
-              _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_26_4 =
-              _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_26_5 =
-              _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_06_4 =
-              _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_06_5 =
-              _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_22_4 =
-              _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_22_5 =
-              _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_14_4 =
-              _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_14_5 =
-              _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_30_4 =
-              _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_30_5 =
-              _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS);
-          const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS);
-          const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS);
-          const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS);
-          const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS);
-          const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS);
-          const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS);
-          const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS);
-          const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS);
-          const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS);
-          const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS);
-          const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS);
-          const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS);
-          const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS);
-          const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS);
-          const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS);
-          // Combine
-          out[2] = _mm_packs_epi32(out_02_6, out_02_7);
-          out[18] = _mm_packs_epi32(out_18_6, out_18_7);
-          out[10] = _mm_packs_epi32(out_10_6, out_10_7);
-          out[26] = _mm_packs_epi32(out_26_6, out_26_7);
-          out[6] = _mm_packs_epi32(out_06_6, out_06_7);
-          out[22] = _mm_packs_epi32(out_22_6, out_22_7);
-          out[14] = _mm_packs_epi32(out_14_6, out_14_7);
-          out[30] = _mm_packs_epi32(out_30_6, out_30_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x8(&out[2], &out[18], &out[10], &out[26],
-                                      &out[6], &out[22], &out[14], &out[30]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          step1[16] = ADD_EPI16(step3[17], step2[16]);
-          step1[17] = SUB_EPI16(step2[16], step3[17]);
-          step1[18] = SUB_EPI16(step2[19], step3[18]);
-          step1[19] = ADD_EPI16(step3[18], step2[19]);
-          step1[20] = ADD_EPI16(step3[21], step2[20]);
-          step1[21] = SUB_EPI16(step2[20], step3[21]);
-          step1[22] = SUB_EPI16(step2[23], step3[22]);
-          step1[23] = ADD_EPI16(step3[22], step2[23]);
-          step1[24] = ADD_EPI16(step3[25], step2[24]);
-          step1[25] = SUB_EPI16(step2[24], step3[25]);
-          step1[26] = SUB_EPI16(step2[27], step3[26]);
-          step1[27] = ADD_EPI16(step3[26], step2[27]);
-          step1[28] = ADD_EPI16(step3[29], step2[28]);
-          step1[29] = SUB_EPI16(step2[28], step3[29]);
-          step1[30] = SUB_EPI16(step2[31], step3[30]);
-          step1[31] = ADD_EPI16(step3[30], step2[31]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x16(
-              &step1[16], &step1[17], &step1[18], &step1[19], &step1[20],
-              &step1[21], &step1[22], &step1[23], &step1[24], &step1[25],
-              &step1[26], &step1[27], &step1[28], &step1[29], &step1[30],
-              &step1[31]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        // Final stage --- outputs indices are bit-reversed.
-        {
-          const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]);
-          const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]);
-          const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]);
-          const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]);
-          const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]);
-          const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]);
-          const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]);
-          const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]);
-          const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01);
-          const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01);
-          const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17);
-          const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17);
-          const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09);
-          const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09);
-          const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25);
-          const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25);
-          const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07);
-          const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07);
-          const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23);
-          const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23);
-          const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15);
-          const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15);
-          const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31);
-          const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31);
-          // dct_const_round_shift
-          const __m128i out_01_4 =
-              _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_01_5 =
-              _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_17_4 =
-              _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_17_5 =
-              _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_09_4 =
-              _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_09_5 =
-              _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_25_4 =
-              _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_25_5 =
-              _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_07_4 =
-              _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_07_5 =
-              _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_23_4 =
-              _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_23_5 =
-              _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_15_4 =
-              _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_15_5 =
-              _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_31_4 =
-              _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_31_5 =
-              _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS);
-          const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS);
-          const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS);
-          const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS);
-          const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS);
-          const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS);
-          const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS);
-          const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS);
-          const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS);
-          const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS);
-          const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS);
-          const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS);
-          const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS);
-          const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS);
-          const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS);
-          const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS);
-          // Combine
-          out[1] = _mm_packs_epi32(out_01_6, out_01_7);
-          out[17] = _mm_packs_epi32(out_17_6, out_17_7);
-          out[9] = _mm_packs_epi32(out_09_6, out_09_7);
-          out[25] = _mm_packs_epi32(out_25_6, out_25_7);
-          out[7] = _mm_packs_epi32(out_07_6, out_07_7);
-          out[23] = _mm_packs_epi32(out_23_6, out_23_7);
-          out[15] = _mm_packs_epi32(out_15_6, out_15_7);
-          out[31] = _mm_packs_epi32(out_31_6, out_31_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x8(&out[1], &out[17], &out[9], &out[25],
-                                      &out[7], &out[23], &out[15], &out[31]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]);
-          const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]);
-          const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]);
-          const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]);
-          const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]);
-          const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]);
-          const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]);
-          const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]);
-          const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05);
-          const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05);
-          const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21);
-          const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21);
-          const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13);
-          const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13);
-          const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29);
-          const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29);
-          const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03);
-          const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03);
-          const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19);
-          const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19);
-          const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11);
-          const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11);
-          const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27);
-          const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27);
-          // dct_const_round_shift
-          const __m128i out_05_4 =
-              _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_05_5 =
-              _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_21_4 =
-              _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_21_5 =
-              _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_13_4 =
-              _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_13_5 =
-              _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_29_4 =
-              _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_29_5 =
-              _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_03_4 =
-              _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_03_5 =
-              _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_19_4 =
-              _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_19_5 =
-              _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_11_4 =
-              _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_11_5 =
-              _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_27_4 =
-              _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_27_5 =
-              _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS);
-          const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS);
-          const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS);
-          const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS);
-          const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS);
-          const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS);
-          const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS);
-          const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS);
-          const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS);
-          const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS);
-          const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS);
-          const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS);
-          const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS);
-          const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS);
-          const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS);
-          const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS);
-          // Combine
-          out[5] = _mm_packs_epi32(out_05_6, out_05_7);
-          out[21] = _mm_packs_epi32(out_21_6, out_21_7);
-          out[13] = _mm_packs_epi32(out_13_6, out_13_7);
-          out[29] = _mm_packs_epi32(out_29_6, out_29_7);
-          out[3] = _mm_packs_epi32(out_03_6, out_03_7);
-          out[19] = _mm_packs_epi32(out_19_6, out_19_7);
-          out[11] = _mm_packs_epi32(out_11_6, out_11_7);
-          out[27] = _mm_packs_epi32(out_27_6, out_27_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x8(&out[5], &out[21], &out[13], &out[29],
-                                      &out[3], &out[19], &out[11], &out[27]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-#if FDCT32x32_HIGH_PRECISION
-      } else {
-        __m128i lstep1[64], lstep2[64], lstep3[64];
-        __m128i u[32], v[32], sign[16];
-        const __m128i K32One = _mm_set_epi32(1, 1, 1, 1);
-        // start using 32-bit operations
-        // stage 3
-        {
-          // expanding to 32-bit length priori to addition operations
-          lstep2[0] = _mm_unpacklo_epi16(step2[0], kZero);
-          lstep2[1] = _mm_unpackhi_epi16(step2[0], kZero);
-          lstep2[2] = _mm_unpacklo_epi16(step2[1], kZero);
-          lstep2[3] = _mm_unpackhi_epi16(step2[1], kZero);
-          lstep2[4] = _mm_unpacklo_epi16(step2[2], kZero);
-          lstep2[5] = _mm_unpackhi_epi16(step2[2], kZero);
-          lstep2[6] = _mm_unpacklo_epi16(step2[3], kZero);
-          lstep2[7] = _mm_unpackhi_epi16(step2[3], kZero);
-          lstep2[8] = _mm_unpacklo_epi16(step2[4], kZero);
-          lstep2[9] = _mm_unpackhi_epi16(step2[4], kZero);
-          lstep2[10] = _mm_unpacklo_epi16(step2[5], kZero);
-          lstep2[11] = _mm_unpackhi_epi16(step2[5], kZero);
-          lstep2[12] = _mm_unpacklo_epi16(step2[6], kZero);
-          lstep2[13] = _mm_unpackhi_epi16(step2[6], kZero);
-          lstep2[14] = _mm_unpacklo_epi16(step2[7], kZero);
-          lstep2[15] = _mm_unpackhi_epi16(step2[7], kZero);
-          lstep2[0] = _mm_madd_epi16(lstep2[0], kOne);
-          lstep2[1] = _mm_madd_epi16(lstep2[1], kOne);
-          lstep2[2] = _mm_madd_epi16(lstep2[2], kOne);
-          lstep2[3] = _mm_madd_epi16(lstep2[3], kOne);
-          lstep2[4] = _mm_madd_epi16(lstep2[4], kOne);
-          lstep2[5] = _mm_madd_epi16(lstep2[5], kOne);
-          lstep2[6] = _mm_madd_epi16(lstep2[6], kOne);
-          lstep2[7] = _mm_madd_epi16(lstep2[7], kOne);
-          lstep2[8] = _mm_madd_epi16(lstep2[8], kOne);
-          lstep2[9] = _mm_madd_epi16(lstep2[9], kOne);
-          lstep2[10] = _mm_madd_epi16(lstep2[10], kOne);
-          lstep2[11] = _mm_madd_epi16(lstep2[11], kOne);
-          lstep2[12] = _mm_madd_epi16(lstep2[12], kOne);
-          lstep2[13] = _mm_madd_epi16(lstep2[13], kOne);
-          lstep2[14] = _mm_madd_epi16(lstep2[14], kOne);
-          lstep2[15] = _mm_madd_epi16(lstep2[15], kOne);
-
-          lstep3[0] = _mm_add_epi32(lstep2[14], lstep2[0]);
-          lstep3[1] = _mm_add_epi32(lstep2[15], lstep2[1]);
-          lstep3[2] = _mm_add_epi32(lstep2[12], lstep2[2]);
-          lstep3[3] = _mm_add_epi32(lstep2[13], lstep2[3]);
-          lstep3[4] = _mm_add_epi32(lstep2[10], lstep2[4]);
-          lstep3[5] = _mm_add_epi32(lstep2[11], lstep2[5]);
-          lstep3[6] = _mm_add_epi32(lstep2[8], lstep2[6]);
-          lstep3[7] = _mm_add_epi32(lstep2[9], lstep2[7]);
-          lstep3[8] = _mm_sub_epi32(lstep2[6], lstep2[8]);
-          lstep3[9] = _mm_sub_epi32(lstep2[7], lstep2[9]);
-          lstep3[10] = _mm_sub_epi32(lstep2[4], lstep2[10]);
-          lstep3[11] = _mm_sub_epi32(lstep2[5], lstep2[11]);
-          lstep3[12] = _mm_sub_epi32(lstep2[2], lstep2[12]);
-          lstep3[13] = _mm_sub_epi32(lstep2[3], lstep2[13]);
-          lstep3[14] = _mm_sub_epi32(lstep2[0], lstep2[14]);
-          lstep3[15] = _mm_sub_epi32(lstep2[1], lstep2[15]);
-        }
-        {
-          const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
-          const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
-          const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
-          const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
-          const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
-          const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
-          const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
-          const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
-          const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
-          const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
-          const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
-          const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
-          // dct_const_round_shift
-          const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
-          lstep3[20] = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
-          lstep3[21] = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
-          lstep3[22] = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
-          lstep3[23] = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
-          lstep3[24] = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
-          lstep3[25] = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
-          lstep3[26] = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
-          lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
-        }
-        {
-          lstep2[40] = _mm_unpacklo_epi16(step2[20], kZero);
-          lstep2[41] = _mm_unpackhi_epi16(step2[20], kZero);
-          lstep2[42] = _mm_unpacklo_epi16(step2[21], kZero);
-          lstep2[43] = _mm_unpackhi_epi16(step2[21], kZero);
-          lstep2[44] = _mm_unpacklo_epi16(step2[22], kZero);
-          lstep2[45] = _mm_unpackhi_epi16(step2[22], kZero);
-          lstep2[46] = _mm_unpacklo_epi16(step2[23], kZero);
-          lstep2[47] = _mm_unpackhi_epi16(step2[23], kZero);
-          lstep2[48] = _mm_unpacklo_epi16(step2[24], kZero);
-          lstep2[49] = _mm_unpackhi_epi16(step2[24], kZero);
-          lstep2[50] = _mm_unpacklo_epi16(step2[25], kZero);
-          lstep2[51] = _mm_unpackhi_epi16(step2[25], kZero);
-          lstep2[52] = _mm_unpacklo_epi16(step2[26], kZero);
-          lstep2[53] = _mm_unpackhi_epi16(step2[26], kZero);
-          lstep2[54] = _mm_unpacklo_epi16(step2[27], kZero);
-          lstep2[55] = _mm_unpackhi_epi16(step2[27], kZero);
-          lstep2[40] = _mm_madd_epi16(lstep2[40], kOne);
-          lstep2[41] = _mm_madd_epi16(lstep2[41], kOne);
-          lstep2[42] = _mm_madd_epi16(lstep2[42], kOne);
-          lstep2[43] = _mm_madd_epi16(lstep2[43], kOne);
-          lstep2[44] = _mm_madd_epi16(lstep2[44], kOne);
-          lstep2[45] = _mm_madd_epi16(lstep2[45], kOne);
-          lstep2[46] = _mm_madd_epi16(lstep2[46], kOne);
-          lstep2[47] = _mm_madd_epi16(lstep2[47], kOne);
-          lstep2[48] = _mm_madd_epi16(lstep2[48], kOne);
-          lstep2[49] = _mm_madd_epi16(lstep2[49], kOne);
-          lstep2[50] = _mm_madd_epi16(lstep2[50], kOne);
-          lstep2[51] = _mm_madd_epi16(lstep2[51], kOne);
-          lstep2[52] = _mm_madd_epi16(lstep2[52], kOne);
-          lstep2[53] = _mm_madd_epi16(lstep2[53], kOne);
-          lstep2[54] = _mm_madd_epi16(lstep2[54], kOne);
-          lstep2[55] = _mm_madd_epi16(lstep2[55], kOne);
-
-          lstep1[32] = _mm_unpacklo_epi16(step1[16], kZero);
-          lstep1[33] = _mm_unpackhi_epi16(step1[16], kZero);
-          lstep1[34] = _mm_unpacklo_epi16(step1[17], kZero);
-          lstep1[35] = _mm_unpackhi_epi16(step1[17], kZero);
-          lstep1[36] = _mm_unpacklo_epi16(step1[18], kZero);
-          lstep1[37] = _mm_unpackhi_epi16(step1[18], kZero);
-          lstep1[38] = _mm_unpacklo_epi16(step1[19], kZero);
-          lstep1[39] = _mm_unpackhi_epi16(step1[19], kZero);
-          lstep1[56] = _mm_unpacklo_epi16(step1[28], kZero);
-          lstep1[57] = _mm_unpackhi_epi16(step1[28], kZero);
-          lstep1[58] = _mm_unpacklo_epi16(step1[29], kZero);
-          lstep1[59] = _mm_unpackhi_epi16(step1[29], kZero);
-          lstep1[60] = _mm_unpacklo_epi16(step1[30], kZero);
-          lstep1[61] = _mm_unpackhi_epi16(step1[30], kZero);
-          lstep1[62] = _mm_unpacklo_epi16(step1[31], kZero);
-          lstep1[63] = _mm_unpackhi_epi16(step1[31], kZero);
-          lstep1[32] = _mm_madd_epi16(lstep1[32], kOne);
-          lstep1[33] = _mm_madd_epi16(lstep1[33], kOne);
-          lstep1[34] = _mm_madd_epi16(lstep1[34], kOne);
-          lstep1[35] = _mm_madd_epi16(lstep1[35], kOne);
-          lstep1[36] = _mm_madd_epi16(lstep1[36], kOne);
-          lstep1[37] = _mm_madd_epi16(lstep1[37], kOne);
-          lstep1[38] = _mm_madd_epi16(lstep1[38], kOne);
-          lstep1[39] = _mm_madd_epi16(lstep1[39], kOne);
-          lstep1[56] = _mm_madd_epi16(lstep1[56], kOne);
-          lstep1[57] = _mm_madd_epi16(lstep1[57], kOne);
-          lstep1[58] = _mm_madd_epi16(lstep1[58], kOne);
-          lstep1[59] = _mm_madd_epi16(lstep1[59], kOne);
-          lstep1[60] = _mm_madd_epi16(lstep1[60], kOne);
-          lstep1[61] = _mm_madd_epi16(lstep1[61], kOne);
-          lstep1[62] = _mm_madd_epi16(lstep1[62], kOne);
-          lstep1[63] = _mm_madd_epi16(lstep1[63], kOne);
-
-          lstep3[32] = _mm_add_epi32(lstep2[46], lstep1[32]);
-          lstep3[33] = _mm_add_epi32(lstep2[47], lstep1[33]);
-
-          lstep3[34] = _mm_add_epi32(lstep2[44], lstep1[34]);
-          lstep3[35] = _mm_add_epi32(lstep2[45], lstep1[35]);
-          lstep3[36] = _mm_add_epi32(lstep2[42], lstep1[36]);
-          lstep3[37] = _mm_add_epi32(lstep2[43], lstep1[37]);
-          lstep3[38] = _mm_add_epi32(lstep2[40], lstep1[38]);
-          lstep3[39] = _mm_add_epi32(lstep2[41], lstep1[39]);
-          lstep3[40] = _mm_sub_epi32(lstep1[38], lstep2[40]);
-          lstep3[41] = _mm_sub_epi32(lstep1[39], lstep2[41]);
-          lstep3[42] = _mm_sub_epi32(lstep1[36], lstep2[42]);
-          lstep3[43] = _mm_sub_epi32(lstep1[37], lstep2[43]);
-          lstep3[44] = _mm_sub_epi32(lstep1[34], lstep2[44]);
-          lstep3[45] = _mm_sub_epi32(lstep1[35], lstep2[45]);
-          lstep3[46] = _mm_sub_epi32(lstep1[32], lstep2[46]);
-          lstep3[47] = _mm_sub_epi32(lstep1[33], lstep2[47]);
-          lstep3[48] = _mm_sub_epi32(lstep1[62], lstep2[48]);
-          lstep3[49] = _mm_sub_epi32(lstep1[63], lstep2[49]);
-          lstep3[50] = _mm_sub_epi32(lstep1[60], lstep2[50]);
-          lstep3[51] = _mm_sub_epi32(lstep1[61], lstep2[51]);
-          lstep3[52] = _mm_sub_epi32(lstep1[58], lstep2[52]);
-          lstep3[53] = _mm_sub_epi32(lstep1[59], lstep2[53]);
-          lstep3[54] = _mm_sub_epi32(lstep1[56], lstep2[54]);
-          lstep3[55] = _mm_sub_epi32(lstep1[57], lstep2[55]);
-          lstep3[56] = _mm_add_epi32(lstep2[54], lstep1[56]);
-          lstep3[57] = _mm_add_epi32(lstep2[55], lstep1[57]);
-          lstep3[58] = _mm_add_epi32(lstep2[52], lstep1[58]);
-          lstep3[59] = _mm_add_epi32(lstep2[53], lstep1[59]);
-          lstep3[60] = _mm_add_epi32(lstep2[50], lstep1[60]);
-          lstep3[61] = _mm_add_epi32(lstep2[51], lstep1[61]);
-          lstep3[62] = _mm_add_epi32(lstep2[48], lstep1[62]);
-          lstep3[63] = _mm_add_epi32(lstep2[49], lstep1[63]);
-        }
-
-        // stage 4
-        {
-          // expanding to 32-bit length priori to addition operations
-          lstep2[16] = _mm_unpacklo_epi16(step2[8], kZero);
-          lstep2[17] = _mm_unpackhi_epi16(step2[8], kZero);
-          lstep2[18] = _mm_unpacklo_epi16(step2[9], kZero);
-          lstep2[19] = _mm_unpackhi_epi16(step2[9], kZero);
-          lstep2[28] = _mm_unpacklo_epi16(step2[14], kZero);
-          lstep2[29] = _mm_unpackhi_epi16(step2[14], kZero);
-          lstep2[30] = _mm_unpacklo_epi16(step2[15], kZero);
-          lstep2[31] = _mm_unpackhi_epi16(step2[15], kZero);
-          lstep2[16] = _mm_madd_epi16(lstep2[16], kOne);
-          lstep2[17] = _mm_madd_epi16(lstep2[17], kOne);
-          lstep2[18] = _mm_madd_epi16(lstep2[18], kOne);
-          lstep2[19] = _mm_madd_epi16(lstep2[19], kOne);
-          lstep2[28] = _mm_madd_epi16(lstep2[28], kOne);
-          lstep2[29] = _mm_madd_epi16(lstep2[29], kOne);
-          lstep2[30] = _mm_madd_epi16(lstep2[30], kOne);
-          lstep2[31] = _mm_madd_epi16(lstep2[31], kOne);
-
-          lstep1[0] = _mm_add_epi32(lstep3[6], lstep3[0]);
-          lstep1[1] = _mm_add_epi32(lstep3[7], lstep3[1]);
-          lstep1[2] = _mm_add_epi32(lstep3[4], lstep3[2]);
-          lstep1[3] = _mm_add_epi32(lstep3[5], lstep3[3]);
-          lstep1[4] = _mm_sub_epi32(lstep3[2], lstep3[4]);
-          lstep1[5] = _mm_sub_epi32(lstep3[3], lstep3[5]);
-          lstep1[6] = _mm_sub_epi32(lstep3[0], lstep3[6]);
-          lstep1[7] = _mm_sub_epi32(lstep3[1], lstep3[7]);
-          lstep1[16] = _mm_add_epi32(lstep3[22], lstep2[16]);
-          lstep1[17] = _mm_add_epi32(lstep3[23], lstep2[17]);
-          lstep1[18] = _mm_add_epi32(lstep3[20], lstep2[18]);
-          lstep1[19] = _mm_add_epi32(lstep3[21], lstep2[19]);
-          lstep1[20] = _mm_sub_epi32(lstep2[18], lstep3[20]);
-          lstep1[21] = _mm_sub_epi32(lstep2[19], lstep3[21]);
-          lstep1[22] = _mm_sub_epi32(lstep2[16], lstep3[22]);
-          lstep1[23] = _mm_sub_epi32(lstep2[17], lstep3[23]);
-          lstep1[24] = _mm_sub_epi32(lstep2[30], lstep3[24]);
-          lstep1[25] = _mm_sub_epi32(lstep2[31], lstep3[25]);
-          lstep1[26] = _mm_sub_epi32(lstep2[28], lstep3[26]);
-          lstep1[27] = _mm_sub_epi32(lstep2[29], lstep3[27]);
-          lstep1[28] = _mm_add_epi32(lstep3[26], lstep2[28]);
-          lstep1[29] = _mm_add_epi32(lstep3[27], lstep2[29]);
-          lstep1[30] = _mm_add_epi32(lstep3[24], lstep2[30]);
-          lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]);
-        }
-        {
-          // to be continued...
-          //
-          const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
-          const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]);
-          u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]);
-          u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]);
-          u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]);
-
-          // TODO(jingning): manually inline k_madd_epi32_ to further hide
-          // instruction latency.
-          v[0] = k_madd_epi32(u[0], k32_p16_m16);
-          v[1] = k_madd_epi32(u[1], k32_p16_m16);
-          v[2] = k_madd_epi32(u[2], k32_p16_m16);
-          v[3] = k_madd_epi32(u[3], k32_p16_m16);
-          v[4] = k_madd_epi32(u[0], k32_p16_p16);
-          v[5] = k_madd_epi32(u[1], k32_p16_p16);
-          v[6] = k_madd_epi32(u[2], k32_p16_p16);
-          v[7] = k_madd_epi32(u[3], k32_p16_p16);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_8(&v[0], &v[1], &v[2], &v[3], &v[4],
-                                              &v[5], &v[6], &v[7], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-
-          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-
-          lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-          lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-          lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-          lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-        }
-        {
-          const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
-          const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
-          const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep3[36], lstep3[58]);
-          u[1] = _mm_unpackhi_epi32(lstep3[36], lstep3[58]);
-          u[2] = _mm_unpacklo_epi32(lstep3[37], lstep3[59]);
-          u[3] = _mm_unpackhi_epi32(lstep3[37], lstep3[59]);
-          u[4] = _mm_unpacklo_epi32(lstep3[38], lstep3[56]);
-          u[5] = _mm_unpackhi_epi32(lstep3[38], lstep3[56]);
-          u[6] = _mm_unpacklo_epi32(lstep3[39], lstep3[57]);
-          u[7] = _mm_unpackhi_epi32(lstep3[39], lstep3[57]);
-          u[8] = _mm_unpacklo_epi32(lstep3[40], lstep3[54]);
-          u[9] = _mm_unpackhi_epi32(lstep3[40], lstep3[54]);
-          u[10] = _mm_unpacklo_epi32(lstep3[41], lstep3[55]);
-          u[11] = _mm_unpackhi_epi32(lstep3[41], lstep3[55]);
-          u[12] = _mm_unpacklo_epi32(lstep3[42], lstep3[52]);
-          u[13] = _mm_unpackhi_epi32(lstep3[42], lstep3[52]);
-          u[14] = _mm_unpacklo_epi32(lstep3[43], lstep3[53]);
-          u[15] = _mm_unpackhi_epi32(lstep3[43], lstep3[53]);
-
-          v[0] = k_madd_epi32(u[0], k32_m08_p24);
-          v[1] = k_madd_epi32(u[1], k32_m08_p24);
-          v[2] = k_madd_epi32(u[2], k32_m08_p24);
-          v[3] = k_madd_epi32(u[3], k32_m08_p24);
-          v[4] = k_madd_epi32(u[4], k32_m08_p24);
-          v[5] = k_madd_epi32(u[5], k32_m08_p24);
-          v[6] = k_madd_epi32(u[6], k32_m08_p24);
-          v[7] = k_madd_epi32(u[7], k32_m08_p24);
-          v[8] = k_madd_epi32(u[8], k32_m24_m08);
-          v[9] = k_madd_epi32(u[9], k32_m24_m08);
-          v[10] = k_madd_epi32(u[10], k32_m24_m08);
-          v[11] = k_madd_epi32(u[11], k32_m24_m08);
-          v[12] = k_madd_epi32(u[12], k32_m24_m08);
-          v[13] = k_madd_epi32(u[13], k32_m24_m08);
-          v[14] = k_madd_epi32(u[14], k32_m24_m08);
-          v[15] = k_madd_epi32(u[15], k32_m24_m08);
-          v[16] = k_madd_epi32(u[12], k32_m08_p24);
-          v[17] = k_madd_epi32(u[13], k32_m08_p24);
-          v[18] = k_madd_epi32(u[14], k32_m08_p24);
-          v[19] = k_madd_epi32(u[15], k32_m08_p24);
-          v[20] = k_madd_epi32(u[8], k32_m08_p24);
-          v[21] = k_madd_epi32(u[9], k32_m08_p24);
-          v[22] = k_madd_epi32(u[10], k32_m08_p24);
-          v[23] = k_madd_epi32(u[11], k32_m08_p24);
-          v[24] = k_madd_epi32(u[4], k32_p24_p08);
-          v[25] = k_madd_epi32(u[5], k32_p24_p08);
-          v[26] = k_madd_epi32(u[6], k32_p24_p08);
-          v[27] = k_madd_epi32(u[7], k32_p24_p08);
-          v[28] = k_madd_epi32(u[0], k32_p24_p08);
-          v[29] = k_madd_epi32(u[1], k32_p24_p08);
-          v[30] = k_madd_epi32(u[2], k32_p24_p08);
-          v[31] = k_madd_epi32(u[3], k32_p24_p08);
-
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_32(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
-              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
-              &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
-              &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-          u[4] = k_packs_epi64(v[8], v[9]);
-          u[5] = k_packs_epi64(v[10], v[11]);
-          u[6] = k_packs_epi64(v[12], v[13]);
-          u[7] = k_packs_epi64(v[14], v[15]);
-          u[8] = k_packs_epi64(v[16], v[17]);
-          u[9] = k_packs_epi64(v[18], v[19]);
-          u[10] = k_packs_epi64(v[20], v[21]);
-          u[11] = k_packs_epi64(v[22], v[23]);
-          u[12] = k_packs_epi64(v[24], v[25]);
-          u[13] = k_packs_epi64(v[26], v[27]);
-          u[14] = k_packs_epi64(v[28], v[29]);
-          u[15] = k_packs_epi64(v[30], v[31]);
-
-          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-          v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-          v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-          v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-          v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-          v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-          v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-          v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-          v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-          lstep1[36] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-          lstep1[37] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-          lstep1[38] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-          lstep1[39] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-          lstep1[40] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-          lstep1[41] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-          lstep1[42] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-          lstep1[43] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-          lstep1[52] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-          lstep1[53] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-          lstep1[54] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-          lstep1[55] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-          lstep1[56] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-          lstep1[57] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-          lstep1[58] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-          lstep1[59] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-        }
-        // stage 5
-        {
-          lstep2[8] = _mm_add_epi32(lstep1[10], lstep3[8]);
-          lstep2[9] = _mm_add_epi32(lstep1[11], lstep3[9]);
-          lstep2[10] = _mm_sub_epi32(lstep3[8], lstep1[10]);
-          lstep2[11] = _mm_sub_epi32(lstep3[9], lstep1[11]);
-          lstep2[12] = _mm_sub_epi32(lstep3[14], lstep1[12]);
-          lstep2[13] = _mm_sub_epi32(lstep3[15], lstep1[13]);
-          lstep2[14] = _mm_add_epi32(lstep1[12], lstep3[14]);
-          lstep2[15] = _mm_add_epi32(lstep1[13], lstep3[15]);
-        }
-        {
-          const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
-          const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
-          const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
-          const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep1[0], lstep1[2]);
-          u[1] = _mm_unpackhi_epi32(lstep1[0], lstep1[2]);
-          u[2] = _mm_unpacklo_epi32(lstep1[1], lstep1[3]);
-          u[3] = _mm_unpackhi_epi32(lstep1[1], lstep1[3]);
-          u[4] = _mm_unpacklo_epi32(lstep1[4], lstep1[6]);
-          u[5] = _mm_unpackhi_epi32(lstep1[4], lstep1[6]);
-          u[6] = _mm_unpacklo_epi32(lstep1[5], lstep1[7]);
-          u[7] = _mm_unpackhi_epi32(lstep1[5], lstep1[7]);
-
-          // TODO(jingning): manually inline k_madd_epi32_ to further hide
-          // instruction latency.
-          v[0] = k_madd_epi32(u[0], k32_p16_p16);
-          v[1] = k_madd_epi32(u[1], k32_p16_p16);
-          v[2] = k_madd_epi32(u[2], k32_p16_p16);
-          v[3] = k_madd_epi32(u[3], k32_p16_p16);
-          v[4] = k_madd_epi32(u[0], k32_p16_m16);
-          v[5] = k_madd_epi32(u[1], k32_p16_m16);
-          v[6] = k_madd_epi32(u[2], k32_p16_m16);
-          v[7] = k_madd_epi32(u[3], k32_p16_m16);
-          v[8] = k_madd_epi32(u[4], k32_p24_p08);
-          v[9] = k_madd_epi32(u[5], k32_p24_p08);
-          v[10] = k_madd_epi32(u[6], k32_p24_p08);
-          v[11] = k_madd_epi32(u[7], k32_p24_p08);
-          v[12] = k_madd_epi32(u[4], k32_m08_p24);
-          v[13] = k_madd_epi32(u[5], k32_m08_p24);
-          v[14] = k_madd_epi32(u[6], k32_m08_p24);
-          v[15] = k_madd_epi32(u[7], k32_m08_p24);
-
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_16(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
-              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-          u[4] = k_packs_epi64(v[8], v[9]);
-          u[5] = k_packs_epi64(v[10], v[11]);
-          u[6] = k_packs_epi64(v[12], v[13]);
-          u[7] = k_packs_epi64(v[14], v[15]);
-
-          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-
-          u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-          u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-          u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-          u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-          u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-          u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-          u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-          u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-
-          sign[0] = _mm_cmplt_epi32(u[0], kZero);
-          sign[1] = _mm_cmplt_epi32(u[1], kZero);
-          sign[2] = _mm_cmplt_epi32(u[2], kZero);
-          sign[3] = _mm_cmplt_epi32(u[3], kZero);
-          sign[4] = _mm_cmplt_epi32(u[4], kZero);
-          sign[5] = _mm_cmplt_epi32(u[5], kZero);
-          sign[6] = _mm_cmplt_epi32(u[6], kZero);
-          sign[7] = _mm_cmplt_epi32(u[7], kZero);
-
-          u[0] = _mm_sub_epi32(u[0], sign[0]);
-          u[1] = _mm_sub_epi32(u[1], sign[1]);
-          u[2] = _mm_sub_epi32(u[2], sign[2]);
-          u[3] = _mm_sub_epi32(u[3], sign[3]);
-          u[4] = _mm_sub_epi32(u[4], sign[4]);
-          u[5] = _mm_sub_epi32(u[5], sign[5]);
-          u[6] = _mm_sub_epi32(u[6], sign[6]);
-          u[7] = _mm_sub_epi32(u[7], sign[7]);
-
-          u[0] = _mm_add_epi32(u[0], K32One);
-          u[1] = _mm_add_epi32(u[1], K32One);
-          u[2] = _mm_add_epi32(u[2], K32One);
-          u[3] = _mm_add_epi32(u[3], K32One);
-          u[4] = _mm_add_epi32(u[4], K32One);
-          u[5] = _mm_add_epi32(u[5], K32One);
-          u[6] = _mm_add_epi32(u[6], K32One);
-          u[7] = _mm_add_epi32(u[7], K32One);
-
-          u[0] = _mm_srai_epi32(u[0], 2);
-          u[1] = _mm_srai_epi32(u[1], 2);
-          u[2] = _mm_srai_epi32(u[2], 2);
-          u[3] = _mm_srai_epi32(u[3], 2);
-          u[4] = _mm_srai_epi32(u[4], 2);
-          u[5] = _mm_srai_epi32(u[5], 2);
-          u[6] = _mm_srai_epi32(u[6], 2);
-          u[7] = _mm_srai_epi32(u[7], 2);
-
-          // Combine
-          out[0] = _mm_packs_epi32(u[0], u[1]);
-          out[16] = _mm_packs_epi32(u[2], u[3]);
-          out[8] = _mm_packs_epi32(u[4], u[5]);
-          out[24] = _mm_packs_epi32(u[6], u[7]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x4(&out[0], &out[16], &out[8], &out[24]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
-          const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
-          const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep1[18], lstep1[28]);
-          u[1] = _mm_unpackhi_epi32(lstep1[18], lstep1[28]);
-          u[2] = _mm_unpacklo_epi32(lstep1[19], lstep1[29]);
-          u[3] = _mm_unpackhi_epi32(lstep1[19], lstep1[29]);
-          u[4] = _mm_unpacklo_epi32(lstep1[20], lstep1[26]);
-          u[5] = _mm_unpackhi_epi32(lstep1[20], lstep1[26]);
-          u[6] = _mm_unpacklo_epi32(lstep1[21], lstep1[27]);
-          u[7] = _mm_unpackhi_epi32(lstep1[21], lstep1[27]);
-
-          v[0] = k_madd_epi32(u[0], k32_m08_p24);
-          v[1] = k_madd_epi32(u[1], k32_m08_p24);
-          v[2] = k_madd_epi32(u[2], k32_m08_p24);
-          v[3] = k_madd_epi32(u[3], k32_m08_p24);
-          v[4] = k_madd_epi32(u[4], k32_m24_m08);
-          v[5] = k_madd_epi32(u[5], k32_m24_m08);
-          v[6] = k_madd_epi32(u[6], k32_m24_m08);
-          v[7] = k_madd_epi32(u[7], k32_m24_m08);
-          v[8] = k_madd_epi32(u[4], k32_m08_p24);
-          v[9] = k_madd_epi32(u[5], k32_m08_p24);
-          v[10] = k_madd_epi32(u[6], k32_m08_p24);
-          v[11] = k_madd_epi32(u[7], k32_m08_p24);
-          v[12] = k_madd_epi32(u[0], k32_p24_p08);
-          v[13] = k_madd_epi32(u[1], k32_p24_p08);
-          v[14] = k_madd_epi32(u[2], k32_p24_p08);
-          v[15] = k_madd_epi32(u[3], k32_p24_p08);
-
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_16(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
-              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-          u[4] = k_packs_epi64(v[8], v[9]);
-          u[5] = k_packs_epi64(v[10], v[11]);
-          u[6] = k_packs_epi64(v[12], v[13]);
-          u[7] = k_packs_epi64(v[14], v[15]);
-
-          u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-
-          lstep2[18] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-          lstep2[19] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-          lstep2[20] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-          lstep2[21] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-          lstep2[26] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-          lstep2[27] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-          lstep2[28] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-          lstep2[29] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-        }
-        {
-          lstep2[32] = _mm_add_epi32(lstep1[38], lstep3[32]);
-          lstep2[33] = _mm_add_epi32(lstep1[39], lstep3[33]);
-          lstep2[34] = _mm_add_epi32(lstep1[36], lstep3[34]);
-          lstep2[35] = _mm_add_epi32(lstep1[37], lstep3[35]);
-          lstep2[36] = _mm_sub_epi32(lstep3[34], lstep1[36]);
-          lstep2[37] = _mm_sub_epi32(lstep3[35], lstep1[37]);
-          lstep2[38] = _mm_sub_epi32(lstep3[32], lstep1[38]);
-          lstep2[39] = _mm_sub_epi32(lstep3[33], lstep1[39]);
-          lstep2[40] = _mm_sub_epi32(lstep3[46], lstep1[40]);
-          lstep2[41] = _mm_sub_epi32(lstep3[47], lstep1[41]);
-          lstep2[42] = _mm_sub_epi32(lstep3[44], lstep1[42]);
-          lstep2[43] = _mm_sub_epi32(lstep3[45], lstep1[43]);
-          lstep2[44] = _mm_add_epi32(lstep1[42], lstep3[44]);
-          lstep2[45] = _mm_add_epi32(lstep1[43], lstep3[45]);
-          lstep2[46] = _mm_add_epi32(lstep1[40], lstep3[46]);
-          lstep2[47] = _mm_add_epi32(lstep1[41], lstep3[47]);
-          lstep2[48] = _mm_add_epi32(lstep1[54], lstep3[48]);
-          lstep2[49] = _mm_add_epi32(lstep1[55], lstep3[49]);
-          lstep2[50] = _mm_add_epi32(lstep1[52], lstep3[50]);
-          lstep2[51] = _mm_add_epi32(lstep1[53], lstep3[51]);
-          lstep2[52] = _mm_sub_epi32(lstep3[50], lstep1[52]);
-          lstep2[53] = _mm_sub_epi32(lstep3[51], lstep1[53]);
-          lstep2[54] = _mm_sub_epi32(lstep3[48], lstep1[54]);
-          lstep2[55] = _mm_sub_epi32(lstep3[49], lstep1[55]);
-          lstep2[56] = _mm_sub_epi32(lstep3[62], lstep1[56]);
-          lstep2[57] = _mm_sub_epi32(lstep3[63], lstep1[57]);
-          lstep2[58] = _mm_sub_epi32(lstep3[60], lstep1[58]);
-          lstep2[59] = _mm_sub_epi32(lstep3[61], lstep1[59]);
-          lstep2[60] = _mm_add_epi32(lstep1[58], lstep3[60]);
-          lstep2[61] = _mm_add_epi32(lstep1[59], lstep3[61]);
-          lstep2[62] = _mm_add_epi32(lstep1[56], lstep3[62]);
-          lstep2[63] = _mm_add_epi32(lstep1[57], lstep3[63]);
-        }
-        // stage 6
-        {
-          const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64);
-          const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64);
-          const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64);
-          const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep2[8], lstep2[14]);
-          u[1] = _mm_unpackhi_epi32(lstep2[8], lstep2[14]);
-          u[2] = _mm_unpacklo_epi32(lstep2[9], lstep2[15]);
-          u[3] = _mm_unpackhi_epi32(lstep2[9], lstep2[15]);
-          u[4] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]);
-          u[5] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]);
-          u[6] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]);
-          u[7] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]);
-          u[8] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]);
-          u[9] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]);
-          u[10] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]);
-          u[11] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]);
-          u[12] = _mm_unpacklo_epi32(lstep2[8], lstep2[14]);
-          u[13] = _mm_unpackhi_epi32(lstep2[8], lstep2[14]);
-          u[14] = _mm_unpacklo_epi32(lstep2[9], lstep2[15]);
-          u[15] = _mm_unpackhi_epi32(lstep2[9], lstep2[15]);
-
-          v[0] = k_madd_epi32(u[0], k32_p28_p04);
-          v[1] = k_madd_epi32(u[1], k32_p28_p04);
-          v[2] = k_madd_epi32(u[2], k32_p28_p04);
-          v[3] = k_madd_epi32(u[3], k32_p28_p04);
-          v[4] = k_madd_epi32(u[4], k32_p12_p20);
-          v[5] = k_madd_epi32(u[5], k32_p12_p20);
-          v[6] = k_madd_epi32(u[6], k32_p12_p20);
-          v[7] = k_madd_epi32(u[7], k32_p12_p20);
-          v[8] = k_madd_epi32(u[8], k32_m20_p12);
-          v[9] = k_madd_epi32(u[9], k32_m20_p12);
-          v[10] = k_madd_epi32(u[10], k32_m20_p12);
-          v[11] = k_madd_epi32(u[11], k32_m20_p12);
-          v[12] = k_madd_epi32(u[12], k32_m04_p28);
-          v[13] = k_madd_epi32(u[13], k32_m04_p28);
-          v[14] = k_madd_epi32(u[14], k32_m04_p28);
-          v[15] = k_madd_epi32(u[15], k32_m04_p28);
-
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_16(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
-              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-          u[4] = k_packs_epi64(v[8], v[9]);
-          u[5] = k_packs_epi64(v[10], v[11]);
-          u[6] = k_packs_epi64(v[12], v[13]);
-          u[7] = k_packs_epi64(v[14], v[15]);
-
-          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-
-          u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-          u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-          u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-          u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-          u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-          u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-          u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-          u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-
-          sign[0] = _mm_cmplt_epi32(u[0], kZero);
-          sign[1] = _mm_cmplt_epi32(u[1], kZero);
-          sign[2] = _mm_cmplt_epi32(u[2], kZero);
-          sign[3] = _mm_cmplt_epi32(u[3], kZero);
-          sign[4] = _mm_cmplt_epi32(u[4], kZero);
-          sign[5] = _mm_cmplt_epi32(u[5], kZero);
-          sign[6] = _mm_cmplt_epi32(u[6], kZero);
-          sign[7] = _mm_cmplt_epi32(u[7], kZero);
-
-          u[0] = _mm_sub_epi32(u[0], sign[0]);
-          u[1] = _mm_sub_epi32(u[1], sign[1]);
-          u[2] = _mm_sub_epi32(u[2], sign[2]);
-          u[3] = _mm_sub_epi32(u[3], sign[3]);
-          u[4] = _mm_sub_epi32(u[4], sign[4]);
-          u[5] = _mm_sub_epi32(u[5], sign[5]);
-          u[6] = _mm_sub_epi32(u[6], sign[6]);
-          u[7] = _mm_sub_epi32(u[7], sign[7]);
-
-          u[0] = _mm_add_epi32(u[0], K32One);
-          u[1] = _mm_add_epi32(u[1], K32One);
-          u[2] = _mm_add_epi32(u[2], K32One);
-          u[3] = _mm_add_epi32(u[3], K32One);
-          u[4] = _mm_add_epi32(u[4], K32One);
-          u[5] = _mm_add_epi32(u[5], K32One);
-          u[6] = _mm_add_epi32(u[6], K32One);
-          u[7] = _mm_add_epi32(u[7], K32One);
-
-          u[0] = _mm_srai_epi32(u[0], 2);
-          u[1] = _mm_srai_epi32(u[1], 2);
-          u[2] = _mm_srai_epi32(u[2], 2);
-          u[3] = _mm_srai_epi32(u[3], 2);
-          u[4] = _mm_srai_epi32(u[4], 2);
-          u[5] = _mm_srai_epi32(u[5], 2);
-          u[6] = _mm_srai_epi32(u[6], 2);
-          u[7] = _mm_srai_epi32(u[7], 2);
-
-          out[4] = _mm_packs_epi32(u[0], u[1]);
-          out[20] = _mm_packs_epi32(u[2], u[3]);
-          out[12] = _mm_packs_epi32(u[4], u[5]);
-          out[28] = _mm_packs_epi32(u[6], u[7]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x4(&out[4], &out[20], &out[12], &out[28]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          lstep3[16] = _mm_add_epi32(lstep2[18], lstep1[16]);
-          lstep3[17] = _mm_add_epi32(lstep2[19], lstep1[17]);
-          lstep3[18] = _mm_sub_epi32(lstep1[16], lstep2[18]);
-          lstep3[19] = _mm_sub_epi32(lstep1[17], lstep2[19]);
-          lstep3[20] = _mm_sub_epi32(lstep1[22], lstep2[20]);
-          lstep3[21] = _mm_sub_epi32(lstep1[23], lstep2[21]);
-          lstep3[22] = _mm_add_epi32(lstep2[20], lstep1[22]);
-          lstep3[23] = _mm_add_epi32(lstep2[21], lstep1[23]);
-          lstep3[24] = _mm_add_epi32(lstep2[26], lstep1[24]);
-          lstep3[25] = _mm_add_epi32(lstep2[27], lstep1[25]);
-          lstep3[26] = _mm_sub_epi32(lstep1[24], lstep2[26]);
-          lstep3[27] = _mm_sub_epi32(lstep1[25], lstep2[27]);
-          lstep3[28] = _mm_sub_epi32(lstep1[30], lstep2[28]);
-          lstep3[29] = _mm_sub_epi32(lstep1[31], lstep2[29]);
-          lstep3[30] = _mm_add_epi32(lstep2[28], lstep1[30]);
-          lstep3[31] = _mm_add_epi32(lstep2[29], lstep1[31]);
-        }
-        {
-          const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64);
-          const __m128i k32_m28_m04 = pair_set_epi32(-cospi_28_64, -cospi_4_64);
-          const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64);
-          const __m128i k32_m12_m20 =
-              pair_set_epi32(-cospi_12_64, -cospi_20_64);
-          const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64);
-          const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep2[34], lstep2[60]);
-          u[1] = _mm_unpackhi_epi32(lstep2[34], lstep2[60]);
-          u[2] = _mm_unpacklo_epi32(lstep2[35], lstep2[61]);
-          u[3] = _mm_unpackhi_epi32(lstep2[35], lstep2[61]);
-          u[4] = _mm_unpacklo_epi32(lstep2[36], lstep2[58]);
-          u[5] = _mm_unpackhi_epi32(lstep2[36], lstep2[58]);
-          u[6] = _mm_unpacklo_epi32(lstep2[37], lstep2[59]);
-          u[7] = _mm_unpackhi_epi32(lstep2[37], lstep2[59]);
-          u[8] = _mm_unpacklo_epi32(lstep2[42], lstep2[52]);
-          u[9] = _mm_unpackhi_epi32(lstep2[42], lstep2[52]);
-          u[10] = _mm_unpacklo_epi32(lstep2[43], lstep2[53]);
-          u[11] = _mm_unpackhi_epi32(lstep2[43], lstep2[53]);
-          u[12] = _mm_unpacklo_epi32(lstep2[44], lstep2[50]);
-          u[13] = _mm_unpackhi_epi32(lstep2[44], lstep2[50]);
-          u[14] = _mm_unpacklo_epi32(lstep2[45], lstep2[51]);
-          u[15] = _mm_unpackhi_epi32(lstep2[45], lstep2[51]);
-
-          v[0] = k_madd_epi32(u[0], k32_m04_p28);
-          v[1] = k_madd_epi32(u[1], k32_m04_p28);
-          v[2] = k_madd_epi32(u[2], k32_m04_p28);
-          v[3] = k_madd_epi32(u[3], k32_m04_p28);
-          v[4] = k_madd_epi32(u[4], k32_m28_m04);
-          v[5] = k_madd_epi32(u[5], k32_m28_m04);
-          v[6] = k_madd_epi32(u[6], k32_m28_m04);
-          v[7] = k_madd_epi32(u[7], k32_m28_m04);
-          v[8] = k_madd_epi32(u[8], k32_m20_p12);
-          v[9] = k_madd_epi32(u[9], k32_m20_p12);
-          v[10] = k_madd_epi32(u[10], k32_m20_p12);
-          v[11] = k_madd_epi32(u[11], k32_m20_p12);
-          v[12] = k_madd_epi32(u[12], k32_m12_m20);
-          v[13] = k_madd_epi32(u[13], k32_m12_m20);
-          v[14] = k_madd_epi32(u[14], k32_m12_m20);
-          v[15] = k_madd_epi32(u[15], k32_m12_m20);
-          v[16] = k_madd_epi32(u[12], k32_m20_p12);
-          v[17] = k_madd_epi32(u[13], k32_m20_p12);
-          v[18] = k_madd_epi32(u[14], k32_m20_p12);
-          v[19] = k_madd_epi32(u[15], k32_m20_p12);
-          v[20] = k_madd_epi32(u[8], k32_p12_p20);
-          v[21] = k_madd_epi32(u[9], k32_p12_p20);
-          v[22] = k_madd_epi32(u[10], k32_p12_p20);
-          v[23] = k_madd_epi32(u[11], k32_p12_p20);
-          v[24] = k_madd_epi32(u[4], k32_m04_p28);
-          v[25] = k_madd_epi32(u[5], k32_m04_p28);
-          v[26] = k_madd_epi32(u[6], k32_m04_p28);
-          v[27] = k_madd_epi32(u[7], k32_m04_p28);
-          v[28] = k_madd_epi32(u[0], k32_p28_p04);
-          v[29] = k_madd_epi32(u[1], k32_p28_p04);
-          v[30] = k_madd_epi32(u[2], k32_p28_p04);
-          v[31] = k_madd_epi32(u[3], k32_p28_p04);
-
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_32(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
-              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
-              &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
-              &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-          u[4] = k_packs_epi64(v[8], v[9]);
-          u[5] = k_packs_epi64(v[10], v[11]);
-          u[6] = k_packs_epi64(v[12], v[13]);
-          u[7] = k_packs_epi64(v[14], v[15]);
-          u[8] = k_packs_epi64(v[16], v[17]);
-          u[9] = k_packs_epi64(v[18], v[19]);
-          u[10] = k_packs_epi64(v[20], v[21]);
-          u[11] = k_packs_epi64(v[22], v[23]);
-          u[12] = k_packs_epi64(v[24], v[25]);
-          u[13] = k_packs_epi64(v[26], v[27]);
-          u[14] = k_packs_epi64(v[28], v[29]);
-          u[15] = k_packs_epi64(v[30], v[31]);
-
-          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-          v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-          v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-          v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-          v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-          v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-          v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-          v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-          v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-          lstep3[34] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-          lstep3[35] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-          lstep3[36] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-          lstep3[37] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-          lstep3[42] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-          lstep3[43] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-          lstep3[44] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-          lstep3[45] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-          lstep3[50] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-          lstep3[51] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-          lstep3[52] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-          lstep3[53] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-          lstep3[58] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-          lstep3[59] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-          lstep3[60] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-          lstep3[61] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-        }
-        // stage 7
-        {
-          const __m128i k32_p30_p02 = pair_set_epi32(cospi_30_64, cospi_2_64);
-          const __m128i k32_p14_p18 = pair_set_epi32(cospi_14_64, cospi_18_64);
-          const __m128i k32_p22_p10 = pair_set_epi32(cospi_22_64, cospi_10_64);
-          const __m128i k32_p06_p26 = pair_set_epi32(cospi_6_64, cospi_26_64);
-          const __m128i k32_m26_p06 = pair_set_epi32(-cospi_26_64, cospi_6_64);
-          const __m128i k32_m10_p22 = pair_set_epi32(-cospi_10_64, cospi_22_64);
-          const __m128i k32_m18_p14 = pair_set_epi32(-cospi_18_64, cospi_14_64);
-          const __m128i k32_m02_p30 = pair_set_epi32(-cospi_2_64, cospi_30_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep3[16], lstep3[30]);
-          u[1] = _mm_unpackhi_epi32(lstep3[16], lstep3[30]);
-          u[2] = _mm_unpacklo_epi32(lstep3[17], lstep3[31]);
-          u[3] = _mm_unpackhi_epi32(lstep3[17], lstep3[31]);
-          u[4] = _mm_unpacklo_epi32(lstep3[18], lstep3[28]);
-          u[5] = _mm_unpackhi_epi32(lstep3[18], lstep3[28]);
-          u[6] = _mm_unpacklo_epi32(lstep3[19], lstep3[29]);
-          u[7] = _mm_unpackhi_epi32(lstep3[19], lstep3[29]);
-          u[8] = _mm_unpacklo_epi32(lstep3[20], lstep3[26]);
-          u[9] = _mm_unpackhi_epi32(lstep3[20], lstep3[26]);
-          u[10] = _mm_unpacklo_epi32(lstep3[21], lstep3[27]);
-          u[11] = _mm_unpackhi_epi32(lstep3[21], lstep3[27]);
-          u[12] = _mm_unpacklo_epi32(lstep3[22], lstep3[24]);
-          u[13] = _mm_unpackhi_epi32(lstep3[22], lstep3[24]);
-          u[14] = _mm_unpacklo_epi32(lstep3[23], lstep3[25]);
-          u[15] = _mm_unpackhi_epi32(lstep3[23], lstep3[25]);
-
-          v[0] = k_madd_epi32(u[0], k32_p30_p02);
-          v[1] = k_madd_epi32(u[1], k32_p30_p02);
-          v[2] = k_madd_epi32(u[2], k32_p30_p02);
-          v[3] = k_madd_epi32(u[3], k32_p30_p02);
-          v[4] = k_madd_epi32(u[4], k32_p14_p18);
-          v[5] = k_madd_epi32(u[5], k32_p14_p18);
-          v[6] = k_madd_epi32(u[6], k32_p14_p18);
-          v[7] = k_madd_epi32(u[7], k32_p14_p18);
-          v[8] = k_madd_epi32(u[8], k32_p22_p10);
-          v[9] = k_madd_epi32(u[9], k32_p22_p10);
-          v[10] = k_madd_epi32(u[10], k32_p22_p10);
-          v[11] = k_madd_epi32(u[11], k32_p22_p10);
-          v[12] = k_madd_epi32(u[12], k32_p06_p26);
-          v[13] = k_madd_epi32(u[13], k32_p06_p26);
-          v[14] = k_madd_epi32(u[14], k32_p06_p26);
-          v[15] = k_madd_epi32(u[15], k32_p06_p26);
-          v[16] = k_madd_epi32(u[12], k32_m26_p06);
-          v[17] = k_madd_epi32(u[13], k32_m26_p06);
-          v[18] = k_madd_epi32(u[14], k32_m26_p06);
-          v[19] = k_madd_epi32(u[15], k32_m26_p06);
-          v[20] = k_madd_epi32(u[8], k32_m10_p22);
-          v[21] = k_madd_epi32(u[9], k32_m10_p22);
-          v[22] = k_madd_epi32(u[10], k32_m10_p22);
-          v[23] = k_madd_epi32(u[11], k32_m10_p22);
-          v[24] = k_madd_epi32(u[4], k32_m18_p14);
-          v[25] = k_madd_epi32(u[5], k32_m18_p14);
-          v[26] = k_madd_epi32(u[6], k32_m18_p14);
-          v[27] = k_madd_epi32(u[7], k32_m18_p14);
-          v[28] = k_madd_epi32(u[0], k32_m02_p30);
-          v[29] = k_madd_epi32(u[1], k32_m02_p30);
-          v[30] = k_madd_epi32(u[2], k32_m02_p30);
-          v[31] = k_madd_epi32(u[3], k32_m02_p30);
-
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_32(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
-              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
-              &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
-              &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-          u[4] = k_packs_epi64(v[8], v[9]);
-          u[5] = k_packs_epi64(v[10], v[11]);
-          u[6] = k_packs_epi64(v[12], v[13]);
-          u[7] = k_packs_epi64(v[14], v[15]);
-          u[8] = k_packs_epi64(v[16], v[17]);
-          u[9] = k_packs_epi64(v[18], v[19]);
-          u[10] = k_packs_epi64(v[20], v[21]);
-          u[11] = k_packs_epi64(v[22], v[23]);
-          u[12] = k_packs_epi64(v[24], v[25]);
-          u[13] = k_packs_epi64(v[26], v[27]);
-          u[14] = k_packs_epi64(v[28], v[29]);
-          u[15] = k_packs_epi64(v[30], v[31]);
-
-          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-          v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-          v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-          v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-          v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-          v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-          v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-          v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-          v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-          u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-          u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-          u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-          u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-          u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-          u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-          u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-          u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-          u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-          u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-          u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-          u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-          u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-          u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-          u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-          u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-
-          v[0] = _mm_cmplt_epi32(u[0], kZero);
-          v[1] = _mm_cmplt_epi32(u[1], kZero);
-          v[2] = _mm_cmplt_epi32(u[2], kZero);
-          v[3] = _mm_cmplt_epi32(u[3], kZero);
-          v[4] = _mm_cmplt_epi32(u[4], kZero);
-          v[5] = _mm_cmplt_epi32(u[5], kZero);
-          v[6] = _mm_cmplt_epi32(u[6], kZero);
-          v[7] = _mm_cmplt_epi32(u[7], kZero);
-          v[8] = _mm_cmplt_epi32(u[8], kZero);
-          v[9] = _mm_cmplt_epi32(u[9], kZero);
-          v[10] = _mm_cmplt_epi32(u[10], kZero);
-          v[11] = _mm_cmplt_epi32(u[11], kZero);
-          v[12] = _mm_cmplt_epi32(u[12], kZero);
-          v[13] = _mm_cmplt_epi32(u[13], kZero);
-          v[14] = _mm_cmplt_epi32(u[14], kZero);
-          v[15] = _mm_cmplt_epi32(u[15], kZero);
-
-          u[0] = _mm_sub_epi32(u[0], v[0]);
-          u[1] = _mm_sub_epi32(u[1], v[1]);
-          u[2] = _mm_sub_epi32(u[2], v[2]);
-          u[3] = _mm_sub_epi32(u[3], v[3]);
-          u[4] = _mm_sub_epi32(u[4], v[4]);
-          u[5] = _mm_sub_epi32(u[5], v[5]);
-          u[6] = _mm_sub_epi32(u[6], v[6]);
-          u[7] = _mm_sub_epi32(u[7], v[7]);
-          u[8] = _mm_sub_epi32(u[8], v[8]);
-          u[9] = _mm_sub_epi32(u[9], v[9]);
-          u[10] = _mm_sub_epi32(u[10], v[10]);
-          u[11] = _mm_sub_epi32(u[11], v[11]);
-          u[12] = _mm_sub_epi32(u[12], v[12]);
-          u[13] = _mm_sub_epi32(u[13], v[13]);
-          u[14] = _mm_sub_epi32(u[14], v[14]);
-          u[15] = _mm_sub_epi32(u[15], v[15]);
-
-          v[0] = _mm_add_epi32(u[0], K32One);
-          v[1] = _mm_add_epi32(u[1], K32One);
-          v[2] = _mm_add_epi32(u[2], K32One);
-          v[3] = _mm_add_epi32(u[3], K32One);
-          v[4] = _mm_add_epi32(u[4], K32One);
-          v[5] = _mm_add_epi32(u[5], K32One);
-          v[6] = _mm_add_epi32(u[6], K32One);
-          v[7] = _mm_add_epi32(u[7], K32One);
-          v[8] = _mm_add_epi32(u[8], K32One);
-          v[9] = _mm_add_epi32(u[9], K32One);
-          v[10] = _mm_add_epi32(u[10], K32One);
-          v[11] = _mm_add_epi32(u[11], K32One);
-          v[12] = _mm_add_epi32(u[12], K32One);
-          v[13] = _mm_add_epi32(u[13], K32One);
-          v[14] = _mm_add_epi32(u[14], K32One);
-          v[15] = _mm_add_epi32(u[15], K32One);
-
-          u[0] = _mm_srai_epi32(v[0], 2);
-          u[1] = _mm_srai_epi32(v[1], 2);
-          u[2] = _mm_srai_epi32(v[2], 2);
-          u[3] = _mm_srai_epi32(v[3], 2);
-          u[4] = _mm_srai_epi32(v[4], 2);
-          u[5] = _mm_srai_epi32(v[5], 2);
-          u[6] = _mm_srai_epi32(v[6], 2);
-          u[7] = _mm_srai_epi32(v[7], 2);
-          u[8] = _mm_srai_epi32(v[8], 2);
-          u[9] = _mm_srai_epi32(v[9], 2);
-          u[10] = _mm_srai_epi32(v[10], 2);
-          u[11] = _mm_srai_epi32(v[11], 2);
-          u[12] = _mm_srai_epi32(v[12], 2);
-          u[13] = _mm_srai_epi32(v[13], 2);
-          u[14] = _mm_srai_epi32(v[14], 2);
-          u[15] = _mm_srai_epi32(v[15], 2);
-
-          out[2] = _mm_packs_epi32(u[0], u[1]);
-          out[18] = _mm_packs_epi32(u[2], u[3]);
-          out[10] = _mm_packs_epi32(u[4], u[5]);
-          out[26] = _mm_packs_epi32(u[6], u[7]);
-          out[6] = _mm_packs_epi32(u[8], u[9]);
-          out[22] = _mm_packs_epi32(u[10], u[11]);
-          out[14] = _mm_packs_epi32(u[12], u[13]);
-          out[30] = _mm_packs_epi32(u[14], u[15]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x8(&out[2], &out[18], &out[10], &out[26],
-                                      &out[6], &out[22], &out[14], &out[30]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          lstep1[32] = _mm_add_epi32(lstep3[34], lstep2[32]);
-          lstep1[33] = _mm_add_epi32(lstep3[35], lstep2[33]);
-          lstep1[34] = _mm_sub_epi32(lstep2[32], lstep3[34]);
-          lstep1[35] = _mm_sub_epi32(lstep2[33], lstep3[35]);
-          lstep1[36] = _mm_sub_epi32(lstep2[38], lstep3[36]);
-          lstep1[37] = _mm_sub_epi32(lstep2[39], lstep3[37]);
-          lstep1[38] = _mm_add_epi32(lstep3[36], lstep2[38]);
-          lstep1[39] = _mm_add_epi32(lstep3[37], lstep2[39]);
-          lstep1[40] = _mm_add_epi32(lstep3[42], lstep2[40]);
-          lstep1[41] = _mm_add_epi32(lstep3[43], lstep2[41]);
-          lstep1[42] = _mm_sub_epi32(lstep2[40], lstep3[42]);
-          lstep1[43] = _mm_sub_epi32(lstep2[41], lstep3[43]);
-          lstep1[44] = _mm_sub_epi32(lstep2[46], lstep3[44]);
-          lstep1[45] = _mm_sub_epi32(lstep2[47], lstep3[45]);
-          lstep1[46] = _mm_add_epi32(lstep3[44], lstep2[46]);
-          lstep1[47] = _mm_add_epi32(lstep3[45], lstep2[47]);
-          lstep1[48] = _mm_add_epi32(lstep3[50], lstep2[48]);
-          lstep1[49] = _mm_add_epi32(lstep3[51], lstep2[49]);
-          lstep1[50] = _mm_sub_epi32(lstep2[48], lstep3[50]);
-          lstep1[51] = _mm_sub_epi32(lstep2[49], lstep3[51]);
-          lstep1[52] = _mm_sub_epi32(lstep2[54], lstep3[52]);
-          lstep1[53] = _mm_sub_epi32(lstep2[55], lstep3[53]);
-          lstep1[54] = _mm_add_epi32(lstep3[52], lstep2[54]);
-          lstep1[55] = _mm_add_epi32(lstep3[53], lstep2[55]);
-          lstep1[56] = _mm_add_epi32(lstep3[58], lstep2[56]);
-          lstep1[57] = _mm_add_epi32(lstep3[59], lstep2[57]);
-          lstep1[58] = _mm_sub_epi32(lstep2[56], lstep3[58]);
-          lstep1[59] = _mm_sub_epi32(lstep2[57], lstep3[59]);
-          lstep1[60] = _mm_sub_epi32(lstep2[62], lstep3[60]);
-          lstep1[61] = _mm_sub_epi32(lstep2[63], lstep3[61]);
-          lstep1[62] = _mm_add_epi32(lstep3[60], lstep2[62]);
-          lstep1[63] = _mm_add_epi32(lstep3[61], lstep2[63]);
-        }
-        // stage 8
-        {
-          const __m128i k32_p31_p01 = pair_set_epi32(cospi_31_64, cospi_1_64);
-          const __m128i k32_p15_p17 = pair_set_epi32(cospi_15_64, cospi_17_64);
-          const __m128i k32_p23_p09 = pair_set_epi32(cospi_23_64, cospi_9_64);
-          const __m128i k32_p07_p25 = pair_set_epi32(cospi_7_64, cospi_25_64);
-          const __m128i k32_m25_p07 = pair_set_epi32(-cospi_25_64, cospi_7_64);
-          const __m128i k32_m09_p23 = pair_set_epi32(-cospi_9_64, cospi_23_64);
-          const __m128i k32_m17_p15 = pair_set_epi32(-cospi_17_64, cospi_15_64);
-          const __m128i k32_m01_p31 = pair_set_epi32(-cospi_1_64, cospi_31_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep1[32], lstep1[62]);
-          u[1] = _mm_unpackhi_epi32(lstep1[32], lstep1[62]);
-          u[2] = _mm_unpacklo_epi32(lstep1[33], lstep1[63]);
-          u[3] = _mm_unpackhi_epi32(lstep1[33], lstep1[63]);
-          u[4] = _mm_unpacklo_epi32(lstep1[34], lstep1[60]);
-          u[5] = _mm_unpackhi_epi32(lstep1[34], lstep1[60]);
-          u[6] = _mm_unpacklo_epi32(lstep1[35], lstep1[61]);
-          u[7] = _mm_unpackhi_epi32(lstep1[35], lstep1[61]);
-          u[8] = _mm_unpacklo_epi32(lstep1[36], lstep1[58]);
-          u[9] = _mm_unpackhi_epi32(lstep1[36], lstep1[58]);
-          u[10] = _mm_unpacklo_epi32(lstep1[37], lstep1[59]);
-          u[11] = _mm_unpackhi_epi32(lstep1[37], lstep1[59]);
-          u[12] = _mm_unpacklo_epi32(lstep1[38], lstep1[56]);
-          u[13] = _mm_unpackhi_epi32(lstep1[38], lstep1[56]);
-          u[14] = _mm_unpacklo_epi32(lstep1[39], lstep1[57]);
-          u[15] = _mm_unpackhi_epi32(lstep1[39], lstep1[57]);
-
-          v[0] = k_madd_epi32(u[0], k32_p31_p01);
-          v[1] = k_madd_epi32(u[1], k32_p31_p01);
-          v[2] = k_madd_epi32(u[2], k32_p31_p01);
-          v[3] = k_madd_epi32(u[3], k32_p31_p01);
-          v[4] = k_madd_epi32(u[4], k32_p15_p17);
-          v[5] = k_madd_epi32(u[5], k32_p15_p17);
-          v[6] = k_madd_epi32(u[6], k32_p15_p17);
-          v[7] = k_madd_epi32(u[7], k32_p15_p17);
-          v[8] = k_madd_epi32(u[8], k32_p23_p09);
-          v[9] = k_madd_epi32(u[9], k32_p23_p09);
-          v[10] = k_madd_epi32(u[10], k32_p23_p09);
-          v[11] = k_madd_epi32(u[11], k32_p23_p09);
-          v[12] = k_madd_epi32(u[12], k32_p07_p25);
-          v[13] = k_madd_epi32(u[13], k32_p07_p25);
-          v[14] = k_madd_epi32(u[14], k32_p07_p25);
-          v[15] = k_madd_epi32(u[15], k32_p07_p25);
-          v[16] = k_madd_epi32(u[12], k32_m25_p07);
-          v[17] = k_madd_epi32(u[13], k32_m25_p07);
-          v[18] = k_madd_epi32(u[14], k32_m25_p07);
-          v[19] = k_madd_epi32(u[15], k32_m25_p07);
-          v[20] = k_madd_epi32(u[8], k32_m09_p23);
-          v[21] = k_madd_epi32(u[9], k32_m09_p23);
-          v[22] = k_madd_epi32(u[10], k32_m09_p23);
-          v[23] = k_madd_epi32(u[11], k32_m09_p23);
-          v[24] = k_madd_epi32(u[4], k32_m17_p15);
-          v[25] = k_madd_epi32(u[5], k32_m17_p15);
-          v[26] = k_madd_epi32(u[6], k32_m17_p15);
-          v[27] = k_madd_epi32(u[7], k32_m17_p15);
-          v[28] = k_madd_epi32(u[0], k32_m01_p31);
-          v[29] = k_madd_epi32(u[1], k32_m01_p31);
-          v[30] = k_madd_epi32(u[2], k32_m01_p31);
-          v[31] = k_madd_epi32(u[3], k32_m01_p31);
-
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_32(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
-              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
-              &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
-              &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-          u[4] = k_packs_epi64(v[8], v[9]);
-          u[5] = k_packs_epi64(v[10], v[11]);
-          u[6] = k_packs_epi64(v[12], v[13]);
-          u[7] = k_packs_epi64(v[14], v[15]);
-          u[8] = k_packs_epi64(v[16], v[17]);
-          u[9] = k_packs_epi64(v[18], v[19]);
-          u[10] = k_packs_epi64(v[20], v[21]);
-          u[11] = k_packs_epi64(v[22], v[23]);
-          u[12] = k_packs_epi64(v[24], v[25]);
-          u[13] = k_packs_epi64(v[26], v[27]);
-          u[14] = k_packs_epi64(v[28], v[29]);
-          u[15] = k_packs_epi64(v[30], v[31]);
-
-          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-          v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-          v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-          v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-          v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-          v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-          v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-          v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-          v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-          u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-          u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-          u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-          u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-          u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-          u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-          u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-          u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-          u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-          u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-          u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-          u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-          u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-          u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-          u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-          u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-
-          v[0] = _mm_cmplt_epi32(u[0], kZero);
-          v[1] = _mm_cmplt_epi32(u[1], kZero);
-          v[2] = _mm_cmplt_epi32(u[2], kZero);
-          v[3] = _mm_cmplt_epi32(u[3], kZero);
-          v[4] = _mm_cmplt_epi32(u[4], kZero);
-          v[5] = _mm_cmplt_epi32(u[5], kZero);
-          v[6] = _mm_cmplt_epi32(u[6], kZero);
-          v[7] = _mm_cmplt_epi32(u[7], kZero);
-          v[8] = _mm_cmplt_epi32(u[8], kZero);
-          v[9] = _mm_cmplt_epi32(u[9], kZero);
-          v[10] = _mm_cmplt_epi32(u[10], kZero);
-          v[11] = _mm_cmplt_epi32(u[11], kZero);
-          v[12] = _mm_cmplt_epi32(u[12], kZero);
-          v[13] = _mm_cmplt_epi32(u[13], kZero);
-          v[14] = _mm_cmplt_epi32(u[14], kZero);
-          v[15] = _mm_cmplt_epi32(u[15], kZero);
-
-          u[0] = _mm_sub_epi32(u[0], v[0]);
-          u[1] = _mm_sub_epi32(u[1], v[1]);
-          u[2] = _mm_sub_epi32(u[2], v[2]);
-          u[3] = _mm_sub_epi32(u[3], v[3]);
-          u[4] = _mm_sub_epi32(u[4], v[4]);
-          u[5] = _mm_sub_epi32(u[5], v[5]);
-          u[6] = _mm_sub_epi32(u[6], v[6]);
-          u[7] = _mm_sub_epi32(u[7], v[7]);
-          u[8] = _mm_sub_epi32(u[8], v[8]);
-          u[9] = _mm_sub_epi32(u[9], v[9]);
-          u[10] = _mm_sub_epi32(u[10], v[10]);
-          u[11] = _mm_sub_epi32(u[11], v[11]);
-          u[12] = _mm_sub_epi32(u[12], v[12]);
-          u[13] = _mm_sub_epi32(u[13], v[13]);
-          u[14] = _mm_sub_epi32(u[14], v[14]);
-          u[15] = _mm_sub_epi32(u[15], v[15]);
-
-          v[0] = _mm_add_epi32(u[0], K32One);
-          v[1] = _mm_add_epi32(u[1], K32One);
-          v[2] = _mm_add_epi32(u[2], K32One);
-          v[3] = _mm_add_epi32(u[3], K32One);
-          v[4] = _mm_add_epi32(u[4], K32One);
-          v[5] = _mm_add_epi32(u[5], K32One);
-          v[6] = _mm_add_epi32(u[6], K32One);
-          v[7] = _mm_add_epi32(u[7], K32One);
-          v[8] = _mm_add_epi32(u[8], K32One);
-          v[9] = _mm_add_epi32(u[9], K32One);
-          v[10] = _mm_add_epi32(u[10], K32One);
-          v[11] = _mm_add_epi32(u[11], K32One);
-          v[12] = _mm_add_epi32(u[12], K32One);
-          v[13] = _mm_add_epi32(u[13], K32One);
-          v[14] = _mm_add_epi32(u[14], K32One);
-          v[15] = _mm_add_epi32(u[15], K32One);
-
-          u[0] = _mm_srai_epi32(v[0], 2);
-          u[1] = _mm_srai_epi32(v[1], 2);
-          u[2] = _mm_srai_epi32(v[2], 2);
-          u[3] = _mm_srai_epi32(v[3], 2);
-          u[4] = _mm_srai_epi32(v[4], 2);
-          u[5] = _mm_srai_epi32(v[5], 2);
-          u[6] = _mm_srai_epi32(v[6], 2);
-          u[7] = _mm_srai_epi32(v[7], 2);
-          u[8] = _mm_srai_epi32(v[8], 2);
-          u[9] = _mm_srai_epi32(v[9], 2);
-          u[10] = _mm_srai_epi32(v[10], 2);
-          u[11] = _mm_srai_epi32(v[11], 2);
-          u[12] = _mm_srai_epi32(v[12], 2);
-          u[13] = _mm_srai_epi32(v[13], 2);
-          u[14] = _mm_srai_epi32(v[14], 2);
-          u[15] = _mm_srai_epi32(v[15], 2);
-
-          out[1] = _mm_packs_epi32(u[0], u[1]);
-          out[17] = _mm_packs_epi32(u[2], u[3]);
-          out[9] = _mm_packs_epi32(u[4], u[5]);
-          out[25] = _mm_packs_epi32(u[6], u[7]);
-          out[7] = _mm_packs_epi32(u[8], u[9]);
-          out[23] = _mm_packs_epi32(u[10], u[11]);
-          out[15] = _mm_packs_epi32(u[12], u[13]);
-          out[31] = _mm_packs_epi32(u[14], u[15]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x8(&out[1], &out[17], &out[9], &out[25],
-                                      &out[7], &out[23], &out[15], &out[31]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i k32_p27_p05 = pair_set_epi32(cospi_27_64, cospi_5_64);
-          const __m128i k32_p11_p21 = pair_set_epi32(cospi_11_64, cospi_21_64);
-          const __m128i k32_p19_p13 = pair_set_epi32(cospi_19_64, cospi_13_64);
-          const __m128i k32_p03_p29 = pair_set_epi32(cospi_3_64, cospi_29_64);
-          const __m128i k32_m29_p03 = pair_set_epi32(-cospi_29_64, cospi_3_64);
-          const __m128i k32_m13_p19 = pair_set_epi32(-cospi_13_64, cospi_19_64);
-          const __m128i k32_m21_p11 = pair_set_epi32(-cospi_21_64, cospi_11_64);
-          const __m128i k32_m05_p27 = pair_set_epi32(-cospi_5_64, cospi_27_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep1[40], lstep1[54]);
-          u[1] = _mm_unpackhi_epi32(lstep1[40], lstep1[54]);
-          u[2] = _mm_unpacklo_epi32(lstep1[41], lstep1[55]);
-          u[3] = _mm_unpackhi_epi32(lstep1[41], lstep1[55]);
-          u[4] = _mm_unpacklo_epi32(lstep1[42], lstep1[52]);
-          u[5] = _mm_unpackhi_epi32(lstep1[42], lstep1[52]);
-          u[6] = _mm_unpacklo_epi32(lstep1[43], lstep1[53]);
-          u[7] = _mm_unpackhi_epi32(lstep1[43], lstep1[53]);
-          u[8] = _mm_unpacklo_epi32(lstep1[44], lstep1[50]);
-          u[9] = _mm_unpackhi_epi32(lstep1[44], lstep1[50]);
-          u[10] = _mm_unpacklo_epi32(lstep1[45], lstep1[51]);
-          u[11] = _mm_unpackhi_epi32(lstep1[45], lstep1[51]);
-          u[12] = _mm_unpacklo_epi32(lstep1[46], lstep1[48]);
-          u[13] = _mm_unpackhi_epi32(lstep1[46], lstep1[48]);
-          u[14] = _mm_unpacklo_epi32(lstep1[47], lstep1[49]);
-          u[15] = _mm_unpackhi_epi32(lstep1[47], lstep1[49]);
-
-          v[0] = k_madd_epi32(u[0], k32_p27_p05);
-          v[1] = k_madd_epi32(u[1], k32_p27_p05);
-          v[2] = k_madd_epi32(u[2], k32_p27_p05);
-          v[3] = k_madd_epi32(u[3], k32_p27_p05);
-          v[4] = k_madd_epi32(u[4], k32_p11_p21);
-          v[5] = k_madd_epi32(u[5], k32_p11_p21);
-          v[6] = k_madd_epi32(u[6], k32_p11_p21);
-          v[7] = k_madd_epi32(u[7], k32_p11_p21);
-          v[8] = k_madd_epi32(u[8], k32_p19_p13);
-          v[9] = k_madd_epi32(u[9], k32_p19_p13);
-          v[10] = k_madd_epi32(u[10], k32_p19_p13);
-          v[11] = k_madd_epi32(u[11], k32_p19_p13);
-          v[12] = k_madd_epi32(u[12], k32_p03_p29);
-          v[13] = k_madd_epi32(u[13], k32_p03_p29);
-          v[14] = k_madd_epi32(u[14], k32_p03_p29);
-          v[15] = k_madd_epi32(u[15], k32_p03_p29);
-          v[16] = k_madd_epi32(u[12], k32_m29_p03);
-          v[17] = k_madd_epi32(u[13], k32_m29_p03);
-          v[18] = k_madd_epi32(u[14], k32_m29_p03);
-          v[19] = k_madd_epi32(u[15], k32_m29_p03);
-          v[20] = k_madd_epi32(u[8], k32_m13_p19);
-          v[21] = k_madd_epi32(u[9], k32_m13_p19);
-          v[22] = k_madd_epi32(u[10], k32_m13_p19);
-          v[23] = k_madd_epi32(u[11], k32_m13_p19);
-          v[24] = k_madd_epi32(u[4], k32_m21_p11);
-          v[25] = k_madd_epi32(u[5], k32_m21_p11);
-          v[26] = k_madd_epi32(u[6], k32_m21_p11);
-          v[27] = k_madd_epi32(u[7], k32_m21_p11);
-          v[28] = k_madd_epi32(u[0], k32_m05_p27);
-          v[29] = k_madd_epi32(u[1], k32_m05_p27);
-          v[30] = k_madd_epi32(u[2], k32_m05_p27);
-          v[31] = k_madd_epi32(u[3], k32_m05_p27);
-
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_32(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
-              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
-              &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
-              &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-          u[4] = k_packs_epi64(v[8], v[9]);
-          u[5] = k_packs_epi64(v[10], v[11]);
-          u[6] = k_packs_epi64(v[12], v[13]);
-          u[7] = k_packs_epi64(v[14], v[15]);
-          u[8] = k_packs_epi64(v[16], v[17]);
-          u[9] = k_packs_epi64(v[18], v[19]);
-          u[10] = k_packs_epi64(v[20], v[21]);
-          u[11] = k_packs_epi64(v[22], v[23]);
-          u[12] = k_packs_epi64(v[24], v[25]);
-          u[13] = k_packs_epi64(v[26], v[27]);
-          u[14] = k_packs_epi64(v[28], v[29]);
-          u[15] = k_packs_epi64(v[30], v[31]);
-
-          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-          v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-          v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-          v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-          v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-          v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-          v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-          v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-          v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-          u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-          u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-          u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-          u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-          u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-          u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-          u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-          u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-          u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-          u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-          u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-          u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-          u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-          u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-          u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-          u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-
-          v[0] = _mm_cmplt_epi32(u[0], kZero);
-          v[1] = _mm_cmplt_epi32(u[1], kZero);
-          v[2] = _mm_cmplt_epi32(u[2], kZero);
-          v[3] = _mm_cmplt_epi32(u[3], kZero);
-          v[4] = _mm_cmplt_epi32(u[4], kZero);
-          v[5] = _mm_cmplt_epi32(u[5], kZero);
-          v[6] = _mm_cmplt_epi32(u[6], kZero);
-          v[7] = _mm_cmplt_epi32(u[7], kZero);
-          v[8] = _mm_cmplt_epi32(u[8], kZero);
-          v[9] = _mm_cmplt_epi32(u[9], kZero);
-          v[10] = _mm_cmplt_epi32(u[10], kZero);
-          v[11] = _mm_cmplt_epi32(u[11], kZero);
-          v[12] = _mm_cmplt_epi32(u[12], kZero);
-          v[13] = _mm_cmplt_epi32(u[13], kZero);
-          v[14] = _mm_cmplt_epi32(u[14], kZero);
-          v[15] = _mm_cmplt_epi32(u[15], kZero);
-
-          u[0] = _mm_sub_epi32(u[0], v[0]);
-          u[1] = _mm_sub_epi32(u[1], v[1]);
-          u[2] = _mm_sub_epi32(u[2], v[2]);
-          u[3] = _mm_sub_epi32(u[3], v[3]);
-          u[4] = _mm_sub_epi32(u[4], v[4]);
-          u[5] = _mm_sub_epi32(u[5], v[5]);
-          u[6] = _mm_sub_epi32(u[6], v[6]);
-          u[7] = _mm_sub_epi32(u[7], v[7]);
-          u[8] = _mm_sub_epi32(u[8], v[8]);
-          u[9] = _mm_sub_epi32(u[9], v[9]);
-          u[10] = _mm_sub_epi32(u[10], v[10]);
-          u[11] = _mm_sub_epi32(u[11], v[11]);
-          u[12] = _mm_sub_epi32(u[12], v[12]);
-          u[13] = _mm_sub_epi32(u[13], v[13]);
-          u[14] = _mm_sub_epi32(u[14], v[14]);
-          u[15] = _mm_sub_epi32(u[15], v[15]);
-
-          v[0] = _mm_add_epi32(u[0], K32One);
-          v[1] = _mm_add_epi32(u[1], K32One);
-          v[2] = _mm_add_epi32(u[2], K32One);
-          v[3] = _mm_add_epi32(u[3], K32One);
-          v[4] = _mm_add_epi32(u[4], K32One);
-          v[5] = _mm_add_epi32(u[5], K32One);
-          v[6] = _mm_add_epi32(u[6], K32One);
-          v[7] = _mm_add_epi32(u[7], K32One);
-          v[8] = _mm_add_epi32(u[8], K32One);
-          v[9] = _mm_add_epi32(u[9], K32One);
-          v[10] = _mm_add_epi32(u[10], K32One);
-          v[11] = _mm_add_epi32(u[11], K32One);
-          v[12] = _mm_add_epi32(u[12], K32One);
-          v[13] = _mm_add_epi32(u[13], K32One);
-          v[14] = _mm_add_epi32(u[14], K32One);
-          v[15] = _mm_add_epi32(u[15], K32One);
-
-          u[0] = _mm_srai_epi32(v[0], 2);
-          u[1] = _mm_srai_epi32(v[1], 2);
-          u[2] = _mm_srai_epi32(v[2], 2);
-          u[3] = _mm_srai_epi32(v[3], 2);
-          u[4] = _mm_srai_epi32(v[4], 2);
-          u[5] = _mm_srai_epi32(v[5], 2);
-          u[6] = _mm_srai_epi32(v[6], 2);
-          u[7] = _mm_srai_epi32(v[7], 2);
-          u[8] = _mm_srai_epi32(v[8], 2);
-          u[9] = _mm_srai_epi32(v[9], 2);
-          u[10] = _mm_srai_epi32(v[10], 2);
-          u[11] = _mm_srai_epi32(v[11], 2);
-          u[12] = _mm_srai_epi32(v[12], 2);
-          u[13] = _mm_srai_epi32(v[13], 2);
-          u[14] = _mm_srai_epi32(v[14], 2);
-          u[15] = _mm_srai_epi32(v[15], 2);
-
-          out[5] = _mm_packs_epi32(u[0], u[1]);
-          out[21] = _mm_packs_epi32(u[2], u[3]);
-          out[13] = _mm_packs_epi32(u[4], u[5]);
-          out[29] = _mm_packs_epi32(u[6], u[7]);
-          out[3] = _mm_packs_epi32(u[8], u[9]);
-          out[19] = _mm_packs_epi32(u[10], u[11]);
-          out[11] = _mm_packs_epi32(u[12], u[13]);
-          out[27] = _mm_packs_epi32(u[14], u[15]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x8(&out[5], &out[21], &out[13], &out[29],
-                                      &out[3], &out[19], &out[11], &out[27]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-      }
-#endif  // FDCT32x32_HIGH_PRECISION
-      // Transpose the results, do it as four 8x8 transposes.
-      {
-        int transpose_block;
-        int16_t *output0 = &intermediate[column_start * 32];
-        tran_low_t *output1 = &output_org[column_start * 32];
-        for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
-          __m128i *this_out = &out[8 * transpose_block];
-          // 00 01 02 03 04 05 06 07
-          // 10 11 12 13 14 15 16 17
-          // 20 21 22 23 24 25 26 27
-          // 30 31 32 33 34 35 36 37
-          // 40 41 42 43 44 45 46 47
-          // 50 51 52 53 54 55 56 57
-          // 60 61 62 63 64 65 66 67
-          // 70 71 72 73 74 75 76 77
-          const __m128i tr0_0 = _mm_unpacklo_epi16(this_out[0], this_out[1]);
-          const __m128i tr0_1 = _mm_unpacklo_epi16(this_out[2], this_out[3]);
-          const __m128i tr0_2 = _mm_unpackhi_epi16(this_out[0], this_out[1]);
-          const __m128i tr0_3 = _mm_unpackhi_epi16(this_out[2], this_out[3]);
-          const __m128i tr0_4 = _mm_unpacklo_epi16(this_out[4], this_out[5]);
-          const __m128i tr0_5 = _mm_unpacklo_epi16(this_out[6], this_out[7]);
-          const __m128i tr0_6 = _mm_unpackhi_epi16(this_out[4], this_out[5]);
-          const __m128i tr0_7 = _mm_unpackhi_epi16(this_out[6], this_out[7]);
-          // 00 10 01 11 02 12 03 13
-          // 20 30 21 31 22 32 23 33
-          // 04 14 05 15 06 16 07 17
-          // 24 34 25 35 26 36 27 37
-          // 40 50 41 51 42 52 43 53
-          // 60 70 61 71 62 72 63 73
-          // 54 54 55 55 56 56 57 57
-          // 64 74 65 75 66 76 67 77
-          const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-          const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-          const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-          const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-          const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-          const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-          const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-          const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-          // 00 10 20 30 01 11 21 31
-          // 40 50 60 70 41 51 61 71
-          // 02 12 22 32 03 13 23 33
-          // 42 52 62 72 43 53 63 73
-          // 04 14 24 34 05 15 21 36
-          // 44 54 64 74 45 55 61 76
-          // 06 16 26 36 07 17 27 37
-          // 46 56 66 76 47 57 67 77
-          __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
-          __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
-          __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
-          __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
-          __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
-          __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
-          __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
-          __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
-          // 00 10 20 30 40 50 60 70
-          // 01 11 21 31 41 51 61 71
-          // 02 12 22 32 42 52 62 72
-          // 03 13 23 33 43 53 63 73
-          // 04 14 24 34 44 54 64 74
-          // 05 15 25 35 45 55 65 75
-          // 06 16 26 36 46 56 66 76
-          // 07 17 27 37 47 57 67 77
-          if (0 == pass) {
-            // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2;
-            // TODO(cd): see quality impact of only doing
-            //           output[j] = (output[j] + 1) >> 2;
-            //           which would remove the code between here ...
-            __m128i tr2_0_0 = _mm_cmpgt_epi16(tr2_0, kZero);
-            __m128i tr2_1_0 = _mm_cmpgt_epi16(tr2_1, kZero);
-            __m128i tr2_2_0 = _mm_cmpgt_epi16(tr2_2, kZero);
-            __m128i tr2_3_0 = _mm_cmpgt_epi16(tr2_3, kZero);
-            __m128i tr2_4_0 = _mm_cmpgt_epi16(tr2_4, kZero);
-            __m128i tr2_5_0 = _mm_cmpgt_epi16(tr2_5, kZero);
-            __m128i tr2_6_0 = _mm_cmpgt_epi16(tr2_6, kZero);
-            __m128i tr2_7_0 = _mm_cmpgt_epi16(tr2_7, kZero);
-            tr2_0 = _mm_sub_epi16(tr2_0, tr2_0_0);
-            tr2_1 = _mm_sub_epi16(tr2_1, tr2_1_0);
-            tr2_2 = _mm_sub_epi16(tr2_2, tr2_2_0);
-            tr2_3 = _mm_sub_epi16(tr2_3, tr2_3_0);
-            tr2_4 = _mm_sub_epi16(tr2_4, tr2_4_0);
-            tr2_5 = _mm_sub_epi16(tr2_5, tr2_5_0);
-            tr2_6 = _mm_sub_epi16(tr2_6, tr2_6_0);
-            tr2_7 = _mm_sub_epi16(tr2_7, tr2_7_0);
-            //           ... and here.
-            //           PS: also change code in av1/encoder/dct.c
-            tr2_0 = _mm_add_epi16(tr2_0, kOne);
-            tr2_1 = _mm_add_epi16(tr2_1, kOne);
-            tr2_2 = _mm_add_epi16(tr2_2, kOne);
-            tr2_3 = _mm_add_epi16(tr2_3, kOne);
-            tr2_4 = _mm_add_epi16(tr2_4, kOne);
-            tr2_5 = _mm_add_epi16(tr2_5, kOne);
-            tr2_6 = _mm_add_epi16(tr2_6, kOne);
-            tr2_7 = _mm_add_epi16(tr2_7, kOne);
-            tr2_0 = _mm_srai_epi16(tr2_0, 2);
-            tr2_1 = _mm_srai_epi16(tr2_1, 2);
-            tr2_2 = _mm_srai_epi16(tr2_2, 2);
-            tr2_3 = _mm_srai_epi16(tr2_3, 2);
-            tr2_4 = _mm_srai_epi16(tr2_4, 2);
-            tr2_5 = _mm_srai_epi16(tr2_5, 2);
-            tr2_6 = _mm_srai_epi16(tr2_6, 2);
-            tr2_7 = _mm_srai_epi16(tr2_7, 2);
-          }
-          // Note: even though all these stores are aligned, using the aligned
-          //       intrinsic make the code slightly slower.
-          if (pass == 0) {
-            _mm_storeu_si128((__m128i *)(output0 + 0 * 32), tr2_0);
-            _mm_storeu_si128((__m128i *)(output0 + 1 * 32), tr2_1);
-            _mm_storeu_si128((__m128i *)(output0 + 2 * 32), tr2_2);
-            _mm_storeu_si128((__m128i *)(output0 + 3 * 32), tr2_3);
-            _mm_storeu_si128((__m128i *)(output0 + 4 * 32), tr2_4);
-            _mm_storeu_si128((__m128i *)(output0 + 5 * 32), tr2_5);
-            _mm_storeu_si128((__m128i *)(output0 + 6 * 32), tr2_6);
-            _mm_storeu_si128((__m128i *)(output0 + 7 * 32), tr2_7);
-            // Process next 8x8
-            output0 += 8;
-          } else {
-            storeu_output(&tr2_0, (output1 + 0 * 32));
-            storeu_output(&tr2_1, (output1 + 1 * 32));
-            storeu_output(&tr2_2, (output1 + 2 * 32));
-            storeu_output(&tr2_3, (output1 + 3 * 32));
-            storeu_output(&tr2_4, (output1 + 4 * 32));
-            storeu_output(&tr2_5, (output1 + 5 * 32));
-            storeu_output(&tr2_6, (output1 + 6 * 32));
-            storeu_output(&tr2_7, (output1 + 7 * 32));
-            // Process next 8x8
-            output1 += 8;
-          }
-        }
-      }
-    }
-  }
-}  // NOLINT
-
-#undef ADD_EPI16
-#undef SUB_EPI16
-#undef HIGH_FDCT32x32_2D_C
-#undef HIGH_FDCT32x32_2D_ROWS_C
diff --git a/av1/common/x86/av1_fwd_txfm1d_sse4.c b/av1/common/x86/av1_fwd_txfm1d_sse4.c
index f0bcef9..c09a019 100644
--- a/av1/common/x86/av1_fwd_txfm1d_sse4.c
+++ b/av1/common/x86/av1_fwd_txfm1d_sse4.c
@@ -1,354 +1,5 @@
 #include "av1/common/x86/av1_txfm1d_sse4.h"
 
-void av1_fdct4_new_sse4_1(const __m128i *input, __m128i *output,
-                          const int8_t *cos_bit, const int8_t *stage_range) {
-  const int txfm_size = 4;
-  const int num_per_128 = 4;
-  const int32_t *cospi;
-  __m128i buf0[4];
-  __m128i buf1[4];
-  int col_num = txfm_size / num_per_128;
-  int bit;
-  int col;
-  (void)stage_range;
-  for (col = 0; col < col_num; col++) {
-    // stage 0;
-    int32_t stage_idx = 0;
-    buf0[0] = input[0 * col_num + col];
-    buf0[1] = input[1 * col_num + col];
-    buf0[2] = input[2 * col_num + col];
-    buf0[3] = input[3 * col_num + col];
-
-    // stage 1
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
-    buf1[0] = _mm_add_epi32(buf0[0], buf0[3]);
-    buf1[3] = _mm_sub_epi32(buf0[0], buf0[3]);
-    buf1[1] = _mm_add_epi32(buf0[1], buf0[2]);
-    buf1[2] = _mm_sub_epi32(buf0[1], buf0[2]);
-
-    // stage 2
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
-    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[0], buf1[1], buf0[0],
-                        buf0[1], bit);
-    btf_32_sse4_1_type1(cospi[48], cospi[16], buf1[2], buf1[3], buf0[2],
-                        buf0[3], bit);
-
-    // stage 3
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
-    buf1[0] = buf0[0];
-    buf1[1] = buf0[2];
-    buf1[2] = buf0[1];
-    buf1[3] = buf0[3];
-
-    output[0 * col_num + col] = buf1[0];
-    output[1 * col_num + col] = buf1[1];
-    output[2 * col_num + col] = buf1[2];
-    output[3 * col_num + col] = buf1[3];
-  }
-}
-
-void av1_fdct8_new_sse4_1(const __m128i *input, __m128i *output,
-                          const int8_t *cos_bit, const int8_t *stage_range) {
-  const int txfm_size = 8;
-  const int num_per_128 = 4;
-  const int32_t *cospi;
-  __m128i buf0[8];
-  __m128i buf1[8];
-  int col_num = txfm_size / num_per_128;
-  int bit;
-  int col;
-  (void)stage_range;
-  for (col = 0; col < col_num; col++) {
-    // stage 0;
-    int32_t stage_idx = 0;
-    buf0[0] = input[0 * col_num + col];
-    buf0[1] = input[1 * col_num + col];
-    buf0[2] = input[2 * col_num + col];
-    buf0[3] = input[3 * col_num + col];
-    buf0[4] = input[4 * col_num + col];
-    buf0[5] = input[5 * col_num + col];
-    buf0[6] = input[6 * col_num + col];
-    buf0[7] = input[7 * col_num + col];
-
-    // stage 1
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
-    buf1[0] = _mm_add_epi32(buf0[0], buf0[7]);
-    buf1[7] = _mm_sub_epi32(buf0[0], buf0[7]);
-    buf1[1] = _mm_add_epi32(buf0[1], buf0[6]);
-    buf1[6] = _mm_sub_epi32(buf0[1], buf0[6]);
-    buf1[2] = _mm_add_epi32(buf0[2], buf0[5]);
-    buf1[5] = _mm_sub_epi32(buf0[2], buf0[5]);
-    buf1[3] = _mm_add_epi32(buf0[3], buf0[4]);
-    buf1[4] = _mm_sub_epi32(buf0[3], buf0[4]);
-
-    // stage 2
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
-    buf0[0] = _mm_add_epi32(buf1[0], buf1[3]);
-    buf0[3] = _mm_sub_epi32(buf1[0], buf1[3]);
-    buf0[1] = _mm_add_epi32(buf1[1], buf1[2]);
-    buf0[2] = _mm_sub_epi32(buf1[1], buf1[2]);
-    buf0[4] = buf1[4];
-    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5],
-                        buf0[6], bit);
-    buf0[7] = buf1[7];
-
-    // stage 3
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
-    btf_32_sse4_1_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0],
-                        buf1[1], bit);
-    btf_32_sse4_1_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2],
-                        buf1[3], bit);
-    buf1[4] = _mm_add_epi32(buf0[4], buf0[5]);
-    buf1[5] = _mm_sub_epi32(buf0[4], buf0[5]);
-    buf1[6] = _mm_sub_epi32(buf0[7], buf0[6]);
-    buf1[7] = _mm_add_epi32(buf0[7], buf0[6]);
-
-    // stage 4
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
-    buf0[0] = buf1[0];
-    buf0[1] = buf1[1];
-    buf0[2] = buf1[2];
-    buf0[3] = buf1[3];
-    btf_32_sse4_1_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7],
-                        bit);
-    btf_32_sse4_1_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5],
-                        buf0[6], bit);
-
-    // stage 5
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
-    buf1[0] = buf0[0];
-    buf1[1] = buf0[4];
-    buf1[2] = buf0[2];
-    buf1[3] = buf0[6];
-    buf1[4] = buf0[1];
-    buf1[5] = buf0[5];
-    buf1[6] = buf0[3];
-    buf1[7] = buf0[7];
-
-    output[0 * col_num + col] = buf1[0];
-    output[1 * col_num + col] = buf1[1];
-    output[2 * col_num + col] = buf1[2];
-    output[3 * col_num + col] = buf1[3];
-    output[4 * col_num + col] = buf1[4];
-    output[5 * col_num + col] = buf1[5];
-    output[6 * col_num + col] = buf1[6];
-    output[7 * col_num + col] = buf1[7];
-  }
-}
-
-void av1_fdct16_new_sse4_1(const __m128i *input, __m128i *output,
-                           const int8_t *cos_bit, const int8_t *stage_range) {
-  const int txfm_size = 16;
-  const int num_per_128 = 4;
-  const int32_t *cospi;
-  __m128i buf0[16];
-  __m128i buf1[16];
-  int col_num = txfm_size / num_per_128;
-  int bit;
-  int col;
-  (void)stage_range;
-  for (col = 0; col < col_num; col++) {
-    // stage 0;
-    int32_t stage_idx = 0;
-    buf0[0] = input[0 * col_num + col];
-    buf0[1] = input[1 * col_num + col];
-    buf0[2] = input[2 * col_num + col];
-    buf0[3] = input[3 * col_num + col];
-    buf0[4] = input[4 * col_num + col];
-    buf0[5] = input[5 * col_num + col];
-    buf0[6] = input[6 * col_num + col];
-    buf0[7] = input[7 * col_num + col];
-    buf0[8] = input[8 * col_num + col];
-    buf0[9] = input[9 * col_num + col];
-    buf0[10] = input[10 * col_num + col];
-    buf0[11] = input[11 * col_num + col];
-    buf0[12] = input[12 * col_num + col];
-    buf0[13] = input[13 * col_num + col];
-    buf0[14] = input[14 * col_num + col];
-    buf0[15] = input[15 * col_num + col];
-
-    // stage 1
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
-    buf1[0] = _mm_add_epi32(buf0[0], buf0[15]);
-    buf1[15] = _mm_sub_epi32(buf0[0], buf0[15]);
-    buf1[1] = _mm_add_epi32(buf0[1], buf0[14]);
-    buf1[14] = _mm_sub_epi32(buf0[1], buf0[14]);
-    buf1[2] = _mm_add_epi32(buf0[2], buf0[13]);
-    buf1[13] = _mm_sub_epi32(buf0[2], buf0[13]);
-    buf1[3] = _mm_add_epi32(buf0[3], buf0[12]);
-    buf1[12] = _mm_sub_epi32(buf0[3], buf0[12]);
-    buf1[4] = _mm_add_epi32(buf0[4], buf0[11]);
-    buf1[11] = _mm_sub_epi32(buf0[4], buf0[11]);
-    buf1[5] = _mm_add_epi32(buf0[5], buf0[10]);
-    buf1[10] = _mm_sub_epi32(buf0[5], buf0[10]);
-    buf1[6] = _mm_add_epi32(buf0[6], buf0[9]);
-    buf1[9] = _mm_sub_epi32(buf0[6], buf0[9]);
-    buf1[7] = _mm_add_epi32(buf0[7], buf0[8]);
-    buf1[8] = _mm_sub_epi32(buf0[7], buf0[8]);
-
-    // stage 2
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
-    buf0[0] = _mm_add_epi32(buf1[0], buf1[7]);
-    buf0[7] = _mm_sub_epi32(buf1[0], buf1[7]);
-    buf0[1] = _mm_add_epi32(buf1[1], buf1[6]);
-    buf0[6] = _mm_sub_epi32(buf1[1], buf1[6]);
-    buf0[2] = _mm_add_epi32(buf1[2], buf1[5]);
-    buf0[5] = _mm_sub_epi32(buf1[2], buf1[5]);
-    buf0[3] = _mm_add_epi32(buf1[3], buf1[4]);
-    buf0[4] = _mm_sub_epi32(buf1[3], buf1[4]);
-    buf0[8] = buf1[8];
-    buf0[9] = buf1[9];
-    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[10], buf1[13], buf0[10],
-                        buf0[13], bit);
-    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[11], buf1[12], buf0[11],
-                        buf0[12], bit);
-    buf0[14] = buf1[14];
-    buf0[15] = buf1[15];
-
-    // stage 3
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
-    buf1[0] = _mm_add_epi32(buf0[0], buf0[3]);
-    buf1[3] = _mm_sub_epi32(buf0[0], buf0[3]);
-    buf1[1] = _mm_add_epi32(buf0[1], buf0[2]);
-    buf1[2] = _mm_sub_epi32(buf0[1], buf0[2]);
-    buf1[4] = buf0[4];
-    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[5], buf0[6], buf1[5],
-                        buf1[6], bit);
-    buf1[7] = buf0[7];
-    buf1[8] = _mm_add_epi32(buf0[8], buf0[11]);
-    buf1[11] = _mm_sub_epi32(buf0[8], buf0[11]);
-    buf1[9] = _mm_add_epi32(buf0[9], buf0[10]);
-    buf1[10] = _mm_sub_epi32(buf0[9], buf0[10]);
-    buf1[12] = _mm_sub_epi32(buf0[15], buf0[12]);
-    buf1[15] = _mm_add_epi32(buf0[15], buf0[12]);
-    buf1[13] = _mm_sub_epi32(buf0[14], buf0[13]);
-    buf1[14] = _mm_add_epi32(buf0[14], buf0[13]);
-
-    // stage 4
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
-    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[0], buf1[1], buf0[0],
-                        buf0[1], bit);
-    btf_32_sse4_1_type1(cospi[48], cospi[16], buf1[2], buf1[3], buf0[2],
-                        buf0[3], bit);
-    buf0[4] = _mm_add_epi32(buf1[4], buf1[5]);
-    buf0[5] = _mm_sub_epi32(buf1[4], buf1[5]);
-    buf0[6] = _mm_sub_epi32(buf1[7], buf1[6]);
-    buf0[7] = _mm_add_epi32(buf1[7], buf1[6]);
-    buf0[8] = buf1[8];
-    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[9], buf1[14], buf0[9],
-                        buf0[14], bit);
-    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[10], buf1[13], buf0[10],
-                        buf0[13], bit);
-    buf0[11] = buf1[11];
-    buf0[12] = buf1[12];
-    buf0[15] = buf1[15];
-
-    // stage 5
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
-    buf1[0] = buf0[0];
-    buf1[1] = buf0[1];
-    buf1[2] = buf0[2];
-    buf1[3] = buf0[3];
-    btf_32_sse4_1_type1(cospi[56], cospi[8], buf0[4], buf0[7], buf1[4], buf1[7],
-                        bit);
-    btf_32_sse4_1_type1(cospi[24], cospi[40], buf0[5], buf0[6], buf1[5],
-                        buf1[6], bit);
-    buf1[8] = _mm_add_epi32(buf0[8], buf0[9]);
-    buf1[9] = _mm_sub_epi32(buf0[8], buf0[9]);
-    buf1[10] = _mm_sub_epi32(buf0[11], buf0[10]);
-    buf1[11] = _mm_add_epi32(buf0[11], buf0[10]);
-    buf1[12] = _mm_add_epi32(buf0[12], buf0[13]);
-    buf1[13] = _mm_sub_epi32(buf0[12], buf0[13]);
-    buf1[14] = _mm_sub_epi32(buf0[15], buf0[14]);
-    buf1[15] = _mm_add_epi32(buf0[15], buf0[14]);
-
-    // stage 6
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
-    buf0[0] = buf1[0];
-    buf0[1] = buf1[1];
-    buf0[2] = buf1[2];
-    buf0[3] = buf1[3];
-    buf0[4] = buf1[4];
-    buf0[5] = buf1[5];
-    buf0[6] = buf1[6];
-    buf0[7] = buf1[7];
-    btf_32_sse4_1_type1(cospi[60], cospi[4], buf1[8], buf1[15], buf0[8],
-                        buf0[15], bit);
-    btf_32_sse4_1_type1(cospi[28], cospi[36], buf1[9], buf1[14], buf0[9],
-                        buf0[14], bit);
-    btf_32_sse4_1_type1(cospi[44], cospi[20], buf1[10], buf1[13], buf0[10],
-                        buf0[13], bit);
-    btf_32_sse4_1_type1(cospi[12], cospi[52], buf1[11], buf1[12], buf0[11],
-                        buf0[12], bit);
-
-    // stage 7
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
-    buf1[0] = buf0[0];
-    buf1[1] = buf0[8];
-    buf1[2] = buf0[4];
-    buf1[3] = buf0[12];
-    buf1[4] = buf0[2];
-    buf1[5] = buf0[10];
-    buf1[6] = buf0[6];
-    buf1[7] = buf0[14];
-    buf1[8] = buf0[1];
-    buf1[9] = buf0[9];
-    buf1[10] = buf0[5];
-    buf1[11] = buf0[13];
-    buf1[12] = buf0[3];
-    buf1[13] = buf0[11];
-    buf1[14] = buf0[7];
-    buf1[15] = buf0[15];
-
-    output[0 * col_num + col] = buf1[0];
-    output[1 * col_num + col] = buf1[1];
-    output[2 * col_num + col] = buf1[2];
-    output[3 * col_num + col] = buf1[3];
-    output[4 * col_num + col] = buf1[4];
-    output[5 * col_num + col] = buf1[5];
-    output[6 * col_num + col] = buf1[6];
-    output[7 * col_num + col] = buf1[7];
-    output[8 * col_num + col] = buf1[8];
-    output[9 * col_num + col] = buf1[9];
-    output[10 * col_num + col] = buf1[10];
-    output[11 * col_num + col] = buf1[11];
-    output[12 * col_num + col] = buf1[12];
-    output[13 * col_num + col] = buf1[13];
-    output[14 * col_num + col] = buf1[14];
-    output[15 * col_num + col] = buf1[15];
-  }
-}
-
 void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
                            const int8_t *cos_bit, const int8_t *stage_range) {
   const int txfm_size = 32;
@@ -835,370 +486,6 @@
   }
 }
 
-void av1_fadst8_new_sse4_1(const __m128i *input, __m128i *output,
-                           const int8_t *cos_bit, const int8_t *stage_range) {
-  const int txfm_size = 8;
-  const int num_per_128 = 4;
-  const int32_t *cospi;
-  __m128i buf0[8];
-  __m128i buf1[8];
-  int col_num = txfm_size / num_per_128;
-  int bit;
-  int col;
-  (void)stage_range;
-  for (col = 0; col < col_num; col++) {
-    // stage 0;
-    int32_t stage_idx = 0;
-    buf0[0] = input[0 * col_num + col];
-    buf0[1] = input[1 * col_num + col];
-    buf0[2] = input[2 * col_num + col];
-    buf0[3] = input[3 * col_num + col];
-    buf0[4] = input[4 * col_num + col];
-    buf0[5] = input[5 * col_num + col];
-    buf0[6] = input[6 * col_num + col];
-    buf0[7] = input[7 * col_num + col];
-
-    // stage 1
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
-    buf1[0] = buf0[7];
-    buf1[1] = buf0[0];
-    buf1[2] = buf0[5];
-    buf1[3] = buf0[2];
-    buf1[4] = buf0[3];
-    buf1[5] = buf0[4];
-    buf1[6] = buf0[1];
-    buf1[7] = buf0[6];
-
-    // stage 2
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
-    btf_32_sse4_1_type0(cospi[4], cospi[60], buf1[0], buf1[1], buf0[0], buf0[1],
-                        bit);
-    btf_32_sse4_1_type0(cospi[20], cospi[44], buf1[2], buf1[3], buf0[2],
-                        buf0[3], bit);
-    btf_32_sse4_1_type0(cospi[36], cospi[28], buf1[4], buf1[5], buf0[4],
-                        buf0[5], bit);
-    btf_32_sse4_1_type0(cospi[52], cospi[12], buf1[6], buf1[7], buf0[6],
-                        buf0[7], bit);
-
-    // stage 3
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
-    buf1[0] = _mm_add_epi32(buf0[0], buf0[4]);
-    buf1[4] = _mm_sub_epi32(buf0[0], buf0[4]);
-    buf1[1] = _mm_add_epi32(buf0[1], buf0[5]);
-    buf1[5] = _mm_sub_epi32(buf0[1], buf0[5]);
-    buf1[2] = _mm_add_epi32(buf0[2], buf0[6]);
-    buf1[6] = _mm_sub_epi32(buf0[2], buf0[6]);
-    buf1[3] = _mm_add_epi32(buf0[3], buf0[7]);
-    buf1[7] = _mm_sub_epi32(buf0[3], buf0[7]);
-
-    // stage 4
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
-    buf0[0] = buf1[0];
-    buf0[1] = buf1[1];
-    buf0[2] = buf1[2];
-    buf0[3] = buf1[3];
-    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[4], buf1[5], buf0[4],
-                        buf0[5], bit);
-    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[6], buf1[7], buf0[6],
-                        buf0[7], bit);
-
-    // stage 5
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
-    buf1[0] = _mm_add_epi32(buf0[0], buf0[2]);
-    buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]);
-    buf1[1] = _mm_add_epi32(buf0[1], buf0[3]);
-    buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]);
-    buf1[4] = _mm_add_epi32(buf0[4], buf0[6]);
-    buf1[6] = _mm_sub_epi32(buf0[4], buf0[6]);
-    buf1[5] = _mm_add_epi32(buf0[5], buf0[7]);
-    buf1[7] = _mm_sub_epi32(buf0[5], buf0[7]);
-
-    // stage 6
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
-    buf0[0] = buf1[0];
-    buf0[1] = buf1[1];
-    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
-                        buf0[3], bit);
-    buf0[4] = buf1[4];
-    buf0[5] = buf1[5];
-    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[6], buf1[7], buf0[6],
-                        buf0[7], bit);
-
-    // stage 7
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
-    buf1[0] = buf0[0];
-    buf1[1] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[4]);
-    buf1[2] = buf0[6];
-    buf1[3] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[2]);
-    buf1[4] = buf0[3];
-    buf1[5] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[7]);
-    buf1[6] = buf0[5];
-    buf1[7] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]);
-
-    output[0 * col_num + col] = buf1[0];
-    output[1 * col_num + col] = buf1[1];
-    output[2 * col_num + col] = buf1[2];
-    output[3 * col_num + col] = buf1[3];
-    output[4 * col_num + col] = buf1[4];
-    output[5 * col_num + col] = buf1[5];
-    output[6 * col_num + col] = buf1[6];
-    output[7 * col_num + col] = buf1[7];
-  }
-}
-
-void av1_fadst16_new_sse4_1(const __m128i *input, __m128i *output,
-                            const int8_t *cos_bit, const int8_t *stage_range) {
-  const int txfm_size = 16;
-  const int num_per_128 = 4;
-  const int32_t *cospi;
-  __m128i buf0[16];
-  __m128i buf1[16];
-  int col_num = txfm_size / num_per_128;
-  int bit;
-  int col;
-  (void)stage_range;
-  for (col = 0; col < col_num; col++) {
-    // stage 0;
-    int32_t stage_idx = 0;
-    buf0[0] = input[0 * col_num + col];
-    buf0[1] = input[1 * col_num + col];
-    buf0[2] = input[2 * col_num + col];
-    buf0[3] = input[3 * col_num + col];
-    buf0[4] = input[4 * col_num + col];
-    buf0[5] = input[5 * col_num + col];
-    buf0[6] = input[6 * col_num + col];
-    buf0[7] = input[7 * col_num + col];
-    buf0[8] = input[8 * col_num + col];
-    buf0[9] = input[9 * col_num + col];
-    buf0[10] = input[10 * col_num + col];
-    buf0[11] = input[11 * col_num + col];
-    buf0[12] = input[12 * col_num + col];
-    buf0[13] = input[13 * col_num + col];
-    buf0[14] = input[14 * col_num + col];
-    buf0[15] = input[15 * col_num + col];
-
-    // stage 1
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
-    buf1[0] = buf0[15];
-    buf1[1] = buf0[0];
-    buf1[2] = buf0[13];
-    buf1[3] = buf0[2];
-    buf1[4] = buf0[11];
-    buf1[5] = buf0[4];
-    buf1[6] = buf0[9];
-    buf1[7] = buf0[6];
-    buf1[8] = buf0[7];
-    buf1[9] = buf0[8];
-    buf1[10] = buf0[5];
-    buf1[11] = buf0[10];
-    buf1[12] = buf0[3];
-    buf1[13] = buf0[12];
-    buf1[14] = buf0[1];
-    buf1[15] = buf0[14];
-
-    // stage 2
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
-    btf_32_sse4_1_type0(cospi[2], cospi[62], buf1[0], buf1[1], buf0[0], buf0[1],
-                        bit);
-    btf_32_sse4_1_type0(cospi[10], cospi[54], buf1[2], buf1[3], buf0[2],
-                        buf0[3], bit);
-    btf_32_sse4_1_type0(cospi[18], cospi[46], buf1[4], buf1[5], buf0[4],
-                        buf0[5], bit);
-    btf_32_sse4_1_type0(cospi[26], cospi[38], buf1[6], buf1[7], buf0[6],
-                        buf0[7], bit);
-    btf_32_sse4_1_type0(cospi[34], cospi[30], buf1[8], buf1[9], buf0[8],
-                        buf0[9], bit);
-    btf_32_sse4_1_type0(cospi[42], cospi[22], buf1[10], buf1[11], buf0[10],
-                        buf0[11], bit);
-    btf_32_sse4_1_type0(cospi[50], cospi[14], buf1[12], buf1[13], buf0[12],
-                        buf0[13], bit);
-    btf_32_sse4_1_type0(cospi[58], cospi[6], buf1[14], buf1[15], buf0[14],
-                        buf0[15], bit);
-
-    // stage 3
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
-    buf1[0] = _mm_add_epi32(buf0[0], buf0[8]);
-    buf1[8] = _mm_sub_epi32(buf0[0], buf0[8]);
-    buf1[1] = _mm_add_epi32(buf0[1], buf0[9]);
-    buf1[9] = _mm_sub_epi32(buf0[1], buf0[9]);
-    buf1[2] = _mm_add_epi32(buf0[2], buf0[10]);
-    buf1[10] = _mm_sub_epi32(buf0[2], buf0[10]);
-    buf1[3] = _mm_add_epi32(buf0[3], buf0[11]);
-    buf1[11] = _mm_sub_epi32(buf0[3], buf0[11]);
-    buf1[4] = _mm_add_epi32(buf0[4], buf0[12]);
-    buf1[12] = _mm_sub_epi32(buf0[4], buf0[12]);
-    buf1[5] = _mm_add_epi32(buf0[5], buf0[13]);
-    buf1[13] = _mm_sub_epi32(buf0[5], buf0[13]);
-    buf1[6] = _mm_add_epi32(buf0[6], buf0[14]);
-    buf1[14] = _mm_sub_epi32(buf0[6], buf0[14]);
-    buf1[7] = _mm_add_epi32(buf0[7], buf0[15]);
-    buf1[15] = _mm_sub_epi32(buf0[7], buf0[15]);
-
-    // stage 4
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
-    buf0[0] = buf1[0];
-    buf0[1] = buf1[1];
-    buf0[2] = buf1[2];
-    buf0[3] = buf1[3];
-    buf0[4] = buf1[4];
-    buf0[5] = buf1[5];
-    buf0[6] = buf1[6];
-    buf0[7] = buf1[7];
-    btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[8], buf1[9], buf0[8], buf0[9],
-                        bit);
-    btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[10], buf1[11], buf0[10],
-                        buf0[11], bit);
-    btf_32_sse4_1_type0(-cospi[56], cospi[8], buf1[12], buf1[13], buf0[12],
-                        buf0[13], bit);
-    btf_32_sse4_1_type0(-cospi[24], cospi[40], buf1[14], buf1[15], buf0[14],
-                        buf0[15], bit);
-
-    // stage 5
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
-    buf1[0] = _mm_add_epi32(buf0[0], buf0[4]);
-    buf1[4] = _mm_sub_epi32(buf0[0], buf0[4]);
-    buf1[1] = _mm_add_epi32(buf0[1], buf0[5]);
-    buf1[5] = _mm_sub_epi32(buf0[1], buf0[5]);
-    buf1[2] = _mm_add_epi32(buf0[2], buf0[6]);
-    buf1[6] = _mm_sub_epi32(buf0[2], buf0[6]);
-    buf1[3] = _mm_add_epi32(buf0[3], buf0[7]);
-    buf1[7] = _mm_sub_epi32(buf0[3], buf0[7]);
-    buf1[8] = _mm_add_epi32(buf0[8], buf0[12]);
-    buf1[12] = _mm_sub_epi32(buf0[8], buf0[12]);
-    buf1[9] = _mm_add_epi32(buf0[9], buf0[13]);
-    buf1[13] = _mm_sub_epi32(buf0[9], buf0[13]);
-    buf1[10] = _mm_add_epi32(buf0[10], buf0[14]);
-    buf1[14] = _mm_sub_epi32(buf0[10], buf0[14]);
-    buf1[11] = _mm_add_epi32(buf0[11], buf0[15]);
-    buf1[15] = _mm_sub_epi32(buf0[11], buf0[15]);
-
-    // stage 6
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
-    buf0[0] = buf1[0];
-    buf0[1] = buf1[1];
-    buf0[2] = buf1[2];
-    buf0[3] = buf1[3];
-    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[4], buf1[5], buf0[4],
-                        buf0[5], bit);
-    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[6], buf1[7], buf0[6],
-                        buf0[7], bit);
-    buf0[8] = buf1[8];
-    buf0[9] = buf1[9];
-    buf0[10] = buf1[10];
-    buf0[11] = buf1[11];
-    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[12], buf1[13], buf0[12],
-                        buf0[13], bit);
-    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[14], buf1[15], buf0[14],
-                        buf0[15], bit);
-
-    // stage 7
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
-    buf1[0] = _mm_add_epi32(buf0[0], buf0[2]);
-    buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]);
-    buf1[1] = _mm_add_epi32(buf0[1], buf0[3]);
-    buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]);
-    buf1[4] = _mm_add_epi32(buf0[4], buf0[6]);
-    buf1[6] = _mm_sub_epi32(buf0[4], buf0[6]);
-    buf1[5] = _mm_add_epi32(buf0[5], buf0[7]);
-    buf1[7] = _mm_sub_epi32(buf0[5], buf0[7]);
-    buf1[8] = _mm_add_epi32(buf0[8], buf0[10]);
-    buf1[10] = _mm_sub_epi32(buf0[8], buf0[10]);
-    buf1[9] = _mm_add_epi32(buf0[9], buf0[11]);
-    buf1[11] = _mm_sub_epi32(buf0[9], buf0[11]);
-    buf1[12] = _mm_add_epi32(buf0[12], buf0[14]);
-    buf1[14] = _mm_sub_epi32(buf0[12], buf0[14]);
-    buf1[13] = _mm_add_epi32(buf0[13], buf0[15]);
-    buf1[15] = _mm_sub_epi32(buf0[13], buf0[15]);
-
-    // stage 8
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
-    buf0[0] = buf1[0];
-    buf0[1] = buf1[1];
-    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
-                        buf0[3], bit);
-    buf0[4] = buf1[4];
-    buf0[5] = buf1[5];
-    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[6], buf1[7], buf0[6],
-                        buf0[7], bit);
-    buf0[8] = buf1[8];
-    buf0[9] = buf1[9];
-    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[10], buf1[11], buf0[10],
-                        buf0[11], bit);
-    buf0[12] = buf1[12];
-    buf0[13] = buf1[13];
-    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[14], buf1[15], buf0[14],
-                        buf0[15], bit);
-
-    // stage 9
-    stage_idx++;
-    bit = cos_bit[stage_idx];
-    cospi = cospi_arr[bit - cos_bit_min];
-    buf1[0] = buf0[0];
-    buf1[1] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[8]);
-    buf1[2] = buf0[12];
-    buf1[3] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[4]);
-    buf1[4] = buf0[6];
-    buf1[5] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[14]);
-    buf1[6] = buf0[10];
-    buf1[7] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[2]);
-    buf1[8] = buf0[3];
-    buf1[9] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[11]);
-    buf1[10] = buf0[15];
-    buf1[11] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[7]);
-    buf1[12] = buf0[5];
-    buf1[13] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[13]);
-    buf1[14] = buf0[9];
-    buf1[15] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]);
-
-    output[0 * col_num + col] = buf1[0];
-    output[1 * col_num + col] = buf1[1];
-    output[2 * col_num + col] = buf1[2];
-    output[3 * col_num + col] = buf1[3];
-    output[4 * col_num + col] = buf1[4];
-    output[5 * col_num + col] = buf1[5];
-    output[6 * col_num + col] = buf1[6];
-    output[7 * col_num + col] = buf1[7];
-    output[8 * col_num + col] = buf1[8];
-    output[9 * col_num + col] = buf1[9];
-    output[10 * col_num + col] = buf1[10];
-    output[11 * col_num + col] = buf1[11];
-    output[12 * col_num + col] = buf1[12];
-    output[13 * col_num + col] = buf1[13];
-    output[14 * col_num + col] = buf1[14];
-    output[15 * col_num + col] = buf1[15];
-  }
-}
-
 void av1_fadst32_new_sse4_1(const __m128i *input, __m128i *output,
                             const int8_t *cos_bit, const int8_t *stage_range) {
   const int txfm_size = 32;
diff --git a/av1/common/x86/av1_fwd_txfm2d_sse4.c b/av1/common/x86/av1_fwd_txfm2d_sse4.c
index 07c283e..3d60b36 100644
--- a/av1/common/x86/av1_fwd_txfm2d_sse4.c
+++ b/av1/common/x86/av1_fwd_txfm2d_sse4.c
@@ -28,13 +28,7 @@
 
 static INLINE TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
   switch (txfm_type) {
-    case TXFM_TYPE_DCT4: return av1_fdct4_new_sse4_1; break;
-    case TXFM_TYPE_DCT8: return av1_fdct8_new_sse4_1; break;
-    case TXFM_TYPE_DCT16: return av1_fdct16_new_sse4_1; break;
     case TXFM_TYPE_DCT32: return av1_fdct32_new_sse4_1; break;
-    case TXFM_TYPE_ADST4: return av1_fadst4_new_sse4_1; break;
-    case TXFM_TYPE_ADST8: return av1_fadst8_new_sse4_1; break;
-    case TXFM_TYPE_ADST16: return av1_fadst16_new_sse4_1; break;
     case TXFM_TYPE_ADST32: return av1_fadst32_new_sse4_1; break;
     default: assert(0);
   }
diff --git a/av1/common/x86/av1_fwd_txfm_impl_sse2.h b/av1/common/x86/av1_fwd_txfm_impl_sse2.h
deleted file mode 100644
index 0e341ac..0000000
--- a/av1/common/x86/av1_fwd_txfm_impl_sse2.h
+++ /dev/null
@@ -1,1014 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>  // SSE2
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/txfm_common.h"
-#include "aom_dsp/x86/fwd_txfm_sse2.h"
-#include "aom_dsp/x86/txfm_common_sse2.h"
-#include "aom_ports/mem.h"
-
-// TODO(jingning) The high bit-depth functions need rework for performance.
-// After we properly fix the high bit-depth function implementations, this
-// file's dependency should be substantially simplified.
-#if DCT_HIGH_BIT_DEPTH
-#define ADD_EPI16 _mm_adds_epi16
-#define SUB_EPI16 _mm_subs_epi16
-
-#else
-#define ADD_EPI16 _mm_add_epi16
-#define SUB_EPI16 _mm_sub_epi16
-#endif
-
-void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) {
-  // This 2D transform implements 4 vertical 1D transforms followed
-  // by 4 horizontal 1D transforms.  The multiplies and adds are as given
-  // by Chen, Smith and Fralick ('77).  The commands for moving the data
-  // around have been minimized by hand.
-  // For the purposes of the comments, the 16 inputs are referred to at i0
-  // through iF (in raster order), intermediate variables are a0, b0, c0
-  // through f, and correspond to the in-place computations mapped to input
-  // locations.  The outputs, o0 through oF are labeled according to the
-  // output locations.
-
-  // Constants
-  // These are the coefficients used for the multiplies.
-  // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64),
-  // where cospi_N_64 = cos(N pi /64)
-  const __m128i k__cospi_A =
-      octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64,
-                     cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_B =
-      octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64,
-                     cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64);
-  const __m128i k__cospi_C =
-      octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64,
-                     cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_D =
-      octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64,
-                     cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_E =
-      octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64,
-                     cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64);
-  const __m128i k__cospi_F =
-      octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64,
-                     cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_G =
-      octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64,
-                     -cospi_8_64, -cospi_24_64, -cospi_8_64, -cospi_24_64);
-  const __m128i k__cospi_H =
-      octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64,
-                     -cospi_24_64, cospi_8_64, -cospi_24_64, cospi_8_64);
-
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  // This second rounding constant saves doing some extra adds at the end
-  const __m128i k__DCT_CONST_ROUNDING2 =
-      _mm_set1_epi32(DCT_CONST_ROUNDING + (DCT_CONST_ROUNDING << 1));
-  const int DCT_CONST_BITS2 = DCT_CONST_BITS + 2;
-  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
-  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
-  __m128i in0, in1;
-#if DCT_HIGH_BIT_DEPTH
-  __m128i cmp0, cmp1;
-  int test, overflow;
-#endif
-
-  // Load inputs.
-  in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
-  in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
-  in1 = _mm_unpacklo_epi64(
-      in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
-  in0 = _mm_unpacklo_epi64(
-      in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));
-// in0 = [i0 i1 i2 i3 iC iD iE iF]
-// in1 = [i4 i5 i6 i7 i8 i9 iA iB]
-#if DCT_HIGH_BIT_DEPTH
-  // Check inputs small enough to use optimised code
-  cmp0 = _mm_xor_si128(_mm_cmpgt_epi16(in0, _mm_set1_epi16(0x3ff)),
-                       _mm_cmplt_epi16(in0, _mm_set1_epi16(0xfc00)));
-  cmp1 = _mm_xor_si128(_mm_cmpgt_epi16(in1, _mm_set1_epi16(0x3ff)),
-                       _mm_cmplt_epi16(in1, _mm_set1_epi16(0xfc00)));
-  test = _mm_movemask_epi8(_mm_or_si128(cmp0, cmp1));
-  if (test) {
-    aom_highbd_fdct4x4_c(input, output, stride);
-    return;
-  }
-#endif  // DCT_HIGH_BIT_DEPTH
-
-  // multiply by 16 to give some extra precision
-  in0 = _mm_slli_epi16(in0, 4);
-  in1 = _mm_slli_epi16(in1, 4);
-  // if (i == 0 && input[0]) input[0] += 1;
-  // add 1 to the upper left pixel if it is non-zero, which helps reduce
-  // the round-trip error
-  {
-    // The mask will only contain whether the first value is zero, all
-    // other comparison will fail as something shifted by 4 (above << 4)
-    // can never be equal to one. To increment in the non-zero case, we
-    // add the mask and one for the first element:
-    //   - if zero, mask = -1, v = v - 1 + 1 = v
-    //   - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
-    __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);
-    in0 = _mm_add_epi16(in0, mask);
-    in0 = _mm_add_epi16(in0, k__nonzero_bias_b);
-  }
-  // There are 4 total stages, alternating between an add/subtract stage
-  // followed by an multiply-and-add stage.
-  {
-    // Stage 1: Add/subtract
-
-    // in0 = [i0 i1 i2 i3 iC iD iE iF]
-    // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
-    const __m128i r0 = _mm_unpacklo_epi16(in0, in1);
-    const __m128i r1 = _mm_unpackhi_epi16(in0, in1);
-    // r0 = [i0 i4 i1 i5 i2 i6 i3 i7]
-    // r1 = [iC i8 iD i9 iE iA iF iB]
-    const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4);
-    const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4);
-    // r2 = [i0 i4 i1 i5 i3 i7 i2 i6]
-    // r3 = [iC i8 iD i9 iF iB iE iA]
-
-    const __m128i t0 = _mm_add_epi16(r2, r3);
-    const __m128i t1 = _mm_sub_epi16(r2, r3);
-    // t0 = [a0 a4 a1 a5 a3 a7 a2 a6]
-    // t1 = [aC a8 aD a9 aF aB aE aA]
-
-    // Stage 2: multiply by constants (which gets us into 32 bits).
-    // The constants needed here are:
-    // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16]
-    // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16]
-    // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08]
-    // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24]
-    const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A);
-    const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B);
-    const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C);
-    const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D);
-    // Then add and right-shift to get back to 16-bit range
-    const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-    const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-    const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-    const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-    const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-    const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-    const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-    const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-    // w0 = [b0 b1 b7 b6]
-    // w1 = [b8 b9 bF bE]
-    // w2 = [b4 b5 b3 b2]
-    // w3 = [bC bD bB bA]
-    const __m128i x0 = _mm_packs_epi32(w0, w1);
-    const __m128i x1 = _mm_packs_epi32(w2, w3);
-#if DCT_HIGH_BIT_DEPTH
-    overflow = check_epi16_overflow_x2(&x0, &x1);
-    if (overflow) {
-      aom_highbd_fdct4x4_c(input, output, stride);
-      return;
-    }
-#endif  // DCT_HIGH_BIT_DEPTH
-    // x0 = [b0 b1 b7 b6 b8 b9 bF bE]
-    // x1 = [b4 b5 b3 b2 bC bD bB bA]
-    in0 = _mm_shuffle_epi32(x0, 0xD8);
-    in1 = _mm_shuffle_epi32(x1, 0x8D);
-    // in0 = [b0 b1 b8 b9 b7 b6 bF bE]
-    // in1 = [b3 b2 bB bA b4 b5 bC bD]
-  }
-  {
-    // vertical DCTs finished. Now we do the horizontal DCTs.
-    // Stage 3: Add/subtract
-
-    // t0 = [c0 c1 c8 c9  c4  c5  cC  cD]
-    // t1 = [c3 c2 cB cA -c7 -c6 -cF -cE]
-    const __m128i t0 = ADD_EPI16(in0, in1);
-    const __m128i t1 = SUB_EPI16(in0, in1);
-#if DCT_HIGH_BIT_DEPTH
-    overflow = check_epi16_overflow_x2(&t0, &t1);
-    if (overflow) {
-      aom_highbd_fdct4x4_c(input, output, stride);
-      return;
-    }
-#endif  // DCT_HIGH_BIT_DEPTH
-
-    // Stage 4: multiply by constants (which gets us into 32 bits).
-    {
-      // The constants needed here are:
-      // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16]
-      // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16]
-      // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24]
-      // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08]
-      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E);
-      const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F);
-      const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G);
-      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H);
-      // Then add and right-shift to get back to 16-bit range
-      // but this combines the final right-shift as well to save operations
-      // This unusual rounding operations is to maintain bit-accurate
-      // compatibility with the c version of this function which has two
-      // rounding steps in a row.
-      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2);
-      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2);
-      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2);
-      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2);
-      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2);
-      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2);
-      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2);
-      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2);
-      // w0 = [o0 o4 o8 oC]
-      // w1 = [o2 o6 oA oE]
-      // w2 = [o1 o5 o9 oD]
-      // w3 = [o3 o7 oB oF]
-      // remember the o's are numbered according to the correct output location
-      const __m128i x0 = _mm_packs_epi32(w0, w1);
-      const __m128i x1 = _mm_packs_epi32(w2, w3);
-#if DCT_HIGH_BIT_DEPTH
-      overflow = check_epi16_overflow_x2(&x0, &x1);
-      if (overflow) {
-        aom_highbd_fdct4x4_c(input, output, stride);
-        return;
-      }
-#endif  // DCT_HIGH_BIT_DEPTH
-      {
-        // x0 = [o0 o4 o8 oC o2 o6 oA oE]
-        // x1 = [o1 o5 o9 oD o3 o7 oB oF]
-        const __m128i y0 = _mm_unpacklo_epi16(x0, x1);
-        const __m128i y1 = _mm_unpackhi_epi16(x0, x1);
-        // y0 = [o0 o1 o4 o5 o8 o9 oC oD]
-        // y1 = [o2 o3 o6 o7 oA oB oE oF]
-        in0 = _mm_unpacklo_epi32(y0, y1);
-        // in0 = [o0 o1 o2 o3 o4 o5 o6 o7]
-        in1 = _mm_unpackhi_epi32(y0, y1);
-        // in1 = [o8 o9 oA oB oC oD oE oF]
-      }
-    }
-  }
-  // Post-condition (v + 1) >> 2 is now incorporated into previous
-  // add and right-shift commands.  Only 2 store instructions needed
-  // because we are using the fact that 1/3 are stored just after 0/2.
-  storeu_output(&in0, output + 0 * 4);
-  storeu_output(&in1, output + 2 * 4);
-}
-
-void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
-  int pass;
-  // Constants
-  //    When we use them, in one case, they are all the same. In all others
-  //    it's a pair of them that we need to repeat four times. This is done
-  //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-#if DCT_HIGH_BIT_DEPTH
-  int overflow;
-#endif
-  // Load input
-  __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
-  __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
-  __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
-  __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
-  __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
-  __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
-  __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
-  __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
-  // Pre-condition input (shift by two)
-  in0 = _mm_slli_epi16(in0, 2);
-  in1 = _mm_slli_epi16(in1, 2);
-  in2 = _mm_slli_epi16(in2, 2);
-  in3 = _mm_slli_epi16(in3, 2);
-  in4 = _mm_slli_epi16(in4, 2);
-  in5 = _mm_slli_epi16(in5, 2);
-  in6 = _mm_slli_epi16(in6, 2);
-  in7 = _mm_slli_epi16(in7, 2);
-
-  // We do two passes, first the columns, then the rows. The results of the
-  // first pass are transposed so that the same column code can be reused. The
-  // results of the second pass are also transposed so that the rows (processed
-  // as columns) are put back in row positions.
-  for (pass = 0; pass < 2; pass++) {
-    // To store results of each pass before the transpose.
-    __m128i res0, res1, res2, res3, res4, res5, res6, res7;
-    // Add/subtract
-    const __m128i q0 = ADD_EPI16(in0, in7);
-    const __m128i q1 = ADD_EPI16(in1, in6);
-    const __m128i q2 = ADD_EPI16(in2, in5);
-    const __m128i q3 = ADD_EPI16(in3, in4);
-    const __m128i q4 = SUB_EPI16(in3, in4);
-    const __m128i q5 = SUB_EPI16(in2, in5);
-    const __m128i q6 = SUB_EPI16(in1, in6);
-    const __m128i q7 = SUB_EPI16(in0, in7);
-#if DCT_HIGH_BIT_DEPTH
-    if (pass == 1) {
-      overflow =
-          check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
-      if (overflow) {
-        aom_highbd_fdct8x8_c(input, output, stride);
-        return;
-      }
-    }
-#endif  // DCT_HIGH_BIT_DEPTH
-    // Work on first four results
-    {
-      // Add/subtract
-      const __m128i r0 = ADD_EPI16(q0, q3);
-      const __m128i r1 = ADD_EPI16(q1, q2);
-      const __m128i r2 = SUB_EPI16(q1, q2);
-      const __m128i r3 = SUB_EPI16(q0, q3);
-#if DCT_HIGH_BIT_DEPTH
-      overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3);
-      if (overflow) {
-        aom_highbd_fdct8x8_c(input, output, stride);
-        return;
-      }
-#endif  // DCT_HIGH_BIT_DEPTH
-      // Interleave to do the multiply by constants which gets us into 32bits
-      {
-        const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
-        const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
-        const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
-        const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
-        const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
-        const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
-        const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
-        const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
-        const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
-        const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
-        const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
-        const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
-        // dct_const_round_shift
-        const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-        const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-        const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-        const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-        const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
-        const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
-        const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
-        const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-        const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-        const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-        const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-        const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-        const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-        const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-        const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-        const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-        // Combine
-        res0 = _mm_packs_epi32(w0, w1);
-        res4 = _mm_packs_epi32(w2, w3);
-        res2 = _mm_packs_epi32(w4, w5);
-        res6 = _mm_packs_epi32(w6, w7);
-#if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x4(&res0, &res4, &res2, &res6);
-        if (overflow) {
-          aom_highbd_fdct8x8_c(input, output, stride);
-          return;
-        }
-#endif  // DCT_HIGH_BIT_DEPTH
-      }
-    }
-    // Work on next four results
-    {
-      // Interleave to do the multiply by constants which gets us into 32bits
-      const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
-      const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
-      const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
-      const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
-      const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
-      const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
-      // dct_const_round_shift
-      const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
-      const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
-      const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
-      const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
-      const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
-      const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
-      const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
-      const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
-      // Combine
-      const __m128i r0 = _mm_packs_epi32(s0, s1);
-      const __m128i r1 = _mm_packs_epi32(s2, s3);
-#if DCT_HIGH_BIT_DEPTH
-      overflow = check_epi16_overflow_x2(&r0, &r1);
-      if (overflow) {
-        aom_highbd_fdct8x8_c(input, output, stride);
-        return;
-      }
-#endif  // DCT_HIGH_BIT_DEPTH
-      {
-        // Add/subtract
-        const __m128i x0 = ADD_EPI16(q4, r0);
-        const __m128i x1 = SUB_EPI16(q4, r0);
-        const __m128i x2 = SUB_EPI16(q7, r1);
-        const __m128i x3 = ADD_EPI16(q7, r1);
-#if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3);
-        if (overflow) {
-          aom_highbd_fdct8x8_c(input, output, stride);
-          return;
-        }
-#endif  // DCT_HIGH_BIT_DEPTH
-        // Interleave to do the multiply by constants which gets us into 32bits
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
-          const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
-          const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
-          const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
-          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
-          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
-          const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
-          const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
-          const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
-          const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
-          const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
-          const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
-          // dct_const_round_shift
-          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-          const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
-          const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
-          const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
-          const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-          const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-          const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-          const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-          const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-          // Combine
-          res1 = _mm_packs_epi32(w0, w1);
-          res7 = _mm_packs_epi32(w2, w3);
-          res5 = _mm_packs_epi32(w4, w5);
-          res3 = _mm_packs_epi32(w6, w7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(&res1, &res7, &res5, &res3);
-          if (overflow) {
-            aom_highbd_fdct8x8_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-      }
-    }
-    // Transpose the 8x8.
-    {
-      // 00 01 02 03 04 05 06 07
-      // 10 11 12 13 14 15 16 17
-      // 20 21 22 23 24 25 26 27
-      // 30 31 32 33 34 35 36 37
-      // 40 41 42 43 44 45 46 47
-      // 50 51 52 53 54 55 56 57
-      // 60 61 62 63 64 65 66 67
-      // 70 71 72 73 74 75 76 77
-      const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
-      const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
-      const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
-      const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
-      const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
-      const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
-      const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
-      const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
-      // 00 10 01 11 02 12 03 13
-      // 20 30 21 31 22 32 23 33
-      // 04 14 05 15 06 16 07 17
-      // 24 34 25 35 26 36 27 37
-      // 40 50 41 51 42 52 43 53
-      // 60 70 61 71 62 72 63 73
-      // 54 54 55 55 56 56 57 57
-      // 64 74 65 75 66 76 67 77
-      const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-      const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-      const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-      const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-      const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-      const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-      const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-      const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-      // 00 10 20 30 01 11 21 31
-      // 40 50 60 70 41 51 61 71
-      // 02 12 22 32 03 13 23 33
-      // 42 52 62 72 43 53 63 73
-      // 04 14 24 34 05 15 21 36
-      // 44 54 64 74 45 55 61 76
-      // 06 16 26 36 07 17 27 37
-      // 46 56 66 76 47 57 67 77
-      in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
-      in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
-      in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
-      in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
-      in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
-      in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
-      in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
-      in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
-      // 00 10 20 30 40 50 60 70
-      // 01 11 21 31 41 51 61 71
-      // 02 12 22 32 42 52 62 72
-      // 03 13 23 33 43 53 63 73
-      // 04 14 24 34 44 54 64 74
-      // 05 15 25 35 45 55 65 75
-      // 06 16 26 36 46 56 66 76
-      // 07 17 27 37 47 57 67 77
-    }
-  }
-  // Post-condition output and store it
-  {
-    // Post-condition (division by two)
-    //    division of two 16 bits signed numbers using shifts
-    //    n / 2 = (n - (n >> 15)) >> 1
-    const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
-    const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
-    const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
-    const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
-    const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
-    const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
-    const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
-    const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
-    in0 = _mm_sub_epi16(in0, sign_in0);
-    in1 = _mm_sub_epi16(in1, sign_in1);
-    in2 = _mm_sub_epi16(in2, sign_in2);
-    in3 = _mm_sub_epi16(in3, sign_in3);
-    in4 = _mm_sub_epi16(in4, sign_in4);
-    in5 = _mm_sub_epi16(in5, sign_in5);
-    in6 = _mm_sub_epi16(in6, sign_in6);
-    in7 = _mm_sub_epi16(in7, sign_in7);
-    in0 = _mm_srai_epi16(in0, 1);
-    in1 = _mm_srai_epi16(in1, 1);
-    in2 = _mm_srai_epi16(in2, 1);
-    in3 = _mm_srai_epi16(in3, 1);
-    in4 = _mm_srai_epi16(in4, 1);
-    in5 = _mm_srai_epi16(in5, 1);
-    in6 = _mm_srai_epi16(in6, 1);
-    in7 = _mm_srai_epi16(in7, 1);
-    // store results
-    store_output(&in0, (output + 0 * 8));
-    store_output(&in1, (output + 1 * 8));
-    store_output(&in2, (output + 2 * 8));
-    store_output(&in3, (output + 3 * 8));
-    store_output(&in4, (output + 4 * 8));
-    store_output(&in5, (output + 5 * 8));
-    store_output(&in6, (output + 6 * 8));
-    store_output(&in7, (output + 7 * 8));
-  }
-}
-
-void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
-  // The 2D transform is done with two passes which are actually pretty
-  // similar. In the first one, we transform the columns and transpose
-  // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we transpose the columns (that
-  // is the transposed rows) and transpose the results (so that it goes back
-  // in normal/row positions).
-  int pass;
-  // We need an intermediate buffer between passes.
-  DECLARE_ALIGNED(16, int16_t, intermediate[256]);
-  const int16_t *in = input;
-  int16_t *out0 = intermediate;
-  tran_low_t *out1 = output;
-  // Constants
-  //    When we use them, in one case, they are all the same. In all others
-  //    it's a pair of them that we need to repeat four times. This is done
-  //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
-  const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
-  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
-  const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
-  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i kOne = _mm_set1_epi16(1);
-  // Do the two transform/transpose passes
-  for (pass = 0; pass < 2; ++pass) {
-    // We process eight columns (transposed rows in second pass) at a time.
-    int column_start;
-#if DCT_HIGH_BIT_DEPTH
-    int overflow;
-#endif
-    for (column_start = 0; column_start < 16; column_start += 8) {
-      __m128i in00, in01, in02, in03, in04, in05, in06, in07;
-      __m128i in08, in09, in10, in11, in12, in13, in14, in15;
-      __m128i input0, input1, input2, input3, input4, input5, input6, input7;
-      __m128i step1_0, step1_1, step1_2, step1_3;
-      __m128i step1_4, step1_5, step1_6, step1_7;
-      __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
-      __m128i step3_0, step3_1, step3_2, step3_3;
-      __m128i step3_4, step3_5, step3_6, step3_7;
-      __m128i res00, res01, res02, res03, res04, res05, res06, res07;
-      __m128i res08, res09, res10, res11, res12, res13, res14, res15;
-      // Load and pre-condition input.
-      if (0 == pass) {
-        in00 = _mm_load_si128((const __m128i *)(in + 0 * stride));
-        in01 = _mm_load_si128((const __m128i *)(in + 1 * stride));
-        in02 = _mm_load_si128((const __m128i *)(in + 2 * stride));
-        in03 = _mm_load_si128((const __m128i *)(in + 3 * stride));
-        in04 = _mm_load_si128((const __m128i *)(in + 4 * stride));
-        in05 = _mm_load_si128((const __m128i *)(in + 5 * stride));
-        in06 = _mm_load_si128((const __m128i *)(in + 6 * stride));
-        in07 = _mm_load_si128((const __m128i *)(in + 7 * stride));
-        in08 = _mm_load_si128((const __m128i *)(in + 8 * stride));
-        in09 = _mm_load_si128((const __m128i *)(in + 9 * stride));
-        in10 = _mm_load_si128((const __m128i *)(in + 10 * stride));
-        in11 = _mm_load_si128((const __m128i *)(in + 11 * stride));
-        in12 = _mm_load_si128((const __m128i *)(in + 12 * stride));
-        in13 = _mm_load_si128((const __m128i *)(in + 13 * stride));
-        in14 = _mm_load_si128((const __m128i *)(in + 14 * stride));
-        in15 = _mm_load_si128((const __m128i *)(in + 15 * stride));
-        // x = x << 2
-        in00 = _mm_slli_epi16(in00, 2);
-        in01 = _mm_slli_epi16(in01, 2);
-        in02 = _mm_slli_epi16(in02, 2);
-        in03 = _mm_slli_epi16(in03, 2);
-        in04 = _mm_slli_epi16(in04, 2);
-        in05 = _mm_slli_epi16(in05, 2);
-        in06 = _mm_slli_epi16(in06, 2);
-        in07 = _mm_slli_epi16(in07, 2);
-        in08 = _mm_slli_epi16(in08, 2);
-        in09 = _mm_slli_epi16(in09, 2);
-        in10 = _mm_slli_epi16(in10, 2);
-        in11 = _mm_slli_epi16(in11, 2);
-        in12 = _mm_slli_epi16(in12, 2);
-        in13 = _mm_slli_epi16(in13, 2);
-        in14 = _mm_slli_epi16(in14, 2);
-        in15 = _mm_slli_epi16(in15, 2);
-      } else {
-        in00 = _mm_load_si128((const __m128i *)(in + 0 * 16));
-        in01 = _mm_load_si128((const __m128i *)(in + 1 * 16));
-        in02 = _mm_load_si128((const __m128i *)(in + 2 * 16));
-        in03 = _mm_load_si128((const __m128i *)(in + 3 * 16));
-        in04 = _mm_load_si128((const __m128i *)(in + 4 * 16));
-        in05 = _mm_load_si128((const __m128i *)(in + 5 * 16));
-        in06 = _mm_load_si128((const __m128i *)(in + 6 * 16));
-        in07 = _mm_load_si128((const __m128i *)(in + 7 * 16));
-        in08 = _mm_load_si128((const __m128i *)(in + 8 * 16));
-        in09 = _mm_load_si128((const __m128i *)(in + 9 * 16));
-        in10 = _mm_load_si128((const __m128i *)(in + 10 * 16));
-        in11 = _mm_load_si128((const __m128i *)(in + 11 * 16));
-        in12 = _mm_load_si128((const __m128i *)(in + 12 * 16));
-        in13 = _mm_load_si128((const __m128i *)(in + 13 * 16));
-        in14 = _mm_load_si128((const __m128i *)(in + 14 * 16));
-        in15 = _mm_load_si128((const __m128i *)(in + 15 * 16));
-        // x = (x + 1) >> 2
-        in00 = _mm_add_epi16(in00, kOne);
-        in01 = _mm_add_epi16(in01, kOne);
-        in02 = _mm_add_epi16(in02, kOne);
-        in03 = _mm_add_epi16(in03, kOne);
-        in04 = _mm_add_epi16(in04, kOne);
-        in05 = _mm_add_epi16(in05, kOne);
-        in06 = _mm_add_epi16(in06, kOne);
-        in07 = _mm_add_epi16(in07, kOne);
-        in08 = _mm_add_epi16(in08, kOne);
-        in09 = _mm_add_epi16(in09, kOne);
-        in10 = _mm_add_epi16(in10, kOne);
-        in11 = _mm_add_epi16(in11, kOne);
-        in12 = _mm_add_epi16(in12, kOne);
-        in13 = _mm_add_epi16(in13, kOne);
-        in14 = _mm_add_epi16(in14, kOne);
-        in15 = _mm_add_epi16(in15, kOne);
-        in00 = _mm_srai_epi16(in00, 2);
-        in01 = _mm_srai_epi16(in01, 2);
-        in02 = _mm_srai_epi16(in02, 2);
-        in03 = _mm_srai_epi16(in03, 2);
-        in04 = _mm_srai_epi16(in04, 2);
-        in05 = _mm_srai_epi16(in05, 2);
-        in06 = _mm_srai_epi16(in06, 2);
-        in07 = _mm_srai_epi16(in07, 2);
-        in08 = _mm_srai_epi16(in08, 2);
-        in09 = _mm_srai_epi16(in09, 2);
-        in10 = _mm_srai_epi16(in10, 2);
-        in11 = _mm_srai_epi16(in11, 2);
-        in12 = _mm_srai_epi16(in12, 2);
-        in13 = _mm_srai_epi16(in13, 2);
-        in14 = _mm_srai_epi16(in14, 2);
-        in15 = _mm_srai_epi16(in15, 2);
-      }
-      in += 8;
-      // Calculate input for the first 8 results.
-      {
-        input0 = ADD_EPI16(in00, in15);
-        input1 = ADD_EPI16(in01, in14);
-        input2 = ADD_EPI16(in02, in13);
-        input3 = ADD_EPI16(in03, in12);
-        input4 = ADD_EPI16(in04, in11);
-        input5 = ADD_EPI16(in05, in10);
-        input6 = ADD_EPI16(in06, in09);
-        input7 = ADD_EPI16(in07, in08);
-#if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x8(&input0, &input1, &input2, &input3,
-                                           &input4, &input5, &input6, &input7);
-        if (overflow) {
-          aom_highbd_fdct16x16_c(input, output, stride);
-          return;
-        }
-#endif  // DCT_HIGH_BIT_DEPTH
-      }
-      // Calculate input for the next 8 results.
-      {
-        step1_0 = SUB_EPI16(in07, in08);
-        step1_1 = SUB_EPI16(in06, in09);
-        step1_2 = SUB_EPI16(in05, in10);
-        step1_3 = SUB_EPI16(in04, in11);
-        step1_4 = SUB_EPI16(in03, in12);
-        step1_5 = SUB_EPI16(in02, in13);
-        step1_6 = SUB_EPI16(in01, in14);
-        step1_7 = SUB_EPI16(in00, in15);
-#if DCT_HIGH_BIT_DEPTH
-        overflow =
-            check_epi16_overflow_x8(&step1_0, &step1_1, &step1_2, &step1_3,
-                                    &step1_4, &step1_5, &step1_6, &step1_7);
-        if (overflow) {
-          aom_highbd_fdct16x16_c(input, output, stride);
-          return;
-        }
-#endif  // DCT_HIGH_BIT_DEPTH
-      }
-      // Work on the first eight values; fdct8(input, even_results);
-      {
-        // Add/subtract
-        const __m128i q0 = ADD_EPI16(input0, input7);
-        const __m128i q1 = ADD_EPI16(input1, input6);
-        const __m128i q2 = ADD_EPI16(input2, input5);
-        const __m128i q3 = ADD_EPI16(input3, input4);
-        const __m128i q4 = SUB_EPI16(input3, input4);
-        const __m128i q5 = SUB_EPI16(input2, input5);
-        const __m128i q6 = SUB_EPI16(input1, input6);
-        const __m128i q7 = SUB_EPI16(input0, input7);
-#if DCT_HIGH_BIT_DEPTH
-        overflow =
-            check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
-        if (overflow) {
-          aom_highbd_fdct16x16_c(input, output, stride);
-          return;
-        }
-#endif  // DCT_HIGH_BIT_DEPTH
-        // Work on first four results
-        {
-          // Add/subtract
-          const __m128i r0 = ADD_EPI16(q0, q3);
-          const __m128i r1 = ADD_EPI16(q1, q2);
-          const __m128i r2 = SUB_EPI16(q1, q2);
-          const __m128i r3 = SUB_EPI16(q0, q3);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          // Interleave to do the multiply by constants which gets us
-          // into 32 bits.
-          {
-            const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
-            const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
-            const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
-            const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
-            res00 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-            res08 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-            res04 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-            res12 = mult_round_shift(&t2, &t3, &k__cospi_m08_p24,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
-            overflow = check_epi16_overflow_x4(&res00, &res08, &res04, &res12);
-            if (overflow) {
-              aom_highbd_fdct16x16_c(input, output, stride);
-              return;
-            }
-#endif  // DCT_HIGH_BIT_DEPTH
-          }
-        }
-        // Work on next four results
-        {
-          // Interleave to do the multiply by constants which gets us
-          // into 32 bits.
-          const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
-          const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
-          const __m128i r0 =
-              mult_round_shift(&d0, &d1, &k__cospi_p16_m16,
-                               &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          const __m128i r1 =
-              mult_round_shift(&d0, &d1, &k__cospi_p16_p16,
-                               &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x2(&r0, &r1);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          {
-            // Add/subtract
-            const __m128i x0 = ADD_EPI16(q4, r0);
-            const __m128i x1 = SUB_EPI16(q4, r0);
-            const __m128i x2 = SUB_EPI16(q7, r1);
-            const __m128i x3 = ADD_EPI16(q7, r1);
-#if DCT_HIGH_BIT_DEPTH
-            overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3);
-            if (overflow) {
-              aom_highbd_fdct16x16_c(input, output, stride);
-              return;
-            }
-#endif  // DCT_HIGH_BIT_DEPTH
-            // Interleave to do the multiply by constants which gets us
-            // into 32 bits.
-            {
-              const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
-              const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
-              const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
-              const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
-              res02 = mult_round_shift(&t0, &t1, &k__cospi_p28_p04,
-                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-              res14 = mult_round_shift(&t0, &t1, &k__cospi_m04_p28,
-                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-              res10 = mult_round_shift(&t2, &t3, &k__cospi_p12_p20,
-                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-              res06 = mult_round_shift(&t2, &t3, &k__cospi_m20_p12,
-                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
-              overflow =
-                  check_epi16_overflow_x4(&res02, &res14, &res10, &res06);
-              if (overflow) {
-                aom_highbd_fdct16x16_c(input, output, stride);
-                return;
-              }
-#endif  // DCT_HIGH_BIT_DEPTH
-            }
-          }
-        }
-      }
-      // Work on the next eight values; step1 -> odd_results
-      {
-        // step 2
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
-          const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
-          const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
-          const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
-          step2_2 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          step2_3 = mult_round_shift(&t2, &t3, &k__cospi_p16_m16,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          step2_5 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          step2_4 = mult_round_shift(&t2, &t3, &k__cospi_p16_p16,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x4(&step2_2, &step2_3, &step2_5, &step2_4);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        // step 3
-        {
-          step3_0 = ADD_EPI16(step1_0, step2_3);
-          step3_1 = ADD_EPI16(step1_1, step2_2);
-          step3_2 = SUB_EPI16(step1_1, step2_2);
-          step3_3 = SUB_EPI16(step1_0, step2_3);
-          step3_4 = SUB_EPI16(step1_7, step2_4);
-          step3_5 = SUB_EPI16(step1_6, step2_5);
-          step3_6 = ADD_EPI16(step1_6, step2_5);
-          step3_7 = ADD_EPI16(step1_7, step2_4);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x8(&step3_0, &step3_1, &step3_2, &step3_3,
-                                      &step3_4, &step3_5, &step3_6, &step3_7);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        // step 4
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
-          const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
-          const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
-          const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
-          step2_1 = mult_round_shift(&t0, &t1, &k__cospi_m08_p24,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          step2_2 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          step2_6 = mult_round_shift(&t0, &t1, &k__cospi_p24_p08,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          step2_5 = mult_round_shift(&t2, &t3, &k__cospi_p08_m24,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x4(&step2_1, &step2_2, &step2_6, &step2_5);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        // step 5
-        {
-          step1_0 = ADD_EPI16(step3_0, step2_1);
-          step1_1 = SUB_EPI16(step3_0, step2_1);
-          step1_2 = ADD_EPI16(step3_3, step2_2);
-          step1_3 = SUB_EPI16(step3_3, step2_2);
-          step1_4 = SUB_EPI16(step3_4, step2_5);
-          step1_5 = ADD_EPI16(step3_4, step2_5);
-          step1_6 = SUB_EPI16(step3_7, step2_6);
-          step1_7 = ADD_EPI16(step3_7, step2_6);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x8(&step1_0, &step1_1, &step1_2, &step1_3,
-                                      &step1_4, &step1_5, &step1_6, &step1_7);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        // step 6
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
-          const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
-          const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
-          const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
-          res01 = mult_round_shift(&t0, &t1, &k__cospi_p30_p02,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          res09 = mult_round_shift(&t2, &t3, &k__cospi_p14_p18,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          res15 = mult_round_shift(&t0, &t1, &k__cospi_m02_p30,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          res07 = mult_round_shift(&t2, &t3, &k__cospi_m18_p14,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(&res01, &res09, &res15, &res07);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
-          const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
-          const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
-          const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
-          res05 = mult_round_shift(&t0, &t1, &k__cospi_p22_p10,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          res13 = mult_round_shift(&t2, &t3, &k__cospi_p06_p26,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          res11 = mult_round_shift(&t0, &t1, &k__cospi_m10_p22,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          res03 = mult_round_shift(&t2, &t3, &k__cospi_m26_p06,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(&res05, &res13, &res11, &res03);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-      }
-      // Transpose the results, do it as two 8x8 transposes.
-      transpose_and_output8x8(&res00, &res01, &res02, &res03, &res04, &res05,
-                              &res06, &res07, pass, out0, out1);
-      transpose_and_output8x8(&res08, &res09, &res10, &res11, &res12, &res13,
-                              &res14, &res15, pass, out0 + 8, out1 + 8);
-      if (pass == 0) {
-        out0 += 8 * 16;
-      } else {
-        out1 += 8 * 16;
-      }
-    }
-    // Setup in/out for next pass.
-    in = intermediate;
-  }
-}
-
-#undef ADD_EPI16
-#undef SUB_EPI16
diff --git a/av1/common/x86/av1_fwd_txfm_sse2.c b/av1/common/x86/av1_fwd_txfm_sse2.c
deleted file mode 100644
index 081fe08..0000000
--- a/av1/common/x86/av1_fwd_txfm_sse2.c
+++ /dev/null
@@ -1,272 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>  // SSE2
-
-#include "./aom_config.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/x86/fwd_txfm_sse2.h"
-
-void av1_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
-  __m128i in0, in1;
-  __m128i tmp;
-  const __m128i zero = _mm_setzero_si128();
-  in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
-  in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
-  in1 = _mm_unpacklo_epi64(
-      in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
-  in0 = _mm_unpacklo_epi64(
-      in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));
-
-  tmp = _mm_add_epi16(in0, in1);
-  in0 = _mm_unpacklo_epi16(zero, tmp);
-  in1 = _mm_unpackhi_epi16(zero, tmp);
-  in0 = _mm_srai_epi32(in0, 16);
-  in1 = _mm_srai_epi32(in1, 16);
-
-  tmp = _mm_add_epi32(in0, in1);
-  in0 = _mm_unpacklo_epi32(tmp, zero);
-  in1 = _mm_unpackhi_epi32(tmp, zero);
-
-  tmp = _mm_add_epi32(in0, in1);
-  in0 = _mm_srli_si128(tmp, 8);
-
-  in1 = _mm_add_epi32(tmp, in0);
-  in0 = _mm_slli_epi32(in1, 1);
-  store_output(&in0, output);
-}
-
-void av1_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
-  __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
-  __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
-  __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
-  __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
-  __m128i u0, u1, sum;
-
-  u0 = _mm_add_epi16(in0, in1);
-  u1 = _mm_add_epi16(in2, in3);
-
-  in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
-  in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
-  in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
-  in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
-
-  sum = _mm_add_epi16(u0, u1);
-
-  in0 = _mm_add_epi16(in0, in1);
-  in2 = _mm_add_epi16(in2, in3);
-  sum = _mm_add_epi16(sum, in0);
-
-  u0 = _mm_setzero_si128();
-  sum = _mm_add_epi16(sum, in2);
-
-  in0 = _mm_unpacklo_epi16(u0, sum);
-  in1 = _mm_unpackhi_epi16(u0, sum);
-  in0 = _mm_srai_epi32(in0, 16);
-  in1 = _mm_srai_epi32(in1, 16);
-
-  sum = _mm_add_epi32(in0, in1);
-  in0 = _mm_unpacklo_epi32(sum, u0);
-  in1 = _mm_unpackhi_epi32(sum, u0);
-
-  sum = _mm_add_epi32(in0, in1);
-  in0 = _mm_srli_si128(sum, 8);
-
-  in1 = _mm_add_epi32(sum, in0);
-  store_output(&in1, output);
-}
-
-void av1_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
-                          int stride) {
-  __m128i in0, in1, in2, in3;
-  __m128i u0, u1;
-  __m128i sum = _mm_setzero_si128();
-  int i;
-
-  for (i = 0; i < 2; ++i) {
-    input += 8 * i;
-    in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
-    in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
-    in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
-    in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
-
-    u0 = _mm_add_epi16(in0, in1);
-    u1 = _mm_add_epi16(in2, in3);
-    sum = _mm_add_epi16(sum, u0);
-
-    in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
-    in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
-    in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
-    in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
-
-    sum = _mm_add_epi16(sum, u1);
-    u0 = _mm_add_epi16(in0, in1);
-    u1 = _mm_add_epi16(in2, in3);
-    sum = _mm_add_epi16(sum, u0);
-
-    in0 = _mm_load_si128((const __m128i *)(input + 8 * stride));
-    in1 = _mm_load_si128((const __m128i *)(input + 9 * stride));
-    in2 = _mm_load_si128((const __m128i *)(input + 10 * stride));
-    in3 = _mm_load_si128((const __m128i *)(input + 11 * stride));
-
-    sum = _mm_add_epi16(sum, u1);
-    u0 = _mm_add_epi16(in0, in1);
-    u1 = _mm_add_epi16(in2, in3);
-    sum = _mm_add_epi16(sum, u0);
-
-    in0 = _mm_load_si128((const __m128i *)(input + 12 * stride));
-    in1 = _mm_load_si128((const __m128i *)(input + 13 * stride));
-    in2 = _mm_load_si128((const __m128i *)(input + 14 * stride));
-    in3 = _mm_load_si128((const __m128i *)(input + 15 * stride));
-
-    sum = _mm_add_epi16(sum, u1);
-    u0 = _mm_add_epi16(in0, in1);
-    u1 = _mm_add_epi16(in2, in3);
-    sum = _mm_add_epi16(sum, u0);
-
-    sum = _mm_add_epi16(sum, u1);
-  }
-
-  u0 = _mm_setzero_si128();
-  in0 = _mm_unpacklo_epi16(u0, sum);
-  in1 = _mm_unpackhi_epi16(u0, sum);
-  in0 = _mm_srai_epi32(in0, 16);
-  in1 = _mm_srai_epi32(in1, 16);
-
-  sum = _mm_add_epi32(in0, in1);
-  in0 = _mm_unpacklo_epi32(sum, u0);
-  in1 = _mm_unpackhi_epi32(sum, u0);
-
-  sum = _mm_add_epi32(in0, in1);
-  in0 = _mm_srli_si128(sum, 8);
-
-  in1 = _mm_add_epi32(sum, in0);
-  in1 = _mm_srai_epi32(in1, 1);
-  store_output(&in1, output);
-}
-
-void av1_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
-                          int stride) {
-  __m128i in0, in1, in2, in3;
-  __m128i u0, u1;
-  __m128i sum = _mm_setzero_si128();
-  int i;
-
-  for (i = 0; i < 8; ++i) {
-    in0 = _mm_load_si128((const __m128i *)(input + 0));
-    in1 = _mm_load_si128((const __m128i *)(input + 8));
-    in2 = _mm_load_si128((const __m128i *)(input + 16));
-    in3 = _mm_load_si128((const __m128i *)(input + 24));
-
-    input += stride;
-    u0 = _mm_add_epi16(in0, in1);
-    u1 = _mm_add_epi16(in2, in3);
-    sum = _mm_add_epi16(sum, u0);
-
-    in0 = _mm_load_si128((const __m128i *)(input + 0));
-    in1 = _mm_load_si128((const __m128i *)(input + 8));
-    in2 = _mm_load_si128((const __m128i *)(input + 16));
-    in3 = _mm_load_si128((const __m128i *)(input + 24));
-
-    input += stride;
-    sum = _mm_add_epi16(sum, u1);
-    u0 = _mm_add_epi16(in0, in1);
-    u1 = _mm_add_epi16(in2, in3);
-    sum = _mm_add_epi16(sum, u0);
-
-    in0 = _mm_load_si128((const __m128i *)(input + 0));
-    in1 = _mm_load_si128((const __m128i *)(input + 8));
-    in2 = _mm_load_si128((const __m128i *)(input + 16));
-    in3 = _mm_load_si128((const __m128i *)(input + 24));
-
-    input += stride;
-    sum = _mm_add_epi16(sum, u1);
-    u0 = _mm_add_epi16(in0, in1);
-    u1 = _mm_add_epi16(in2, in3);
-    sum = _mm_add_epi16(sum, u0);
-
-    in0 = _mm_load_si128((const __m128i *)(input + 0));
-    in1 = _mm_load_si128((const __m128i *)(input + 8));
-    in2 = _mm_load_si128((const __m128i *)(input + 16));
-    in3 = _mm_load_si128((const __m128i *)(input + 24));
-
-    input += stride;
-    sum = _mm_add_epi16(sum, u1);
-    u0 = _mm_add_epi16(in0, in1);
-    u1 = _mm_add_epi16(in2, in3);
-    sum = _mm_add_epi16(sum, u0);
-
-    sum = _mm_add_epi16(sum, u1);
-  }
-
-  u0 = _mm_setzero_si128();
-  in0 = _mm_unpacklo_epi16(u0, sum);
-  in1 = _mm_unpackhi_epi16(u0, sum);
-  in0 = _mm_srai_epi32(in0, 16);
-  in1 = _mm_srai_epi32(in1, 16);
-
-  sum = _mm_add_epi32(in0, in1);
-  in0 = _mm_unpacklo_epi32(sum, u0);
-  in1 = _mm_unpackhi_epi32(sum, u0);
-
-  sum = _mm_add_epi32(in0, in1);
-  in0 = _mm_srli_si128(sum, 8);
-
-  in1 = _mm_add_epi32(sum, in0);
-  in1 = _mm_srai_epi32(in1, 3);
-  store_output(&in1, output);
-}
-
-#define DCT_HIGH_BIT_DEPTH 0
-#define FDCT4x4_2D av1_fdct4x4_sse2
-#define FDCT8x8_2D av1_fdct8x8_sse2
-#define FDCT16x16_2D av1_fdct16x16_sse2
-#include "av1/common/x86/av1_fwd_txfm_impl_sse2.h"
-#undef FDCT4x4_2D
-#undef FDCT8x8_2D
-#undef FDCT16x16_2D
-
-#define FDCT32x32_2D av1_fdct32x32_rd_sse2
-#define FDCT32x32_HIGH_PRECISION 0
-#include "av1/common/x86/av1_fwd_dct32x32_impl_sse2.h"
-#undef FDCT32x32_2D
-#undef FDCT32x32_HIGH_PRECISION
-
-#define FDCT32x32_2D av1_fdct32x32_sse2
-#define FDCT32x32_HIGH_PRECISION 1
-#include "av1/common/x86/av1_fwd_dct32x32_impl_sse2.h"  // NOLINT
-#undef FDCT32x32_2D
-#undef FDCT32x32_HIGH_PRECISION
-#undef DCT_HIGH_BIT_DEPTH
-
-#if CONFIG_AOM_HIGHBITDEPTH
-#define DCT_HIGH_BIT_DEPTH 1
-#define FDCT4x4_2D av1_highbd_fdct4x4_sse2
-#define FDCT8x8_2D av1_highbd_fdct8x8_sse2
-#define FDCT16x16_2D av1_highbd_fdct16x16_sse2
-#include "av1/common/x86/av1_fwd_txfm_impl_sse2.h"  // NOLINT
-#undef FDCT4x4_2D
-#undef FDCT8x8_2D
-#undef FDCT16x16_2D
-
-#define FDCT32x32_2D av1_highbd_fdct32x32_rd_sse2
-#define FDCT32x32_HIGH_PRECISION 0
-#include "av1/common/x86/av1_fwd_dct32x32_impl_sse2.h"  // NOLINT
-#undef FDCT32x32_2D
-#undef FDCT32x32_HIGH_PRECISION
-
-#define FDCT32x32_2D av1_highbd_fdct32x32_sse2
-#define FDCT32x32_HIGH_PRECISION 1
-#include "av1/common/x86/av1_fwd_dct32x32_impl_sse2.h"  // NOLINT
-#undef FDCT32x32_2D
-#undef FDCT32x32_HIGH_PRECISION
-#undef DCT_HIGH_BIT_DEPTH
-#endif  // CONFIG_AOM_HIGHBITDEPTH
diff --git a/av1/common/x86/av1_inv_txfm_sse2.c b/av1/common/x86/av1_inv_txfm_sse2.c
deleted file mode 100644
index 365c124..0000000
--- a/av1/common/x86/av1_inv_txfm_sse2.c
+++ /dev/null
@@ -1,4028 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "./av1_rtcd.h"
-#include "av1/common/x86/av1_inv_txfm_sse2.h"
-#include "aom_dsp/x86/txfm_common_sse2.h"
-
-#define RECON_AND_STORE4X4(dest, in_x)                    \
-  {                                                       \
-    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
-    d0 = _mm_unpacklo_epi8(d0, zero);                     \
-    d0 = _mm_add_epi16(in_x, d0);                         \
-    d0 = _mm_packus_epi16(d0, d0);                        \
-    *(int *)(dest) = _mm_cvtsi128_si32(d0);               \
-  }
-
-void av1_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i eight = _mm_set1_epi16(8);
-  const __m128i cst = _mm_setr_epi16(
-      (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64,
-      (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
-      (int16_t)cospi_8_64, (int16_t)cospi_24_64);
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i input0, input1, input2, input3;
-
-  // Rows
-  input0 = _mm_load_si128((const __m128i *)input);
-  input2 = _mm_load_si128((const __m128i *)(input + 8));
-
-  // Construct i3, i1, i3, i1, i2, i0, i2, i0
-  input0 = _mm_shufflelo_epi16(input0, 0xd8);
-  input0 = _mm_shufflehi_epi16(input0, 0xd8);
-  input2 = _mm_shufflelo_epi16(input2, 0xd8);
-  input2 = _mm_shufflehi_epi16(input2, 0xd8);
-
-  input1 = _mm_unpackhi_epi32(input0, input0);
-  input0 = _mm_unpacklo_epi32(input0, input0);
-  input3 = _mm_unpackhi_epi32(input2, input2);
-  input2 = _mm_unpacklo_epi32(input2, input2);
-
-  // Stage 1
-  input0 = _mm_madd_epi16(input0, cst);
-  input1 = _mm_madd_epi16(input1, cst);
-  input2 = _mm_madd_epi16(input2, cst);
-  input3 = _mm_madd_epi16(input3, cst);
-
-  input0 = _mm_add_epi32(input0, rounding);
-  input1 = _mm_add_epi32(input1, rounding);
-  input2 = _mm_add_epi32(input2, rounding);
-  input3 = _mm_add_epi32(input3, rounding);
-
-  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
-  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
-  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
-  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
-
-  // Stage 2
-  input0 = _mm_packs_epi32(input0, input1);
-  input1 = _mm_packs_epi32(input2, input3);
-
-  // Transpose
-  input2 = _mm_unpacklo_epi16(input0, input1);
-  input3 = _mm_unpackhi_epi16(input0, input1);
-  input0 = _mm_unpacklo_epi32(input2, input3);
-  input1 = _mm_unpackhi_epi32(input2, input3);
-
-  // Switch column2, column 3, and then, we got:
-  // input2: column1, column 0;  input3: column2, column 3.
-  input1 = _mm_shuffle_epi32(input1, 0x4e);
-  input2 = _mm_add_epi16(input0, input1);
-  input3 = _mm_sub_epi16(input0, input1);
-
-  // Columns
-  // Construct i3, i1, i3, i1, i2, i0, i2, i0
-  input0 = _mm_unpacklo_epi32(input2, input2);
-  input1 = _mm_unpackhi_epi32(input2, input2);
-  input2 = _mm_unpackhi_epi32(input3, input3);
-  input3 = _mm_unpacklo_epi32(input3, input3);
-
-  // Stage 1
-  input0 = _mm_madd_epi16(input0, cst);
-  input1 = _mm_madd_epi16(input1, cst);
-  input2 = _mm_madd_epi16(input2, cst);
-  input3 = _mm_madd_epi16(input3, cst);
-
-  input0 = _mm_add_epi32(input0, rounding);
-  input1 = _mm_add_epi32(input1, rounding);
-  input2 = _mm_add_epi32(input2, rounding);
-  input3 = _mm_add_epi32(input3, rounding);
-
-  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
-  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
-  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
-  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
-
-  // Stage 2
-  input0 = _mm_packs_epi32(input0, input2);
-  input1 = _mm_packs_epi32(input1, input3);
-
-  // Transpose
-  input2 = _mm_unpacklo_epi16(input0, input1);
-  input3 = _mm_unpackhi_epi16(input0, input1);
-  input0 = _mm_unpacklo_epi32(input2, input3);
-  input1 = _mm_unpackhi_epi32(input2, input3);
-
-  // Switch column2, column 3, and then, we got:
-  // input2: column1, column 0;  input3: column2, column 3.
-  input1 = _mm_shuffle_epi32(input1, 0x4e);
-  input2 = _mm_add_epi16(input0, input1);
-  input3 = _mm_sub_epi16(input0, input1);
-
-  // Final round and shift
-  input2 = _mm_add_epi16(input2, eight);
-  input3 = _mm_add_epi16(input3, eight);
-
-  input2 = _mm_srai_epi16(input2, 4);
-  input3 = _mm_srai_epi16(input3, 4);
-
-  // Reconstruction and Store
-  {
-    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
-    __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
-    d0 = _mm_unpacklo_epi32(d0,
-                            _mm_cvtsi32_si128(*(const int *)(dest + stride)));
-    d2 = _mm_unpacklo_epi32(
-        _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2);
-    d0 = _mm_unpacklo_epi8(d0, zero);
-    d2 = _mm_unpacklo_epi8(d2, zero);
-    d0 = _mm_add_epi16(d0, input2);
-    d2 = _mm_add_epi16(d2, input3);
-    d0 = _mm_packus_epi16(d0, d2);
-    // store input0
-    *(int *)dest = _mm_cvtsi128_si32(d0);
-    // store input1
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
-    // store input2
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
-    // store input3
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
-  }
-}
-
-void av1_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
-  __m128i dc_value;
-  const __m128i zero = _mm_setzero_si128();
-  int a;
-
-  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
-  a = (int)dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 4);
-
-  dc_value = _mm_set1_epi16(a);
-
-  RECON_AND_STORE4X4(dest + 0 * stride, dc_value);
-  RECON_AND_STORE4X4(dest + 1 * stride, dc_value);
-  RECON_AND_STORE4X4(dest + 2 * stride, dc_value);
-  RECON_AND_STORE4X4(dest + 3 * stride, dc_value);
-}
-
-static INLINE void transpose_4x4(__m128i *res) {
-  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
-  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
-
-  res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
-  res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
-}
-
-void av1_idct4_sse2(__m128i *in) {
-  const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i u[8], v[8];
-
-  transpose_4x4(in);
-  // stage 1
-  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
-  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
-  v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-
-  u[0] = _mm_packs_epi32(v[0], v[1]);
-  u[1] = _mm_packs_epi32(v[3], v[2]);
-
-  // stage 2
-  in[0] = _mm_add_epi16(u[0], u[1]);
-  in[1] = _mm_sub_epi16(u[0], u[1]);
-  in[1] = _mm_shuffle_epi32(in[1], 0x4E);
-}
-
-void av1_iadst4_sse2(__m128i *in) {
-  const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
-  const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
-  const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
-  const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
-  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
-  const __m128i kZero = _mm_set1_epi16(0);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i u[8], v[8], in7;
-
-  transpose_4x4(in);
-  in7 = _mm_srli_si128(in[1], 8);
-  in7 = _mm_add_epi16(in7, in[0]);
-  in7 = _mm_sub_epi16(in7, in[1]);
-
-  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
-  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
-  u[2] = _mm_unpacklo_epi16(in7, kZero);
-  u[3] = _mm_unpackhi_epi16(in[0], kZero);
-
-  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04);  // s0 + s3
-  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02);  // s2 + s5
-  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x2
-  v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01);  // s1 - s4
-  v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04);  // s2 - s6
-  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s2
-
-  u[0] = _mm_add_epi32(v[0], v[1]);
-  u[1] = _mm_add_epi32(v[3], v[4]);
-  u[2] = v[2];
-  u[3] = _mm_add_epi32(u[0], u[1]);
-  u[4] = _mm_slli_epi32(v[5], 2);
-  u[5] = _mm_add_epi32(u[3], v[5]);
-  u[6] = _mm_sub_epi32(u[5], u[4]);
-
-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-
-  in[0] = _mm_packs_epi32(u[0], u[1]);
-  in[1] = _mm_packs_epi32(u[2], u[3]);
-}
-
-#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
-                      out2, out3, out4, out5, out6, out7)                 \
-  {                                                                       \
-    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1);                   \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3);                   \
-    const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1);                   \
-    const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3);                   \
-    const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5);                   \
-    const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7);                   \
-    const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5);                   \
-    const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7);                   \
-                                                                          \
-    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);               \
-    const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);               \
-    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);               \
-    const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);               \
-    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);               \
-    const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);               \
-    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);               \
-    const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);               \
-                                                                          \
-    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4);                              \
-    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4);                              \
-    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6);                              \
-    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6);                              \
-    out4 = _mm_unpacklo_epi64(tr1_1, tr1_5);                              \
-    out5 = _mm_unpackhi_epi64(tr1_1, tr1_5);                              \
-    out6 = _mm_unpacklo_epi64(tr1_3, tr1_7);                              \
-    out7 = _mm_unpackhi_epi64(tr1_3, tr1_7);                              \
-  }
-
-#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, out0, out1, out2, out3) \
-  {                                                                      \
-    const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1);                \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0);                \
-    const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3);                \
-    const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2);                \
-                                                                         \
-    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);              \
-    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);              \
-    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);              \
-    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);              \
-                                                                         \
-    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4);                             \
-    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4);                             \
-    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6);                             \
-    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6);                             \
-  }
-
-#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
-  {                                                      \
-    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1);  \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3);  \
-    out0 = _mm_unpacklo_epi32(tr0_0, tr0_1);             \
-    out1 = _mm_unpackhi_epi32(tr0_0, tr0_1);             \
-  }
-
-// Define Macro for multiplying elements by constants and adding them together.
-#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, cst0, cst1, cst2, cst3, \
-                               res0, res1, res2, res3)                         \
-  {                                                                            \
-    tmp0 = _mm_madd_epi16(lo_0, cst0);                                         \
-    tmp1 = _mm_madd_epi16(hi_0, cst0);                                         \
-    tmp2 = _mm_madd_epi16(lo_0, cst1);                                         \
-    tmp3 = _mm_madd_epi16(hi_0, cst1);                                         \
-    tmp4 = _mm_madd_epi16(lo_1, cst2);                                         \
-    tmp5 = _mm_madd_epi16(hi_1, cst2);                                         \
-    tmp6 = _mm_madd_epi16(lo_1, cst3);                                         \
-    tmp7 = _mm_madd_epi16(hi_1, cst3);                                         \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-    tmp4 = _mm_add_epi32(tmp4, rounding);                                      \
-    tmp5 = _mm_add_epi32(tmp5, rounding);                                      \
-    tmp6 = _mm_add_epi32(tmp6, rounding);                                      \
-    tmp7 = _mm_add_epi32(tmp7, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);                               \
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);                               \
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);                               \
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);                               \
-                                                                               \
-    res0 = _mm_packs_epi32(tmp0, tmp1);                                        \
-    res1 = _mm_packs_epi32(tmp2, tmp3);                                        \
-    res2 = _mm_packs_epi32(tmp4, tmp5);                                        \
-    res3 = _mm_packs_epi32(tmp6, tmp7);                                        \
-  }
-
-#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
-  {                                                                  \
-    tmp0 = _mm_madd_epi16(lo_0, cst0);                               \
-    tmp1 = _mm_madd_epi16(hi_0, cst0);                               \
-    tmp2 = _mm_madd_epi16(lo_0, cst1);                               \
-    tmp3 = _mm_madd_epi16(hi_0, cst1);                               \
-                                                                     \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                            \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                            \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                            \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                            \
-                                                                     \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                     \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                     \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                     \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                     \
-                                                                     \
-    res0 = _mm_packs_epi32(tmp0, tmp1);                              \
-    res1 = _mm_packs_epi32(tmp2, tmp3);                              \
-  }
-
-#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, \
-              out4, out5, out6, out7)                                         \
-  {                                                                           \
-    /* Stage1 */                                                              \
-    {                                                                         \
-      const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7);                     \
-      const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7);                     \
-      const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5);                     \
-      const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5);                     \
-                                                                              \
-      MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, stg1_1,      \
-                             stg1_2, stg1_3, stp1_4, stp1_7, stp1_5, stp1_6)  \
-    }                                                                         \
-                                                                              \
-    /* Stage2 */                                                              \
-    {                                                                         \
-      const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4);                     \
-      const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4);                     \
-      const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6);                     \
-      const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6);                     \
-                                                                              \
-      MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, stg2_1,      \
-                             stg2_2, stg2_3, stp2_0, stp2_1, stp2_2, stp2_3)  \
-                                                                              \
-      stp2_4 = _mm_adds_epi16(stp1_4, stp1_5);                                \
-      stp2_5 = _mm_subs_epi16(stp1_4, stp1_5);                                \
-      stp2_6 = _mm_subs_epi16(stp1_7, stp1_6);                                \
-      stp2_7 = _mm_adds_epi16(stp1_7, stp1_6);                                \
-    }                                                                         \
-                                                                              \
-    /* Stage3 */                                                              \
-    {                                                                         \
-      const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5);               \
-      const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5);               \
-                                                                              \
-      stp1_0 = _mm_adds_epi16(stp2_0, stp2_3);                                \
-      stp1_1 = _mm_adds_epi16(stp2_1, stp2_2);                                \
-      stp1_2 = _mm_subs_epi16(stp2_1, stp2_2);                                \
-      stp1_3 = _mm_subs_epi16(stp2_0, stp2_3);                                \
-                                                                              \
-      tmp0 = _mm_madd_epi16(lo_56, stg2_1);                                   \
-      tmp1 = _mm_madd_epi16(hi_56, stg2_1);                                   \
-      tmp2 = _mm_madd_epi16(lo_56, stg2_0);                                   \
-      tmp3 = _mm_madd_epi16(hi_56, stg2_0);                                   \
-                                                                              \
-      tmp0 = _mm_add_epi32(tmp0, rounding);                                   \
-      tmp1 = _mm_add_epi32(tmp1, rounding);                                   \
-      tmp2 = _mm_add_epi32(tmp2, rounding);                                   \
-      tmp3 = _mm_add_epi32(tmp3, rounding);                                   \
-                                                                              \
-      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                            \
-      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                            \
-      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                            \
-      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                            \
-                                                                              \
-      stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                   \
-      stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                   \
-    }                                                                         \
-                                                                              \
-    /* Stage4  */                                                             \
-    out0 = _mm_adds_epi16(stp1_0, stp2_7);                                    \
-    out1 = _mm_adds_epi16(stp1_1, stp1_6);                                    \
-    out2 = _mm_adds_epi16(stp1_2, stp1_5);                                    \
-    out3 = _mm_adds_epi16(stp1_3, stp2_4);                                    \
-    out4 = _mm_subs_epi16(stp1_3, stp2_4);                                    \
-    out5 = _mm_subs_epi16(stp1_2, stp1_5);                                    \
-    out6 = _mm_subs_epi16(stp1_1, stp1_6);                                    \
-    out7 = _mm_subs_epi16(stp1_0, stp2_7);                                    \
-  }
-
-void av1_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
-  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i;
-
-  // Load input data.
-  in0 = _mm_load_si128((const __m128i *)input);
-  in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
-  in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
-  in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
-  in4 = _mm_load_si128((const __m128i *)(input + 8 * 4));
-  in5 = _mm_load_si128((const __m128i *)(input + 8 * 5));
-  in6 = _mm_load_si128((const __m128i *)(input + 8 * 6));
-  in7 = _mm_load_si128((const __m128i *)(input + 8 * 7));
-
-  // 2-D
-  for (i = 0; i < 2; i++) {
-    // 8x8 Transpose is copied from av1_fdct8x8_sse2()
-    TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                  in4, in5, in6, in7);
-
-    // 4-stage 1D av1_idct8x8
-    IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5,
-          in6, in7);
-  }
-
-  // Final rounding and shift
-  in0 = _mm_adds_epi16(in0, final_rounding);
-  in1 = _mm_adds_epi16(in1, final_rounding);
-  in2 = _mm_adds_epi16(in2, final_rounding);
-  in3 = _mm_adds_epi16(in3, final_rounding);
-  in4 = _mm_adds_epi16(in4, final_rounding);
-  in5 = _mm_adds_epi16(in5, final_rounding);
-  in6 = _mm_adds_epi16(in6, final_rounding);
-  in7 = _mm_adds_epi16(in7, final_rounding);
-
-  in0 = _mm_srai_epi16(in0, 5);
-  in1 = _mm_srai_epi16(in1, 5);
-  in2 = _mm_srai_epi16(in2, 5);
-  in3 = _mm_srai_epi16(in3, 5);
-  in4 = _mm_srai_epi16(in4, 5);
-  in5 = _mm_srai_epi16(in5, 5);
-  in6 = _mm_srai_epi16(in6, 5);
-  in7 = _mm_srai_epi16(in7, 5);
-
-  RECON_AND_STORE(dest + 0 * stride, in0);
-  RECON_AND_STORE(dest + 1 * stride, in1);
-  RECON_AND_STORE(dest + 2 * stride, in2);
-  RECON_AND_STORE(dest + 3 * stride, in3);
-  RECON_AND_STORE(dest + 4 * stride, in4);
-  RECON_AND_STORE(dest + 5 * stride, in5);
-  RECON_AND_STORE(dest + 6 * stride, in6);
-  RECON_AND_STORE(dest + 7 * stride, in7);
-}
-
-void av1_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
-  __m128i dc_value;
-  const __m128i zero = _mm_setzero_si128();
-  int a;
-
-  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
-  a = (int)dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 5);
-
-  dc_value = _mm_set1_epi16(a);
-
-  RECON_AND_STORE(dest + 0 * stride, dc_value);
-  RECON_AND_STORE(dest + 1 * stride, dc_value);
-  RECON_AND_STORE(dest + 2 * stride, dc_value);
-  RECON_AND_STORE(dest + 3 * stride, dc_value);
-  RECON_AND_STORE(dest + 4 * stride, dc_value);
-  RECON_AND_STORE(dest + 5 * stride, dc_value);
-  RECON_AND_STORE(dest + 6 * stride, dc_value);
-  RECON_AND_STORE(dest + 7 * stride, dc_value);
-}
-
-void av1_idct8_sse2(__m128i *in) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-
-  // 8x8 Transpose is copied from av1_fdct8x8_sse2()
-  TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], in0,
-                in1, in2, in3, in4, in5, in6, in7);
-
-  // 4-stage 1D av1_idct8x8
-  IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in[0], in[1], in[2], in[3],
-        in[4], in[5], in[6], in[7]);
-}
-
-void av1_iadst8_sse2(__m128i *in) {
-  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
-  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__const_0 = _mm_set1_epi16(0);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
-  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
-  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-
-  // transpose
-  array_transpose_8x8(in, in);
-
-  // properly aligned for butterfly input
-  in0 = in[7];
-  in1 = in[0];
-  in2 = in[5];
-  in3 = in[2];
-  in4 = in[3];
-  in5 = in[4];
-  in6 = in[1];
-  in7 = in[6];
-
-  // column transformation
-  // stage 1
-  // interleave and multiply/add into 32-bit integer
-  s0 = _mm_unpacklo_epi16(in0, in1);
-  s1 = _mm_unpackhi_epi16(in0, in1);
-  s2 = _mm_unpacklo_epi16(in2, in3);
-  s3 = _mm_unpackhi_epi16(in2, in3);
-  s4 = _mm_unpacklo_epi16(in4, in5);
-  s5 = _mm_unpackhi_epi16(in4, in5);
-  s6 = _mm_unpacklo_epi16(in6, in7);
-  s7 = _mm_unpackhi_epi16(in6, in7);
-
-  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
-  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
-  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
-  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
-  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
-  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
-  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
-  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
-  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
-  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
-  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
-  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
-  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
-  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
-  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
-  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
-
-  // addition
-  w0 = _mm_add_epi32(u0, u8);
-  w1 = _mm_add_epi32(u1, u9);
-  w2 = _mm_add_epi32(u2, u10);
-  w3 = _mm_add_epi32(u3, u11);
-  w4 = _mm_add_epi32(u4, u12);
-  w5 = _mm_add_epi32(u5, u13);
-  w6 = _mm_add_epi32(u6, u14);
-  w7 = _mm_add_epi32(u7, u15);
-  w8 = _mm_sub_epi32(u0, u8);
-  w9 = _mm_sub_epi32(u1, u9);
-  w10 = _mm_sub_epi32(u2, u10);
-  w11 = _mm_sub_epi32(u3, u11);
-  w12 = _mm_sub_epi32(u4, u12);
-  w13 = _mm_sub_epi32(u5, u13);
-  w14 = _mm_sub_epi32(u6, u14);
-  w15 = _mm_sub_epi32(u7, u15);
-
-  // shift and rounding
-  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
-  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
-  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
-  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
-  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
-  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
-  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
-  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
-  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
-  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
-  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
-  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
-  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
-  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
-  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
-  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
-
-  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-  u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
-  u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
-  u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
-  u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
-  u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
-  u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
-  u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
-  u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
-
-  // back to 16-bit and pack 8 integers into __m128i
-  in[0] = _mm_packs_epi32(u0, u1);
-  in[1] = _mm_packs_epi32(u2, u3);
-  in[2] = _mm_packs_epi32(u4, u5);
-  in[3] = _mm_packs_epi32(u6, u7);
-  in[4] = _mm_packs_epi32(u8, u9);
-  in[5] = _mm_packs_epi32(u10, u11);
-  in[6] = _mm_packs_epi32(u12, u13);
-  in[7] = _mm_packs_epi32(u14, u15);
-
-  // stage 2
-  s0 = _mm_add_epi16(in[0], in[2]);
-  s1 = _mm_add_epi16(in[1], in[3]);
-  s2 = _mm_sub_epi16(in[0], in[2]);
-  s3 = _mm_sub_epi16(in[1], in[3]);
-  u0 = _mm_unpacklo_epi16(in[4], in[5]);
-  u1 = _mm_unpackhi_epi16(in[4], in[5]);
-  u2 = _mm_unpacklo_epi16(in[6], in[7]);
-  u3 = _mm_unpackhi_epi16(in[6], in[7]);
-
-  v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
-  v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
-  v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
-  v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
-  v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
-  v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
-  v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
-  v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
-
-  w0 = _mm_add_epi32(v0, v4);
-  w1 = _mm_add_epi32(v1, v5);
-  w2 = _mm_add_epi32(v2, v6);
-  w3 = _mm_add_epi32(v3, v7);
-  w4 = _mm_sub_epi32(v0, v4);
-  w5 = _mm_sub_epi32(v1, v5);
-  w6 = _mm_sub_epi32(v2, v6);
-  w7 = _mm_sub_epi32(v3, v7);
-
-  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
-  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
-  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
-  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
-  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
-  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
-  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
-  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
-
-  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-
-  // back to 16-bit intergers
-  s4 = _mm_packs_epi32(u0, u1);
-  s5 = _mm_packs_epi32(u2, u3);
-  s6 = _mm_packs_epi32(u4, u5);
-  s7 = _mm_packs_epi32(u6, u7);
-
-  // stage 3
-  u0 = _mm_unpacklo_epi16(s2, s3);
-  u1 = _mm_unpackhi_epi16(s2, s3);
-  u2 = _mm_unpacklo_epi16(s6, s7);
-  u3 = _mm_unpackhi_epi16(s6, s7);
-
-  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
-  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
-  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
-  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
-  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
-  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
-  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
-  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
-
-  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
-  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
-  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
-  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
-  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
-  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
-  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
-  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
-
-  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
-  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
-  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
-  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
-  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
-  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
-  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
-  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
-
-  s2 = _mm_packs_epi32(v0, v1);
-  s3 = _mm_packs_epi32(v2, v3);
-  s6 = _mm_packs_epi32(v4, v5);
-  s7 = _mm_packs_epi32(v6, v7);
-
-  in[0] = s0;
-  in[1] = _mm_sub_epi16(k__const_0, s4);
-  in[2] = s6;
-  in[3] = _mm_sub_epi16(k__const_0, s2);
-  in[4] = s3;
-  in[5] = _mm_sub_epi16(k__const_0, s7);
-  in[6] = s5;
-  in[7] = _mm_sub_epi16(k__const_0, s1);
-}
-
-void av1_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
-  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-
-  // Rows. Load 4-row input data.
-  in0 = _mm_load_si128((const __m128i *)input);
-  in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
-  in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
-  in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
-
-  // 8x4 Transpose
-  TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
-  // Stage1
-  {
-    const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
-    const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
-
-    tmp0 = _mm_madd_epi16(lo_17, stg1_0);
-    tmp2 = _mm_madd_epi16(lo_17, stg1_1);
-    tmp4 = _mm_madd_epi16(lo_35, stg1_2);
-    tmp6 = _mm_madd_epi16(lo_35, stg1_3);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-    tmp6 = _mm_add_epi32(tmp6, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
-    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
-    stp1_5 = _mm_packs_epi32(tmp4, tmp6);
-  }
-
-  // Stage2
-  {
-    const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
-    const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
-
-    tmp0 = _mm_madd_epi16(lo_04, stg2_0);
-    tmp2 = _mm_madd_epi16(lo_04, stg2_1);
-    tmp4 = _mm_madd_epi16(lo_26, stg2_2);
-    tmp6 = _mm_madd_epi16(lo_26, stg2_3);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-    tmp6 = _mm_add_epi32(tmp6, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
-    stp2_0 = _mm_packs_epi32(tmp0, tmp2);
-    stp2_2 = _mm_packs_epi32(tmp6, tmp4);
-
-    tmp0 = _mm_adds_epi16(stp1_4, stp1_5);
-    tmp1 = _mm_subs_epi16(stp1_4, stp1_5);
-
-    stp2_4 = tmp0;
-    stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
-    stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
-  }
-
-  // Stage3
-  {
-    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
-
-    tmp4 = _mm_adds_epi16(stp2_0, stp2_2);
-    tmp6 = _mm_subs_epi16(stp2_0, stp2_2);
-
-    stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
-    stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
-
-    tmp0 = _mm_madd_epi16(lo_56, stg3_0);
-    tmp2 = _mm_madd_epi16(lo_56, stg2_0);  // stg3_1 = stg2_0
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-
-    stp1_5 = _mm_packs_epi32(tmp0, tmp2);
-  }
-
-  // Stage4
-  tmp0 = _mm_adds_epi16(stp1_3, stp2_4);
-  tmp1 = _mm_adds_epi16(stp1_2, stp1_5);
-  tmp2 = _mm_subs_epi16(stp1_3, stp2_4);
-  tmp3 = _mm_subs_epi16(stp1_2, stp1_5);
-
-  TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
-
-  IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, in0, in1, in2, in3, in4,
-        in5, in6, in7);
-  // Final rounding and shift
-  in0 = _mm_adds_epi16(in0, final_rounding);
-  in1 = _mm_adds_epi16(in1, final_rounding);
-  in2 = _mm_adds_epi16(in2, final_rounding);
-  in3 = _mm_adds_epi16(in3, final_rounding);
-  in4 = _mm_adds_epi16(in4, final_rounding);
-  in5 = _mm_adds_epi16(in5, final_rounding);
-  in6 = _mm_adds_epi16(in6, final_rounding);
-  in7 = _mm_adds_epi16(in7, final_rounding);
-
-  in0 = _mm_srai_epi16(in0, 5);
-  in1 = _mm_srai_epi16(in1, 5);
-  in2 = _mm_srai_epi16(in2, 5);
-  in3 = _mm_srai_epi16(in3, 5);
-  in4 = _mm_srai_epi16(in4, 5);
-  in5 = _mm_srai_epi16(in5, 5);
-  in6 = _mm_srai_epi16(in6, 5);
-  in7 = _mm_srai_epi16(in7, 5);
-
-  RECON_AND_STORE(dest + 0 * stride, in0);
-  RECON_AND_STORE(dest + 1 * stride, in1);
-  RECON_AND_STORE(dest + 2 * stride, in2);
-  RECON_AND_STORE(dest + 3 * stride, in3);
-  RECON_AND_STORE(dest + 4 * stride, in4);
-  RECON_AND_STORE(dest + 5 * stride, in5);
-  RECON_AND_STORE(dest + 6 * stride, in6);
-  RECON_AND_STORE(dest + 7 * stride, in7);
-}
-
-#define IDCT16                                                                 \
-  /* Stage2 */                                                                 \
-  {                                                                            \
-    const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]);                 \
-    const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]);                 \
-    const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]);                   \
-    const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]);                   \
-    const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]);                 \
-    const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]);                 \
-    const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]);                 \
-    const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]);                 \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, stg2_0, stg2_1,   \
-                           stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, stp2_14)   \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, stg2_4, stg2_5, \
-                           stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, stp2_12) \
-  }                                                                            \
-                                                                               \
-  /* Stage3 */                                                                 \
-  {                                                                            \
-    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]);                 \
-    const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]);                 \
-    const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]);                 \
-    const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]);                 \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, stg3_0, stg3_1, \
-                           stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, stp1_6)     \
-                                                                               \
-    stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);                                  \
-    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);                                    \
-    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);                                 \
-    stp1_11 = _mm_add_epi16(stp2_11, stp2_10);                                 \
-                                                                               \
-    stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13);                               \
-    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);                                 \
-    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);                                 \
-    stp1_15 = _mm_add_epi16(stp2_15, stp2_14);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage4 */                                                                 \
-  {                                                                            \
-    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]);                   \
-    const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]);                   \
-    const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]);                 \
-    const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]);                 \
-                                                                               \
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);               \
-    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);               \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, stg4_0, stg4_1,   \
-                           stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3)     \
-                                                                               \
-    stp2_4 = _mm_add_epi16(stp1_4, stp1_5);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);                                    \
-    stp2_7 = _mm_add_epi16(stp1_7, stp1_6);                                    \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
-                           stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10,   \
-                           stp2_13)                                            \
-  }                                                                            \
-                                                                               \
-  /* Stage5 */                                                                 \
-  {                                                                            \
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
-    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
-                                                                               \
-    stp1_0 = _mm_add_epi16(stp2_0, stp2_3);                                    \
-    stp1_1 = _mm_add_epi16(stp2_1, stp2_2);                                    \
-    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);                                    \
-    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);                                    \
-                                                                               \
-    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
-    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
-    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
-    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-                                                                               \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
-                                                                               \
-    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);                                 \
-    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
-    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);                                \
-                                                                               \
-    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);                               \
-    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
-    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
-    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);                               \
-  }                                                                            \
-                                                                               \
-  /* Stage6 */                                                                 \
-  {                                                                            \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
-    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
-                                                                               \
-    stp2_0 = _mm_add_epi16(stp1_0, stp2_7);                                    \
-    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
-    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
-    stp2_3 = _mm_add_epi16(stp1_3, stp2_4);                                    \
-    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
-    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);                                    \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
-                           stp2_12)                                            \
-  }
-
-#define IDCT16_10                                                              \
-  /* Stage2 */                                                                 \
-  {                                                                            \
-    const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero);                   \
-    const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero);                   \
-    const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]);                   \
-    const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]);                   \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, stg2_0, stg2_1, \
-                           stg2_6, stg2_7, stp1_8_0, stp1_15, stp1_11,         \
-                           stp1_12_0)                                          \
-  }                                                                            \
-                                                                               \
-  /* Stage3 */                                                                 \
-  {                                                                            \
-    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero);                   \
-    const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero);                   \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, stg3_0, stg3_1, stp2_4, stp2_7) \
-                                                                               \
-    stp1_9 = stp1_8_0;                                                         \
-    stp1_10 = stp1_11;                                                         \
-                                                                               \
-    stp1_13 = stp1_12_0;                                                       \
-    stp1_14 = stp1_15;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage4 */                                                                 \
-  {                                                                            \
-    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);                    \
-    const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero);                    \
-                                                                               \
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);               \
-    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);               \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, stg4_0, stg4_1, stp1_0, stp1_1)   \
-    stp2_5 = stp2_4;                                                           \
-    stp2_6 = stp2_7;                                                           \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
-                           stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10,   \
-                           stp2_13)                                            \
-  }                                                                            \
-                                                                               \
-  /* Stage5 */                                                                 \
-  {                                                                            \
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
-    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
-                                                                               \
-    stp1_2 = stp1_1;                                                           \
-    stp1_3 = stp1_0;                                                           \
-                                                                               \
-    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
-    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
-    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
-    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-                                                                               \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
-                                                                               \
-    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);                                 \
-    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
-    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);                                \
-                                                                               \
-    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);                               \
-    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
-    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
-    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);                               \
-  }                                                                            \
-                                                                               \
-  /* Stage6 */                                                                 \
-  {                                                                            \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
-    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
-                                                                               \
-    stp2_0 = _mm_add_epi16(stp1_0, stp2_7);                                    \
-    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
-    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
-    stp2_3 = _mm_add_epi16(stp1_3, stp2_4);                                    \
-    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
-    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);                                    \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
-                           stp2_12)                                            \
-  }
-
-void av1_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
-                                int stride) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  const __m128i zero = _mm_setzero_si128();
-
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in[16], l[16], r[16], *curr1;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
-      stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-      stp1_8_0, stp1_12_0;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i;
-
-  curr1 = l;
-  for (i = 0; i < 2; i++) {
-    // 1-D av1_idct
-
-    // Load input data.
-    in[0] = _mm_load_si128((const __m128i *)input);
-    in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1));
-    in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
-    in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3));
-    in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
-    in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5));
-    in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
-    in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7));
-    in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8));
-    in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9));
-    in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10));
-    in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11));
-    in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12));
-    in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13));
-    in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14));
-    in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15));
-
-    array_transpose_8x8(in, in);
-    array_transpose_8x8(in + 8, in + 8);
-
-    IDCT16
-
-    // Stage7
-    curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
-    curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
-    curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
-    curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
-    curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
-    curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
-    curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
-    curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
-    curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
-    curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
-    curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
-    curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
-    curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
-    curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
-    curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
-    curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
-    curr1 = r;
-    input += 128;
-  }
-  for (i = 0; i < 2; i++) {
-    int j;
-    // 1-D av1_idct
-    array_transpose_8x8(l + i * 8, in);
-    array_transpose_8x8(r + i * 8, in + 8);
-
-    IDCT16
-
-    // 2-D
-    in[0] = _mm_add_epi16(stp2_0, stp1_15);
-    in[1] = _mm_add_epi16(stp2_1, stp1_14);
-    in[2] = _mm_add_epi16(stp2_2, stp2_13);
-    in[3] = _mm_add_epi16(stp2_3, stp2_12);
-    in[4] = _mm_add_epi16(stp2_4, stp2_11);
-    in[5] = _mm_add_epi16(stp2_5, stp2_10);
-    in[6] = _mm_add_epi16(stp2_6, stp1_9);
-    in[7] = _mm_add_epi16(stp2_7, stp1_8);
-    in[8] = _mm_sub_epi16(stp2_7, stp1_8);
-    in[9] = _mm_sub_epi16(stp2_6, stp1_9);
-    in[10] = _mm_sub_epi16(stp2_5, stp2_10);
-    in[11] = _mm_sub_epi16(stp2_4, stp2_11);
-    in[12] = _mm_sub_epi16(stp2_3, stp2_12);
-    in[13] = _mm_sub_epi16(stp2_2, stp2_13);
-    in[14] = _mm_sub_epi16(stp2_1, stp1_14);
-    in[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
-    for (j = 0; j < 16; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      RECON_AND_STORE(dest + j * stride, in[j]);
-    }
-
-    dest += 8;
-  }
-}
-
-void av1_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
-  __m128i dc_value;
-  const __m128i zero = _mm_setzero_si128();
-  int a, i;
-
-  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
-  a = (int)dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 6);
-
-  dc_value = _mm_set1_epi16(a);
-
-  for (i = 0; i < 2; ++i) {
-    RECON_AND_STORE(dest + 0 * stride, dc_value);
-    RECON_AND_STORE(dest + 1 * stride, dc_value);
-    RECON_AND_STORE(dest + 2 * stride, dc_value);
-    RECON_AND_STORE(dest + 3 * stride, dc_value);
-    RECON_AND_STORE(dest + 4 * stride, dc_value);
-    RECON_AND_STORE(dest + 5 * stride, dc_value);
-    RECON_AND_STORE(dest + 6 * stride, dc_value);
-    RECON_AND_STORE(dest + 7 * stride, dc_value);
-    RECON_AND_STORE(dest + 8 * stride, dc_value);
-    RECON_AND_STORE(dest + 9 * stride, dc_value);
-    RECON_AND_STORE(dest + 10 * stride, dc_value);
-    RECON_AND_STORE(dest + 11 * stride, dc_value);
-    RECON_AND_STORE(dest + 12 * stride, dc_value);
-    RECON_AND_STORE(dest + 13 * stride, dc_value);
-    RECON_AND_STORE(dest + 14 * stride, dc_value);
-    RECON_AND_STORE(dest + 15 * stride, dc_value);
-    dest += 8;
-  }
-}
-
-static void av1_iadst16_8col(__m128i *in) {
-  // perform 16x16 1-D ADST for 8 columns
-  __m128i s[16], x[16], u[32], v[32];
-  const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
-  const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
-  const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
-  const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
-  const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
-  const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
-  const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
-  const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
-  const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
-  const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
-  const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
-  const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
-  const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
-  const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
-  const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
-  const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
-  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i kZero = _mm_set1_epi16(0);
-
-  u[0] = _mm_unpacklo_epi16(in[15], in[0]);
-  u[1] = _mm_unpackhi_epi16(in[15], in[0]);
-  u[2] = _mm_unpacklo_epi16(in[13], in[2]);
-  u[3] = _mm_unpackhi_epi16(in[13], in[2]);
-  u[4] = _mm_unpacklo_epi16(in[11], in[4]);
-  u[5] = _mm_unpackhi_epi16(in[11], in[4]);
-  u[6] = _mm_unpacklo_epi16(in[9], in[6]);
-  u[7] = _mm_unpackhi_epi16(in[9], in[6]);
-  u[8] = _mm_unpacklo_epi16(in[7], in[8]);
-  u[9] = _mm_unpackhi_epi16(in[7], in[8]);
-  u[10] = _mm_unpacklo_epi16(in[5], in[10]);
-  u[11] = _mm_unpackhi_epi16(in[5], in[10]);
-  u[12] = _mm_unpacklo_epi16(in[3], in[12]);
-  u[13] = _mm_unpackhi_epi16(in[3], in[12]);
-  u[14] = _mm_unpacklo_epi16(in[1], in[14]);
-  u[15] = _mm_unpackhi_epi16(in[1], in[14]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
-  v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
-  v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
-  v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
-  v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
-  v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
-  v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
-  v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
-  v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
-  v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
-  v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
-  v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
-  v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
-  v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
-  v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
-  v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
-  v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
-
-  u[0] = _mm_add_epi32(v[0], v[16]);
-  u[1] = _mm_add_epi32(v[1], v[17]);
-  u[2] = _mm_add_epi32(v[2], v[18]);
-  u[3] = _mm_add_epi32(v[3], v[19]);
-  u[4] = _mm_add_epi32(v[4], v[20]);
-  u[5] = _mm_add_epi32(v[5], v[21]);
-  u[6] = _mm_add_epi32(v[6], v[22]);
-  u[7] = _mm_add_epi32(v[7], v[23]);
-  u[8] = _mm_add_epi32(v[8], v[24]);
-  u[9] = _mm_add_epi32(v[9], v[25]);
-  u[10] = _mm_add_epi32(v[10], v[26]);
-  u[11] = _mm_add_epi32(v[11], v[27]);
-  u[12] = _mm_add_epi32(v[12], v[28]);
-  u[13] = _mm_add_epi32(v[13], v[29]);
-  u[14] = _mm_add_epi32(v[14], v[30]);
-  u[15] = _mm_add_epi32(v[15], v[31]);
-  u[16] = _mm_sub_epi32(v[0], v[16]);
-  u[17] = _mm_sub_epi32(v[1], v[17]);
-  u[18] = _mm_sub_epi32(v[2], v[18]);
-  u[19] = _mm_sub_epi32(v[3], v[19]);
-  u[20] = _mm_sub_epi32(v[4], v[20]);
-  u[21] = _mm_sub_epi32(v[5], v[21]);
-  u[22] = _mm_sub_epi32(v[6], v[22]);
-  u[23] = _mm_sub_epi32(v[7], v[23]);
-  u[24] = _mm_sub_epi32(v[8], v[24]);
-  u[25] = _mm_sub_epi32(v[9], v[25]);
-  u[26] = _mm_sub_epi32(v[10], v[26]);
-  u[27] = _mm_sub_epi32(v[11], v[27]);
-  u[28] = _mm_sub_epi32(v[12], v[28]);
-  u[29] = _mm_sub_epi32(v[13], v[29]);
-  u[30] = _mm_sub_epi32(v[14], v[30]);
-  u[31] = _mm_sub_epi32(v[15], v[31]);
-
-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-  v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
-  v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
-  v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
-  v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
-  v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
-  v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
-  v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
-  v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
-  v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
-  v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
-  v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
-  v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
-  v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
-  v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
-  v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
-  v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-  u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
-  u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
-  u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
-  u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
-  u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
-  u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
-  u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
-  u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
-  u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
-  u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
-  u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
-  u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
-  u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
-  u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
-  u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
-  u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
-
-  s[0] = _mm_packs_epi32(u[0], u[1]);
-  s[1] = _mm_packs_epi32(u[2], u[3]);
-  s[2] = _mm_packs_epi32(u[4], u[5]);
-  s[3] = _mm_packs_epi32(u[6], u[7]);
-  s[4] = _mm_packs_epi32(u[8], u[9]);
-  s[5] = _mm_packs_epi32(u[10], u[11]);
-  s[6] = _mm_packs_epi32(u[12], u[13]);
-  s[7] = _mm_packs_epi32(u[14], u[15]);
-  s[8] = _mm_packs_epi32(u[16], u[17]);
-  s[9] = _mm_packs_epi32(u[18], u[19]);
-  s[10] = _mm_packs_epi32(u[20], u[21]);
-  s[11] = _mm_packs_epi32(u[22], u[23]);
-  s[12] = _mm_packs_epi32(u[24], u[25]);
-  s[13] = _mm_packs_epi32(u[26], u[27]);
-  s[14] = _mm_packs_epi32(u[28], u[29]);
-  s[15] = _mm_packs_epi32(u[30], u[31]);
-
-  // stage 2
-  u[0] = _mm_unpacklo_epi16(s[8], s[9]);
-  u[1] = _mm_unpackhi_epi16(s[8], s[9]);
-  u[2] = _mm_unpacklo_epi16(s[10], s[11]);
-  u[3] = _mm_unpackhi_epi16(s[10], s[11]);
-  u[4] = _mm_unpacklo_epi16(s[12], s[13]);
-  u[5] = _mm_unpackhi_epi16(s[12], s[13]);
-  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
-  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
-
-  u[0] = _mm_add_epi32(v[0], v[8]);
-  u[1] = _mm_add_epi32(v[1], v[9]);
-  u[2] = _mm_add_epi32(v[2], v[10]);
-  u[3] = _mm_add_epi32(v[3], v[11]);
-  u[4] = _mm_add_epi32(v[4], v[12]);
-  u[5] = _mm_add_epi32(v[5], v[13]);
-  u[6] = _mm_add_epi32(v[6], v[14]);
-  u[7] = _mm_add_epi32(v[7], v[15]);
-  u[8] = _mm_sub_epi32(v[0], v[8]);
-  u[9] = _mm_sub_epi32(v[1], v[9]);
-  u[10] = _mm_sub_epi32(v[2], v[10]);
-  u[11] = _mm_sub_epi32(v[3], v[11]);
-  u[12] = _mm_sub_epi32(v[4], v[12]);
-  u[13] = _mm_sub_epi32(v[5], v[13]);
-  u[14] = _mm_sub_epi32(v[6], v[14]);
-  u[15] = _mm_sub_epi32(v[7], v[15]);
-
-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-
-  x[0] = _mm_add_epi16(s[0], s[4]);
-  x[1] = _mm_add_epi16(s[1], s[5]);
-  x[2] = _mm_add_epi16(s[2], s[6]);
-  x[3] = _mm_add_epi16(s[3], s[7]);
-  x[4] = _mm_sub_epi16(s[0], s[4]);
-  x[5] = _mm_sub_epi16(s[1], s[5]);
-  x[6] = _mm_sub_epi16(s[2], s[6]);
-  x[7] = _mm_sub_epi16(s[3], s[7]);
-  x[8] = _mm_packs_epi32(u[0], u[1]);
-  x[9] = _mm_packs_epi32(u[2], u[3]);
-  x[10] = _mm_packs_epi32(u[4], u[5]);
-  x[11] = _mm_packs_epi32(u[6], u[7]);
-  x[12] = _mm_packs_epi32(u[8], u[9]);
-  x[13] = _mm_packs_epi32(u[10], u[11]);
-  x[14] = _mm_packs_epi32(u[12], u[13]);
-  x[15] = _mm_packs_epi32(u[14], u[15]);
-
-  // stage 3
-  u[0] = _mm_unpacklo_epi16(x[4], x[5]);
-  u[1] = _mm_unpackhi_epi16(x[4], x[5]);
-  u[2] = _mm_unpacklo_epi16(x[6], x[7]);
-  u[3] = _mm_unpackhi_epi16(x[6], x[7]);
-  u[4] = _mm_unpacklo_epi16(x[12], x[13]);
-  u[5] = _mm_unpackhi_epi16(x[12], x[13]);
-  u[6] = _mm_unpacklo_epi16(x[14], x[15]);
-  u[7] = _mm_unpackhi_epi16(x[14], x[15]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
-
-  u[0] = _mm_add_epi32(v[0], v[4]);
-  u[1] = _mm_add_epi32(v[1], v[5]);
-  u[2] = _mm_add_epi32(v[2], v[6]);
-  u[3] = _mm_add_epi32(v[3], v[7]);
-  u[4] = _mm_sub_epi32(v[0], v[4]);
-  u[5] = _mm_sub_epi32(v[1], v[5]);
-  u[6] = _mm_sub_epi32(v[2], v[6]);
-  u[7] = _mm_sub_epi32(v[3], v[7]);
-  u[8] = _mm_add_epi32(v[8], v[12]);
-  u[9] = _mm_add_epi32(v[9], v[13]);
-  u[10] = _mm_add_epi32(v[10], v[14]);
-  u[11] = _mm_add_epi32(v[11], v[15]);
-  u[12] = _mm_sub_epi32(v[8], v[12]);
-  u[13] = _mm_sub_epi32(v[9], v[13]);
-  u[14] = _mm_sub_epi32(v[10], v[14]);
-  u[15] = _mm_sub_epi32(v[11], v[15]);
-
-  u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  s[0] = _mm_add_epi16(x[0], x[2]);
-  s[1] = _mm_add_epi16(x[1], x[3]);
-  s[2] = _mm_sub_epi16(x[0], x[2]);
-  s[3] = _mm_sub_epi16(x[1], x[3]);
-  s[4] = _mm_packs_epi32(v[0], v[1]);
-  s[5] = _mm_packs_epi32(v[2], v[3]);
-  s[6] = _mm_packs_epi32(v[4], v[5]);
-  s[7] = _mm_packs_epi32(v[6], v[7]);
-  s[8] = _mm_add_epi16(x[8], x[10]);
-  s[9] = _mm_add_epi16(x[9], x[11]);
-  s[10] = _mm_sub_epi16(x[8], x[10]);
-  s[11] = _mm_sub_epi16(x[9], x[11]);
-  s[12] = _mm_packs_epi32(v[8], v[9]);
-  s[13] = _mm_packs_epi32(v[10], v[11]);
-  s[14] = _mm_packs_epi32(v[12], v[13]);
-  s[15] = _mm_packs_epi32(v[14], v[15]);
-
-  // stage 4
-  u[0] = _mm_unpacklo_epi16(s[2], s[3]);
-  u[1] = _mm_unpackhi_epi16(s[2], s[3]);
-  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
-  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
-  u[4] = _mm_unpacklo_epi16(s[10], s[11]);
-  u[5] = _mm_unpackhi_epi16(s[10], s[11]);
-  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
-  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  in[0] = s[0];
-  in[1] = _mm_sub_epi16(kZero, s[8]);
-  in[2] = s[12];
-  in[3] = _mm_sub_epi16(kZero, s[4]);
-  in[4] = _mm_packs_epi32(v[4], v[5]);
-  in[5] = _mm_packs_epi32(v[12], v[13]);
-  in[6] = _mm_packs_epi32(v[8], v[9]);
-  in[7] = _mm_packs_epi32(v[0], v[1]);
-  in[8] = _mm_packs_epi32(v[2], v[3]);
-  in[9] = _mm_packs_epi32(v[10], v[11]);
-  in[10] = _mm_packs_epi32(v[14], v[15]);
-  in[11] = _mm_packs_epi32(v[6], v[7]);
-  in[12] = s[5];
-  in[13] = _mm_sub_epi16(kZero, s[13]);
-  in[14] = s[9];
-  in[15] = _mm_sub_epi16(kZero, s[1]);
-}
-
-static void av1_idct16_8col(__m128i *in) {
-  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
-  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i v[16], u[16], s[16], t[16];
-
-  // stage 1
-  s[0] = in[0];
-  s[1] = in[8];
-  s[2] = in[4];
-  s[3] = in[12];
-  s[4] = in[2];
-  s[5] = in[10];
-  s[6] = in[6];
-  s[7] = in[14];
-  s[8] = in[1];
-  s[9] = in[9];
-  s[10] = in[5];
-  s[11] = in[13];
-  s[12] = in[3];
-  s[13] = in[11];
-  s[14] = in[7];
-  s[15] = in[15];
-
-  // stage 2
-  u[0] = _mm_unpacklo_epi16(s[8], s[15]);
-  u[1] = _mm_unpackhi_epi16(s[8], s[15]);
-  u[2] = _mm_unpacklo_epi16(s[9], s[14]);
-  u[3] = _mm_unpackhi_epi16(s[9], s[14]);
-  u[4] = _mm_unpacklo_epi16(s[10], s[13]);
-  u[5] = _mm_unpackhi_epi16(s[10], s[13]);
-  u[6] = _mm_unpacklo_epi16(s[11], s[12]);
-  u[7] = _mm_unpackhi_epi16(s[11], s[12]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  s[8] = _mm_packs_epi32(u[0], u[1]);
-  s[15] = _mm_packs_epi32(u[2], u[3]);
-  s[9] = _mm_packs_epi32(u[4], u[5]);
-  s[14] = _mm_packs_epi32(u[6], u[7]);
-  s[10] = _mm_packs_epi32(u[8], u[9]);
-  s[13] = _mm_packs_epi32(u[10], u[11]);
-  s[11] = _mm_packs_epi32(u[12], u[13]);
-  s[12] = _mm_packs_epi32(u[14], u[15]);
-
-  // stage 3
-  t[0] = s[0];
-  t[1] = s[1];
-  t[2] = s[2];
-  t[3] = s[3];
-  u[0] = _mm_unpacklo_epi16(s[4], s[7]);
-  u[1] = _mm_unpackhi_epi16(s[4], s[7]);
-  u[2] = _mm_unpacklo_epi16(s[5], s[6]);
-  u[3] = _mm_unpackhi_epi16(s[5], s[6]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
-  t[4] = _mm_packs_epi32(u[0], u[1]);
-  t[7] = _mm_packs_epi32(u[2], u[3]);
-  t[5] = _mm_packs_epi32(u[4], u[5]);
-  t[6] = _mm_packs_epi32(u[6], u[7]);
-  t[8] = _mm_add_epi16(s[8], s[9]);
-  t[9] = _mm_sub_epi16(s[8], s[9]);
-  t[10] = _mm_sub_epi16(s[11], s[10]);
-  t[11] = _mm_add_epi16(s[10], s[11]);
-  t[12] = _mm_add_epi16(s[12], s[13]);
-  t[13] = _mm_sub_epi16(s[12], s[13]);
-  t[14] = _mm_sub_epi16(s[15], s[14]);
-  t[15] = _mm_add_epi16(s[14], s[15]);
-
-  // stage 4
-  u[0] = _mm_unpacklo_epi16(t[0], t[1]);
-  u[1] = _mm_unpackhi_epi16(t[0], t[1]);
-  u[2] = _mm_unpacklo_epi16(t[2], t[3]);
-  u[3] = _mm_unpackhi_epi16(t[2], t[3]);
-  u[4] = _mm_unpacklo_epi16(t[9], t[14]);
-  u[5] = _mm_unpackhi_epi16(t[9], t[14]);
-  u[6] = _mm_unpacklo_epi16(t[10], t[13]);
-  u[7] = _mm_unpackhi_epi16(t[10], t[13]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  s[0] = _mm_packs_epi32(u[0], u[1]);
-  s[1] = _mm_packs_epi32(u[2], u[3]);
-  s[2] = _mm_packs_epi32(u[4], u[5]);
-  s[3] = _mm_packs_epi32(u[6], u[7]);
-  s[4] = _mm_add_epi16(t[4], t[5]);
-  s[5] = _mm_sub_epi16(t[4], t[5]);
-  s[6] = _mm_sub_epi16(t[7], t[6]);
-  s[7] = _mm_add_epi16(t[6], t[7]);
-  s[8] = t[8];
-  s[15] = t[15];
-  s[9] = _mm_packs_epi32(u[8], u[9]);
-  s[14] = _mm_packs_epi32(u[10], u[11]);
-  s[10] = _mm_packs_epi32(u[12], u[13]);
-  s[13] = _mm_packs_epi32(u[14], u[15]);
-  s[11] = t[11];
-  s[12] = t[12];
-
-  // stage 5
-  t[0] = _mm_add_epi16(s[0], s[3]);
-  t[1] = _mm_add_epi16(s[1], s[2]);
-  t[2] = _mm_sub_epi16(s[1], s[2]);
-  t[3] = _mm_sub_epi16(s[0], s[3]);
-  t[4] = s[4];
-  t[7] = s[7];
-
-  u[0] = _mm_unpacklo_epi16(s[5], s[6]);
-  u[1] = _mm_unpackhi_epi16(s[5], s[6]);
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  t[5] = _mm_packs_epi32(u[0], u[1]);
-  t[6] = _mm_packs_epi32(u[2], u[3]);
-
-  t[8] = _mm_add_epi16(s[8], s[11]);
-  t[9] = _mm_add_epi16(s[9], s[10]);
-  t[10] = _mm_sub_epi16(s[9], s[10]);
-  t[11] = _mm_sub_epi16(s[8], s[11]);
-  t[12] = _mm_sub_epi16(s[15], s[12]);
-  t[13] = _mm_sub_epi16(s[14], s[13]);
-  t[14] = _mm_add_epi16(s[13], s[14]);
-  t[15] = _mm_add_epi16(s[12], s[15]);
-
-  // stage 6
-  s[0] = _mm_add_epi16(t[0], t[7]);
-  s[1] = _mm_add_epi16(t[1], t[6]);
-  s[2] = _mm_add_epi16(t[2], t[5]);
-  s[3] = _mm_add_epi16(t[3], t[4]);
-  s[4] = _mm_sub_epi16(t[3], t[4]);
-  s[5] = _mm_sub_epi16(t[2], t[5]);
-  s[6] = _mm_sub_epi16(t[1], t[6]);
-  s[7] = _mm_sub_epi16(t[0], t[7]);
-  s[8] = t[8];
-  s[9] = t[9];
-
-  u[0] = _mm_unpacklo_epi16(t[10], t[13]);
-  u[1] = _mm_unpackhi_epi16(t[10], t[13]);
-  u[2] = _mm_unpacklo_epi16(t[11], t[12]);
-  u[3] = _mm_unpackhi_epi16(t[11], t[12]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
-  s[10] = _mm_packs_epi32(u[0], u[1]);
-  s[13] = _mm_packs_epi32(u[2], u[3]);
-  s[11] = _mm_packs_epi32(u[4], u[5]);
-  s[12] = _mm_packs_epi32(u[6], u[7]);
-  s[14] = t[14];
-  s[15] = t[15];
-
-  // stage 7
-  in[0] = _mm_add_epi16(s[0], s[15]);
-  in[1] = _mm_add_epi16(s[1], s[14]);
-  in[2] = _mm_add_epi16(s[2], s[13]);
-  in[3] = _mm_add_epi16(s[3], s[12]);
-  in[4] = _mm_add_epi16(s[4], s[11]);
-  in[5] = _mm_add_epi16(s[5], s[10]);
-  in[6] = _mm_add_epi16(s[6], s[9]);
-  in[7] = _mm_add_epi16(s[7], s[8]);
-  in[8] = _mm_sub_epi16(s[7], s[8]);
-  in[9] = _mm_sub_epi16(s[6], s[9]);
-  in[10] = _mm_sub_epi16(s[5], s[10]);
-  in[11] = _mm_sub_epi16(s[4], s[11]);
-  in[12] = _mm_sub_epi16(s[3], s[12]);
-  in[13] = _mm_sub_epi16(s[2], s[13]);
-  in[14] = _mm_sub_epi16(s[1], s[14]);
-  in[15] = _mm_sub_epi16(s[0], s[15]);
-}
-
-void av1_idct16_sse2(__m128i *in0, __m128i *in1) {
-  array_transpose_16x16(in0, in1);
-  av1_idct16_8col(in0);
-  av1_idct16_8col(in1);
-}
-
-void av1_iadst16_sse2(__m128i *in0, __m128i *in1) {
-  array_transpose_16x16(in0, in1);
-  av1_iadst16_8col(in0);
-  av1_iadst16_8col(in1);
-}
-
-void av1_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
-                               int stride) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  const __m128i zero = _mm_setzero_si128();
-
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  __m128i in[16], l[16];
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_8,
-      stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, stp1_8_0,
-      stp1_12_0;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i;
-  // First 1-D inverse DCT
-  // Load input data.
-  in[0] = _mm_load_si128((const __m128i *)input);
-  in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
-  in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
-  in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
-
-  TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
-
-  // Stage2
-  {
-    const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
-    const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
-
-    tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
-    tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
-    tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
-    tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp5 = _mm_add_epi32(tmp5, rounding);
-    tmp7 = _mm_add_epi32(tmp7, rounding);
-
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
-
-    stp2_8 = _mm_packs_epi32(tmp0, tmp2);
-    stp2_11 = _mm_packs_epi32(tmp5, tmp7);
-  }
-
-  // Stage3
-  {
-    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
-
-    tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
-    tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-
-    stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
-    stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
-
-    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
-  }
-
-  // Stage4
-  {
-    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
-
-    tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
-    tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
-    tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
-    tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
-    tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
-    tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp1 = _mm_add_epi32(tmp1, rounding);
-    tmp3 = _mm_add_epi32(tmp3, rounding);
-    tmp5 = _mm_add_epi32(tmp5, rounding);
-    tmp7 = _mm_add_epi32(tmp7, rounding);
-
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
-
-    stp1_0 = _mm_packs_epi32(tmp0, tmp0);
-    stp1_1 = _mm_packs_epi32(tmp2, tmp2);
-    stp2_9 = _mm_packs_epi32(tmp1, tmp3);
-    stp2_10 = _mm_packs_epi32(tmp5, tmp7);
-
-    stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
-  }
-
-  // Stage5 and Stage6
-  {
-    tmp0 = _mm_add_epi16(stp2_8, stp2_11);
-    tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
-    tmp2 = _mm_add_epi16(stp2_9, stp2_10);
-    tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
-
-    stp1_9 = _mm_unpacklo_epi64(tmp2, zero);
-    stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
-    stp1_8 = _mm_unpacklo_epi64(tmp0, zero);
-    stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
-
-    stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
-    stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
-    stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
-    stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
-  }
-
-  // Stage6
-  {
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
-
-    tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
-    tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
-    tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
-    tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
-    tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
-    tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
-
-    tmp1 = _mm_add_epi32(tmp1, rounding);
-    tmp3 = _mm_add_epi32(tmp3, rounding);
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-    tmp6 = _mm_add_epi32(tmp6, rounding);
-
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
-    stp1_6 = _mm_packs_epi32(tmp3, tmp1);
-
-    stp2_10 = _mm_packs_epi32(tmp0, zero);
-    stp2_13 = _mm_packs_epi32(tmp2, zero);
-    stp2_11 = _mm_packs_epi32(tmp4, zero);
-    stp2_12 = _mm_packs_epi32(tmp6, zero);
-
-    tmp0 = _mm_add_epi16(stp1_0, stp1_4);
-    tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
-    tmp2 = _mm_add_epi16(stp1_1, stp1_6);
-    tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
-
-    stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
-    stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
-    stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
-    stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
-    stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
-    stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
-    stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
-    stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
-  }
-
-  // Stage7. Left 8x16 only.
-  l[0] = _mm_add_epi16(stp2_0, stp1_15);
-  l[1] = _mm_add_epi16(stp2_1, stp1_14);
-  l[2] = _mm_add_epi16(stp2_2, stp2_13);
-  l[3] = _mm_add_epi16(stp2_3, stp2_12);
-  l[4] = _mm_add_epi16(stp2_4, stp2_11);
-  l[5] = _mm_add_epi16(stp2_5, stp2_10);
-  l[6] = _mm_add_epi16(stp2_6, stp1_9);
-  l[7] = _mm_add_epi16(stp2_7, stp1_8);
-  l[8] = _mm_sub_epi16(stp2_7, stp1_8);
-  l[9] = _mm_sub_epi16(stp2_6, stp1_9);
-  l[10] = _mm_sub_epi16(stp2_5, stp2_10);
-  l[11] = _mm_sub_epi16(stp2_4, stp2_11);
-  l[12] = _mm_sub_epi16(stp2_3, stp2_12);
-  l[13] = _mm_sub_epi16(stp2_2, stp2_13);
-  l[14] = _mm_sub_epi16(stp2_1, stp1_14);
-  l[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
-  // Second 1-D inverse transform, performed per 8x16 block
-  for (i = 0; i < 2; i++) {
-    int j;
-    array_transpose_4X8(l + 8 * i, in);
-
-    IDCT16_10
-
-    // Stage7
-    in[0] = _mm_add_epi16(stp2_0, stp1_15);
-    in[1] = _mm_add_epi16(stp2_1, stp1_14);
-    in[2] = _mm_add_epi16(stp2_2, stp2_13);
-    in[3] = _mm_add_epi16(stp2_3, stp2_12);
-    in[4] = _mm_add_epi16(stp2_4, stp2_11);
-    in[5] = _mm_add_epi16(stp2_5, stp2_10);
-    in[6] = _mm_add_epi16(stp2_6, stp1_9);
-    in[7] = _mm_add_epi16(stp2_7, stp1_8);
-    in[8] = _mm_sub_epi16(stp2_7, stp1_8);
-    in[9] = _mm_sub_epi16(stp2_6, stp1_9);
-    in[10] = _mm_sub_epi16(stp2_5, stp2_10);
-    in[11] = _mm_sub_epi16(stp2_4, stp2_11);
-    in[12] = _mm_sub_epi16(stp2_3, stp2_12);
-    in[13] = _mm_sub_epi16(stp2_2, stp2_13);
-    in[14] = _mm_sub_epi16(stp2_1, stp1_14);
-    in[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
-    for (j = 0; j < 16; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      RECON_AND_STORE(dest + j * stride, in[j]);
-    }
-
-    dest += 8;
-  }
-}
-
-#define LOAD_DQCOEFF(reg, input)                  \
-  {                                               \
-    reg = _mm_load_si128((const __m128i *)input); \
-    input += 8;                                   \
-  }
-
-#define IDCT32_34                                                              \
-  /* Stage1 */                                                                 \
-  {                                                                            \
-    const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero);                   \
-    const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero);                   \
-                                                                               \
-    const __m128i lo_25_7 = _mm_unpacklo_epi16(zero, in[7]);                   \
-    const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]);                   \
-                                                                               \
-    const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero);                   \
-    const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero);                   \
-                                                                               \
-    const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]);                   \
-    const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]);                   \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, stg1_1, stp1_16,        \
-                             stp1_31);                                         \
-    MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, stg1_7, stp1_19,        \
-                             stp1_28);                                         \
-    MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, stg1_9, stp1_20,        \
-                             stp1_27);                                         \
-    MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, stg1_15, stp1_23,      \
-                             stp1_24);                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage2 */                                                                 \
-  {                                                                            \
-    const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero);                   \
-    const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero);                   \
-                                                                               \
-    const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]);                   \
-    const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]);                   \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, stg2_1, stp2_8,         \
-                             stp2_15);                                         \
-    MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, stg2_7, stp2_11,        \
-                             stp2_12);                                         \
-                                                                               \
-    stp2_16 = stp1_16;                                                         \
-    stp2_19 = stp1_19;                                                         \
-                                                                               \
-    stp2_20 = stp1_20;                                                         \
-    stp2_23 = stp1_23;                                                         \
-                                                                               \
-    stp2_24 = stp1_24;                                                         \
-    stp2_27 = stp1_27;                                                         \
-                                                                               \
-    stp2_28 = stp1_28;                                                         \
-    stp2_31 = stp1_31;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage3 */                                                                 \
-  {                                                                            \
-    const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero);                   \
-    const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero);                   \
-                                                                               \
-    const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31);             \
-    const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31);             \
-    const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28);             \
-    const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28);             \
-                                                                               \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27);             \
-    const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24);             \
-    const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, stg3_1, stp1_4,         \
-                             stp1_7);                                          \
-                                                                               \
-    stp1_8 = stp2_8;                                                           \
-    stp1_11 = stp2_11;                                                         \
-    stp1_12 = stp2_12;                                                         \
-    stp1_15 = stp2_15;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4,     \
-                           stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18,  \
-                           stp1_29)                                            \
-    MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8,     \
-                           stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \
-                           stp1_25)                                            \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_31 = stp2_31;                                                         \
-    stp1_19 = stp2_19;                                                         \
-    stp1_20 = stp2_20;                                                         \
-    stp1_23 = stp2_23;                                                         \
-    stp1_24 = stp2_24;                                                         \
-    stp1_27 = stp2_27;                                                         \
-    stp1_28 = stp2_28;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage4 */                                                                 \
-  {                                                                            \
-    const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero);                   \
-    const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero);                   \
-                                                                               \
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15);               \
-    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15);               \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, stg4_1, stp2_0,         \
-                             stp2_1);                                          \
-                                                                               \
-    stp2_4 = stp1_4;                                                           \
-    stp2_5 = stp1_4;                                                           \
-    stp2_6 = stp1_7;                                                           \
-    stp2_7 = stp1_7;                                                           \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
-                           stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10,   \
-                           stp2_13)                                            \
-                                                                               \
-    stp2_8 = stp1_8;                                                           \
-    stp2_15 = stp1_15;                                                         \
-    stp2_11 = stp1_11;                                                         \
-    stp2_12 = stp1_12;                                                         \
-                                                                               \
-    stp2_16 = _mm_add_epi16(stp1_16, stp1_19);                                 \
-    stp2_17 = _mm_add_epi16(stp1_17, stp1_18);                                 \
-    stp2_18 = _mm_sub_epi16(stp1_17, stp1_18);                                 \
-    stp2_19 = _mm_sub_epi16(stp1_16, stp1_19);                                 \
-    stp2_20 = _mm_sub_epi16(stp1_23, stp1_20);                                 \
-    stp2_21 = _mm_sub_epi16(stp1_22, stp1_21);                                 \
-    stp2_22 = _mm_add_epi16(stp1_22, stp1_21);                                 \
-    stp2_23 = _mm_add_epi16(stp1_23, stp1_20);                                 \
-                                                                               \
-    stp2_24 = _mm_add_epi16(stp1_24, stp1_27);                                 \
-    stp2_25 = _mm_add_epi16(stp1_25, stp1_26);                                 \
-    stp2_26 = _mm_sub_epi16(stp1_25, stp1_26);                                 \
-    stp2_27 = _mm_sub_epi16(stp1_24, stp1_27);                                 \
-    stp2_28 = _mm_sub_epi16(stp1_31, stp1_28);                                 \
-    stp2_29 = _mm_sub_epi16(stp1_30, stp1_29);                                 \
-    stp2_30 = _mm_add_epi16(stp1_29, stp1_30);                                 \
-    stp2_31 = _mm_add_epi16(stp1_28, stp1_31);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage5 */                                                                 \
-  {                                                                            \
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
-    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
-    const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);             \
-    const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);             \
-                                                                               \
-    const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28);             \
-    const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28);             \
-    const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
-    const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
-                                                                               \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
-                                                                               \
-    stp1_0 = stp2_0;                                                           \
-    stp1_1 = stp2_1;                                                           \
-    stp1_2 = stp2_1;                                                           \
-    stp1_3 = stp2_0;                                                           \
-                                                                               \
-    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
-    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
-    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
-    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-                                                                               \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
-                                                                               \
-    stp1_4 = stp2_4;                                                           \
-    stp1_7 = stp2_7;                                                           \
-                                                                               \
-    stp1_8 = _mm_add_epi16(stp2_8, stp2_11);                                   \
-    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
-    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp2_8, stp2_11);                                  \
-    stp1_12 = _mm_sub_epi16(stp2_15, stp2_12);                                 \
-    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
-    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
-    stp1_15 = _mm_add_epi16(stp2_15, stp2_12);                                 \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_17 = stp2_17;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4,     \
-                           stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19,  \
-                           stp1_28)                                            \
-    MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6,     \
-                           stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21,  \
-                           stp1_26)                                            \
-                                                                               \
-    stp1_22 = stp2_22;                                                         \
-    stp1_23 = stp2_23;                                                         \
-    stp1_24 = stp2_24;                                                         \
-    stp1_25 = stp2_25;                                                         \
-    stp1_30 = stp2_30;                                                         \
-    stp1_31 = stp2_31;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage6 */                                                                 \
-  {                                                                            \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
-    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
-                                                                               \
-    stp2_0 = _mm_add_epi16(stp1_0, stp1_7);                                    \
-    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
-    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
-    stp2_3 = _mm_add_epi16(stp1_3, stp1_4);                                    \
-    stp2_4 = _mm_sub_epi16(stp1_3, stp1_4);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
-    stp2_7 = _mm_sub_epi16(stp1_0, stp1_7);                                    \
-                                                                               \
-    stp2_8 = stp1_8;                                                           \
-    stp2_9 = stp1_9;                                                           \
-    stp2_14 = stp1_14;                                                         \
-    stp2_15 = stp1_15;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
-                           stp2_12)                                            \
-                                                                               \
-    stp2_16 = _mm_add_epi16(stp1_16, stp1_23);                                 \
-    stp2_17 = _mm_add_epi16(stp1_17, stp1_22);                                 \
-    stp2_18 = _mm_add_epi16(stp1_18, stp1_21);                                 \
-    stp2_19 = _mm_add_epi16(stp1_19, stp1_20);                                 \
-    stp2_20 = _mm_sub_epi16(stp1_19, stp1_20);                                 \
-    stp2_21 = _mm_sub_epi16(stp1_18, stp1_21);                                 \
-    stp2_22 = _mm_sub_epi16(stp1_17, stp1_22);                                 \
-    stp2_23 = _mm_sub_epi16(stp1_16, stp1_23);                                 \
-                                                                               \
-    stp2_24 = _mm_sub_epi16(stp1_31, stp1_24);                                 \
-    stp2_25 = _mm_sub_epi16(stp1_30, stp1_25);                                 \
-    stp2_26 = _mm_sub_epi16(stp1_29, stp1_26);                                 \
-    stp2_27 = _mm_sub_epi16(stp1_28, stp1_27);                                 \
-    stp2_28 = _mm_add_epi16(stp1_27, stp1_28);                                 \
-    stp2_29 = _mm_add_epi16(stp1_26, stp1_29);                                 \
-    stp2_30 = _mm_add_epi16(stp1_25, stp1_30);                                 \
-    stp2_31 = _mm_add_epi16(stp1_24, stp1_31);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage7 */                                                                 \
-  {                                                                            \
-    const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
-    const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
-                                                                               \
-    const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);             \
-    const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);             \
-    const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24);             \
-    const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24);             \
-                                                                               \
-    stp1_0 = _mm_add_epi16(stp2_0, stp2_15);                                   \
-    stp1_1 = _mm_add_epi16(stp2_1, stp2_14);                                   \
-    stp1_2 = _mm_add_epi16(stp2_2, stp2_13);                                   \
-    stp1_3 = _mm_add_epi16(stp2_3, stp2_12);                                   \
-    stp1_4 = _mm_add_epi16(stp2_4, stp2_11);                                   \
-    stp1_5 = _mm_add_epi16(stp2_5, stp2_10);                                   \
-    stp1_6 = _mm_add_epi16(stp2_6, stp2_9);                                    \
-    stp1_7 = _mm_add_epi16(stp2_7, stp2_8);                                    \
-    stp1_8 = _mm_sub_epi16(stp2_7, stp2_8);                                    \
-    stp1_9 = _mm_sub_epi16(stp2_6, stp2_9);                                    \
-    stp1_10 = _mm_sub_epi16(stp2_5, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp2_4, stp2_11);                                  \
-    stp1_12 = _mm_sub_epi16(stp2_3, stp2_12);                                  \
-    stp1_13 = _mm_sub_epi16(stp2_2, stp2_13);                                  \
-    stp1_14 = _mm_sub_epi16(stp2_1, stp2_14);                                  \
-    stp1_15 = _mm_sub_epi16(stp2_0, stp2_15);                                  \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_17 = stp2_17;                                                         \
-    stp1_18 = stp2_18;                                                         \
-    stp1_19 = stp2_19;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21,  \
-                           stp1_26)                                            \
-    MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23,  \
-                           stp1_24)                                            \
-                                                                               \
-    stp1_28 = stp2_28;                                                         \
-    stp1_29 = stp2_29;                                                         \
-    stp1_30 = stp2_30;                                                         \
-    stp1_31 = stp2_31;                                                         \
-  }
-
-#define IDCT32                                                                 \
-  /* Stage1 */                                                                 \
-  {                                                                            \
-    const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]);                 \
-    const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]);                 \
-    const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]);               \
-    const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]);               \
-                                                                               \
-    const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]);                 \
-    const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]);                 \
-    const __m128i lo_25_7 = _mm_unpacklo_epi16(in[25], in[7]);                 \
-    const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]);                 \
-                                                                               \
-    const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]);                 \
-    const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]);                 \
-    const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]);               \
-    const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]);               \
-                                                                               \
-    const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]);               \
-    const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]);               \
-    const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]);                 \
-    const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]);                 \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0,       \
-                           stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, stp1_17,  \
-                           stp1_30)                                            \
-    MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, stg1_5, \
-                           stg1_6, stg1_7, stp1_18, stp1_29, stp1_19, stp1_28) \
-    MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8,       \
-                           stg1_9, stg1_10, stg1_11, stp1_20, stp1_27,         \
-                           stp1_21, stp1_26)                                   \
-    MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12,      \
-                           stg1_13, stg1_14, stg1_15, stp1_22, stp1_25,        \
-                           stp1_23, stp1_24)                                   \
-  }                                                                            \
-                                                                               \
-  /* Stage2 */                                                                 \
-  {                                                                            \
-    const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]);                 \
-    const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]);                 \
-    const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]);               \
-    const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]);               \
-                                                                               \
-    const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]);               \
-    const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]);               \
-    const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]);                 \
-    const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]);                 \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0,       \
-                           stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9,    \
-                           stp2_14)                                            \
-    MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4,       \
-                           stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, stp2_11,  \
-                           stp2_12)                                            \
-                                                                               \
-    stp2_16 = _mm_add_epi16(stp1_16, stp1_17);                                 \
-    stp2_17 = _mm_sub_epi16(stp1_16, stp1_17);                                 \
-    stp2_18 = _mm_sub_epi16(stp1_19, stp1_18);                                 \
-    stp2_19 = _mm_add_epi16(stp1_19, stp1_18);                                 \
-                                                                               \
-    stp2_20 = _mm_add_epi16(stp1_20, stp1_21);                                 \
-    stp2_21 = _mm_sub_epi16(stp1_20, stp1_21);                                 \
-    stp2_22 = _mm_sub_epi16(stp1_23, stp1_22);                                 \
-    stp2_23 = _mm_add_epi16(stp1_23, stp1_22);                                 \
-                                                                               \
-    stp2_24 = _mm_add_epi16(stp1_24, stp1_25);                                 \
-    stp2_25 = _mm_sub_epi16(stp1_24, stp1_25);                                 \
-    stp2_26 = _mm_sub_epi16(stp1_27, stp1_26);                                 \
-    stp2_27 = _mm_add_epi16(stp1_27, stp1_26);                                 \
-                                                                               \
-    stp2_28 = _mm_add_epi16(stp1_28, stp1_29);                                 \
-    stp2_29 = _mm_sub_epi16(stp1_28, stp1_29);                                 \
-    stp2_30 = _mm_sub_epi16(stp1_31, stp1_30);                                 \
-    stp2_31 = _mm_add_epi16(stp1_31, stp1_30);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage3 */                                                                 \
-  {                                                                            \
-    const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]);                 \
-    const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]);                 \
-    const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]);               \
-    const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]);               \
-                                                                               \
-    const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30);             \
-    const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30);             \
-    const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);             \
-    const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);             \
-                                                                               \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
-    const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);             \
-    const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0,       \
-                           stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5,     \
-                           stp1_6)                                             \
-                                                                               \
-    stp1_8 = _mm_add_epi16(stp2_8, stp2_9);                                    \
-    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);                                    \
-    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);                                 \
-    stp1_11 = _mm_add_epi16(stp2_11, stp2_10);                                 \
-    stp1_12 = _mm_add_epi16(stp2_12, stp2_13);                                 \
-    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);                                 \
-    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);                                 \
-    stp1_15 = _mm_add_epi16(stp2_15, stp2_14);                                 \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4,     \
-                           stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18,  \
-                           stp1_29)                                            \
-    MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8,     \
-                           stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \
-                           stp1_25)                                            \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_31 = stp2_31;                                                         \
-    stp1_19 = stp2_19;                                                         \
-    stp1_20 = stp2_20;                                                         \
-    stp1_23 = stp2_23;                                                         \
-    stp1_24 = stp2_24;                                                         \
-    stp1_27 = stp2_27;                                                         \
-    stp1_28 = stp2_28;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage4 */                                                                 \
-  {                                                                            \
-    const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]);                 \
-    const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]);                 \
-    const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]);                 \
-    const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]);                 \
-                                                                               \
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);               \
-    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);               \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, stg4_1, \
-                           stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3)     \
-                                                                               \
-    stp2_4 = _mm_add_epi16(stp1_4, stp1_5);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);                                    \
-    stp2_7 = _mm_add_epi16(stp1_7, stp1_6);                                    \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
-                           stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10,   \
-                           stp2_13)                                            \
-                                                                               \
-    stp2_8 = stp1_8;                                                           \
-    stp2_15 = stp1_15;                                                         \
-    stp2_11 = stp1_11;                                                         \
-    stp2_12 = stp1_12;                                                         \
-                                                                               \
-    stp2_16 = _mm_add_epi16(stp1_16, stp1_19);                                 \
-    stp2_17 = _mm_add_epi16(stp1_17, stp1_18);                                 \
-    stp2_18 = _mm_sub_epi16(stp1_17, stp1_18);                                 \
-    stp2_19 = _mm_sub_epi16(stp1_16, stp1_19);                                 \
-    stp2_20 = _mm_sub_epi16(stp1_23, stp1_20);                                 \
-    stp2_21 = _mm_sub_epi16(stp1_22, stp1_21);                                 \
-    stp2_22 = _mm_add_epi16(stp1_22, stp1_21);                                 \
-    stp2_23 = _mm_add_epi16(stp1_23, stp1_20);                                 \
-                                                                               \
-    stp2_24 = _mm_add_epi16(stp1_24, stp1_27);                                 \
-    stp2_25 = _mm_add_epi16(stp1_25, stp1_26);                                 \
-    stp2_26 = _mm_sub_epi16(stp1_25, stp1_26);                                 \
-    stp2_27 = _mm_sub_epi16(stp1_24, stp1_27);                                 \
-    stp2_28 = _mm_sub_epi16(stp1_31, stp1_28);                                 \
-    stp2_29 = _mm_sub_epi16(stp1_30, stp1_29);                                 \
-    stp2_30 = _mm_add_epi16(stp1_29, stp1_30);                                 \
-    stp2_31 = _mm_add_epi16(stp1_28, stp1_31);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage5 */                                                                 \
-  {                                                                            \
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
-    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
-    const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);             \
-    const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);             \
-                                                                               \
-    const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28);             \
-    const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28);             \
-    const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
-    const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
-                                                                               \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
-                                                                               \
-    stp1_0 = _mm_add_epi16(stp2_0, stp2_3);                                    \
-    stp1_1 = _mm_add_epi16(stp2_1, stp2_2);                                    \
-    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);                                    \
-    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);                                    \
-                                                                               \
-    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
-    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
-    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
-    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-                                                                               \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
-                                                                               \
-    stp1_4 = stp2_4;                                                           \
-    stp1_7 = stp2_7;                                                           \
-                                                                               \
-    stp1_8 = _mm_add_epi16(stp2_8, stp2_11);                                   \
-    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
-    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp2_8, stp2_11);                                  \
-    stp1_12 = _mm_sub_epi16(stp2_15, stp2_12);                                 \
-    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
-    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
-    stp1_15 = _mm_add_epi16(stp2_15, stp2_12);                                 \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_17 = stp2_17;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4,     \
-                           stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19,  \
-                           stp1_28)                                            \
-    MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6,     \
-                           stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21,  \
-                           stp1_26)                                            \
-                                                                               \
-    stp1_22 = stp2_22;                                                         \
-    stp1_23 = stp2_23;                                                         \
-    stp1_24 = stp2_24;                                                         \
-    stp1_25 = stp2_25;                                                         \
-    stp1_30 = stp2_30;                                                         \
-    stp1_31 = stp2_31;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage6 */                                                                 \
-  {                                                                            \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
-    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
-                                                                               \
-    stp2_0 = _mm_add_epi16(stp1_0, stp1_7);                                    \
-    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
-    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
-    stp2_3 = _mm_add_epi16(stp1_3, stp1_4);                                    \
-    stp2_4 = _mm_sub_epi16(stp1_3, stp1_4);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
-    stp2_7 = _mm_sub_epi16(stp1_0, stp1_7);                                    \
-                                                                               \
-    stp2_8 = stp1_8;                                                           \
-    stp2_9 = stp1_9;                                                           \
-    stp2_14 = stp1_14;                                                         \
-    stp2_15 = stp1_15;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
-                           stp2_12)                                            \
-                                                                               \
-    stp2_16 = _mm_add_epi16(stp1_16, stp1_23);                                 \
-    stp2_17 = _mm_add_epi16(stp1_17, stp1_22);                                 \
-    stp2_18 = _mm_add_epi16(stp1_18, stp1_21);                                 \
-    stp2_19 = _mm_add_epi16(stp1_19, stp1_20);                                 \
-    stp2_20 = _mm_sub_epi16(stp1_19, stp1_20);                                 \
-    stp2_21 = _mm_sub_epi16(stp1_18, stp1_21);                                 \
-    stp2_22 = _mm_sub_epi16(stp1_17, stp1_22);                                 \
-    stp2_23 = _mm_sub_epi16(stp1_16, stp1_23);                                 \
-                                                                               \
-    stp2_24 = _mm_sub_epi16(stp1_31, stp1_24);                                 \
-    stp2_25 = _mm_sub_epi16(stp1_30, stp1_25);                                 \
-    stp2_26 = _mm_sub_epi16(stp1_29, stp1_26);                                 \
-    stp2_27 = _mm_sub_epi16(stp1_28, stp1_27);                                 \
-    stp2_28 = _mm_add_epi16(stp1_27, stp1_28);                                 \
-    stp2_29 = _mm_add_epi16(stp1_26, stp1_29);                                 \
-    stp2_30 = _mm_add_epi16(stp1_25, stp1_30);                                 \
-    stp2_31 = _mm_add_epi16(stp1_24, stp1_31);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage7 */                                                                 \
-  {                                                                            \
-    const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
-    const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
-                                                                               \
-    const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);             \
-    const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);             \
-    const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24);             \
-    const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24);             \
-                                                                               \
-    stp1_0 = _mm_add_epi16(stp2_0, stp2_15);                                   \
-    stp1_1 = _mm_add_epi16(stp2_1, stp2_14);                                   \
-    stp1_2 = _mm_add_epi16(stp2_2, stp2_13);                                   \
-    stp1_3 = _mm_add_epi16(stp2_3, stp2_12);                                   \
-    stp1_4 = _mm_add_epi16(stp2_4, stp2_11);                                   \
-    stp1_5 = _mm_add_epi16(stp2_5, stp2_10);                                   \
-    stp1_6 = _mm_add_epi16(stp2_6, stp2_9);                                    \
-    stp1_7 = _mm_add_epi16(stp2_7, stp2_8);                                    \
-    stp1_8 = _mm_sub_epi16(stp2_7, stp2_8);                                    \
-    stp1_9 = _mm_sub_epi16(stp2_6, stp2_9);                                    \
-    stp1_10 = _mm_sub_epi16(stp2_5, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp2_4, stp2_11);                                  \
-    stp1_12 = _mm_sub_epi16(stp2_3, stp2_12);                                  \
-    stp1_13 = _mm_sub_epi16(stp2_2, stp2_13);                                  \
-    stp1_14 = _mm_sub_epi16(stp2_1, stp2_14);                                  \
-    stp1_15 = _mm_sub_epi16(stp2_0, stp2_15);                                  \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_17 = stp2_17;                                                         \
-    stp1_18 = stp2_18;                                                         \
-    stp1_19 = stp2_19;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21,  \
-                           stp1_26)                                            \
-    MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23,  \
-                           stp1_24)                                            \
-                                                                               \
-    stp1_28 = stp2_28;                                                         \
-    stp1_29 = stp2_29;                                                         \
-    stp1_30 = stp2_30;                                                         \
-    stp1_31 = stp2_31;                                                         \
-  }
-
-// Only upper-left 8x8 has non-zero coeff
-void av1_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
-                               int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-
-  // av1_idct constants for each stage
-  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
-  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
-  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
-  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
-  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
-  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
-  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
-
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in[32], col[32];
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
-      stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-      stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
-      stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
-      stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
-      stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i;
-
-  // Load input data. Only need to load the top left 8x8 block.
-  in[0] = _mm_load_si128((const __m128i *)input);
-  in[1] = _mm_load_si128((const __m128i *)(input + 32));
-  in[2] = _mm_load_si128((const __m128i *)(input + 64));
-  in[3] = _mm_load_si128((const __m128i *)(input + 96));
-  in[4] = _mm_load_si128((const __m128i *)(input + 128));
-  in[5] = _mm_load_si128((const __m128i *)(input + 160));
-  in[6] = _mm_load_si128((const __m128i *)(input + 192));
-  in[7] = _mm_load_si128((const __m128i *)(input + 224));
-
-  for (i = 8; i < 32; ++i) {
-    in[i] = _mm_setzero_si128();
-  }
-
-  array_transpose_8x8(in, in);
-  // TODO(hkuang): Following transposes are unnecessary. But remove them will
-  // lead to performance drop on some devices.
-  array_transpose_8x8(in + 8, in + 8);
-  array_transpose_8x8(in + 16, in + 16);
-  array_transpose_8x8(in + 24, in + 24);
-
-  IDCT32_34
-
-  // 1_D: Store 32 intermediate results for each 8x32 block.
-  col[0] = _mm_add_epi16(stp1_0, stp1_31);
-  col[1] = _mm_add_epi16(stp1_1, stp1_30);
-  col[2] = _mm_add_epi16(stp1_2, stp1_29);
-  col[3] = _mm_add_epi16(stp1_3, stp1_28);
-  col[4] = _mm_add_epi16(stp1_4, stp1_27);
-  col[5] = _mm_add_epi16(stp1_5, stp1_26);
-  col[6] = _mm_add_epi16(stp1_6, stp1_25);
-  col[7] = _mm_add_epi16(stp1_7, stp1_24);
-  col[8] = _mm_add_epi16(stp1_8, stp1_23);
-  col[9] = _mm_add_epi16(stp1_9, stp1_22);
-  col[10] = _mm_add_epi16(stp1_10, stp1_21);
-  col[11] = _mm_add_epi16(stp1_11, stp1_20);
-  col[12] = _mm_add_epi16(stp1_12, stp1_19);
-  col[13] = _mm_add_epi16(stp1_13, stp1_18);
-  col[14] = _mm_add_epi16(stp1_14, stp1_17);
-  col[15] = _mm_add_epi16(stp1_15, stp1_16);
-  col[16] = _mm_sub_epi16(stp1_15, stp1_16);
-  col[17] = _mm_sub_epi16(stp1_14, stp1_17);
-  col[18] = _mm_sub_epi16(stp1_13, stp1_18);
-  col[19] = _mm_sub_epi16(stp1_12, stp1_19);
-  col[20] = _mm_sub_epi16(stp1_11, stp1_20);
-  col[21] = _mm_sub_epi16(stp1_10, stp1_21);
-  col[22] = _mm_sub_epi16(stp1_9, stp1_22);
-  col[23] = _mm_sub_epi16(stp1_8, stp1_23);
-  col[24] = _mm_sub_epi16(stp1_7, stp1_24);
-  col[25] = _mm_sub_epi16(stp1_6, stp1_25);
-  col[26] = _mm_sub_epi16(stp1_5, stp1_26);
-  col[27] = _mm_sub_epi16(stp1_4, stp1_27);
-  col[28] = _mm_sub_epi16(stp1_3, stp1_28);
-  col[29] = _mm_sub_epi16(stp1_2, stp1_29);
-  col[30] = _mm_sub_epi16(stp1_1, stp1_30);
-  col[31] = _mm_sub_epi16(stp1_0, stp1_31);
-  for (i = 0; i < 4; i++) {
-    int j;
-    // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(col + i * 8, in);
-    IDCT32_34
-
-    // 2_D: Calculate the results and store them to destination.
-    in[0] = _mm_add_epi16(stp1_0, stp1_31);
-    in[1] = _mm_add_epi16(stp1_1, stp1_30);
-    in[2] = _mm_add_epi16(stp1_2, stp1_29);
-    in[3] = _mm_add_epi16(stp1_3, stp1_28);
-    in[4] = _mm_add_epi16(stp1_4, stp1_27);
-    in[5] = _mm_add_epi16(stp1_5, stp1_26);
-    in[6] = _mm_add_epi16(stp1_6, stp1_25);
-    in[7] = _mm_add_epi16(stp1_7, stp1_24);
-    in[8] = _mm_add_epi16(stp1_8, stp1_23);
-    in[9] = _mm_add_epi16(stp1_9, stp1_22);
-    in[10] = _mm_add_epi16(stp1_10, stp1_21);
-    in[11] = _mm_add_epi16(stp1_11, stp1_20);
-    in[12] = _mm_add_epi16(stp1_12, stp1_19);
-    in[13] = _mm_add_epi16(stp1_13, stp1_18);
-    in[14] = _mm_add_epi16(stp1_14, stp1_17);
-    in[15] = _mm_add_epi16(stp1_15, stp1_16);
-    in[16] = _mm_sub_epi16(stp1_15, stp1_16);
-    in[17] = _mm_sub_epi16(stp1_14, stp1_17);
-    in[18] = _mm_sub_epi16(stp1_13, stp1_18);
-    in[19] = _mm_sub_epi16(stp1_12, stp1_19);
-    in[20] = _mm_sub_epi16(stp1_11, stp1_20);
-    in[21] = _mm_sub_epi16(stp1_10, stp1_21);
-    in[22] = _mm_sub_epi16(stp1_9, stp1_22);
-    in[23] = _mm_sub_epi16(stp1_8, stp1_23);
-    in[24] = _mm_sub_epi16(stp1_7, stp1_24);
-    in[25] = _mm_sub_epi16(stp1_6, stp1_25);
-    in[26] = _mm_sub_epi16(stp1_5, stp1_26);
-    in[27] = _mm_sub_epi16(stp1_4, stp1_27);
-    in[28] = _mm_sub_epi16(stp1_3, stp1_28);
-    in[29] = _mm_sub_epi16(stp1_2, stp1_29);
-    in[30] = _mm_sub_epi16(stp1_1, stp1_30);
-    in[31] = _mm_sub_epi16(stp1_0, stp1_31);
-
-    for (j = 0; j < 32; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      RECON_AND_STORE(dest + j * stride, in[j]);
-    }
-
-    dest += 8;
-  }
-}
-
-void av1_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
-                                 int stride) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  const __m128i zero = _mm_setzero_si128();
-
-  // av1_idct constants for each stage
-  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
-  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
-  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
-  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
-  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
-  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
-  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
-  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
-  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
-  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
-  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
-  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
-  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
-  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
-
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
-  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in[32], col[128], zero_idx[16];
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
-      stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-      stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
-      stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
-      stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
-      stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i, j, i32;
-
-  for (i = 0; i < 4; i++) {
-    i32 = (i << 5);
-    // First 1-D av1_idct
-    // Load input data.
-    LOAD_DQCOEFF(in[0], input);
-    LOAD_DQCOEFF(in[8], input);
-    LOAD_DQCOEFF(in[16], input);
-    LOAD_DQCOEFF(in[24], input);
-    LOAD_DQCOEFF(in[1], input);
-    LOAD_DQCOEFF(in[9], input);
-    LOAD_DQCOEFF(in[17], input);
-    LOAD_DQCOEFF(in[25], input);
-    LOAD_DQCOEFF(in[2], input);
-    LOAD_DQCOEFF(in[10], input);
-    LOAD_DQCOEFF(in[18], input);
-    LOAD_DQCOEFF(in[26], input);
-    LOAD_DQCOEFF(in[3], input);
-    LOAD_DQCOEFF(in[11], input);
-    LOAD_DQCOEFF(in[19], input);
-    LOAD_DQCOEFF(in[27], input);
-
-    LOAD_DQCOEFF(in[4], input);
-    LOAD_DQCOEFF(in[12], input);
-    LOAD_DQCOEFF(in[20], input);
-    LOAD_DQCOEFF(in[28], input);
-    LOAD_DQCOEFF(in[5], input);
-    LOAD_DQCOEFF(in[13], input);
-    LOAD_DQCOEFF(in[21], input);
-    LOAD_DQCOEFF(in[29], input);
-    LOAD_DQCOEFF(in[6], input);
-    LOAD_DQCOEFF(in[14], input);
-    LOAD_DQCOEFF(in[22], input);
-    LOAD_DQCOEFF(in[30], input);
-    LOAD_DQCOEFF(in[7], input);
-    LOAD_DQCOEFF(in[15], input);
-    LOAD_DQCOEFF(in[23], input);
-    LOAD_DQCOEFF(in[31], input);
-
-    // checking if all entries are zero
-    zero_idx[0] = _mm_or_si128(in[0], in[1]);
-    zero_idx[1] = _mm_or_si128(in[2], in[3]);
-    zero_idx[2] = _mm_or_si128(in[4], in[5]);
-    zero_idx[3] = _mm_or_si128(in[6], in[7]);
-    zero_idx[4] = _mm_or_si128(in[8], in[9]);
-    zero_idx[5] = _mm_or_si128(in[10], in[11]);
-    zero_idx[6] = _mm_or_si128(in[12], in[13]);
-    zero_idx[7] = _mm_or_si128(in[14], in[15]);
-    zero_idx[8] = _mm_or_si128(in[16], in[17]);
-    zero_idx[9] = _mm_or_si128(in[18], in[19]);
-    zero_idx[10] = _mm_or_si128(in[20], in[21]);
-    zero_idx[11] = _mm_or_si128(in[22], in[23]);
-    zero_idx[12] = _mm_or_si128(in[24], in[25]);
-    zero_idx[13] = _mm_or_si128(in[26], in[27]);
-    zero_idx[14] = _mm_or_si128(in[28], in[29]);
-    zero_idx[15] = _mm_or_si128(in[30], in[31]);
-
-    zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
-    zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
-    zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
-    zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
-    zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
-    zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
-    zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
-    zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
-
-    zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
-    zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
-    zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
-    zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
-    zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
-    zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
-    zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
-
-    if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
-      col[i32 + 0] = _mm_setzero_si128();
-      col[i32 + 1] = _mm_setzero_si128();
-      col[i32 + 2] = _mm_setzero_si128();
-      col[i32 + 3] = _mm_setzero_si128();
-      col[i32 + 4] = _mm_setzero_si128();
-      col[i32 + 5] = _mm_setzero_si128();
-      col[i32 + 6] = _mm_setzero_si128();
-      col[i32 + 7] = _mm_setzero_si128();
-      col[i32 + 8] = _mm_setzero_si128();
-      col[i32 + 9] = _mm_setzero_si128();
-      col[i32 + 10] = _mm_setzero_si128();
-      col[i32 + 11] = _mm_setzero_si128();
-      col[i32 + 12] = _mm_setzero_si128();
-      col[i32 + 13] = _mm_setzero_si128();
-      col[i32 + 14] = _mm_setzero_si128();
-      col[i32 + 15] = _mm_setzero_si128();
-      col[i32 + 16] = _mm_setzero_si128();
-      col[i32 + 17] = _mm_setzero_si128();
-      col[i32 + 18] = _mm_setzero_si128();
-      col[i32 + 19] = _mm_setzero_si128();
-      col[i32 + 20] = _mm_setzero_si128();
-      col[i32 + 21] = _mm_setzero_si128();
-      col[i32 + 22] = _mm_setzero_si128();
-      col[i32 + 23] = _mm_setzero_si128();
-      col[i32 + 24] = _mm_setzero_si128();
-      col[i32 + 25] = _mm_setzero_si128();
-      col[i32 + 26] = _mm_setzero_si128();
-      col[i32 + 27] = _mm_setzero_si128();
-      col[i32 + 28] = _mm_setzero_si128();
-      col[i32 + 29] = _mm_setzero_si128();
-      col[i32 + 30] = _mm_setzero_si128();
-      col[i32 + 31] = _mm_setzero_si128();
-      continue;
-    }
-
-    // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(in, in);
-    array_transpose_8x8(in + 8, in + 8);
-    array_transpose_8x8(in + 16, in + 16);
-    array_transpose_8x8(in + 24, in + 24);
-
-    IDCT32
-
-    // 1_D: Store 32 intermediate results for each 8x32 block.
-    col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
-    col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
-    col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
-    col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
-    col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
-    col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
-    col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
-    col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
-    col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
-    col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
-    col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
-    col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
-    col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
-    col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
-    col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
-    col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
-    col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
-    col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
-    col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
-    col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
-    col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
-    col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
-    col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
-    col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
-    col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
-    col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
-    col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
-    col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
-    col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
-    col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
-    col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
-    col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
-  }
-  for (i = 0; i < 4; i++) {
-    // Second 1-D av1_idct
-    j = i << 3;
-
-    // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(col + j, in);
-    array_transpose_8x8(col + j + 32, in + 8);
-    array_transpose_8x8(col + j + 64, in + 16);
-    array_transpose_8x8(col + j + 96, in + 24);
-
-    IDCT32
-
-    // 2_D: Calculate the results and store them to destination.
-    in[0] = _mm_add_epi16(stp1_0, stp1_31);
-    in[1] = _mm_add_epi16(stp1_1, stp1_30);
-    in[2] = _mm_add_epi16(stp1_2, stp1_29);
-    in[3] = _mm_add_epi16(stp1_3, stp1_28);
-    in[4] = _mm_add_epi16(stp1_4, stp1_27);
-    in[5] = _mm_add_epi16(stp1_5, stp1_26);
-    in[6] = _mm_add_epi16(stp1_6, stp1_25);
-    in[7] = _mm_add_epi16(stp1_7, stp1_24);
-    in[8] = _mm_add_epi16(stp1_8, stp1_23);
-    in[9] = _mm_add_epi16(stp1_9, stp1_22);
-    in[10] = _mm_add_epi16(stp1_10, stp1_21);
-    in[11] = _mm_add_epi16(stp1_11, stp1_20);
-    in[12] = _mm_add_epi16(stp1_12, stp1_19);
-    in[13] = _mm_add_epi16(stp1_13, stp1_18);
-    in[14] = _mm_add_epi16(stp1_14, stp1_17);
-    in[15] = _mm_add_epi16(stp1_15, stp1_16);
-    in[16] = _mm_sub_epi16(stp1_15, stp1_16);
-    in[17] = _mm_sub_epi16(stp1_14, stp1_17);
-    in[18] = _mm_sub_epi16(stp1_13, stp1_18);
-    in[19] = _mm_sub_epi16(stp1_12, stp1_19);
-    in[20] = _mm_sub_epi16(stp1_11, stp1_20);
-    in[21] = _mm_sub_epi16(stp1_10, stp1_21);
-    in[22] = _mm_sub_epi16(stp1_9, stp1_22);
-    in[23] = _mm_sub_epi16(stp1_8, stp1_23);
-    in[24] = _mm_sub_epi16(stp1_7, stp1_24);
-    in[25] = _mm_sub_epi16(stp1_6, stp1_25);
-    in[26] = _mm_sub_epi16(stp1_5, stp1_26);
-    in[27] = _mm_sub_epi16(stp1_4, stp1_27);
-    in[28] = _mm_sub_epi16(stp1_3, stp1_28);
-    in[29] = _mm_sub_epi16(stp1_2, stp1_29);
-    in[30] = _mm_sub_epi16(stp1_1, stp1_30);
-    in[31] = _mm_sub_epi16(stp1_0, stp1_31);
-
-    for (j = 0; j < 32; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      RECON_AND_STORE(dest + j * stride, in[j]);
-    }
-
-    dest += 8;
-  }
-}
-
-void av1_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
-  __m128i dc_value;
-  const __m128i zero = _mm_setzero_si128();
-  int a, i;
-
-  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
-  a = (int)dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 6);
-
-  dc_value = _mm_set1_epi16(a);
-
-  for (i = 0; i < 4; ++i) {
-    int j;
-    for (j = 0; j < 32; ++j) {
-      RECON_AND_STORE(dest + j * stride, dc_value);
-    }
-    dest += 8;
-  }
-}
-
-#if CONFIG_AOM_HIGHBITDEPTH
-static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
-  __m128i ubounded, retval;
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one);
-  ubounded = _mm_cmpgt_epi16(value, max);
-  retval = _mm_andnot_si128(ubounded, value);
-  ubounded = _mm_and_si128(ubounded, max);
-  retval = _mm_or_si128(retval, ubounded);
-  retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
-  return retval;
-}
-
-void av1_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8,
-                                    int stride, int bd) {
-  tran_low_t out[4 * 4];
-  tran_low_t *outptr = out;
-  int i, j;
-  __m128i inptr[4];
-  __m128i sign_bits[2];
-  __m128i temp_mm, min_input, max_input;
-  int test;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  int optimised_cols = 0;
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i eight = _mm_set1_epi16(8);
-  const __m128i max = _mm_set1_epi16(12043);
-  const __m128i min = _mm_set1_epi16(-12043);
-  // Load input into __m128i
-  inptr[0] = _mm_loadu_si128((const __m128i *)input);
-  inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
-  inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
-  inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
-
-  // Pack to 16 bits
-  inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
-  inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
-
-  max_input = _mm_max_epi16(inptr[0], inptr[1]);
-  min_input = _mm_min_epi16(inptr[0], inptr[1]);
-  max_input = _mm_cmpgt_epi16(max_input, max);
-  min_input = _mm_cmplt_epi16(min_input, min);
-  temp_mm = _mm_or_si128(max_input, min_input);
-  test = _mm_movemask_epi8(temp_mm);
-
-  if (!test) {
-    // Do the row transform
-    av1_idct4_sse2(inptr);
-
-    // Check the min & max values
-    max_input = _mm_max_epi16(inptr[0], inptr[1]);
-    min_input = _mm_min_epi16(inptr[0], inptr[1]);
-    max_input = _mm_cmpgt_epi16(max_input, max);
-    min_input = _mm_cmplt_epi16(min_input, min);
-    temp_mm = _mm_or_si128(max_input, min_input);
-    test = _mm_movemask_epi8(temp_mm);
-
-    if (test) {
-      transpose_4x4(inptr);
-      sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
-      sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
-      inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
-      inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
-      inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
-      inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
-      _mm_storeu_si128((__m128i *)outptr, inptr[0]);
-      _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);
-      _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);
-      _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);
-    } else {
-      // Set to use the optimised transform for the column
-      optimised_cols = 1;
-    }
-  } else {
-    // Run the un-optimised row transform
-    for (i = 0; i < 4; ++i) {
-      av1_highbd_idct4_c(input, outptr, bd);
-      input += 4;
-      outptr += 4;
-    }
-  }
-
-  if (optimised_cols) {
-    av1_idct4_sse2(inptr);
-
-    // Final round and shift
-    inptr[0] = _mm_add_epi16(inptr[0], eight);
-    inptr[1] = _mm_add_epi16(inptr[1], eight);
-
-    inptr[0] = _mm_srai_epi16(inptr[0], 4);
-    inptr[1] = _mm_srai_epi16(inptr[1], 4);
-
-    // Reconstruction and Store
-    {
-      __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
-      __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
-      d0 = _mm_unpacklo_epi64(
-          d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));
-      d2 = _mm_unpacklo_epi64(
-          d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
-      d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
-      d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
-      // store input0
-      _mm_storel_epi64((__m128i *)dest, d0);
-      // store input1
-      d0 = _mm_srli_si128(d0, 8);
-      _mm_storel_epi64((__m128i *)(dest + stride), d0);
-      // store input2
-      _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
-      // store input3
-      d2 = _mm_srli_si128(d2, 8);
-      _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
-    }
-  } else {
-    // Run the un-optimised column transform
-    tran_low_t temp_in[4], temp_out[4];
-    // Columns
-    for (i = 0; i < 4; ++i) {
-      for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
-      av1_highbd_idct4_c(temp_in, temp_out, bd);
-      for (j = 0; j < 4; ++j) {
-        dest[j * stride + i] = highbd_clip_pixel_add(
-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
-      }
-    }
-  }
-}
-
-void av1_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,
-                                    int stride, int bd) {
-  tran_low_t out[8 * 8];
-  tran_low_t *outptr = out;
-  int i, j, test;
-  __m128i inptr[8];
-  __m128i min_input, max_input, temp1, temp2, sign_bits;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i sixteen = _mm_set1_epi16(16);
-  const __m128i max = _mm_set1_epi16(6201);
-  const __m128i min = _mm_set1_epi16(-6201);
-  int optimised_cols = 0;
-
-  // Load input into __m128i & pack to 16 bits
-  for (i = 0; i < 8; i++) {
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
-    inptr[i] = _mm_packs_epi32(temp1, temp2);
-  }
-
-  // Find the min & max for the row transform
-  max_input = _mm_max_epi16(inptr[0], inptr[1]);
-  min_input = _mm_min_epi16(inptr[0], inptr[1]);
-  for (i = 2; i < 8; i++) {
-    max_input = _mm_max_epi16(max_input, inptr[i]);
-    min_input = _mm_min_epi16(min_input, inptr[i]);
-  }
-  max_input = _mm_cmpgt_epi16(max_input, max);
-  min_input = _mm_cmplt_epi16(min_input, min);
-  temp1 = _mm_or_si128(max_input, min_input);
-  test = _mm_movemask_epi8(temp1);
-
-  if (!test) {
-    // Do the row transform
-    av1_idct8_sse2(inptr);
-
-    // Find the min & max for the column transform
-    max_input = _mm_max_epi16(inptr[0], inptr[1]);
-    min_input = _mm_min_epi16(inptr[0], inptr[1]);
-    for (i = 2; i < 8; i++) {
-      max_input = _mm_max_epi16(max_input, inptr[i]);
-      min_input = _mm_min_epi16(min_input, inptr[i]);
-    }
-    max_input = _mm_cmpgt_epi16(max_input, max);
-    min_input = _mm_cmplt_epi16(min_input, min);
-    temp1 = _mm_or_si128(max_input, min_input);
-    test = _mm_movemask_epi8(temp1);
-
-    if (test) {
-      array_transpose_8x8(inptr, inptr);
-      for (i = 0; i < 8; i++) {
-        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
-        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
-        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
-      }
-    } else {
-      // Set to use the optimised transform for the column
-      optimised_cols = 1;
-    }
-  } else {
-    // Run the un-optimised row transform
-    for (i = 0; i < 8; ++i) {
-      av1_highbd_idct8_c(input, outptr, bd);
-      input += 8;
-      outptr += 8;
-    }
-  }
-
-  if (optimised_cols) {
-    av1_idct8_sse2(inptr);
-
-    // Final round & shift and Reconstruction and Store
-    {
-      __m128i d[8];
-      for (i = 0; i < 8; i++) {
-        inptr[i] = _mm_add_epi16(inptr[i], sixteen);
-        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
-        inptr[i] = _mm_srai_epi16(inptr[i], 5);
-        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
-        // Store
-        _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
-      }
-    }
-  } else {
-    // Run the un-optimised column transform
-    tran_low_t temp_in[8], temp_out[8];
-    for (i = 0; i < 8; ++i) {
-      for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
-      av1_highbd_idct8_c(temp_in, temp_out, bd);
-      for (j = 0; j < 8; ++j) {
-        dest[j * stride + i] = highbd_clip_pixel_add(
-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
-      }
-    }
-  }
-}
-
-void av1_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
-                                    int stride, int bd) {
-  tran_low_t out[8 * 8] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j, test;
-  __m128i inptr[8];
-  __m128i min_input, max_input, temp1, temp2, sign_bits;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i sixteen = _mm_set1_epi16(16);
-  const __m128i max = _mm_set1_epi16(6201);
-  const __m128i min = _mm_set1_epi16(-6201);
-  int optimised_cols = 0;
-
-  // Load input into __m128i & pack to 16 bits
-  for (i = 0; i < 8; i++) {
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
-    inptr[i] = _mm_packs_epi32(temp1, temp2);
-  }
-
-  // Find the min & max for the row transform
-  // only first 4 row has non-zero coefs
-  max_input = _mm_max_epi16(inptr[0], inptr[1]);
-  min_input = _mm_min_epi16(inptr[0], inptr[1]);
-  for (i = 2; i < 4; i++) {
-    max_input = _mm_max_epi16(max_input, inptr[i]);
-    min_input = _mm_min_epi16(min_input, inptr[i]);
-  }
-  max_input = _mm_cmpgt_epi16(max_input, max);
-  min_input = _mm_cmplt_epi16(min_input, min);
-  temp1 = _mm_or_si128(max_input, min_input);
-  test = _mm_movemask_epi8(temp1);
-
-  if (!test) {
-    // Do the row transform
-    av1_idct8_sse2(inptr);
-
-    // Find the min & max for the column transform
-    // N.B. Only first 4 cols contain non-zero coeffs
-    max_input = _mm_max_epi16(inptr[0], inptr[1]);
-    min_input = _mm_min_epi16(inptr[0], inptr[1]);
-    for (i = 2; i < 8; i++) {
-      max_input = _mm_max_epi16(max_input, inptr[i]);
-      min_input = _mm_min_epi16(min_input, inptr[i]);
-    }
-    max_input = _mm_cmpgt_epi16(max_input, max);
-    min_input = _mm_cmplt_epi16(min_input, min);
-    temp1 = _mm_or_si128(max_input, min_input);
-    test = _mm_movemask_epi8(temp1);
-
-    if (test) {
-      // Use fact only first 4 rows contain non-zero coeffs
-      array_transpose_4X8(inptr, inptr);
-      for (i = 0; i < 4; i++) {
-        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
-        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
-        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
-      }
-    } else {
-      // Set to use the optimised transform for the column
-      optimised_cols = 1;
-    }
-  } else {
-    // Run the un-optimised row transform
-    for (i = 0; i < 4; ++i) {
-      av1_highbd_idct8_c(input, outptr, bd);
-      input += 8;
-      outptr += 8;
-    }
-  }
-
-  if (optimised_cols) {
-    av1_idct8_sse2(inptr);
-
-    // Final round & shift and Reconstruction and Store
-    {
-      __m128i d[8];
-      for (i = 0; i < 8; i++) {
-        inptr[i] = _mm_add_epi16(inptr[i], sixteen);
-        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
-        inptr[i] = _mm_srai_epi16(inptr[i], 5);
-        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
-        // Store
-        _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
-      }
-    }
-  } else {
-    // Run the un-optimised column transform
-    tran_low_t temp_in[8], temp_out[8];
-    for (i = 0; i < 8; ++i) {
-      for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
-      av1_highbd_idct8_c(temp_in, temp_out, bd);
-      for (j = 0; j < 8; ++j) {
-        dest[j * stride + i] = highbd_clip_pixel_add(
-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
-      }
-    }
-  }
-}
-
-void av1_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8,
-                                       int stride, int bd) {
-  tran_low_t out[16 * 16];
-  tran_low_t *outptr = out;
-  int i, j, test;
-  __m128i inptr[32];
-  __m128i min_input, max_input, temp1, temp2, sign_bits;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i rounding = _mm_set1_epi16(32);
-  const __m128i max = _mm_set1_epi16(3155);
-  const __m128i min = _mm_set1_epi16(-3155);
-  int optimised_cols = 0;
-
-  // Load input into __m128i & pack to 16 bits
-  for (i = 0; i < 16; i++) {
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
-    inptr[i] = _mm_packs_epi32(temp1, temp2);
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
-    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
-  }
-
-  // Find the min & max for the row transform
-  max_input = _mm_max_epi16(inptr[0], inptr[1]);
-  min_input = _mm_min_epi16(inptr[0], inptr[1]);
-  for (i = 2; i < 32; i++) {
-    max_input = _mm_max_epi16(max_input, inptr[i]);
-    min_input = _mm_min_epi16(min_input, inptr[i]);
-  }
-  max_input = _mm_cmpgt_epi16(max_input, max);
-  min_input = _mm_cmplt_epi16(min_input, min);
-  temp1 = _mm_or_si128(max_input, min_input);
-  test = _mm_movemask_epi8(temp1);
-
-  if (!test) {
-    // Do the row transform
-    av1_idct16_sse2(inptr, inptr + 16);
-
-    // Find the min & max for the column transform
-    max_input = _mm_max_epi16(inptr[0], inptr[1]);
-    min_input = _mm_min_epi16(inptr[0], inptr[1]);
-    for (i = 2; i < 32; i++) {
-      max_input = _mm_max_epi16(max_input, inptr[i]);
-      min_input = _mm_min_epi16(min_input, inptr[i]);
-    }
-    max_input = _mm_cmpgt_epi16(max_input, max);
-    min_input = _mm_cmplt_epi16(min_input, min);
-    temp1 = _mm_or_si128(max_input, min_input);
-    test = _mm_movemask_epi8(temp1);
-
-    if (test) {
-      array_transpose_16x16(inptr, inptr + 16);
-      for (i = 0; i < 16; i++) {
-        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
-        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
-        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
-        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
-        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
-        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
-      }
-    } else {
-      // Set to use the optimised transform for the column
-      optimised_cols = 1;
-    }
-  } else {
-    // Run the un-optimised row transform
-    for (i = 0; i < 16; ++i) {
-      av1_highbd_idct16_c(input, outptr, bd);
-      input += 16;
-      outptr += 16;
-    }
-  }
-
-  if (optimised_cols) {
-    av1_idct16_sse2(inptr, inptr + 16);
-
-    // Final round & shift and Reconstruction and Store
-    {
-      __m128i d[2];
-      for (i = 0; i < 16; i++) {
-        inptr[i] = _mm_add_epi16(inptr[i], rounding);
-        inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
-        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
-        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
-        inptr[i] = _mm_srai_epi16(inptr[i], 6);
-        inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
-        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
-        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
-        // Store
-        _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
-        _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
-      }
-    }
-  } else {
-    // Run the un-optimised column transform
-    tran_low_t temp_in[16], temp_out[16];
-    for (i = 0; i < 16; ++i) {
-      for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
-      av1_highbd_idct16_c(temp_in, temp_out, bd);
-      for (j = 0; j < 16; ++j) {
-        dest[j * stride + i] = highbd_clip_pixel_add(
-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
-      }
-    }
-  }
-}
-
-void av1_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
-                                      int stride, int bd) {
-  tran_low_t out[16 * 16] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j, test;
-  __m128i inptr[32];
-  __m128i min_input, max_input, temp1, temp2, sign_bits;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i rounding = _mm_set1_epi16(32);
-  const __m128i max = _mm_set1_epi16(3155);
-  const __m128i min = _mm_set1_epi16(-3155);
-  int optimised_cols = 0;
-
-  // Load input into __m128i & pack to 16 bits
-  for (i = 0; i < 16; i++) {
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
-    inptr[i] = _mm_packs_epi32(temp1, temp2);
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
-    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
-  }
-
-  // Find the min & max for the row transform
-  // Since all non-zero dct coefficients are in upper-left 4x4 area,
-  // we only need to consider first 4 rows here.
-  max_input = _mm_max_epi16(inptr[0], inptr[1]);
-  min_input = _mm_min_epi16(inptr[0], inptr[1]);
-  for (i = 2; i < 4; i++) {
-    max_input = _mm_max_epi16(max_input, inptr[i]);
-    min_input = _mm_min_epi16(min_input, inptr[i]);
-  }
-  max_input = _mm_cmpgt_epi16(max_input, max);
-  min_input = _mm_cmplt_epi16(min_input, min);
-  temp1 = _mm_or_si128(max_input, min_input);
-  test = _mm_movemask_epi8(temp1);
-
-  if (!test) {
-    // Do the row transform (N.B. This transposes inptr)
-    av1_idct16_sse2(inptr, inptr + 16);
-
-    // Find the min & max for the column transform
-    // N.B. Only first 4 cols contain non-zero coeffs
-    max_input = _mm_max_epi16(inptr[0], inptr[1]);
-    min_input = _mm_min_epi16(inptr[0], inptr[1]);
-    for (i = 2; i < 16; i++) {
-      max_input = _mm_max_epi16(max_input, inptr[i]);
-      min_input = _mm_min_epi16(min_input, inptr[i]);
-    }
-    max_input = _mm_cmpgt_epi16(max_input, max);
-    min_input = _mm_cmplt_epi16(min_input, min);
-    temp1 = _mm_or_si128(max_input, min_input);
-    test = _mm_movemask_epi8(temp1);
-
-    if (test) {
-      // Use fact only first 4 rows contain non-zero coeffs
-      array_transpose_8x8(inptr, inptr);
-      array_transpose_8x8(inptr + 8, inptr + 16);
-      for (i = 0; i < 4; i++) {
-        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
-        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
-        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
-        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
-        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
-        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
-      }
-    } else {
-      // Set to use the optimised transform for the column
-      optimised_cols = 1;
-    }
-  } else {
-    // Run the un-optimised row transform
-    for (i = 0; i < 4; ++i) {
-      av1_highbd_idct16_c(input, outptr, bd);
-      input += 16;
-      outptr += 16;
-    }
-  }
-
-  if (optimised_cols) {
-    av1_idct16_sse2(inptr, inptr + 16);
-
-    // Final round & shift and Reconstruction and Store
-    {
-      __m128i d[2];
-      for (i = 0; i < 16; i++) {
-        inptr[i] = _mm_add_epi16(inptr[i], rounding);
-        inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
-        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
-        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
-        inptr[i] = _mm_srai_epi16(inptr[i], 6);
-        inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
-        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
-        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
-        // Store
-        _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
-        _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
-      }
-    }
-  } else {
-    // Run the un-optimised column transform
-    tran_low_t temp_in[16], temp_out[16];
-    for (i = 0; i < 16; ++i) {
-      for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
-      av1_highbd_idct16_c(temp_in, temp_out, bd);
-      for (j = 0; j < 16; ++j) {
-        dest[j * stride + i] = highbd_clip_pixel_add(
-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
-      }
-    }
-  }
-}
-#endif  // CONFIG_AOM_HIGHBITDEPTH
diff --git a/av1/common/x86/av1_inv_txfm_sse2.h b/av1/common/x86/av1_inv_txfm_sse2.h
deleted file mode 100644
index 3aab34c..0000000
--- a/av1/common/x86/av1_inv_txfm_sse2.h
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_X86_INV_TXFM_SSE2_H_
-#define AOM_DSP_X86_INV_TXFM_SSE2_H_
-
-#include <emmintrin.h>  // SSE2
-#include "./aom_config.h"
-#include "aom/aom_integer.h"
-#include "av1/common/av1_inv_txfm.h"
-
-// perform 8x8 transpose
-static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
-  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
-  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
-  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
-  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
-  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
-  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
-  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
-  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
-
-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-
-  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
-  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
-  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
-  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
-  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
-  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
-  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
-  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
-}
-
-#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1)   \
-  {                                                     \
-    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
-                                                        \
-    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \
-    in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \
-  }
-
-static INLINE void array_transpose_4X8(__m128i *in, __m128i *out) {
-  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
-  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
-  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
-  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
-
-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-
-  out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4);
-  out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);
-  out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);
-  out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
-}
-
-static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
-  __m128i tbuf[8];
-  array_transpose_8x8(res0, res0);
-  array_transpose_8x8(res1, tbuf);
-  array_transpose_8x8(res0 + 8, res1);
-  array_transpose_8x8(res1 + 8, res1 + 8);
-
-  res0[8] = tbuf[0];
-  res0[9] = tbuf[1];
-  res0[10] = tbuf[2];
-  res0[11] = tbuf[3];
-  res0[12] = tbuf[4];
-  res0[13] = tbuf[5];
-  res0[14] = tbuf[6];
-  res0[15] = tbuf[7];
-}
-
-static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) {
-  in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16));
-  in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16));
-  in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16));
-  in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16));
-  in[4] = _mm_load_si128((const __m128i *)(input + 4 * 16));
-  in[5] = _mm_load_si128((const __m128i *)(input + 5 * 16));
-  in[6] = _mm_load_si128((const __m128i *)(input + 6 * 16));
-  in[7] = _mm_load_si128((const __m128i *)(input + 7 * 16));
-
-  in[8] = _mm_load_si128((const __m128i *)(input + 8 * 16));
-  in[9] = _mm_load_si128((const __m128i *)(input + 9 * 16));
-  in[10] = _mm_load_si128((const __m128i *)(input + 10 * 16));
-  in[11] = _mm_load_si128((const __m128i *)(input + 11 * 16));
-  in[12] = _mm_load_si128((const __m128i *)(input + 12 * 16));
-  in[13] = _mm_load_si128((const __m128i *)(input + 13 * 16));
-  in[14] = _mm_load_si128((const __m128i *)(input + 14 * 16));
-  in[15] = _mm_load_si128((const __m128i *)(input + 15 * 16));
-}
-
-#define RECON_AND_STORE(dest, in_x)                  \
-  {                                                  \
-    __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
-    d0 = _mm_unpacklo_epi8(d0, zero);                \
-    d0 = _mm_add_epi16(in_x, d0);                    \
-    d0 = _mm_packus_epi16(d0, d0);                   \
-    _mm_storel_epi64((__m128i *)(dest), d0);         \
-  }
-
-static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  const __m128i zero = _mm_setzero_si128();
-  // Final rounding and shift
-  in[0] = _mm_adds_epi16(in[0], final_rounding);
-  in[1] = _mm_adds_epi16(in[1], final_rounding);
-  in[2] = _mm_adds_epi16(in[2], final_rounding);
-  in[3] = _mm_adds_epi16(in[3], final_rounding);
-  in[4] = _mm_adds_epi16(in[4], final_rounding);
-  in[5] = _mm_adds_epi16(in[5], final_rounding);
-  in[6] = _mm_adds_epi16(in[6], final_rounding);
-  in[7] = _mm_adds_epi16(in[7], final_rounding);
-  in[8] = _mm_adds_epi16(in[8], final_rounding);
-  in[9] = _mm_adds_epi16(in[9], final_rounding);
-  in[10] = _mm_adds_epi16(in[10], final_rounding);
-  in[11] = _mm_adds_epi16(in[11], final_rounding);
-  in[12] = _mm_adds_epi16(in[12], final_rounding);
-  in[13] = _mm_adds_epi16(in[13], final_rounding);
-  in[14] = _mm_adds_epi16(in[14], final_rounding);
-  in[15] = _mm_adds_epi16(in[15], final_rounding);
-
-  in[0] = _mm_srai_epi16(in[0], 6);
-  in[1] = _mm_srai_epi16(in[1], 6);
-  in[2] = _mm_srai_epi16(in[2], 6);
-  in[3] = _mm_srai_epi16(in[3], 6);
-  in[4] = _mm_srai_epi16(in[4], 6);
-  in[5] = _mm_srai_epi16(in[5], 6);
-  in[6] = _mm_srai_epi16(in[6], 6);
-  in[7] = _mm_srai_epi16(in[7], 6);
-  in[8] = _mm_srai_epi16(in[8], 6);
-  in[9] = _mm_srai_epi16(in[9], 6);
-  in[10] = _mm_srai_epi16(in[10], 6);
-  in[11] = _mm_srai_epi16(in[11], 6);
-  in[12] = _mm_srai_epi16(in[12], 6);
-  in[13] = _mm_srai_epi16(in[13], 6);
-  in[14] = _mm_srai_epi16(in[14], 6);
-  in[15] = _mm_srai_epi16(in[15], 6);
-
-  RECON_AND_STORE(dest + 0 * stride, in[0]);
-  RECON_AND_STORE(dest + 1 * stride, in[1]);
-  RECON_AND_STORE(dest + 2 * stride, in[2]);
-  RECON_AND_STORE(dest + 3 * stride, in[3]);
-  RECON_AND_STORE(dest + 4 * stride, in[4]);
-  RECON_AND_STORE(dest + 5 * stride, in[5]);
-  RECON_AND_STORE(dest + 6 * stride, in[6]);
-  RECON_AND_STORE(dest + 7 * stride, in[7]);
-  RECON_AND_STORE(dest + 8 * stride, in[8]);
-  RECON_AND_STORE(dest + 9 * stride, in[9]);
-  RECON_AND_STORE(dest + 10 * stride, in[10]);
-  RECON_AND_STORE(dest + 11 * stride, in[11]);
-  RECON_AND_STORE(dest + 12 * stride, in[12]);
-  RECON_AND_STORE(dest + 13 * stride, in[13]);
-  RECON_AND_STORE(dest + 14 * stride, in[14]);
-  RECON_AND_STORE(dest + 15 * stride, in[15]);
-}
-
-void idct4_sse2(__m128i *in);
-void idct8_sse2(__m128i *in);
-void idct16_sse2(__m128i *in0, __m128i *in1);
-void iadst4_sse2(__m128i *in);
-void iadst8_sse2(__m128i *in);
-void iadst16_sse2(__m128i *in0, __m128i *in1);
-
-#endif  // AOM_DSP_X86_INV_TXFM_SSE2_H_
diff --git a/av1/common/x86/hybrid_inv_txfm_avx2.c b/av1/common/x86/hybrid_inv_txfm_avx2.c
new file mode 100644
index 0000000..754152c
--- /dev/null
+++ b/av1/common/x86/hybrid_inv_txfm_avx2.c
@@ -0,0 +1,503 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>  // avx2
+
+#include "./aom_config.h"
+#include "./av1_rtcd.h"
+
+#include "aom_dsp/x86/txfm_common_avx2.h"
+
+static INLINE void load_coeff(const tran_low_t *coeff, __m256i *in) {
+#if CONFIG_AOM_HIGHBITDEPTH
+  *in = _mm256_setr_epi16(
+      (int16_t)coeff[0], (int16_t)coeff[1], (int16_t)coeff[2],
+      (int16_t)coeff[3], (int16_t)coeff[4], (int16_t)coeff[5],
+      (int16_t)coeff[6], (int16_t)coeff[7], (int16_t)coeff[8],
+      (int16_t)coeff[9], (int16_t)coeff[10], (int16_t)coeff[11],
+      (int16_t)coeff[12], (int16_t)coeff[13], (int16_t)coeff[14],
+      (int16_t)coeff[15]);
+#else
+  *in = _mm256_loadu_si256((const __m256i *)coeff);
+#endif
+}
+
+static void load_buffer_16x16(const tran_low_t *coeff, __m256i *in) {
+  int i = 0;
+  while (i < 16) {
+    load_coeff(coeff + (i << 4), &in[i]);
+    i += 1;
+  }
+}
+
+static void recon_and_store(const __m256i *res, uint8_t *output) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i x = _mm_loadu_si128((__m128i const *)output);
+  __m128i p0 = _mm_unpacklo_epi8(x, zero);
+  __m128i p1 = _mm_unpackhi_epi8(x, zero);
+
+  p0 = _mm_add_epi16(p0, _mm256_castsi256_si128(*res));
+  p1 = _mm_add_epi16(p1, _mm256_extractf128_si256(*res, 1));
+  x = _mm_packus_epi16(p0, p1);
+  _mm_storeu_si128((__m128i *)output, x);
+}
+
+#define IDCT_ROUNDING_POS (6)
+
+static void write_buffer_16x16(__m256i *in, const int stride, uint8_t *output) {
+  const __m256i rounding = _mm256_set1_epi16(1 << (IDCT_ROUNDING_POS - 1));
+  int i = 0;
+
+  while (i < 16) {
+    in[i] = _mm256_add_epi16(in[i], rounding);
+    in[i] = _mm256_srai_epi16(in[i], IDCT_ROUNDING_POS);
+    recon_and_store(&in[i], output + i * stride);
+    i += 1;
+  }
+}
+
+static INLINE void unpack_butter_fly(const __m256i *a0, const __m256i *a1,
+                                     const __m256i *c0, const __m256i *c1,
+                                     __m256i *b0, __m256i *b1) {
+  __m256i x0, x1;
+  x0 = _mm256_unpacklo_epi16(*a0, *a1);
+  x1 = _mm256_unpackhi_epi16(*a0, *a1);
+  *b0 = butter_fly(x0, x1, *c0);
+  *b1 = butter_fly(x0, x1, *c1);
+}
+
+static void idct16_avx2(__m256i *in) {
+  const __m256i cospi_p30_m02 = pair256_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m256i cospi_p02_p30 = pair256_set_epi16(cospi_2_64, cospi_30_64);
+  const __m256i cospi_p14_m18 = pair256_set_epi16(cospi_14_64, -cospi_18_64);
+  const __m256i cospi_p18_p14 = pair256_set_epi16(cospi_18_64, cospi_14_64);
+  const __m256i cospi_p22_m10 = pair256_set_epi16(cospi_22_64, -cospi_10_64);
+  const __m256i cospi_p10_p22 = pair256_set_epi16(cospi_10_64, cospi_22_64);
+  const __m256i cospi_p06_m26 = pair256_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m256i cospi_p26_p06 = pair256_set_epi16(cospi_26_64, cospi_6_64);
+  const __m256i cospi_p28_m04 = pair256_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m256i cospi_p04_p28 = pair256_set_epi16(cospi_4_64, cospi_28_64);
+  const __m256i cospi_p12_m20 = pair256_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m256i cospi_p20_p12 = pair256_set_epi16(cospi_20_64, cospi_12_64);
+  const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
+  const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m256i cospi_p24_m08 = pair256_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m256i cospi_p08_p24 = pair256_set_epi16(cospi_8_64, cospi_24_64);
+  const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64);
+  const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
+  __m256i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m256i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m256i t0, t1, t2, t3, t4, t5, t6, t7;
+
+  // stage 1, (0-7)
+  u0 = in[0];
+  u1 = in[8];
+  u2 = in[4];
+  u3 = in[12];
+  u4 = in[2];
+  u5 = in[10];
+  u6 = in[6];
+  u7 = in[14];
+
+  // stage 2, (0-7)
+  // stage 3, (0-7)
+  t0 = u0;
+  t1 = u1;
+  t2 = u2;
+  t3 = u3;
+  unpack_butter_fly(&u4, &u7, &cospi_p28_m04, &cospi_p04_p28, &t4, &t7);
+  unpack_butter_fly(&u5, &u6, &cospi_p12_m20, &cospi_p20_p12, &t5, &t6);
+
+  // stage 4, (0-7)
+  unpack_butter_fly(&t0, &t1, &cospi_p16_p16, &cospi_p16_m16, &u0, &u1);
+  unpack_butter_fly(&t2, &t3, &cospi_p24_m08, &cospi_p08_p24, &u2, &u3);
+  u4 = _mm256_add_epi16(t4, t5);
+  u5 = _mm256_sub_epi16(t4, t5);
+  u6 = _mm256_sub_epi16(t7, t6);
+  u7 = _mm256_add_epi16(t7, t6);
+
+  // stage 5, (0-7)
+  t0 = _mm256_add_epi16(u0, u3);
+  t1 = _mm256_add_epi16(u1, u2);
+  t2 = _mm256_sub_epi16(u1, u2);
+  t3 = _mm256_sub_epi16(u0, u3);
+  t4 = u4;
+  t7 = u7;
+  unpack_butter_fly(&u6, &u5, &cospi_p16_m16, &cospi_p16_p16, &t5, &t6);
+
+  // stage 6, (0-7)
+  u0 = _mm256_add_epi16(t0, t7);
+  u1 = _mm256_add_epi16(t1, t6);
+  u2 = _mm256_add_epi16(t2, t5);
+  u3 = _mm256_add_epi16(t3, t4);
+  u4 = _mm256_sub_epi16(t3, t4);
+  u5 = _mm256_sub_epi16(t2, t5);
+  u6 = _mm256_sub_epi16(t1, t6);
+  u7 = _mm256_sub_epi16(t0, t7);
+
+  // stage 1, (8-15)
+  v0 = in[1];
+  v1 = in[9];
+  v2 = in[5];
+  v3 = in[13];
+  v4 = in[3];
+  v5 = in[11];
+  v6 = in[7];
+  v7 = in[15];
+
+  // stage 2, (8-15)
+  unpack_butter_fly(&v0, &v7, &cospi_p30_m02, &cospi_p02_p30, &t0, &t7);
+  unpack_butter_fly(&v1, &v6, &cospi_p14_m18, &cospi_p18_p14, &t1, &t6);
+  unpack_butter_fly(&v2, &v5, &cospi_p22_m10, &cospi_p10_p22, &t2, &t5);
+  unpack_butter_fly(&v3, &v4, &cospi_p06_m26, &cospi_p26_p06, &t3, &t4);
+
+  // stage 3, (8-15)
+  v0 = _mm256_add_epi16(t0, t1);
+  v1 = _mm256_sub_epi16(t0, t1);
+  v2 = _mm256_sub_epi16(t3, t2);
+  v3 = _mm256_add_epi16(t2, t3);
+  v4 = _mm256_add_epi16(t4, t5);
+  v5 = _mm256_sub_epi16(t4, t5);
+  v6 = _mm256_sub_epi16(t7, t6);
+  v7 = _mm256_add_epi16(t6, t7);
+
+  // stage 4, (8-15)
+  t0 = v0;
+  t7 = v7;
+  t3 = v3;
+  t4 = v4;
+  unpack_butter_fly(&v1, &v6, &cospi_m08_p24, &cospi_p24_p08, &t1, &t6);
+  unpack_butter_fly(&v2, &v5, &cospi_m24_m08, &cospi_m08_p24, &t2, &t5);
+
+  // stage 5, (8-15)
+  v0 = _mm256_add_epi16(t0, t3);
+  v1 = _mm256_add_epi16(t1, t2);
+  v2 = _mm256_sub_epi16(t1, t2);
+  v3 = _mm256_sub_epi16(t0, t3);
+  v4 = _mm256_sub_epi16(t7, t4);
+  v5 = _mm256_sub_epi16(t6, t5);
+  v6 = _mm256_add_epi16(t6, t5);
+  v7 = _mm256_add_epi16(t7, t4);
+
+  // stage 6, (8-15)
+  t0 = v0;
+  t1 = v1;
+  t6 = v6;
+  t7 = v7;
+  unpack_butter_fly(&v5, &v2, &cospi_p16_m16, &cospi_p16_p16, &t2, &t5);
+  unpack_butter_fly(&v4, &v3, &cospi_p16_m16, &cospi_p16_p16, &t3, &t4);
+
+  // stage 7
+  in[0] = _mm256_add_epi16(u0, t7);
+  in[1] = _mm256_add_epi16(u1, t6);
+  in[2] = _mm256_add_epi16(u2, t5);
+  in[3] = _mm256_add_epi16(u3, t4);
+  in[4] = _mm256_add_epi16(u4, t3);
+  in[5] = _mm256_add_epi16(u5, t2);
+  in[6] = _mm256_add_epi16(u6, t1);
+  in[7] = _mm256_add_epi16(u7, t0);
+  in[8] = _mm256_sub_epi16(u7, t0);
+  in[9] = _mm256_sub_epi16(u6, t1);
+  in[10] = _mm256_sub_epi16(u5, t2);
+  in[11] = _mm256_sub_epi16(u4, t3);
+  in[12] = _mm256_sub_epi16(u3, t4);
+  in[13] = _mm256_sub_epi16(u2, t5);
+  in[14] = _mm256_sub_epi16(u1, t6);
+  in[15] = _mm256_sub_epi16(u0, t7);
+}
+
+static void idct16(__m256i *in) {
+  mm256_transpose_16x16(in);
+  idct16_avx2(in);
+}
+
+static INLINE void butterfly_32b(const __m256i *a0, const __m256i *a1,
+                                 const __m256i *c0, const __m256i *c1,
+                                 __m256i *b) {
+  __m256i x0, x1;
+  x0 = _mm256_unpacklo_epi16(*a0, *a1);
+  x1 = _mm256_unpackhi_epi16(*a0, *a1);
+  b[0] = _mm256_madd_epi16(x0, *c0);
+  b[1] = _mm256_madd_epi16(x1, *c0);
+  b[2] = _mm256_madd_epi16(x0, *c1);
+  b[3] = _mm256_madd_epi16(x1, *c1);
+}
+
+static INLINE void group_rounding(__m256i *a, int num) {
+  const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+  int i;
+  for (i = 0; i < num; ++i) {
+    a[i] = _mm256_add_epi32(a[i], dct_rounding);
+    a[i] = _mm256_srai_epi32(a[i], DCT_CONST_BITS);
+  }
+}
+
+static INLINE void add_rnd(const __m256i *a, const __m256i *b, __m256i *out) {
+  __m256i x[4];
+  x[0] = _mm256_add_epi32(a[0], b[0]);
+  x[1] = _mm256_add_epi32(a[1], b[1]);
+  x[2] = _mm256_add_epi32(a[2], b[2]);
+  x[3] = _mm256_add_epi32(a[3], b[3]);
+
+  group_rounding(x, 4);
+
+  out[0] = _mm256_packs_epi32(x[0], x[1]);
+  out[1] = _mm256_packs_epi32(x[2], x[3]);
+}
+
+static INLINE void sub_rnd(const __m256i *a, const __m256i *b, __m256i *out) {
+  __m256i x[4];
+  x[0] = _mm256_sub_epi32(a[0], b[0]);
+  x[1] = _mm256_sub_epi32(a[1], b[1]);
+  x[2] = _mm256_sub_epi32(a[2], b[2]);
+  x[3] = _mm256_sub_epi32(a[3], b[3]);
+
+  group_rounding(x, 4);
+
+  out[0] = _mm256_packs_epi32(x[0], x[1]);
+  out[1] = _mm256_packs_epi32(x[2], x[3]);
+}
+
+static INLINE void butterfly_rnd(__m256i *a, __m256i *out) {
+  group_rounding(a, 4);
+  out[0] = _mm256_packs_epi32(a[0], a[1]);
+  out[1] = _mm256_packs_epi32(a[2], a[3]);
+}
+
+static void iadst16_avx2(__m256i *in) {
+  const __m256i cospi_p01_p31 = pair256_set_epi16(cospi_1_64, cospi_31_64);
+  const __m256i cospi_p31_m01 = pair256_set_epi16(cospi_31_64, -cospi_1_64);
+  const __m256i cospi_p05_p27 = pair256_set_epi16(cospi_5_64, cospi_27_64);
+  const __m256i cospi_p27_m05 = pair256_set_epi16(cospi_27_64, -cospi_5_64);
+  const __m256i cospi_p09_p23 = pair256_set_epi16(cospi_9_64, cospi_23_64);
+  const __m256i cospi_p23_m09 = pair256_set_epi16(cospi_23_64, -cospi_9_64);
+  const __m256i cospi_p13_p19 = pair256_set_epi16(cospi_13_64, cospi_19_64);
+  const __m256i cospi_p19_m13 = pair256_set_epi16(cospi_19_64, -cospi_13_64);
+  const __m256i cospi_p17_p15 = pair256_set_epi16(cospi_17_64, cospi_15_64);
+  const __m256i cospi_p15_m17 = pair256_set_epi16(cospi_15_64, -cospi_17_64);
+  const __m256i cospi_p21_p11 = pair256_set_epi16(cospi_21_64, cospi_11_64);
+  const __m256i cospi_p11_m21 = pair256_set_epi16(cospi_11_64, -cospi_21_64);
+  const __m256i cospi_p25_p07 = pair256_set_epi16(cospi_25_64, cospi_7_64);
+  const __m256i cospi_p07_m25 = pair256_set_epi16(cospi_7_64, -cospi_25_64);
+  const __m256i cospi_p29_p03 = pair256_set_epi16(cospi_29_64, cospi_3_64);
+  const __m256i cospi_p03_m29 = pair256_set_epi16(cospi_3_64, -cospi_29_64);
+  const __m256i cospi_p04_p28 = pair256_set_epi16(cospi_4_64, cospi_28_64);
+  const __m256i cospi_p28_m04 = pair256_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m256i cospi_p20_p12 = pair256_set_epi16(cospi_20_64, cospi_12_64);
+  const __m256i cospi_p12_m20 = pair256_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m256i cospi_m28_p04 = pair256_set_epi16(-cospi_28_64, cospi_4_64);
+  const __m256i cospi_m12_p20 = pair256_set_epi16(-cospi_12_64, cospi_20_64);
+  const __m256i cospi_p08_p24 = pair256_set_epi16(cospi_8_64, cospi_24_64);
+  const __m256i cospi_p24_m08 = pair256_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m256i cospi_m24_p08 = pair256_set_epi16(-cospi_24_64, cospi_8_64);
+  const __m256i cospi_m16_m16 = _mm256_set1_epi16((int16_t)-cospi_16_64);
+  const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
+  const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m256i cospi_m16_p16 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i x[16], s[16];
+  __m256i u[4], v[4];
+
+  // stage 1
+  butterfly_32b(&in[15], &in[0], &cospi_p01_p31, &cospi_p31_m01, u);
+  butterfly_32b(&in[7], &in[8], &cospi_p17_p15, &cospi_p15_m17, v);
+  add_rnd(u, v, &x[0]);
+  sub_rnd(u, v, &x[8]);
+
+  butterfly_32b(&in[13], &in[2], &cospi_p05_p27, &cospi_p27_m05, u);
+  butterfly_32b(&in[5], &in[10], &cospi_p21_p11, &cospi_p11_m21, v);
+  add_rnd(u, v, &x[2]);
+  sub_rnd(u, v, &x[10]);
+
+  butterfly_32b(&in[11], &in[4], &cospi_p09_p23, &cospi_p23_m09, u);
+  butterfly_32b(&in[3], &in[12], &cospi_p25_p07, &cospi_p07_m25, v);
+  add_rnd(u, v, &x[4]);
+  sub_rnd(u, v, &x[12]);
+
+  butterfly_32b(&in[9], &in[6], &cospi_p13_p19, &cospi_p19_m13, u);
+  butterfly_32b(&in[1], &in[14], &cospi_p29_p03, &cospi_p03_m29, v);
+  add_rnd(u, v, &x[6]);
+  sub_rnd(u, v, &x[14]);
+
+  // stage 2
+  s[0] = _mm256_add_epi16(x[0], x[4]);
+  s[1] = _mm256_add_epi16(x[1], x[5]);
+  s[2] = _mm256_add_epi16(x[2], x[6]);
+  s[3] = _mm256_add_epi16(x[3], x[7]);
+  s[4] = _mm256_sub_epi16(x[0], x[4]);
+  s[5] = _mm256_sub_epi16(x[1], x[5]);
+  s[6] = _mm256_sub_epi16(x[2], x[6]);
+  s[7] = _mm256_sub_epi16(x[3], x[7]);
+  butterfly_32b(&x[8], &x[9], &cospi_p04_p28, &cospi_p28_m04, u);
+  butterfly_32b(&x[12], &x[13], &cospi_m28_p04, &cospi_p04_p28, v);
+  add_rnd(u, v, &s[8]);
+  sub_rnd(u, v, &s[12]);
+
+  butterfly_32b(&x[10], &x[11], &cospi_p20_p12, &cospi_p12_m20, u);
+  butterfly_32b(&x[14], &x[15], &cospi_m12_p20, &cospi_p20_p12, v);
+  add_rnd(u, v, &s[10]);
+  sub_rnd(u, v, &s[14]);
+
+  // stage 3
+  x[0] = _mm256_add_epi16(s[0], s[2]);
+  x[1] = _mm256_add_epi16(s[1], s[3]);
+  x[2] = _mm256_sub_epi16(s[0], s[2]);
+  x[3] = _mm256_sub_epi16(s[1], s[3]);
+
+  x[8] = _mm256_add_epi16(s[8], s[10]);
+  x[9] = _mm256_add_epi16(s[9], s[11]);
+  x[10] = _mm256_sub_epi16(s[8], s[10]);
+  x[11] = _mm256_sub_epi16(s[9], s[11]);
+
+  butterfly_32b(&s[4], &s[5], &cospi_p08_p24, &cospi_p24_m08, u);
+  butterfly_32b(&s[6], &s[7], &cospi_m24_p08, &cospi_p08_p24, v);
+  add_rnd(u, v, &x[4]);
+  sub_rnd(u, v, &x[6]);
+
+  butterfly_32b(&s[12], &s[13], &cospi_p08_p24, &cospi_p24_m08, u);
+  butterfly_32b(&s[14], &s[15], &cospi_m24_p08, &cospi_p08_p24, v);
+  add_rnd(u, v, &x[12]);
+  sub_rnd(u, v, &x[14]);
+
+  // stage 4
+  butterfly_32b(&x[2], &x[3], &cospi_m16_m16, &cospi_p16_m16, u);
+  butterfly_32b(&x[6], &x[7], &cospi_p16_p16, &cospi_m16_p16, v);
+  butterfly_rnd(u, &x[2]);
+  butterfly_rnd(v, &x[6]);
+
+  butterfly_32b(&x[10], &x[11], &cospi_p16_p16, &cospi_m16_p16, u);
+  butterfly_32b(&x[14], &x[15], &cospi_m16_m16, &cospi_p16_m16, v);
+  butterfly_rnd(u, &x[10]);
+  butterfly_rnd(v, &x[14]);
+
+  in[0] = x[0];
+  in[1] = _mm256_sub_epi16(zero, x[8]);
+  in[2] = x[12];
+  in[3] = _mm256_sub_epi16(zero, x[4]);
+  in[4] = x[6];
+  in[5] = x[14];
+  in[6] = x[10];
+  in[7] = x[2];
+  in[8] = x[3];
+  in[9] = x[11];
+  in[10] = x[15];
+  in[11] = x[7];
+  in[12] = x[5];
+  in[13] = _mm256_sub_epi16(zero, x[13]);
+  in[14] = x[9];
+  in[15] = _mm256_sub_epi16(zero, x[1]);
+}
+
+static void iadst16(__m256i *in) {
+  mm256_transpose_16x16(in);
+  iadst16_avx2(in);
+}
+
+#if CONFIG_EXT_TX
+static void flip_row(__m256i *in, int rows) {
+  int i;
+  for (i = 0; i < rows; ++i) {
+    mm256_reverse_epi16(&in[i]);
+  }
+}
+
+static void flip_col(uint8_t **dest, int *stride, int rows) {
+  *dest = *dest + (rows - 1) * (*stride);
+  *stride = -*stride;
+}
+
+static void iidtx16(__m256i *in) {
+  mm256_transpose_16x16(in);
+  txfm_scaling16_avx2(Sqrt2, in);
+}
+#endif
+
+void av1_iht16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest,
+                               int stride, int tx_type) {
+  __m256i in[16];
+
+  load_buffer_16x16(input, in);
+  switch (tx_type) {
+    case DCT_DCT:
+      idct16(in);
+      idct16(in);
+      break;
+    case ADST_DCT:
+      idct16(in);
+      iadst16(in);
+      break;
+    case DCT_ADST:
+      iadst16(in);
+      idct16(in);
+      break;
+    case ADST_ADST:
+      iadst16(in);
+      iadst16(in);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      idct16(in);
+      iadst16(in);
+      flip_col(&dest, &stride, 16);
+      break;
+    case DCT_FLIPADST:
+      iadst16(in);
+      idct16(in);
+      flip_row(in, 16);
+      break;
+    case FLIPADST_FLIPADST:
+      iadst16(in);
+      iadst16(in);
+      flip_row(in, 16);
+      flip_col(&dest, &stride, 16);
+      break;
+    case ADST_FLIPADST:
+      iadst16(in);
+      iadst16(in);
+      flip_row(in, 16);
+      break;
+    case FLIPADST_ADST:
+      iadst16(in);
+      iadst16(in);
+      flip_col(&dest, &stride, 16);
+      break;
+    case V_DCT:
+      iidtx16(in);
+      idct16(in);
+      break;
+    case H_DCT:
+      idct16(in);
+      iidtx16(in);
+      break;
+    case V_ADST:
+      iidtx16(in);
+      iadst16(in);
+      break;
+    case H_ADST:
+      iadst16(in);
+      iidtx16(in);
+      break;
+    case V_FLIPADST:
+      iidtx16(in);
+      iadst16(in);
+      flip_col(&dest, &stride, 16);
+      break;
+    case H_FLIPADST:
+      iadst16(in);
+      iidtx16(in);
+      flip_row(in, 16);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+  write_buffer_16x16(in, stride, dest);
+}
diff --git a/av1/common/x86/idct_intrin_sse2.c b/av1/common/x86/idct_intrin_sse2.c
index 27cd756..a6b6e1e 100644
--- a/av1/common/x86/idct_intrin_sse2.c
+++ b/av1/common/x86/idct_intrin_sse2.c
@@ -69,46 +69,46 @@
 
   switch (tx_type) {
     case DCT_DCT:
-      idct4_sse2(in);
-      idct4_sse2(in);
+      aom_idct4_sse2(in);
+      aom_idct4_sse2(in);
       break;
     case ADST_DCT:
-      idct4_sse2(in);
-      iadst4_sse2(in);
+      aom_idct4_sse2(in);
+      aom_iadst4_sse2(in);
       break;
     case DCT_ADST:
-      iadst4_sse2(in);
-      idct4_sse2(in);
+      aom_iadst4_sse2(in);
+      aom_idct4_sse2(in);
       break;
     case ADST_ADST:
-      iadst4_sse2(in);
-      iadst4_sse2(in);
+      aom_iadst4_sse2(in);
+      aom_iadst4_sse2(in);
       break;
 #if CONFIG_EXT_TX
     case FLIPADST_DCT:
-      idct4_sse2(in);
-      iadst4_sse2(in);
+      aom_idct4_sse2(in);
+      aom_iadst4_sse2(in);
       FLIPUD_PTR(dest, stride, 4);
       break;
     case DCT_FLIPADST:
-      iadst4_sse2(in);
-      idct4_sse2(in);
+      aom_iadst4_sse2(in);
+      aom_idct4_sse2(in);
       fliplr_4x4(in);
       break;
     case FLIPADST_FLIPADST:
-      iadst4_sse2(in);
-      iadst4_sse2(in);
+      aom_iadst4_sse2(in);
+      aom_iadst4_sse2(in);
       FLIPUD_PTR(dest, stride, 4);
       fliplr_4x4(in);
       break;
     case ADST_FLIPADST:
-      iadst4_sse2(in);
-      iadst4_sse2(in);
+      aom_iadst4_sse2(in);
+      aom_iadst4_sse2(in);
       fliplr_4x4(in);
       break;
     case FLIPADST_ADST:
-      iadst4_sse2(in);
-      iadst4_sse2(in);
+      aom_iadst4_sse2(in);
+      aom_iadst4_sse2(in);
       FLIPUD_PTR(dest, stride, 4);
       break;
 #endif  // CONFIG_EXT_TX
@@ -167,46 +167,46 @@
 
   switch (tx_type) {
     case DCT_DCT:
-      idct8_sse2(in);
-      idct8_sse2(in);
+      aom_idct8_sse2(in);
+      aom_idct8_sse2(in);
       break;
     case ADST_DCT:
-      idct8_sse2(in);
-      iadst8_sse2(in);
+      aom_idct8_sse2(in);
+      aom_iadst8_sse2(in);
       break;
     case DCT_ADST:
-      iadst8_sse2(in);
-      idct8_sse2(in);
+      aom_iadst8_sse2(in);
+      aom_idct8_sse2(in);
       break;
     case ADST_ADST:
-      iadst8_sse2(in);
-      iadst8_sse2(in);
+      aom_iadst8_sse2(in);
+      aom_iadst8_sse2(in);
       break;
 #if CONFIG_EXT_TX
     case FLIPADST_DCT:
-      idct8_sse2(in);
-      iadst8_sse2(in);
+      aom_idct8_sse2(in);
+      aom_iadst8_sse2(in);
       FLIPUD_PTR(dest, stride, 8);
       break;
     case DCT_FLIPADST:
-      iadst8_sse2(in);
-      idct8_sse2(in);
+      aom_iadst8_sse2(in);
+      aom_idct8_sse2(in);
       fliplr_8x8(in);
       break;
     case FLIPADST_FLIPADST:
-      iadst8_sse2(in);
-      iadst8_sse2(in);
+      aom_iadst8_sse2(in);
+      aom_iadst8_sse2(in);
       FLIPUD_PTR(dest, stride, 8);
       fliplr_8x8(in);
       break;
     case ADST_FLIPADST:
-      iadst8_sse2(in);
-      iadst8_sse2(in);
+      aom_iadst8_sse2(in);
+      aom_iadst8_sse2(in);
       fliplr_8x8(in);
       break;
     case FLIPADST_ADST:
-      iadst8_sse2(in);
-      iadst8_sse2(in);
+      aom_iadst8_sse2(in);
+      aom_iadst8_sse2(in);
       FLIPUD_PTR(dest, stride, 8);
       break;
 #endif  // CONFIG_EXT_TX
@@ -242,69 +242,6 @@
   RECON_AND_STORE(dest + 7 * stride, in[7]);
 }
 
-void av1_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
-                               int stride, int tx_type) {
-  __m128i in[32];
-  __m128i *in0 = &in[0];
-  __m128i *in1 = &in[16];
-
-  load_buffer_8x16(input, in0);
-  input += 8;
-  load_buffer_8x16(input, in1);
-
-  switch (tx_type) {
-    case DCT_DCT:
-      idct16_sse2(in0, in1);
-      idct16_sse2(in0, in1);
-      break;
-    case ADST_DCT:
-      idct16_sse2(in0, in1);
-      iadst16_sse2(in0, in1);
-      break;
-    case DCT_ADST:
-      iadst16_sse2(in0, in1);
-      idct16_sse2(in0, in1);
-      break;
-    case ADST_ADST:
-      iadst16_sse2(in0, in1);
-      iadst16_sse2(in0, in1);
-      break;
-#if CONFIG_EXT_TX
-    case FLIPADST_DCT:
-      idct16_sse2(in0, in1);
-      iadst16_sse2(in0, in1);
-      FLIPUD_PTR(dest, stride, 16);
-      break;
-    case DCT_FLIPADST:
-      iadst16_sse2(in0, in1);
-      idct16_sse2(in0, in1);
-      FLIPLR_16x16(in0, in1);
-      break;
-    case FLIPADST_FLIPADST:
-      iadst16_sse2(in0, in1);
-      iadst16_sse2(in0, in1);
-      FLIPUD_PTR(dest, stride, 16);
-      FLIPLR_16x16(in0, in1);
-      break;
-    case ADST_FLIPADST:
-      iadst16_sse2(in0, in1);
-      iadst16_sse2(in0, in1);
-      FLIPLR_16x16(in0, in1);
-      break;
-    case FLIPADST_ADST:
-      iadst16_sse2(in0, in1);
-      iadst16_sse2(in0, in1);
-      FLIPUD_PTR(dest, stride, 16);
-      break;
-#endif  // CONFIG_EXT_TX
-    default: assert(0); break;
-  }
-
-  write_buffer_8x16(dest, in0, stride);
-  dest += 8;
-  write_buffer_8x16(dest, in1, stride);
-}
-
 #if CONFIG_EXT_TX
 static void iidtx16_8col(__m128i *in) {
   const __m128i k__zero_epi16 = _mm_set1_epi16((int16_t)0);
@@ -501,7 +438,98 @@
   iidtx16_8col(in0);
   iidtx16_8col(in1);
 }
+#endif
 
+void av1_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
+                               int stride, int tx_type) {
+  __m128i in[32];
+  __m128i *in0 = &in[0];
+  __m128i *in1 = &in[16];
+
+  load_buffer_8x16(input, in0);
+  input += 8;
+  load_buffer_8x16(input, in1);
+
+  switch (tx_type) {
+    case DCT_DCT:
+      aom_idct16_sse2(in0, in1);
+      aom_idct16_sse2(in0, in1);
+      break;
+    case ADST_DCT:
+      aom_idct16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
+      break;
+    case DCT_ADST:
+      aom_iadst16_sse2(in0, in1);
+      aom_idct16_sse2(in0, in1);
+      break;
+    case ADST_ADST:
+      aom_iadst16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      aom_idct16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
+      FLIPUD_PTR(dest, stride, 16);
+      break;
+    case DCT_FLIPADST:
+      aom_iadst16_sse2(in0, in1);
+      aom_idct16_sse2(in0, in1);
+      FLIPLR_16x16(in0, in1);
+      break;
+    case FLIPADST_FLIPADST:
+      aom_iadst16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
+      FLIPUD_PTR(dest, stride, 16);
+      FLIPLR_16x16(in0, in1);
+      break;
+    case ADST_FLIPADST:
+      aom_iadst16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
+      FLIPLR_16x16(in0, in1);
+      break;
+    case FLIPADST_ADST:
+      aom_iadst16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
+      FLIPUD_PTR(dest, stride, 16);
+      break;
+    case V_DCT:
+      iidtx16_sse2(in0, in1);
+      aom_idct16_sse2(in0, in1);
+      break;
+    case H_DCT:
+      aom_idct16_sse2(in0, in1);
+      iidtx16_sse2(in0, in1);
+      break;
+    case V_ADST:
+      iidtx16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
+      break;
+    case H_ADST:
+      aom_iadst16_sse2(in0, in1);
+      iidtx16_sse2(in0, in1);
+      break;
+    case V_FLIPADST:
+      iidtx16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
+      FLIPUD_PTR(dest, stride, 16);
+      break;
+    case H_FLIPADST:
+      aom_iadst16_sse2(in0, in1);
+      iidtx16_sse2(in0, in1);
+      FLIPLR_16x16(in0, in1);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+
+  write_buffer_8x16(dest, in0, stride);
+  dest += 8;
+  write_buffer_8x16(dest, in1, stride);
+}
+
+#if CONFIG_EXT_TX
 static void iidtx8_sse2(__m128i *in) {
   in[0] = _mm_slli_epi16(in[0], 1);
   in[1] = _mm_slli_epi16(in[1], 1);
@@ -543,6 +571,7 @@
   in[6] = mm_reverse_epi16(in[6]);
   in[7] = mm_reverse_epi16(in[7]);
 }
+#endif  // CONFIG_EXT_TX
 
 static INLINE void scale_sqrt2_8x4(__m128i *in) {
   // Implements 'ROUND_POWER_OF_TWO(input * Sqrt2, DCT_CONST_BITS)'
@@ -665,26 +694,31 @@
   switch (tx_type) {
     case DCT_DCT:
     case ADST_DCT:
+#if CONFIG_EXT_TX
     case FLIPADST_DCT:
     case H_DCT:
-      idct8_sse2(in);
+#endif
+      aom_idct8_sse2(in);
       array_transpose_8x8(in, in);
-      idct8_sse2(in + 8);
+      aom_idct8_sse2(in + 8);
       array_transpose_8x8(in + 8, in + 8);
       break;
     case DCT_ADST:
     case ADST_ADST:
+#if CONFIG_EXT_TX
     case DCT_FLIPADST:
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
     case H_ADST:
     case H_FLIPADST:
-      iadst8_sse2(in);
+#endif
+      aom_iadst8_sse2(in);
       array_transpose_8x8(in, in);
-      iadst8_sse2(in + 8);
+      aom_iadst8_sse2(in + 8);
       array_transpose_8x8(in + 8, in + 8);
       break;
+#if CONFIG_EXT_TX
     case V_FLIPADST:
     case V_ADST:
     case V_DCT:
@@ -692,6 +726,7 @@
       iidtx8_sse2(in);
       iidtx8_sse2(in + 8);
       break;
+#endif
     default: assert(0); break;
   }
   scale_sqrt2_8x8(in);
@@ -701,33 +736,50 @@
   switch (tx_type) {
     case DCT_DCT:
     case DCT_ADST:
+#if CONFIG_EXT_TX
     case DCT_FLIPADST:
-    case V_DCT: idct16_8col(in); break;
+    case V_DCT:
+#endif
+      idct16_8col(in);
+      break;
     case ADST_DCT:
     case ADST_ADST:
+#if CONFIG_EXT_TX
     case FLIPADST_ADST:
     case ADST_FLIPADST:
     case FLIPADST_FLIPADST:
     case FLIPADST_DCT:
     case V_ADST:
-    case V_FLIPADST: iadst16_8col(in); break;
+    case V_FLIPADST:
+#endif
+      iadst16_8col(in);
+      break;
+#if CONFIG_EXT_TX
     case H_DCT:
     case H_ADST:
     case H_FLIPADST:
     case IDTX: iidtx16_8col(in); break;
+#endif
     default: assert(0); break;
   }
 
   switch (tx_type) {
     case DCT_DCT:
     case ADST_DCT:
+#if CONFIG_EXT_TX
     case H_DCT:
+#endif
     case DCT_ADST:
     case ADST_ADST:
+#if CONFIG_EXT_TX
     case H_ADST:
     case V_ADST:
     case V_DCT:
-    case IDTX: write_buffer_8x16(dest, in, stride); break;
+    case IDTX:
+#endif
+      write_buffer_8x16(dest, in, stride);
+      break;
+#if CONFIG_EXT_TX
     case FLIPADST_DCT:
     case FLIPADST_ADST:
     case V_FLIPADST: write_buffer_8x16(dest + stride * 15, in, -stride); break;
@@ -743,6 +795,7 @@
       flip_buffer_lr_8x8(in + 8);
       write_buffer_8x16(dest + stride * 15, in, -stride);
       break;
+#endif
     default: assert(0); break;
   }
 }
@@ -809,20 +862,30 @@
   switch (tx_type) {
     case DCT_DCT:
     case ADST_DCT:
+#if CONFIG_EXT_TX
     case FLIPADST_DCT:
-    case H_DCT: idct16_8col(in); break;
+    case H_DCT:
+#endif
+      idct16_8col(in);
+      break;
     case DCT_ADST:
     case ADST_ADST:
+#if CONFIG_EXT_TX
     case DCT_FLIPADST:
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
     case H_ADST:
-    case H_FLIPADST: iadst16_8col(in); break;
+    case H_FLIPADST:
+#endif
+      iadst16_8col(in);
+      break;
+#if CONFIG_EXT_TX
     case V_FLIPADST:
     case V_ADST:
     case V_DCT:
     case IDTX: iidtx16_8col(in); break;
+#endif
     default: assert(0); break;
   }
 
@@ -834,22 +897,27 @@
   switch (tx_type) {
     case DCT_DCT:
     case DCT_ADST:
+#if CONFIG_EXT_TX
     case DCT_FLIPADST:
     case V_DCT:
-      idct8_sse2(in);
-      idct8_sse2(in + 8);
+#endif
+      aom_idct8_sse2(in);
+      aom_idct8_sse2(in + 8);
       break;
     case ADST_DCT:
     case ADST_ADST:
+#if CONFIG_EXT_TX
     case FLIPADST_ADST:
     case ADST_FLIPADST:
     case FLIPADST_FLIPADST:
     case FLIPADST_DCT:
     case V_ADST:
     case V_FLIPADST:
-      iadst8_sse2(in);
-      iadst8_sse2(in + 8);
+#endif
+      aom_iadst8_sse2(in);
+      aom_iadst8_sse2(in + 8);
       break;
+#if CONFIG_EXT_TX
     case H_DCT:
     case H_ADST:
     case H_FLIPADST:
@@ -859,22 +927,26 @@
       iidtx8_sse2(in);
       iidtx8_sse2(in + 8);
       break;
+#endif
     default: assert(0); break;
   }
 
   switch (tx_type) {
     case DCT_DCT:
     case ADST_DCT:
-    case H_DCT:
     case DCT_ADST:
     case ADST_ADST:
+#if CONFIG_EXT_TX
+    case H_DCT:
     case H_ADST:
     case V_ADST:
     case V_DCT:
     case IDTX:
+#endif
       write_buffer_8x8_round6(dest, in, stride);
       write_buffer_8x8_round6(dest + 8, in + 8, stride);
       break;
+#if CONFIG_EXT_TX
     case FLIPADST_DCT:
     case FLIPADST_ADST:
     case V_FLIPADST:
@@ -895,6 +967,7 @@
       write_buffer_8x8_round6(dest + stride * 7, in + 8, -stride);
       write_buffer_8x8_round6(dest + stride * 7 + 8, in, -stride);
       break;
+#endif
     default: assert(0); break;
   }
 }
@@ -933,22 +1006,26 @@
   switch (tx_type) {
     case DCT_DCT:
     case ADST_DCT:
+#if CONFIG_EXT_TX
     case FLIPADST_DCT:
-    case H_DCT: idct8_sse2(in); break;
+    case H_DCT:
+#endif
+      aom_idct8_sse2(in);
+      break;
     case DCT_ADST:
     case ADST_ADST:
+#if CONFIG_EXT_TX
     case DCT_FLIPADST:
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
     case H_ADST:
-    case H_FLIPADST: iadst8_sse2(in); break;
+    case H_FLIPADST: aom_iadst8_sse2(in); break;
     case V_FLIPADST:
     case V_ADST:
     case V_DCT:
-    case IDTX:
-      iidtx8_sse2(in);
-      array_transpose_8x8(in, in);
+    case IDTX: iidtx8_sse2(in); array_transpose_8x8(in, in);
+#endif
       break;
     default: assert(0); break;
   }
@@ -967,22 +1044,27 @@
   switch (tx_type) {
     case DCT_DCT:
     case DCT_ADST:
+#if CONFIG_EXT_TX
     case DCT_FLIPADST:
     case V_DCT:
-      idct4_sse2(in + 4);
-      idct4_sse2(in + 6);
+#endif
+      aom_idct4_sse2(in + 4);
+      aom_idct4_sse2(in + 6);
       break;
     case ADST_DCT:
     case ADST_ADST:
+#if CONFIG_EXT_TX
     case FLIPADST_ADST:
     case ADST_FLIPADST:
     case FLIPADST_FLIPADST:
     case FLIPADST_DCT:
     case V_ADST:
     case V_FLIPADST:
-      iadst4_sse2(in + 4);
-      iadst4_sse2(in + 6);
+#endif
+      aom_iadst4_sse2(in + 4);
+      aom_iadst4_sse2(in + 6);
       break;
+#if CONFIG_EXT_TX
     case H_DCT:
     case H_ADST:
     case H_FLIPADST:
@@ -992,6 +1074,7 @@
       iidtx4_sse2(in + 6);
       array_transpose_4x4(in + 6);
       break;
+#endif
     default: assert(0); break;
   }
 
@@ -1004,9 +1087,10 @@
   switch (tx_type) {
     case DCT_DCT:
     case ADST_DCT:
-    case H_DCT:
     case DCT_ADST:
     case ADST_ADST:
+#if CONFIG_EXT_TX
+    case H_DCT:
     case H_ADST:
     case V_ADST:
     case V_DCT:
@@ -1028,6 +1112,7 @@
       in[2] = mm_reverse_epi16(in[2]);
       in[3] = mm_reverse_epi16(in[3]);
       FLIPUD_PTR(dest, stride, 4);
+#endif
       break;
     default: assert(0); break;
   }
@@ -1111,22 +1196,27 @@
   switch (tx_type) {
     case DCT_DCT:
     case ADST_DCT:
+#if CONFIG_EXT_TX
     case FLIPADST_DCT:
     case H_DCT:
-      idct4_sse2(in + 4);
-      idct4_sse2(in + 6);
+#endif
+      aom_idct4_sse2(in + 4);
+      aom_idct4_sse2(in + 6);
       break;
     case DCT_ADST:
     case ADST_ADST:
+#if CONFIG_EXT_TX
     case DCT_FLIPADST:
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
     case H_ADST:
     case H_FLIPADST:
-      iadst4_sse2(in + 4);
-      iadst4_sse2(in + 6);
+#endif
+      aom_iadst4_sse2(in + 4);
+      aom_iadst4_sse2(in + 6);
       break;
+#if CONFIG_EXT_TX
     case V_FLIPADST:
     case V_ADST:
     case V_DCT:
@@ -1136,6 +1226,7 @@
       iidtx4_sse2(in + 6);
       array_transpose_4x4(in + 6);
       break;
+#endif
     default: assert(0); break;
   }
 
@@ -1149,16 +1240,25 @@
   switch (tx_type) {
     case DCT_DCT:
     case DCT_ADST:
+#if CONFIG_EXT_TX
     case DCT_FLIPADST:
-    case V_DCT: idct8_sse2(in); break;
+    case V_DCT:
+#endif
+      aom_idct8_sse2(in);
+      break;
     case ADST_DCT:
     case ADST_ADST:
+#if CONFIG_EXT_TX
     case FLIPADST_ADST:
     case ADST_FLIPADST:
     case FLIPADST_FLIPADST:
     case FLIPADST_DCT:
     case V_ADST:
-    case V_FLIPADST: iadst8_sse2(in); break;
+    case V_FLIPADST:
+#endif
+      aom_iadst8_sse2(in);
+      break;
+#if CONFIG_EXT_TX
     case H_DCT:
     case H_ADST:
     case H_FLIPADST:
@@ -1166,19 +1266,24 @@
       iidtx8_sse2(in);
       array_transpose_8x8(in, in);
       break;
+#endif
     default: assert(0); break;
   }
 
   switch (tx_type) {
     case DCT_DCT:
     case ADST_DCT:
-    case H_DCT:
     case DCT_ADST:
     case ADST_ADST:
+#if CONFIG_EXT_TX
+    case H_DCT:
     case H_ADST:
     case V_ADST:
     case V_DCT:
-    case IDTX: break;
+    case IDTX:
+#endif
+      break;
+#if CONFIG_EXT_TX
     case FLIPADST_DCT:
     case FLIPADST_ADST:
     case V_FLIPADST: FLIPUD_PTR(dest, stride, 8); break;
@@ -1205,6 +1310,7 @@
       in[7] = _mm_shufflelo_epi16(in[7], 0x1b);
       FLIPUD_PTR(dest, stride, 8);
       break;
+#endif
     default: assert(0); break;
   }
   in[0] = _mm_unpacklo_epi64(in[0], in[1]);
@@ -1252,9 +1358,10 @@
   // Generate the bottom half of the output
   scale_sqrt2_8x16(bl);
   scale_sqrt2_8x16(br);
-  idct16_sse2(bl, br);  // Includes a transposition
+  aom_idct16_sse2(bl, br);  // Includes a transposition
 }
 
+#if CONFIG_EXT_TX
 static INLINE void iidtx32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
                                  __m128i *br) {
   int i;
@@ -1267,6 +1374,7 @@
     br[i] = _mm_slli_epi16(br[i], 2);
   }
 }
+#endif  // CONFIG_EXT_TX
 
 static INLINE void write_buffer_16x32_round6(uint8_t *dest, __m128i *intl,
                                              __m128i *intr, __m128i *inbl,
@@ -1307,22 +1415,27 @@
   switch (tx_type) {
     case DCT_DCT:
     case ADST_DCT:
+#if CONFIG_EXT_TX
     case FLIPADST_DCT:
     case H_DCT:
-      idct16_sse2(intl, intr);
-      idct16_sse2(inbl, inbr);
+#endif
+      aom_idct16_sse2(intl, intr);
+      aom_idct16_sse2(inbl, inbr);
       break;
     case DCT_ADST:
     case ADST_ADST:
+#if CONFIG_EXT_TX
     case DCT_FLIPADST:
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
     case H_ADST:
     case H_FLIPADST:
-      iadst16_sse2(intl, intr);
-      iadst16_sse2(inbl, inbr);
+#endif
+      aom_iadst16_sse2(intl, intr);
+      aom_iadst16_sse2(inbl, inbr);
       break;
+#if CONFIG_EXT_TX
     case V_FLIPADST:
     case V_ADST:
     case V_DCT:
@@ -1330,6 +1443,7 @@
       iidtx16_sse2(intl, intr);
       iidtx16_sse2(inbl, inbr);
       break;
+#endif
     default: assert(0); break;
   }
 
@@ -1342,33 +1456,47 @@
   switch (tx_type) {
     case DCT_DCT:
     case DCT_ADST:
+#if CONFIG_EXT_TX
     case DCT_FLIPADST:
-    case V_DCT: idct32_16col(intl, intr, inbl, inbr); break;
+    case V_DCT:
+#endif
+      idct32_16col(intl, intr, inbl, inbr);
+      break;
     case ADST_DCT:
     case ADST_ADST:
+#if CONFIG_EXT_TX
     case FLIPADST_ADST:
     case ADST_FLIPADST:
     case FLIPADST_FLIPADST:
     case FLIPADST_DCT:
     case V_ADST:
-    case V_FLIPADST: ihalfright32_16col(intl, intr, inbl, inbr); break;
+    case V_FLIPADST:
+#endif
+      ihalfright32_16col(intl, intr, inbl, inbr);
+      break;
+#if CONFIG_EXT_TX
     case H_DCT:
     case H_ADST:
     case H_FLIPADST:
     case IDTX: iidtx32_16col(intl, intr, inbl, inbr); break;
+#endif
     default: assert(0); break;
   }
 
   switch (tx_type) {
     case DCT_DCT:
     case ADST_DCT:
-    case H_DCT:
     case DCT_ADST:
     case ADST_ADST:
+#if CONFIG_EXT_TX
+    case H_DCT:
     case H_ADST:
     case V_ADST:
     case V_DCT:
-    case IDTX: break;
+    case IDTX:
+#endif
+      break;
+#if CONFIG_EXT_TX
     case FLIPADST_DCT:
     case FLIPADST_ADST:
     case V_FLIPADST: FLIPUD_PTR(dest, stride, 32); break;
@@ -1395,6 +1523,7 @@
       }
       FLIPUD_PTR(dest, stride, 32);
       break;
+#endif
     default: assert(0); break;
   }
   write_buffer_16x32_round6(dest, intl, intr, inbl, inbr, stride);
@@ -1439,20 +1568,30 @@
   switch (tx_type) {
     case DCT_DCT:
     case ADST_DCT:
+#if CONFIG_EXT_TX
     case FLIPADST_DCT:
-    case H_DCT: idct32_16col(in0, in1, in2, in3); break;
+    case H_DCT:
+#endif
+      idct32_16col(in0, in1, in2, in3);
+      break;
     case DCT_ADST:
     case ADST_ADST:
+#if CONFIG_EXT_TX
     case DCT_FLIPADST:
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
     case H_ADST:
-    case H_FLIPADST: ihalfright32_16col(in0, in1, in2, in3); break;
+    case H_FLIPADST:
+#endif
+      ihalfright32_16col(in0, in1, in2, in3);
+      break;
+#if CONFIG_EXT_TX
     case V_FLIPADST:
     case V_ADST:
     case V_DCT:
     case IDTX: iidtx32_16col(in0, in1, in2, in3); break;
+#endif
     default: assert(0); break;
   }
 
@@ -1465,22 +1604,27 @@
   switch (tx_type) {
     case DCT_DCT:
     case DCT_ADST:
+#if CONFIG_EXT_TX
     case DCT_FLIPADST:
     case V_DCT:
-      idct16_sse2(in0, in1);
-      idct16_sse2(in2, in3);
+#endif
+      aom_idct16_sse2(in0, in1);
+      aom_idct16_sse2(in2, in3);
       break;
     case ADST_DCT:
     case ADST_ADST:
+#if CONFIG_EXT_TX
     case FLIPADST_ADST:
     case ADST_FLIPADST:
     case FLIPADST_FLIPADST:
     case FLIPADST_DCT:
     case V_ADST:
     case V_FLIPADST:
-      iadst16_sse2(in0, in1);
-      iadst16_sse2(in2, in3);
+#endif
+      aom_iadst16_sse2(in0, in1);
+      aom_iadst16_sse2(in2, in3);
       break;
+#if CONFIG_EXT_TX
     case H_DCT:
     case H_ADST:
     case H_FLIPADST:
@@ -1488,19 +1632,24 @@
       iidtx16_sse2(in0, in1);
       iidtx16_sse2(in2, in3);
       break;
+#endif
     default: assert(0); break;
   }
 
   switch (tx_type) {
     case DCT_DCT:
     case ADST_DCT:
-    case H_DCT:
     case DCT_ADST:
     case ADST_ADST:
+#if CONFIG_EXT_TX
+    case H_DCT:
     case H_ADST:
     case V_ADST:
     case V_DCT:
-    case IDTX: break;
+    case IDTX:
+#endif
+      break;
+#if CONFIG_EXT_TX
     case FLIPADST_DCT:
     case FLIPADST_ADST:
     case V_FLIPADST: FLIPUD_PTR(dest, stride, 16); break;
@@ -1527,8 +1676,8 @@
       }
       FLIPUD_PTR(dest, stride, 16);
       break;
+#endif
     default: assert(0); break;
   }
   write_buffer_32x16_round6(dest, in0, in1, in2, in3, stride);
 }
-#endif  // CONFIG_EXT_TX
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index 0f958e4..b1de763 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -38,6 +38,7 @@
 #endif  // CONFIG_DERING
 #include "av1/common/entropy.h"
 #include "av1/common/entropymode.h"
+#include "av1/common/entropymv.h"
 #include "av1/common/idct.h"
 #include "av1/common/pred_common.h"
 #include "av1/common/quant_common.h"
@@ -56,6 +57,14 @@
 #define MAX_AV1_HEADER_SIZE 80
 #define ACCT_STR __func__
 
+static struct aom_read_bit_buffer *init_read_bit_buffer(
+    AV1Decoder *pbi, struct aom_read_bit_buffer *rb, const uint8_t *data,
+    const uint8_t *data_end, uint8_t clear_data[MAX_AV1_HEADER_SIZE]);
+static int read_compressed_header(AV1Decoder *pbi, const uint8_t *data,
+                                  size_t partition_size);
+static size_t read_uncompressed_header(AV1Decoder *pbi,
+                                       struct aom_read_bit_buffer *rb);
+
 static int is_compound_reference_allowed(const AV1_COMMON *cm) {
   int i;
   if (frame_is_intra_only(cm)) return 0;
@@ -106,21 +115,27 @@
   return aom_rb_read_bit(rb) ? TX_MODE_SELECT : aom_rb_read_literal(rb, 2);
 }
 
+static void read_tx_size_probs(FRAME_CONTEXT *fc, aom_reader *r) {
+  int i, j, k;
+  for (i = 0; i < MAX_TX_DEPTH; ++i)
+    for (j = 0; j < TX_SIZE_CONTEXTS; ++j)
+      for (k = 0; k < i + 1; ++k)
+        av1_diff_update_prob(r, &fc->tx_size_probs[i][j][k], ACCT_STR);
+}
+
+#if !CONFIG_EC_ADAPT
 static void read_switchable_interp_probs(FRAME_CONTEXT *fc, aom_reader *r) {
   int i, j;
   for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) {
     for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i)
       av1_diff_update_prob(r, &fc->switchable_interp_prob[j][i], ACCT_STR);
-#if CONFIG_DAALA_EC
-    av1_tree_to_cdf(av1_switchable_interp_tree, fc->switchable_interp_prob[j],
-                    fc->switchable_interp_cdf[j]);
-#endif
   }
 }
+#endif
 
 static void read_inter_mode_probs(FRAME_CONTEXT *fc, aom_reader *r) {
-  int i;
 #if CONFIG_REF_MV
+  int i;
   for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i)
     av1_diff_update_prob(r, &fc->newmv_prob[i], ACCT_STR);
   for (i = 0; i < ZEROMV_MODE_CONTEXTS; ++i)
@@ -133,18 +148,20 @@
   av1_diff_update_prob(r, &fc->new2mv_prob, ACCT_STR);
 #endif  // CONFIG_EXT_INTER
 #else
-  int j;
+#if !CONFIG_EC_ADAPT
+  int i, j;
   for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
     for (j = 0; j < INTER_MODES - 1; ++j)
       av1_diff_update_prob(r, &fc->inter_mode_probs[i][j], ACCT_STR);
-#if CONFIG_DAALA_EC
-    av1_tree_to_cdf(av1_inter_mode_tree, fc->inter_mode_probs[i],
-                    fc->inter_mode_cdf[i]);
-#endif
   }
+#else
+  (void)fc;
+  (void)r;
+#endif
 #endif
 }
 
+#if !CONFIG_EC_ADAPT
 #if CONFIG_EXT_INTER
 static void read_inter_compound_mode_probs(FRAME_CONTEXT *fc, aom_reader *r) {
   int i, j;
@@ -157,6 +174,26 @@
   }
 }
 #endif  // CONFIG_EXT_INTER
+#if !CONFIG_EXT_TX
+static void read_ext_tx_probs(FRAME_CONTEXT *fc, aom_reader *r) {
+  int i, j, k;
+  if (aom_read(r, GROUP_DIFF_UPDATE_PROB, ACCT_STR)) {
+    for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+      for (j = 0; j < TX_TYPES; ++j) {
+        for (k = 0; k < TX_TYPES - 1; ++k)
+          av1_diff_update_prob(r, &fc->intra_ext_tx_prob[i][j][k], ACCT_STR);
+      }
+    }
+  }
+  if (aom_read(r, GROUP_DIFF_UPDATE_PROB, ACCT_STR)) {
+    for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+      for (k = 0; k < TX_TYPES - 1; ++k)
+        av1_diff_update_prob(r, &fc->inter_ext_tx_prob[i][k], ACCT_STR);
+    }
+  }
+}
+#endif
+#endif
 
 static REFERENCE_MODE read_frame_reference_mode(
     const AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
@@ -206,12 +243,11 @@
 }
 
 static void read_mv_probs(nmv_context *ctx, int allow_hp, aom_reader *r) {
-  int i, j;
+  int i;
 
+#if !CONFIG_EC_ADAPT
+  int j;
   update_mv_probs(ctx->joints, MV_JOINTS - 1, r);
-#if CONFIG_DAALA_EC
-  av1_tree_to_cdf(av1_mv_joint_tree, ctx->joints, ctx->joint_cdf);
-#endif
 
   for (i = 0; i < 2; ++i) {
     nmv_component *const comp_ctx = &ctx->comps[i];
@@ -219,25 +255,15 @@
     update_mv_probs(comp_ctx->classes, MV_CLASSES - 1, r);
     update_mv_probs(comp_ctx->class0, CLASS0_SIZE - 1, r);
     update_mv_probs(comp_ctx->bits, MV_OFFSET_BITS, r);
-#if CONFIG_DAALA_EC
-    av1_tree_to_cdf(av1_mv_class_tree, comp_ctx->classes, comp_ctx->class_cdf);
-#endif
   }
-
   for (i = 0; i < 2; ++i) {
     nmv_component *const comp_ctx = &ctx->comps[i];
     for (j = 0; j < CLASS0_SIZE; ++j) {
       update_mv_probs(comp_ctx->class0_fp[j], MV_FP_SIZE - 1, r);
-#if CONFIG_DAALA_EC
-      av1_tree_to_cdf(av1_mv_fp_tree, comp_ctx->class0_fp[j],
-                      comp_ctx->class0_fp_cdf[j]);
-#endif
     }
     update_mv_probs(comp_ctx->fp, MV_FP_SIZE - 1, r);
-#if CONFIG_DAALA_EC
-    av1_tree_to_cdf(av1_mv_fp_tree, comp_ctx->fp, comp_ctx->fp_cdf);
-#endif
   }
+#endif  // !CONFIG_EC_ADAPT
 
   if (allow_hp) {
     for (i = 0; i < 2; ++i) {
@@ -325,13 +351,17 @@
   const TX_SIZE plane_tx_size =
       plane ? uv_txsize_lookup[bsize][mbmi->inter_tx_size[tx_row][tx_col]][0][0]
             : mbmi->inter_tx_size[tx_row][tx_col];
-  int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
-  int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
+  // Scale to match transform block unit.
+  int max_blocks_high = block_size_high[plane_bsize];
+  int max_blocks_wide = block_size_wide[plane_bsize];
 
   if (xd->mb_to_bottom_edge < 0)
-    max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
+    max_blocks_high += xd->mb_to_bottom_edge >> (3 + pd->subsampling_y);
   if (xd->mb_to_right_edge < 0)
-    max_blocks_wide += xd->mb_to_right_edge >> (5 + pd->subsampling_x);
+    max_blocks_wide += xd->mb_to_right_edge >> (3 + pd->subsampling_x);
+
+  max_blocks_high >>= tx_size_wide_log2[0];
+  max_blocks_wide >>= tx_size_wide_log2[0];
 
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
@@ -350,15 +380,14 @@
         pd->dst.stride, max_scan_line, eob);
     *eob_total += eob;
   } else {
-    int bsl = b_width_log2_lookup[bsize];
+    int bsl = block_size_wide[bsize] >> (tx_size_wide_log2[0] + 1);
     int i;
 
     assert(bsl > 0);
-    --bsl;
 
     for (i = 0; i < 4; ++i) {
-      const int offsetr = blk_row + ((i >> 1) << bsl);
-      const int offsetc = blk_col + ((i & 0x01) << bsl);
+      const int offsetr = blk_row + (i >> 1) * bsl;
+      const int offsetc = blk_col + (i & 0x01) * bsl;
 
       if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
 
@@ -1281,8 +1310,8 @@
 
       for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
         const struct macroblockd_plane *const pd = &xd->plane[plane];
-        const int num_4x4_w = pd->n4_w;
-        const int num_4x4_h = pd->n4_h;
+        int block_width = pd->width;
+        int block_height = pd->height;
         int row, col;
 #if CONFIG_VAR_TX
         // TODO(jingning): This can be simplified for decoder performance.
@@ -1297,23 +1326,26 @@
               plane ? get_uv_tx_size(mbmi, pd) : mbmi->tx_size;
           const int stepr = tx_size_high_unit[tx_size];
           const int stepc = tx_size_wide_unit[tx_size];
-          const int max_blocks_wide =
-              num_4x4_w +
+          int max_blocks_wide =
+              block_width +
               (xd->mb_to_right_edge >= 0 ? 0 : xd->mb_to_right_edge >>
-                                                   (5 + pd->subsampling_x));
-          const int max_blocks_high =
-              num_4x4_h +
+                                                   (3 + pd->subsampling_x));
+          int max_blocks_high =
+              block_height +
               (xd->mb_to_bottom_edge >= 0 ? 0 : xd->mb_to_bottom_edge >>
-                                                    (5 + pd->subsampling_y));
-
+                                                    (3 + pd->subsampling_y));
+          max_blocks_wide >>= tx_size_wide_log2[0];
+          max_blocks_high >>= tx_size_wide_log2[0];
           for (row = 0; row < max_blocks_high; row += stepr)
             for (col = 0; col < max_blocks_wide; col += stepc)
               eobtotal += reconstruct_inter_block(cm, xd, r, mbmi->segment_id,
                                                   plane, row, col, tx_size);
         } else {
 #endif
-          for (row = 0; row < num_4x4_h; row += bh_var_tx)
-            for (col = 0; col < num_4x4_w; col += bw_var_tx)
+          block_width >>= tx_size_wide_log2[0];
+          block_height >>= tx_size_wide_log2[0];
+          for (row = 0; row < block_height; row += bh_var_tx)
+            for (col = 0; col < block_width; col += bw_var_tx)
               decode_reconstruct_tx(cm, xd, r, mbmi, plane, plane_bsize, row,
                                     col, max_tx_size, &eobtotal);
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
@@ -1324,15 +1356,16 @@
             plane ? get_uv_tx_size(mbmi, pd) : mbmi->tx_size;
         const int stepr = tx_size_high_unit[tx_size];
         const int stepc = tx_size_wide_unit[tx_size];
-        const int max_blocks_wide =
-            num_4x4_w + (xd->mb_to_right_edge >= 0
-                             ? 0
-                             : xd->mb_to_right_edge >> (5 + pd->subsampling_x));
-        const int max_blocks_high =
-            num_4x4_h +
+        int max_blocks_wide =
+            block_width +
+            (xd->mb_to_right_edge >= 0 ? 0 : xd->mb_to_right_edge >>
+                                                 (3 + pd->subsampling_x));
+        int max_blocks_high =
+            block_height +
             (xd->mb_to_bottom_edge >= 0 ? 0 : xd->mb_to_bottom_edge >>
-                                                  (5 + pd->subsampling_y));
-
+                                                  (3 + pd->subsampling_y));
+        max_blocks_wide >>= tx_size_wide_log2[0];
+        max_blocks_high >>= tx_size_wide_log2[0];
         for (row = 0; row < max_blocks_high; row += stepr)
           for (col = 0; col < max_blocks_wide; col += stepc)
             eobtotal += reconstruct_inter_block(cm, xd, r, mbmi->segment_id,
@@ -1715,20 +1748,21 @@
       assert(mbmi->segment_id_supertx != MAX_SEGMENTS);
       for (i = 0; i < MAX_MB_PLANE; ++i) {
         const struct macroblockd_plane *const pd = &xd->plane[i];
-        const int num_4x4_w = pd->n4_w;
-        const int num_4x4_h = pd->n4_h;
         int row, col;
         const TX_SIZE tx_size = i ? get_uv_tx_size(mbmi, pd) : mbmi->tx_size;
-        const int stepr = num_4x4_blocks_high_txsize_lookup[tx_size];
-        const int stepc = num_4x4_blocks_wide_txsize_lookup[tx_size];
-        const int max_blocks_wide =
-            num_4x4_w + (xd->mb_to_right_edge >= 0
+        const int stepr = tx_size_high_unit[tx_size];
+        const int stepc = tx_size_wide_unit[tx_size];
+        int max_blocks_wide =
+            pd->width + (xd->mb_to_right_edge >= 0
                              ? 0
-                             : xd->mb_to_right_edge >> (5 + pd->subsampling_x));
-        const int max_blocks_high =
-            num_4x4_h +
+                             : xd->mb_to_right_edge >> (3 + pd->subsampling_x));
+        int max_blocks_high =
+            pd->height +
             (xd->mb_to_bottom_edge >= 0 ? 0 : xd->mb_to_bottom_edge >>
-                                                  (5 + pd->subsampling_y));
+                                                  (3 + pd->subsampling_y));
+
+        max_blocks_wide >>= tx_size_wide_log2[0];
+        max_blocks_high >>= tx_size_wide_log2[0];
 
         for (row = 0; row < max_blocks_high; row += stepr)
           for (col = 0; col < max_blocks_wide; col += stepc)
@@ -1775,6 +1809,19 @@
   if (bsize >= BLOCK_8X8 &&
       (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
     dec_update_partition_context(xd, mi_row, mi_col, subsize, num_8x8_wh);
+#endif  // CONFIG_EXT_PARTITION_TYPES
+
+#if CONFIG_DERING
+  if (bsize == BLOCK_64X64) {
+    if (cm->dering_level != 0 && !sb_all_skip(cm, mi_row, mi_col)) {
+      cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]->mbmi.dering_gain =
+          aom_read_literal(r, DERING_REFINEMENT_BITS, ACCT_STR);
+    } else {
+      cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]->mbmi.dering_gain =
+          0;
+    }
+  }
+#endif
 
 #if CONFIG_CLPF
   if (bsize == BLOCK_64X64 && cm->clpf_strength_y &&
@@ -1812,18 +1859,6 @@
     }
   }
 #endif
-#if CONFIG_DERING
-  if (bsize == BLOCK_64X64) {
-    if (cm->dering_level != 0 && !sb_all_skip(cm, mi_row, mi_col)) {
-      cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]->mbmi.dering_gain =
-          aom_read_literal(r, DERING_REFINEMENT_BITS, ACCT_STR);
-    } else {
-      cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]->mbmi.dering_gain =
-          0;
-    }
-  }
-#endif
-#endif  // CONFIG_EXT_PARTITION_TYPES
 }
 
 #if !CONFIG_ANS
@@ -1868,13 +1903,18 @@
 static void read_coef_probs_common(av1_coeff_probs_model *coef_probs,
                                    aom_reader *r) {
   int i, j, k, l, m;
+#if CONFIG_EC_ADAPT
+  const int node_limit = UNCONSTRAINED_NODES - 1;
+#else
+  const int node_limit = UNCONSTRAINED_NODES;
+#endif
 
   if (aom_read_bit(r, ACCT_STR))
     for (i = 0; i < PLANE_TYPES; ++i)
       for (j = 0; j < REF_TYPES; ++j)
         for (k = 0; k < COEF_BANDS; ++k)
           for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l)
-            for (m = 0; m < UNCONSTRAINED_NODES; ++m)
+            for (m = 0; m < node_limit; ++m)
               av1_diff_update_prob(r, &coef_probs[i][j][k][l][m], ACCT_STR);
 }
 
@@ -1883,9 +1923,6 @@
   TX_SIZE tx_size;
   for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
     read_coef_probs_common(fc->coef_probs[tx_size], r);
-#if CONFIG_RANS || CONFIG_DAALA_EC
-  av1_coef_pareto_cdfs(fc);
-#endif  // CONFIG_RANS
 }
 
 static void setup_segmentation(AV1_COMMON *const cm,
@@ -2473,6 +2510,17 @@
     pbi->tile_size_bytes = aom_rb_read_literal(rb, 2) + 1;
   }
 #endif  // CONFIG_EXT_TILE
+#if CONFIG_TILE_GROUPS
+  // Store an index to the location of the tile group information
+  pbi->tg_size_bit_offset = rb->bit_offset;
+  pbi->tg_size = 1 << (cm->log2_tile_rows + cm->log2_tile_cols);
+  if (cm->log2_tile_rows + cm->log2_tile_cols > 0) {
+    pbi->tg_start =
+        aom_rb_read_literal(rb, cm->log2_tile_rows + cm->log2_tile_cols);
+    pbi->tg_size =
+        1 + aom_rb_read_literal(rb, cm->log2_tile_rows + cm->log2_tile_cols);
+  }
+#endif
 }
 
 static int mem_get_varsize(const uint8_t *src, const int sz) {
@@ -2670,6 +2718,41 @@
     AV1Decoder *pbi, const uint8_t *data, const uint8_t *data_end,
     TileBufferDec (*const tile_buffers)[MAX_TILE_COLS]) {
   AV1_COMMON *const cm = &pbi->common;
+#if CONFIG_TILE_GROUPS
+  int r, c;
+  const int tile_cols = cm->tile_cols;
+  const int tile_rows = cm->tile_rows;
+  int tc = 0;
+  int first_tile_in_tg = 0;
+  int hdr_offset;
+  struct aom_read_bit_buffer rb_tg_hdr;
+  uint8_t clear_data[MAX_AV1_HEADER_SIZE];
+  const int num_tiles = tile_rows * tile_cols;
+  const int num_bits = OD_ILOG(num_tiles) - 1;
+  const int hdr_size = pbi->uncomp_hdr_size + pbi->first_partition_size;
+  const int tg_size_bit_offset = pbi->tg_size_bit_offset;
+
+  for (r = 0; r < tile_rows; ++r) {
+    for (c = 0; c < tile_cols; ++c, ++tc) {
+      TileBufferDec *const buf = &tile_buffers[r][c];
+      hdr_offset = (tc && tc == first_tile_in_tg) ? hdr_size : 0;
+
+      buf->col = c;
+      if (hdr_offset) {
+        init_read_bit_buffer(pbi, &rb_tg_hdr, data, data_end, clear_data);
+        rb_tg_hdr.bit_offset = tg_size_bit_offset;
+        if (num_tiles) {
+          pbi->tg_start = aom_rb_read_literal(&rb_tg_hdr, num_bits);
+          pbi->tg_size = 1 + aom_rb_read_literal(&rb_tg_hdr, num_bits);
+        }
+      }
+      first_tile_in_tg += tc == first_tile_in_tg ? pbi->tg_size : 0;
+      data += hdr_offset;
+      get_tile_buffer(data_end, pbi->tile_size_bytes, 0, &pbi->common.error,
+                      &data, pbi->decrypt_cb, pbi->decrypt_state, buf);
+    }
+  }
+#else
   int r, c;
   const int tile_cols = cm->tile_cols;
   const int tile_rows = cm->tile_rows;
@@ -2683,6 +2766,7 @@
                       &data, pbi->decrypt_cb, pbi->decrypt_state, buf);
     }
   }
+#endif
 }
 #endif  // CONFIG_EXT_TILE
 
@@ -2850,7 +2934,10 @@
 
     assert(mi_row > 0);
 
-#if !CONFIG_VAR_TX
+// when Parallel deblocking is enabled, deblocking should not
+// be interleaved with decoding. Instead, deblocking should be done
+// after the entire frame is decoded.
+#if !CONFIG_VAR_TX && !CONFIG_PARALLEL_DEBLOCKING
     // Loopfilter one tile row.
     if (cm->lf.filter_level && !cm->skip_loop_filter) {
       LFWorkerData *const lf_data = (LFWorkerData *)pbi->lf_worker.data1;
@@ -2873,12 +2960,12 @@
         winterface->execute(&pbi->lf_worker);
       }
     }
+#endif  // !CONFIG_VAR_TX && !CONFIG_PARALLEL_DEBLOCKING
 
     // After loopfiltering, the last 7 row pixels in each superblock row may
     // still be changed by the longest loopfilter of the next superblock row.
     if (cm->frame_parallel_decode)
       av1_frameworker_broadcast(pbi->cur_buf, mi_row << cm->mib_size_log2);
-#endif  // !CONFIG_VAR_TX
   }
 
 #if CONFIG_VAR_TX
@@ -2886,6 +2973,16 @@
   av1_loop_filter_frame(get_frame_new_buffer(cm), cm, &pbi->mb,
                         cm->lf.filter_level, 0, 0);
 #else
+#if CONFIG_PARALLEL_DEBLOCKING
+  // Loopfilter all rows in the frame in the frame.
+  if (cm->lf.filter_level && !cm->skip_loop_filter) {
+    LFWorkerData *const lf_data = (LFWorkerData *)pbi->lf_worker.data1;
+    winterface->sync(&pbi->lf_worker);
+    lf_data->start = 0;
+    lf_data->stop = cm->mi_rows;
+    winterface->execute(&pbi->lf_worker);
+  }
+#else
   // Loopfilter remaining rows in the frame.
   if (cm->lf.filter_level && !cm->skip_loop_filter) {
     LFWorkerData *const lf_data = (LFWorkerData *)pbi->lf_worker.data1;
@@ -2894,6 +2991,7 @@
     lf_data->stop = cm->mi_rows;
     winterface->execute(&pbi->lf_worker);
   }
+#endif  // CONFIG_PARALLEL_DEBLOCKING
 #endif  // CONFIG_VAR_TX
   if (cm->frame_parallel_decode)
     av1_frameworker_broadcast(pbi->cur_buf, INT_MAX);
@@ -3436,12 +3534,12 @@
 #endif  // CONFIG_EXT_PARTITION
 
   setup_loopfilter(cm, rb);
-#if CONFIG_CLPF
-  setup_clpf(pbi, rb);
-#endif
 #if CONFIG_DERING
   setup_dering(cm, rb);
 #endif
+#if CONFIG_CLPF
+  setup_clpf(pbi, rb);
+#endif
 #if CONFIG_LOOP_RESTORATION
   decode_restoration_mode(cm, rb);
 #endif  // CONFIG_LOOP_RESTORATION
@@ -3505,11 +3603,11 @@
   if (sz == 0)
     aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
                        "Invalid header size");
-
   return sz;
 }
 
 #if CONFIG_EXT_TX
+#if !CONFIG_EC_ADAPT || !CONFIG_DAALA_EC
 static void read_ext_tx_probs(FRAME_CONTEXT *fc, aom_reader *r) {
   int i, j, k;
   int s;
@@ -3535,36 +3633,10 @@
     }
   }
 }
-
+#endif  // !CONFIG_EC_ADAPT || !CONFIG_DAALA_EC
 #else
 
-static void read_ext_tx_probs(FRAME_CONTEXT *fc, aom_reader *r) {
-  int i, j, k;
-  if (aom_read(r, GROUP_DIFF_UPDATE_PROB, ACCT_STR)) {
-    for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
-      for (j = 0; j < TX_TYPES; ++j) {
-        for (k = 0; k < TX_TYPES - 1; ++k)
-          av1_diff_update_prob(r, &fc->intra_ext_tx_prob[i][j][k], ACCT_STR);
-#if CONFIG_DAALA_EC
-        av1_tree_to_cdf(av1_ext_tx_tree, fc->intra_ext_tx_prob[i][j],
-                        fc->intra_ext_tx_cdf[i][j]);
-#endif
-      }
-    }
-  }
-  if (aom_read(r, GROUP_DIFF_UPDATE_PROB, ACCT_STR)) {
-    for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
-      for (k = 0; k < TX_TYPES - 1; ++k)
-        av1_diff_update_prob(r, &fc->inter_ext_tx_prob[i][k], ACCT_STR);
-#if CONFIG_DAALA_EC
-      av1_tree_to_cdf(av1_ext_tx_tree, fc->inter_ext_tx_prob[i],
-                      fc->inter_ext_tx_cdf[i]);
-#endif
-    }
-  }
-}
 #endif  // CONFIG_EXT_TX
-
 #if CONFIG_SUPERTX
 static void read_supertx_probs(FRAME_CONTEXT *fc, aom_reader *r) {
   int i, j;
@@ -3643,7 +3715,10 @@
 #endif
   FRAME_CONTEXT *const fc = cm->fc;
   aom_reader r;
-  int k, i, j;
+  int k, i;
+#if !CONFIG_EC_ADAPT
+  int j;
+#endif
 
 #if !CONFIG_ANS
   if (aom_reader_init(&r, data, partition_size, pbi->decrypt_cb,
@@ -3660,12 +3735,7 @@
   decode_restoration(cm, &r);
 #endif
 
-  if (cm->tx_mode == TX_MODE_SELECT) {
-    for (i = 0; i < MAX_TX_DEPTH; ++i)
-      for (j = 0; j < TX_SIZE_CONTEXTS; ++j)
-        for (k = 0; k < i + 1; ++k)
-          av1_diff_update_prob(&r, &fc->tx_size_probs[i][j][k], ACCT_STR);
-  }
+  if (cm->tx_mode == TX_MODE_SELECT) read_tx_size_probs(fc, &r);
 
   read_coef_probs(fc, cm->tx_mode, &r);
 
@@ -3674,7 +3744,7 @@
     av1_diff_update_prob(&r, &fc->txfm_partition_prob[k], ACCT_STR);
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
   if (cm->tx_mode == TX_MODE_SELECT) {
-    for (i = 1; i < TX_SIZES - 1; ++i)
+    for (i = 1; i < MAX_TX_DEPTH; ++i)
       av1_diff_update_prob(&r, &fc->rect_tx_prob[i], ACCT_STR);
   }
 #endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
@@ -3688,6 +3758,7 @@
     av1_diff_update_prob(&r, &fc->delta_q_prob[k], ACCT_STR);
 #endif
 
+#if !CONFIG_EC_ADAPT
   if (cm->seg.enabled && cm->seg.update_map) {
     if (cm->seg.temporal_update) {
       for (k = 0; k < PREDICTION_PROBS; k++)
@@ -3695,19 +3766,11 @@
     }
     for (k = 0; k < MAX_SEGMENTS - 1; k++)
       av1_diff_update_prob(&r, &cm->fc->seg.tree_probs[k], ACCT_STR);
-#if CONFIG_DAALA_EC
-    av1_tree_to_cdf(av1_segment_tree, cm->fc->seg.tree_probs,
-                    cm->fc->seg.tree_cdf);
-#endif
   }
 
   for (j = 0; j < INTRA_MODES; j++) {
     for (i = 0; i < INTRA_MODES - 1; ++i)
       av1_diff_update_prob(&r, &fc->uv_mode_prob[j][i], ACCT_STR);
-#if CONFIG_DAALA_EC
-    av1_tree_to_cdf(av1_intra_mode_tree, fc->uv_mode_prob[j],
-                    fc->uv_mode_cdf[j]);
-#endif
   }
 
 #if CONFIG_EXT_PARTITION_TYPES
@@ -3717,41 +3780,32 @@
     for (i = 0; i < EXT_PARTITION_TYPES - 1; ++i)
       av1_diff_update_prob(&r, &fc->partition_prob[j][i], ACCT_STR);
 #else
-  for (j = 0; j < PARTITION_CONTEXTS; ++j) {
+  for (j = 0; j < PARTITION_CONTEXTS; ++j)
     for (i = 0; i < PARTITION_TYPES - 1; ++i)
       av1_diff_update_prob(&r, &fc->partition_prob[j][i], ACCT_STR);
-#if CONFIG_DAALA_EC
-    av1_tree_to_cdf(av1_partition_tree, fc->partition_prob[j],
-                    fc->partition_cdf[j]);
-#endif
-  }
 #endif  // CONFIG_EXT_PARTITION_TYPES
-
+#endif  // EC_ADAPT, DAALA_EC
 #if CONFIG_EXT_INTRA
   for (i = 0; i < INTRA_FILTERS + 1; ++i)
     for (j = 0; j < INTRA_FILTERS - 1; ++j)
       av1_diff_update_prob(&r, &fc->intra_filter_probs[i][j], ACCT_STR);
-#endif  // CONFIG_EXT_INTRA
+#endif  // EC_ADAPT, DAALA_EC
 
   if (frame_is_intra_only(cm)) {
     av1_copy(cm->kf_y_prob, av1_kf_y_mode_prob);
 #if CONFIG_DAALA_EC
     av1_copy(cm->kf_y_cdf, av1_kf_y_mode_cdf);
 #endif
+#if !CONFIG_EC_ADAPT
     for (k = 0; k < INTRA_MODES; k++)
-      for (j = 0; j < INTRA_MODES; j++) {
+      for (j = 0; j < INTRA_MODES; j++)
         for (i = 0; i < INTRA_MODES - 1; ++i)
           av1_diff_update_prob(&r, &cm->kf_y_prob[k][j][i], ACCT_STR);
-#if CONFIG_DAALA_EC
-        av1_tree_to_cdf(av1_intra_mode_tree, cm->kf_y_prob[k][j],
-                        cm->kf_y_cdf[k][j]);
 #endif
-      }
   } else {
 #if !CONFIG_REF_MV
     nmv_context *const nmvc = &fc->nmvc;
 #endif
-
     read_inter_mode_probs(fc, &r);
 
 #if CONFIG_EXT_INTER
@@ -3788,24 +3842,23 @@
     }
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 
+#if !CONFIG_EC_ADAPT
     if (cm->interp_filter == SWITCHABLE) read_switchable_interp_probs(fc, &r);
+#endif
 
     for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
       av1_diff_update_prob(&r, &fc->intra_inter_prob[i], ACCT_STR);
 
     if (cm->reference_mode != SINGLE_REFERENCE)
       setup_compound_reference_mode(cm);
-
     read_frame_reference_mode_probs(cm, &r);
 
+#if !CONFIG_EC_ADAPT
     for (j = 0; j < BLOCK_SIZE_GROUPS; j++) {
       for (i = 0; i < INTRA_MODES - 1; ++i)
         av1_diff_update_prob(&r, &fc->y_mode_prob[j][i], ACCT_STR);
-#if CONFIG_DAALA_EC
-      av1_tree_to_cdf(av1_intra_mode_tree, fc->y_mode_prob[j],
-                      fc->y_mode_cdf[j]);
-#endif
     }
+#endif
 
 #if CONFIG_REF_MV
     for (i = 0; i < NMV_CONTEXTS; ++i)
@@ -3813,14 +3866,23 @@
 #else
     read_mv_probs(nmvc, cm->allow_high_precision_mv, &r);
 #endif
+#if !CONFIG_EC_ADAPT
     read_ext_tx_probs(fc, &r);
+#endif  // EC_ADAPT, DAALA_EC
 #if CONFIG_SUPERTX
     if (!xd->lossless[0]) read_supertx_probs(fc, &r);
 #endif
 #if CONFIG_GLOBAL_MOTION
     read_global_motion(cm, &r);
-#endif  // CONFIG_GLOBAL_MOTION
+#endif  // EC_ADAPT, DAALA_EC
   }
+#if CONFIG_EC_MULTISYMBOL
+  av1_coef_pareto_cdfs(fc);
+  av1_set_mv_cdfs(&fc->nmvc);
+#if CONFIG_DAALA_EC
+  av1_set_mode_cdfs(cm);
+#endif
+#endif
 
   return aom_reader_has_error(&r);
 }
@@ -3931,7 +3993,56 @@
   if (profile > 2) profile += aom_rb_read_bit(rb);
   return (BITSTREAM_PROFILE)profile;
 }
+#if CONFIG_TILE_GROUPS
+static int read_all_headers(AV1Decoder *pbi, struct aom_read_bit_buffer *rb,
+                            const uint8_t **p_data,
+                            const uint8_t **p_data_end) {
+  AV1_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &pbi->mb;
+  YV12_BUFFER_CONFIG *fb = (YV12_BUFFER_CONFIG *)xd->cur_buf;
 
+  pbi->first_partition_size = read_uncompressed_header(pbi, rb);
+  pbi->uncomp_hdr_size = aom_rb_bytes_read(rb);
+#if CONFIG_GLOBAL_MOTION
+  xd->global_motion = cm->global_motion;
+#endif  // CONFIG_GLOBAL_MOTION
+
+  if (!pbi->first_partition_size) {
+// showing a frame directly
+#if CONFIG_EXT_REFS
+    if (cm->show_existing_frame)
+      *p_data_end = *p_data + pbi->uncomp_hdr_size;
+    else
+#endif  // CONFIG_EXT_REFS
+      *p_data_end = *p_data + (cm->profile <= PROFILE_2 ? 1 : 2);
+    return 1;
+  }
+
+  *p_data += pbi->uncomp_hdr_size;
+
+  if (!read_is_valid(*p_data, pbi->first_partition_size, *p_data_end))
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Truncated packet or corrupt header length");
+
+  *cm->fc = cm->frame_contexts[cm->frame_context_idx];
+  if (!cm->fc->initialized)
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Uninitialized entropy context.");
+
+  av1_zero(cm->counts);
+
+  xd->corrupted = 0;
+  fb->corrupted =
+      read_compressed_header(pbi, *p_data, pbi->first_partition_size);
+  if (fb->corrupted)
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Decode failed. Frame data header is corrupted.");
+
+  *p_data += pbi->first_partition_size;
+
+  return 0;
+}
+#endif
 void av1_decode_frame(AV1Decoder *pbi, const uint8_t *data,
                       const uint8_t *data_end, const uint8_t **p_data_end) {
   AV1_COMMON *const cm = &pbi->common;
@@ -3971,6 +4082,10 @@
     aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
                        "Truncated packet or corrupt header length");
 
+#if CONFIG_SIMP_MV_PRED
+  cm->setup_mi(cm);
+#endif
+
   cm->use_prev_frame_mvs =
       !cm->error_resilient_mode && cm->width == cm->last_width &&
       cm->height == cm->last_height && !cm->last_intra_only &&
@@ -4067,6 +4182,12 @@
   }
 #endif  // CONFIG_LOOP_RESTORATION
 
+#if CONFIG_DERING
+  if (cm->dering_level && !cm->skip_loop_filter) {
+    av1_dering_frame(&pbi->cur_buf->buf, cm, &pbi->mb, cm->dering_level);
+  }
+#endif  // CONFIG_DERING
+
 #if CONFIG_CLPF
   if (!cm->skip_loop_filter) {
     const YV12_BUFFER_CONFIG *const frame = &pbi->cur_buf->buf;
@@ -4088,11 +4209,6 @@
   }
   if (cm->clpf_blocks) aom_free(cm->clpf_blocks);
 #endif
-#if CONFIG_DERING
-  if (cm->dering_level && !cm->skip_loop_filter) {
-    av1_dering_frame(&pbi->cur_buf->buf, cm, &pbi->mb, cm->dering_level);
-  }
-#endif  // CONFIG_DERING
 
   if (!xd->corrupted) {
     if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
diff --git a/av1/decoder/decodemv.c b/av1/decoder/decodemv.c
index 3993e72..77cea8a 100644
--- a/av1/decoder/decodemv.c
+++ b/av1/decoder/decodemv.c
@@ -42,7 +42,7 @@
 #endif  // CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA || CONFIG_PALETTE
 
 #if CONFIG_DAALA_EC
-static PREDICTION_MODE read_intra_mode(aom_reader *r, const aom_cdf_prob *cdf) {
+static PREDICTION_MODE read_intra_mode(aom_reader *r, aom_cdf_prob *cdf) {
   return (PREDICTION_MODE)
       av1_intra_mode_inv[aom_read_symbol(r, cdf, INTRA_MODES, ACCT_STR)];
 }
@@ -249,6 +249,24 @@
 }
 #endif
 
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+static MOTION_MODE read_motion_mode(AV1_COMMON *cm, MACROBLOCKD *xd,
+                                    MB_MODE_INFO *mbmi, aom_reader *r) {
+  if (is_motion_variation_allowed(mbmi)) {
+    int motion_mode;
+    FRAME_COUNTS *counts = xd->counts;
+
+    motion_mode =
+        aom_read_tree(r, av1_motion_mode_tree,
+                      cm->fc->motion_mode_prob[mbmi->sb_type], ACCT_STR);
+    if (counts) ++counts->motion_mode[mbmi->sb_type][motion_mode];
+    return (MOTION_MODE)(SIMPLE_TRANSLATION + motion_mode);
+  } else {
+    return SIMPLE_TRANSLATION;
+  }
+}
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+
 #if CONFIG_EXT_INTER
 static PREDICTION_MODE read_inter_compound_mode(AV1_COMMON *cm, MACROBLOCKD *xd,
                                                 aom_reader *r, int16_t ctx) {
@@ -264,8 +282,7 @@
 }
 #endif  // CONFIG_EXT_INTER
 
-static int read_segment_id(aom_reader *r,
-                           const struct segmentation_probs *segp) {
+static int read_segment_id(aom_reader *r, struct segmentation_probs *segp) {
 #if CONFIG_DAALA_EC
   return aom_read_symbol(r, segp->tree_cdf, MAX_SEGMENTS, ACCT_STR);
 #else
@@ -281,24 +298,29 @@
   int is_split = 0;
   const int tx_row = blk_row >> 1;
   const int tx_col = blk_col >> 1;
-  int max_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
-  int max_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
+  int max_blocks_high = block_size_high[mbmi->sb_type];
+  int max_blocks_wide = block_size_wide[mbmi->sb_type];
   int ctx = txfm_partition_context(xd->above_txfm_context + tx_col,
-                                   xd->left_txfm_context + tx_row, tx_size);
+                                   xd->left_txfm_context + tx_row,
+                                   mbmi->sb_type, tx_size);
   TX_SIZE(*const inter_tx_size)
   [MAX_MIB_SIZE] =
       (TX_SIZE(*)[MAX_MIB_SIZE]) & mbmi->inter_tx_size[tx_row][tx_col];
 
-  if (xd->mb_to_bottom_edge < 0) max_blocks_high += xd->mb_to_bottom_edge >> 5;
-  if (xd->mb_to_right_edge < 0) max_blocks_wide += xd->mb_to_right_edge >> 5;
+  if (xd->mb_to_bottom_edge < 0) max_blocks_high += xd->mb_to_bottom_edge >> 3;
+  if (xd->mb_to_right_edge < 0) max_blocks_wide += xd->mb_to_right_edge >> 3;
+
+  // Scale to transform block unit.
+  max_blocks_high >>= tx_size_wide_log2[0];
+  max_blocks_wide >>= tx_size_wide_log2[0];
 
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
   if (depth == MAX_VARTX_DEPTH) {
     int idx, idy;
     inter_tx_size[0][0] = tx_size;
-    for (idy = 0; idy < num_4x4_blocks_high_txsize_lookup[tx_size] / 2; ++idy)
-      for (idx = 0; idx < num_4x4_blocks_wide_txsize_lookup[tx_size] / 2; ++idx)
+    for (idy = 0; idy < tx_size_high_unit[tx_size] / 2; ++idy)
+      for (idx = 0; idx < tx_size_wide_unit[tx_size] / 2; ++idx)
         inter_tx_size[idy][idx] = tx_size;
     mbmi->tx_size = tx_size;
     if (counts) ++counts->txfm_partition[ctx][0];
@@ -311,7 +333,8 @@
 
   if (is_split) {
     BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
-    int bsl = b_width_log2_lookup[bsize];
+    // Half the block size in transform block unit.
+    int bsl = block_size_wide[bsize] >> (tx_size_wide_log2[0] + 1);
     int i;
 
     if (counts) ++counts->txfm_partition[ctx][1];
@@ -325,18 +348,17 @@
     }
 
     assert(bsl > 0);
-    --bsl;
     for (i = 0; i < 4; ++i) {
-      int offsetr = blk_row + ((i >> 1) << bsl);
-      int offsetc = blk_col + ((i & 0x01) << bsl);
+      int offsetr = blk_row + ((i >> 1) * bsl);
+      int offsetc = blk_col + ((i & 0x01) * bsl);
       read_tx_size_vartx(cm, xd, mbmi, counts, tx_size - 1, depth + 1, offsetr,
                          offsetc, r);
     }
   } else {
     int idx, idy;
     inter_tx_size[0][0] = tx_size;
-    for (idy = 0; idy < num_4x4_blocks_high_txsize_lookup[tx_size] / 2; ++idy)
-      for (idx = 0; idx < num_4x4_blocks_wide_txsize_lookup[tx_size] / 2; ++idx)
+    for (idy = 0; idy < tx_size_high_unit[tx_size] / 2; ++idy)
+      for (idx = 0; idx < tx_size_wide_unit[tx_size] / 2; ++idx)
         inter_tx_size[idy][idx] = tx_size;
     mbmi->tx_size = tx_size;
     if (counts) ++counts->txfm_partition[ctx][0];
@@ -350,11 +372,11 @@
                                      int tx_size_cat, aom_reader *r) {
   FRAME_COUNTS *counts = xd->counts;
   const int ctx = get_tx_size_context(xd);
-  int tx_size =
-      aom_read_tree(r, av1_tx_size_tree[tx_size_cat],
-                    cm->fc->tx_size_probs[tx_size_cat][ctx], ACCT_STR);
-  if (counts) ++counts->tx_size[tx_size_cat][ctx][tx_size];
-  return (TX_SIZE)tx_size;
+  int depth = aom_read_tree(r, av1_tx_size_tree[tx_size_cat],
+                            cm->fc->tx_size_probs[tx_size_cat][ctx], ACCT_STR);
+  TX_SIZE tx_size = depth_to_tx_size(depth);
+  if (counts) ++counts->tx_size[tx_size_cat][ctx][depth];
+  return tx_size;
 }
 
 static TX_SIZE read_tx_size_intra(AV1_COMMON *cm, MACROBLOCKD *xd,
@@ -792,12 +814,11 @@
   }
 }
 
-static int read_mv_component(aom_reader *r, const nmv_component *mvcomp,
-                             int usehp) {
+static int read_mv_component(aom_reader *r, nmv_component *mvcomp, int usehp) {
   int mag, d, fr, hp;
   const int sign = aom_read(r, mvcomp->sign, ACCT_STR);
   const int mv_class =
-#if CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
       aom_read_symbol(r, mvcomp->class_cdf, MV_CLASSES, ACCT_STR);
 #else
       aom_read_tree(r, av1_mv_class_tree, mvcomp->classes, ACCT_STR);
@@ -818,7 +839,7 @@
   }
 
 // Fractional part
-#if CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
   fr = aom_read_symbol(r, class0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf,
                        MV_FP_SIZE, ACCT_STR);
 #else
@@ -836,25 +857,24 @@
 }
 
 static INLINE void read_mv(aom_reader *r, MV *mv, const MV *ref,
-                           const nmv_context *ctx, nmv_context_counts *counts,
+                           nmv_context *ctx, nmv_context_counts *counts,
                            int allow_hp) {
   MV_JOINT_TYPE joint_type;
-  const int use_hp = allow_hp && av1_use_mv_hp(ref);
   MV diff = { 0, 0 };
   joint_type =
-#if CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
       (MV_JOINT_TYPE)aom_read_symbol(r, ctx->joint_cdf, MV_JOINTS, ACCT_STR);
 #else
       (MV_JOINT_TYPE)aom_read_tree(r, av1_mv_joint_tree, ctx->joints, ACCT_STR);
 #endif
 
   if (mv_joint_vertical(joint_type))
-    diff.row = read_mv_component(r, &ctx->comps[0], use_hp);
+    diff.row = read_mv_component(r, &ctx->comps[0], allow_hp);
 
   if (mv_joint_horizontal(joint_type))
-    diff.col = read_mv_component(r, &ctx->comps[1], use_hp);
+    diff.col = read_mv_component(r, &ctx->comps[1], allow_hp);
 
-  av1_inc_mv(&diff, counts, use_hp);
+  av1_inc_mv(&diff, counts, allow_hp);
 
   mv->row = ref->row + diff.row;
   mv->col = ref->col + diff.col;
@@ -975,24 +995,6 @@
   }
 }
 
-#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-static MOTION_MODE read_motion_mode(AV1_COMMON *cm, MACROBLOCKD *xd,
-                                    MB_MODE_INFO *mbmi, aom_reader *r) {
-  if (is_motion_variation_allowed(mbmi)) {
-    int motion_mode;
-    FRAME_COUNTS *counts = xd->counts;
-
-    motion_mode =
-        aom_read_tree(r, av1_motion_mode_tree,
-                      cm->fc->motion_mode_prob[mbmi->sb_type], ACCT_STR);
-    if (counts) ++counts->motion_mode[mbmi->sb_type][motion_mode];
-    return (MOTION_MODE)(SIMPLE_TRANSLATION + motion_mode);
-  } else {
-    return SIMPLE_TRANSLATION;
-  }
-}
-#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-
 static INLINE InterpFilter read_interp_filter(AV1_COMMON *const cm,
                                               MACROBLOCKD *const xd,
 #if CONFIG_DUAL_FILTER
@@ -1801,9 +1803,9 @@
         inter_block) {
       const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
       const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
-      const int bs = num_4x4_blocks_wide_lookup[txb_size];
-      const int width = num_4x4_blocks_wide_lookup[bsize];
-      const int height = num_4x4_blocks_high_lookup[bsize];
+      const int bs = block_size_wide[txb_size] >> tx_size_wide_log2[0];
+      const int width = block_size_wide[bsize] >> tx_size_wide_log2[0];
+      const int height = block_size_high[bsize] >> tx_size_wide_log2[0];
       int idx, idy;
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
       int is_rect_tx_allowed = inter_block && is_rect_tx_allowed_bsize(bsize) &&
diff --git a/av1/decoder/decoder.h b/av1/decoder/decoder.h
index fd68d13..262995a 100644
--- a/av1/decoder/decoder.h
+++ b/av1/decoder/decoder.h
@@ -107,7 +107,13 @@
   int acct_enabled;
   Accounting accounting;
 #endif
-
+  size_t uncomp_hdr_size;       // Size of the uncompressed header
+  size_t first_partition_size;  // Size of the compressed header
+#if CONFIG_TILE_GROUPS
+  int tg_size;   // Number of tiles in the current tilegroup
+  int tg_start;  // First tile in the current tilegroup
+  int tg_size_bit_offset;
+#endif
 } AV1Decoder;
 
 int av1_receive_compressed_data(struct AV1Decoder *pbi, size_t size,
diff --git a/av1/decoder/detokenize.c b/av1/decoder/detokenize.c
index 67af6f3..795b1b0 100644
--- a/av1/decoder/detokenize.c
+++ b/av1/decoder/detokenize.c
@@ -48,15 +48,14 @@
 }
 
 #if CONFIG_AOM_QM
-static int decode_coefs(const MACROBLOCKD *xd, PLANE_TYPE type,
-                        tran_low_t *dqcoeff, TX_SIZE tx_size, TX_TYPE tx_type,
-                        const int16_t *dq, int ctx, const int16_t *scan,
-                        const int16_t *nb, aom_reader *r,
+static int decode_coefs(MACROBLOCKD *xd, PLANE_TYPE type, tran_low_t *dqcoeff,
+                        TX_SIZE tx_size, TX_TYPE tx_type, const int16_t *dq,
+                        int ctx, const int16_t *scan, const int16_t *nb,
+                        int16_t *max_scan_line, aom_reader *r,
                         const qm_val_t *iqm[2][TX_SIZES])
 #else
-static int decode_coefs(const MACROBLOCKD *xd, PLANE_TYPE type,
-                        tran_low_t *dqcoeff, TX_SIZE tx_size, TX_TYPE tx_type,
-                        const int16_t *dq,
+static int decode_coefs(MACROBLOCKD *xd, PLANE_TYPE type, tran_low_t *dqcoeff,
+                        TX_SIZE tx_size, TX_TYPE tx_type, const int16_t *dq,
 #if CONFIG_NEW_QUANT
                         dequant_val_type_nuq *dq_val,
 #endif  // CONFIG_NEW_QUANT
@@ -65,22 +64,22 @@
 #endif
 {
   FRAME_COUNTS *counts = xd->counts;
-  const int max_eob = get_tx2d_size(tx_size);
-  const FRAME_CONTEXT *const fc = xd->fc;
+  FRAME_CONTEXT *const fc = xd->fc;
+  const int max_eob = tx_size_2d[tx_size];
   const int ref = is_inter_block(&xd->mi[0]->mbmi);
 #if CONFIG_AOM_QM
   const qm_val_t *iqmatrix = iqm[!ref][tx_size];
 #endif
   int band, c = 0;
   const int tx_size_ctx = txsize_sqr_map[tx_size];
-  const aom_prob(*coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
+  aom_prob(*coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
       fc->coef_probs[tx_size_ctx][type][ref];
   const aom_prob *prob;
-#if CONFIG_RANS || CONFIG_DAALA_EC
-  const aom_cdf_prob(*const coef_cdfs)[COEFF_CONTEXTS][ENTROPY_TOKENS] =
+#if CONFIG_EC_MULTISYMBOL
+  aom_cdf_prob(*coef_cdfs)[COEFF_CONTEXTS][ENTROPY_TOKENS] =
       fc->coef_cdfs[tx_size_ctx][type][ref];
-  const aom_cdf_prob(*cdf)[ENTROPY_TOKENS];
-#endif  // CONFIG_RANS
+  aom_cdf_prob(*cdf)[ENTROPY_TOKENS];
+#endif  // CONFIG_EC_MULTISYMBOL
   unsigned int(*coef_counts)[COEFF_CONTEXTS][UNCONSTRAINED_NODES + 1];
   unsigned int(*eob_branch_count)[COEFF_CONTEXTS];
   uint8_t token_cache[MAX_TX_SQUARE];
@@ -169,7 +168,7 @@
 
     *max_scan_line = AOMMAX(*max_scan_line, scan[c]);
 
-#if CONFIG_RANS || CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
     cdf = &coef_cdfs[band][ctx];
     token = ONE_TOKEN +
             aom_read_symbol(r, *cdf, CATEGORY6_TOKEN - ONE_TOKEN + 1, ACCT_STR);
@@ -215,7 +214,7 @@
 #endif
       } break;
     }
-#else
+#else  // CONFIG_EC_MULTISYMBOL
     if (!aom_read(r, prob[ONE_CONTEXT_NODE], ACCT_STR)) {
       INCREMENT_COUNT(ONE_TOKEN);
       token = ONE_TOKEN;
@@ -266,7 +265,7 @@
         }
       }
     }
-#endif  // CONFIG_RANS
+#endif  // CONFIG_EC_MULTISYMBOL
 #if CONFIG_NEW_QUANT
     v = av1_dequant_abscoeff_nuq(val, dqv, dqv_val);
     v = dq_shift ? ROUND_POWER_OF_TWO(v, dq_shift) : v;
@@ -345,7 +344,7 @@
 #if CONFIG_AOM_QM
   const int eob = decode_coefs(xd, pd->plane_type, pd->dqcoeff, tx_size,
                                tx_type, dequant, ctx, sc->scan, sc->neighbors,
-                               &sc->max_scan_line, r, pd->seg_iqmatrix[seg_id]);
+                               max_scan_line, r, pd->seg_iqmatrix[seg_id]);
 #else
   const int eob =
       decode_coefs(xd, pd->plane_type, pd->dqcoeff, tx_size, tx_type, dequant,
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index 260319e..fcfae7c 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -115,6 +115,9 @@
 #if CONFIG_LOOP_RESTORATION
 static struct av1_token switchable_restore_encodings[RESTORE_SWITCHABLE_TYPES];
 #endif  // CONFIG_LOOP_RESTORATION
+static void write_uncompressed_header(AV1_COMP *cpi,
+                                      struct aom_write_bit_buffer *wb);
+static uint32_t write_compressed_header(AV1_COMP *cpi, uint8_t *data);
 
 void av1_encode_token_init(void) {
 #if CONFIG_EXT_TX || CONFIG_PALETTE
@@ -314,7 +317,7 @@
 static void prob_diff_update(const aom_tree_index *tree,
                              aom_prob probs[/*n - 1*/],
                              const unsigned int counts[/*n - 1*/], int n,
-                             aom_writer *w) {
+                             int probwt, aom_writer *w) {
   int i;
   unsigned int branch_ct[32][2];
 
@@ -323,13 +326,14 @@
 
   av1_tree_probs_from_distribution(tree, branch_ct, counts);
   for (i = 0; i < n - 1; ++i)
-    av1_cond_prob_diff_update(w, &probs[i], branch_ct[i]);
+    av1_cond_prob_diff_update(w, &probs[i], branch_ct[i], probwt);
 }
 
+#if !CONFIG_EC_ADAPT
 static int prob_diff_update_savings(const aom_tree_index *tree,
                                     aom_prob probs[/*n - 1*/],
-                                    const unsigned int counts[/*n - 1*/],
-                                    int n) {
+                                    const unsigned int counts[/*n - 1*/], int n,
+                                    int probwt) {
   int i;
   unsigned int branch_ct[32][2];
   int savings = 0;
@@ -338,10 +342,12 @@
   assert(n <= 32);
   av1_tree_probs_from_distribution(tree, branch_ct, counts);
   for (i = 0; i < n - 1; ++i) {
-    savings += av1_cond_prob_diff_update_savings(&probs[i], branch_ct[i]);
+    savings +=
+        av1_cond_prob_diff_update_savings(&probs[i], branch_ct[i], probwt);
   }
   return savings;
 }
+#endif
 
 #if CONFIG_VAR_TX
 static void write_tx_size_vartx(const AV1_COMMON *cm, const MACROBLOCKD *xd,
@@ -350,13 +356,12 @@
                                 aom_writer *w) {
   const int tx_row = blk_row >> 1;
   const int tx_col = blk_col >> 1;
-  int max_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
-  int max_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
-  int ctx = txfm_partition_context(xd->above_txfm_context + tx_col,
-                                   xd->left_txfm_context + tx_row, tx_size);
+  const int max_blocks_high = max_block_high(xd, mbmi->sb_type, 0);
+  const int max_blocks_wide = max_block_wide(xd, mbmi->sb_type, 0);
 
-  if (xd->mb_to_bottom_edge < 0) max_blocks_high += xd->mb_to_bottom_edge >> 5;
-  if (xd->mb_to_right_edge < 0) max_blocks_wide += xd->mb_to_right_edge >> 5;
+  int ctx = txfm_partition_context(xd->above_txfm_context + tx_col,
+                                   xd->left_txfm_context + tx_row,
+                                   mbmi->sb_type, tx_size);
 
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
@@ -394,11 +399,11 @@
 }
 
 static void update_txfm_partition_probs(AV1_COMMON *cm, aom_writer *w,
-                                        FRAME_COUNTS *counts) {
+                                        FRAME_COUNTS *counts, int probwt) {
   int k;
   for (k = 0; k < TXFM_PARTITION_CONTEXTS; ++k)
     av1_cond_prob_diff_update(w, &cm->fc->txfm_partition_prob[k],
-                              counts->txfm_partition[k]);
+                              counts->txfm_partition[k], probwt);
 }
 #endif
 
@@ -414,6 +419,7 @@
     const int tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize]
                                      : intra_tx_size_cat_lookup[bsize];
     const TX_SIZE coded_tx_size = txsize_sqr_up_map[tx_size];
+    const int depth = tx_size_to_depth(coded_tx_size);
 
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
     assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed(xd, mbmi)));
@@ -423,7 +429,7 @@
 
     av1_write_token(w, av1_tx_size_tree[tx_size_cat],
                     cm->fc->tx_size_probs[tx_size_cat][tx_size_ctx],
-                    &tx_size_encodings[tx_size_cat][coded_tx_size]);
+                    &tx_size_encodings[tx_size_cat][depth]);
   }
 }
 
@@ -431,23 +437,33 @@
 static void update_inter_mode_probs(AV1_COMMON *cm, aom_writer *w,
                                     FRAME_COUNTS *counts) {
   int i;
+#if CONFIG_TILE_GROUPS
+  const int probwt = cm->num_tg;
+#else
+  const int probwt = 1;
+#endif
   for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i)
-    av1_cond_prob_diff_update(w, &cm->fc->newmv_prob[i], counts->newmv_mode[i]);
+    av1_cond_prob_diff_update(w, &cm->fc->newmv_prob[i], counts->newmv_mode[i],
+                              probwt);
   for (i = 0; i < ZEROMV_MODE_CONTEXTS; ++i)
     av1_cond_prob_diff_update(w, &cm->fc->zeromv_prob[i],
-                              counts->zeromv_mode[i]);
+                              counts->zeromv_mode[i], probwt);
   for (i = 0; i < REFMV_MODE_CONTEXTS; ++i)
-    av1_cond_prob_diff_update(w, &cm->fc->refmv_prob[i], counts->refmv_mode[i]);
+    av1_cond_prob_diff_update(w, &cm->fc->refmv_prob[i], counts->refmv_mode[i],
+                              probwt);
   for (i = 0; i < DRL_MODE_CONTEXTS; ++i)
-    av1_cond_prob_diff_update(w, &cm->fc->drl_prob[i], counts->drl_mode[i]);
+    av1_cond_prob_diff_update(w, &cm->fc->drl_prob[i], counts->drl_mode[i],
+                              probwt);
 #if CONFIG_EXT_INTER
-  av1_cond_prob_diff_update(w, &cm->fc->new2mv_prob, counts->new2mv_mode);
+  av1_cond_prob_diff_update(w, &cm->fc->new2mv_prob, counts->new2mv_mode,
+                            probwt);
 #endif  // CONFIG_EXT_INTER
 }
 #endif
 
 #if CONFIG_EXT_INTER
-static void update_inter_compound_mode_probs(AV1_COMMON *cm, aom_writer *w) {
+static void update_inter_compound_mode_probs(AV1_COMMON *cm, int probwt,
+                                             aom_writer *w) {
   const int savings_thresh = av1_cost_one(GROUP_DIFF_UPDATE_PROB) -
                              av1_cost_zero(GROUP_DIFF_UPDATE_PROB);
   int i;
@@ -456,7 +472,7 @@
   for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
     savings += prob_diff_update_savings(
         av1_inter_compound_mode_tree, cm->fc->inter_compound_mode_probs[i],
-        cm->counts.inter_compound_mode[i], INTER_COMPOUND_MODES);
+        cm->counts.inter_compound_mode[i], INTER_COMPOUND_MODES, probwt);
   }
   do_update = savings > savings_thresh;
   aom_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
@@ -464,7 +480,7 @@
     for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
       prob_diff_update(
           av1_inter_compound_mode_tree, cm->fc->inter_compound_mode_probs[i],
-          cm->counts.inter_compound_mode[i], INTER_COMPOUND_MODES, w);
+          cm->counts.inter_compound_mode[i], INTER_COMPOUND_MODES, probwt, w);
     }
   }
 }
@@ -509,9 +525,14 @@
 static void update_delta_q_probs(AV1_COMMON *cm, aom_writer *w,
                                  FRAME_COUNTS *counts) {
   int k;
-
+#if CONFIG_TILE_GROUPS
+  const int probwt = cm->num_tg;
+#else
+  const int probwt = 1;
+#endif
   for (k = 0; k < DELTA_Q_CONTEXTS; ++k) {
-    av1_cond_prob_diff_update(w, &cm->fc->delta_q_prob[k], counts->delta_q[k]);
+    av1_cond_prob_diff_update(w, &cm->fc->delta_q_prob[k], counts->delta_q[k],
+                              probwt);
   }
 }
 #endif
@@ -519,25 +540,33 @@
 static void update_skip_probs(AV1_COMMON *cm, aom_writer *w,
                               FRAME_COUNTS *counts) {
   int k;
-
-  for (k = 0; k < SKIP_CONTEXTS; ++k)
-    av1_cond_prob_diff_update(w, &cm->fc->skip_probs[k], counts->skip[k]);
+#if CONFIG_TILE_GROUPS
+  const int probwt = cm->num_tg;
+#else
+  const int probwt = 1;
+#endif
+  for (k = 0; k < SKIP_CONTEXTS; ++k) {
+    av1_cond_prob_diff_update(w, &cm->fc->skip_probs[k], counts->skip[k],
+                              probwt);
+  }
 }
 
+#if !CONFIG_EC_ADAPT
 static void update_switchable_interp_probs(AV1_COMMON *cm, aom_writer *w,
                                            FRAME_COUNTS *counts) {
   int j;
   for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) {
-    prob_diff_update(av1_switchable_interp_tree,
-                     cm->fc->switchable_interp_prob[j],
-                     counts->switchable_interp[j], SWITCHABLE_FILTERS, w);
-#if CONFIG_DAALA_EC
-    av1_tree_to_cdf(av1_switchable_interp_tree,
-                    cm->fc->switchable_interp_prob[j],
-                    cm->fc->switchable_interp_cdf[j]);
+#if CONFIG_TILE_GROUPS
+    const int probwt = cm->num_tg;
+#else
+    const int probwt = 1;
 #endif
+    prob_diff_update(
+        av1_switchable_interp_tree, cm->fc->switchable_interp_prob[j],
+        counts->switchable_interp[j], SWITCHABLE_FILTERS, probwt, w);
   }
 }
+#endif
 
 #if CONFIG_EXT_TX
 static void update_ext_tx_probs(AV1_COMMON *cm, aom_writer *w) {
@@ -545,6 +574,11 @@
                              av1_cost_zero(GROUP_DIFF_UPDATE_PROB);
   int i, j;
   int s;
+#if CONFIG_TILE_GROUPS
+  const int probwt = cm->num_tg;
+#else
+  const int probwt = 1;
+#endif
   for (s = 1; s < EXT_TX_SETS_INTER; ++s) {
     int savings = 0;
     int do_update = 0;
@@ -552,7 +586,7 @@
       if (!use_inter_ext_tx_for_txsize[s][i]) continue;
       savings += prob_diff_update_savings(
           av1_ext_tx_inter_tree[s], cm->fc->inter_ext_tx_prob[s][i],
-          cm->counts.inter_ext_tx[s][i], num_ext_tx_set_inter[s]);
+          cm->counts.inter_ext_tx[s][i], num_ext_tx_set_inter[s], probwt);
     }
     do_update = savings > savings_thresh;
     aom_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
@@ -561,7 +595,7 @@
         if (!use_inter_ext_tx_for_txsize[s][i]) continue;
         prob_diff_update(
             av1_ext_tx_inter_tree[s], cm->fc->inter_ext_tx_prob[s][i],
-            cm->counts.inter_ext_tx[s][i], num_ext_tx_set_inter[s], w);
+            cm->counts.inter_ext_tx[s][i], num_ext_tx_set_inter[s], probwt, w);
       }
     }
   }
@@ -574,7 +608,7 @@
       for (j = 0; j < INTRA_MODES; ++j)
         savings += prob_diff_update_savings(
             av1_ext_tx_intra_tree[s], cm->fc->intra_ext_tx_prob[s][i][j],
-            cm->counts.intra_ext_tx[s][i][j], num_ext_tx_set_intra[s]);
+            cm->counts.intra_ext_tx[s][i][j], num_ext_tx_set_intra[s], probwt);
     }
     do_update = savings > savings_thresh;
     aom_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
@@ -582,16 +616,17 @@
       for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
         if (!use_intra_ext_tx_for_txsize[s][i]) continue;
         for (j = 0; j < INTRA_MODES; ++j)
-          prob_diff_update(
-              av1_ext_tx_intra_tree[s], cm->fc->intra_ext_tx_prob[s][i][j],
-              cm->counts.intra_ext_tx[s][i][j], num_ext_tx_set_intra[s], w);
+          prob_diff_update(av1_ext_tx_intra_tree[s],
+                           cm->fc->intra_ext_tx_prob[s][i][j],
+                           cm->counts.intra_ext_tx[s][i][j],
+                           num_ext_tx_set_intra[s], probwt, w);
       }
     }
   }
 }
 
 #else
-
+#if !CONFIG_EC_ADAPT
 static void update_ext_tx_probs(AV1_COMMON *cm, aom_writer *w) {
   const int savings_thresh = av1_cost_one(GROUP_DIFF_UPDATE_PROB) -
                              av1_cost_zero(GROUP_DIFF_UPDATE_PROB);
@@ -599,11 +634,16 @@
 
   int savings = 0;
   int do_update = 0;
+#if CONFIG_TILE_GROUPS
+  const int probwt = cm->num_tg;
+#else
+  const int probwt = 1;
+#endif
   for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
     for (j = 0; j < TX_TYPES; ++j)
       savings += prob_diff_update_savings(
           av1_ext_tx_tree, cm->fc->intra_ext_tx_prob[i][j],
-          cm->counts.intra_ext_tx[i][j], TX_TYPES);
+          cm->counts.intra_ext_tx[i][j], TX_TYPES, probwt);
   }
   do_update = savings > savings_thresh;
   aom_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
@@ -611,35 +651,28 @@
     for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
       for (j = 0; j < TX_TYPES; ++j) {
         prob_diff_update(av1_ext_tx_tree, cm->fc->intra_ext_tx_prob[i][j],
-                         cm->counts.intra_ext_tx[i][j], TX_TYPES, w);
-#if CONFIG_DAALA_EC
-        av1_tree_to_cdf(av1_ext_tx_tree, cm->fc->intra_ext_tx_prob[i][j],
-                        cm->fc->intra_ext_tx_cdf[i][j]);
-#endif
+                         cm->counts.intra_ext_tx[i][j], TX_TYPES, probwt, w);
       }
     }
   }
+
   savings = 0;
   for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
     savings +=
         prob_diff_update_savings(av1_ext_tx_tree, cm->fc->inter_ext_tx_prob[i],
-                                 cm->counts.inter_ext_tx[i], TX_TYPES);
+                                 cm->counts.inter_ext_tx[i], TX_TYPES, probwt);
   }
   do_update = savings > savings_thresh;
   aom_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
   if (do_update) {
     for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
       prob_diff_update(av1_ext_tx_tree, cm->fc->inter_ext_tx_prob[i],
-                       cm->counts.inter_ext_tx[i], TX_TYPES, w);
-#if CONFIG_DAALA_EC
-      av1_tree_to_cdf(av1_ext_tx_tree, cm->fc->inter_ext_tx_prob[i],
-                      cm->fc->inter_ext_tx_cdf[i]);
-#endif
+                       cm->counts.inter_ext_tx[i], TX_TYPES, probwt, w);
     }
   }
 }
 #endif  // CONFIG_EXT_TX
-
+#endif
 #if CONFIG_PALETTE
 static void pack_palette_tokens(aom_writer *w, const TOKENEXTRA **tp, int n,
                                 int num) {
@@ -657,7 +690,7 @@
 #endif  // CONFIG_PALETTE
 
 #if CONFIG_SUPERTX
-static void update_supertx_probs(AV1_COMMON *cm, aom_writer *w) {
+static void update_supertx_probs(AV1_COMMON *cm, int probwt, aom_writer *w) {
   const int savings_thresh = av1_cost_one(GROUP_DIFF_UPDATE_PROB) -
                              av1_cost_zero(GROUP_DIFF_UPDATE_PROB);
   int i, j;
@@ -665,8 +698,8 @@
   int do_update = 0;
   for (i = 0; i < PARTITION_SUPERTX_CONTEXTS; ++i) {
     for (j = 1; j < TX_SIZES; ++j) {
-      savings += av1_cond_prob_diff_update_savings(&cm->fc->supertx_prob[i][j],
-                                                   cm->counts.supertx[i][j]);
+      savings += av1_cond_prob_diff_update_savings(
+          &cm->fc->supertx_prob[i][j], cm->counts.supertx[i][j], probwt);
     }
   }
   do_update = savings > savings_thresh;
@@ -675,7 +708,7 @@
     for (i = 0; i < PARTITION_SUPERTX_CONTEXTS; ++i) {
       for (j = 1; j < TX_SIZES; ++j) {
         av1_cond_prob_diff_update(w, &cm->fc->supertx_prob[i][j],
-                                  cm->counts.supertx[i][j]);
+                                  cm->counts.supertx[i][j], probwt);
       }
     }
   }
@@ -684,11 +717,11 @@
 
 static void pack_mb_tokens(aom_writer *w, const TOKENEXTRA **tp,
                            const TOKENEXTRA *const stop,
-                           aom_bit_depth_t bit_depth, const TX_SIZE tx) {
+                           aom_bit_depth_t bit_depth, const TX_SIZE tx_size) {
   const TOKENEXTRA *p = *tp;
 #if CONFIG_VAR_TX
   int count = 0;
-  const int seg_eob = get_tx2d_size(tx);
+  const int seg_eob = tx_size_2d[tx_size];
 #endif
 #if CONFIG_AOM_HIGHBITDEPTH
   const av1_extra_bit *const extra_bits_table =
@@ -703,14 +736,14 @@
   while (p < stop && p->token != EOSB_TOKEN) {
     const int token = p->token;
     aom_tree_index index = 0;
-#if !CONFIG_RANS && !CONFIG_DAALA_EC
+#if !CONFIG_EC_MULTISYMBOL
     const struct av1_token *const coef_encoding = &av1_coef_encodings[token];
     int coef_value = coef_encoding->value;
     int coef_length = coef_encoding->len;
-#endif  // !CONFIG_RANS
+#endif  // !CONFIG_EC_MULTISYMBOL
     const av1_extra_bit *const extra_bits = &extra_bits_table[token];
 
-#if CONFIG_RANS || CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
     /* skip one or two nodes */
     if (!p->skip_eob_node) aom_write(w, token != EOB_TOKEN, p->context_tree[0]);
 
@@ -743,7 +776,7 @@
         }
       }
     }
-#endif  // CONFIG_RANS
+#endif  // CONFIG_EC_MULTISYMBOL
 
     if (extra_bits->base_val) {
       const int bit_string = p->extra;
@@ -751,7 +784,7 @@
                                                       // be written excluding
                                                       // the sign bit.
       int skip_bits = (extra_bits->base_val == CAT6_MIN_VAL)
-                          ? TX_SIZES - 1 - txsize_sqr_up_map[tx]
+                          ? TX_SIZES - 1 - txsize_sqr_up_map[tx_size]
                           : 0;
 
       if (bit_string_length > 0) {
@@ -797,13 +830,8 @@
   const int tx_row = blk_row >> (1 - pd->subsampling_y);
   const int tx_col = blk_col >> (1 - pd->subsampling_x);
   TX_SIZE plane_tx_size;
-  int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
-  int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
-
-  if (xd->mb_to_bottom_edge < 0)
-    max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
-  if (xd->mb_to_right_edge < 0)
-    max_blocks_wide += xd->mb_to_right_edge >> (5 + pd->subsampling_x);
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
 
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
@@ -814,29 +842,28 @@
   if (tx_size == plane_tx_size) {
     pack_mb_tokens(w, tp, tok_end, bit_depth, tx_size);
   } else {
-    int bsl = b_width_log2_lookup[bsize];
+    const int bsl = block_size_wide[bsize] >> (tx_size_wide_log2[0] + 1);
     int i;
 
     assert(bsl > 0);
-    --bsl;
 
     for (i = 0; i < 4; ++i) {
-      const int offsetr = blk_row + ((i >> 1) << bsl);
-      const int offsetc = blk_col + ((i & 0x01) << bsl);
-      int step = num_4x4_blocks_txsize_lookup[tx_size - 1];
+      const int offsetr = blk_row + (i >> 1) * bsl;
+      const int offsetc = blk_col + (i & 0x01) * bsl;
+      const TX_SIZE sub_txs = tx_size - 1;
+      const int step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
 
       if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
 
       pack_txb_tokens(w, tp, tok_end, xd, mbmi, plane, plane_bsize, bit_depth,
-                      block + i * step, offsetr, offsetc, tx_size - 1);
+                      block + i * step, offsetr, offsetc, sub_txs);
     }
   }
 }
 #endif
 
 static void write_segment_id(aom_writer *w, const struct segmentation *seg,
-                             const struct segmentation_probs *segp,
-                             int segment_id) {
+                             struct segmentation_probs *segp, int segment_id) {
   if (seg->enabled && seg->update_map) {
 #if CONFIG_DAALA_EC
     aom_write_symbol(w, segment_id, segp->tree_cdf, MAX_SEGMENTS);
@@ -1097,7 +1124,7 @@
                                 aom_writer *w) {
   AV1_COMMON *const cm = &cpi->common;
 #if !CONFIG_REF_MV
-  const nmv_context *nmvc = &cm->fc->nmvc;
+  nmv_context *nmvc = &cm->fc->nmvc;
 #endif
 
 #if CONFIG_DELTA_Q
@@ -1108,7 +1135,7 @@
   const MACROBLOCKD *xd = &x->e_mbd;
 #endif
   const struct segmentation *const seg = &cm->seg;
-  const struct segmentation_probs *const segp = &cm->fc->seg;
+  struct segmentation_probs *const segp = &cm->fc->seg;
   const MB_MODE_INFO *const mbmi = &mi->mbmi;
   const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
   const PREDICTION_MODE mode = mbmi->mode;
@@ -1321,7 +1348,7 @@
               int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
                                         mbmi_ext->ref_mv_stack[rf_type], ref,
                                         mbmi->ref_mv_idx);
-              const nmv_context *nmvc = &cm->fc->nmvc[nmv_ctx];
+              nmv_context *nmvc = &cm->fc->nmvc[nmv_ctx];
 #endif
               av1_encode_mv(cpi, w, &mi->bmi[j].as_mv[ref].as_mv,
 #if CONFIG_EXT_INTER
@@ -1346,7 +1373,7 @@
             int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
                                       mbmi_ext->ref_mv_stack[rf_type], 1,
                                       mbmi->ref_mv_idx);
-            const nmv_context *nmvc = &cm->fc->nmvc[nmv_ctx];
+            nmv_context *nmvc = &cm->fc->nmvc[nmv_ctx];
 #endif
             av1_encode_mv(cpi, w, &mi->bmi[j].as_mv[1].as_mv,
                           &mi->bmi[j].ref_mv[1].as_mv,
@@ -1360,7 +1387,7 @@
             int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
                                       mbmi_ext->ref_mv_stack[rf_type], 0,
                                       mbmi->ref_mv_idx);
-            const nmv_context *nmvc = &cm->fc->nmvc[nmv_ctx];
+            nmv_context *nmvc = &cm->fc->nmvc[nmv_ctx];
 #endif
             av1_encode_mv(cpi, w, &mi->bmi[j].as_mv[0].as_mv,
                           &mi->bmi[j].ref_mv[0].as_mv,
@@ -1385,7 +1412,7 @@
           int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
                                     mbmi_ext->ref_mv_stack[rf_type], ref,
                                     mbmi->ref_mv_idx);
-          const nmv_context *nmvc = &cm->fc->nmvc[nmv_ctx];
+          nmv_context *nmvc = &cm->fc->nmvc[nmv_ctx];
 #endif
           ref_mv = mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0];
 #if CONFIG_EXT_INTER
@@ -1411,7 +1438,7 @@
         int nmv_ctx =
             av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
                         mbmi_ext->ref_mv_stack[rf_type], 1, mbmi->ref_mv_idx);
-        const nmv_context *nmvc = &cm->fc->nmvc[nmv_ctx];
+        nmv_context *nmvc = &cm->fc->nmvc[nmv_ctx];
 #endif
         av1_encode_mv(cpi, w, &mbmi->mv[1].as_mv,
                       &mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0].as_mv,
@@ -1425,7 +1452,7 @@
         int nmv_ctx =
             av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
                         mbmi_ext->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
-        const nmv_context *nmvc = &cm->fc->nmvc[nmv_ctx];
+        nmv_context *nmvc = &cm->fc->nmvc[nmv_ctx];
 #endif
         av1_encode_mv(cpi, w, &mbmi->mv[0].as_mv,
                       &mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0].as_mv,
@@ -1571,15 +1598,15 @@
 }
 
 #if CONFIG_DELTA_Q
-static void write_mb_modes_kf(const AV1_COMMON *cm, MACROBLOCKD *xd,
+static void write_mb_modes_kf(AV1_COMMON *cm, MACROBLOCKD *xd,
                               MODE_INFO **mi_8x8, aom_writer *w) {
   int skip;
 #else
-static void write_mb_modes_kf(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+static void write_mb_modes_kf(AV1_COMMON *cm, const MACROBLOCKD *xd,
                               MODE_INFO **mi_8x8, aom_writer *w) {
 #endif
   const struct segmentation *const seg = &cm->seg;
-  const struct segmentation_probs *const segp = &cm->fc->seg;
+  struct segmentation_probs *const segp = &cm->fc->seg;
   const MODE_INFO *const mi = mi_8x8[0];
   const MODE_INFO *const above_mi = xd->above_mi;
   const MODE_INFO *const left_mi = xd->left_mi;
@@ -1800,8 +1827,10 @@
       const BLOCK_SIZE plane_bsize =
           get_plane_block_size(AOMMAX(bsize, BLOCK_8X8), pd);
 
-      const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
-      const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+      const int num_4x4_w =
+          block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+      const int num_4x4_h =
+          block_size_high[plane_bsize] >> tx_size_wide_log2[0];
       int row, col;
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
       TX_SIZE tx_size =
@@ -1812,12 +1841,13 @@
       if (is_inter_block(mbmi)) {
 #endif
         const TX_SIZE max_tx_size = max_txsize_lookup[plane_bsize];
-        const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
         int block = 0;
-        const int step = num_4x4_blocks_txsize_lookup[max_tx_size];
-        bw = num_4x4_blocks_wide_lookup[txb_size];
-        for (row = 0; row < num_4x4_h; row += bw) {
-          for (col = 0; col < num_4x4_w; col += bw) {
+        const int step =
+            tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+        const int bkw = tx_size_wide_unit[max_tx_size];
+        const int bkh = tx_size_high_unit[max_tx_size];
+        for (row = 0; row < num_4x4_h; row += bkh) {
+          for (col = 0; col < num_4x4_w; col += bkw) {
             pack_txb_tokens(w, tok, tok_end, xd, mbmi, plane, plane_bsize,
                             cm->bit_depth, block, row, col, max_tx_size);
             block += step;
@@ -1826,12 +1856,11 @@
       } else {
         TX_SIZE tx = plane ? get_uv_tx_size(&m->mbmi, &xd->plane[plane])
                            : m->mbmi.tx_size;
-        BLOCK_SIZE txb_size = txsize_to_bsize[tx];
-        bw = num_4x4_blocks_wide_lookup[txb_size];
-        bh = num_4x4_blocks_high_lookup[txb_size];
+        const int bkw = tx_size_wide_unit[tx];
+        const int bkh = tx_size_high_unit[tx];
 
-        for (row = 0; row < num_4x4_h; row += bh)
-          for (col = 0; col < num_4x4_w; col += bw)
+        for (row = 0; row < num_4x4_h; row += bkh)
+          for (col = 0; col < num_4x4_w; col += bkw)
             pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx);
       }
 #else
@@ -2057,6 +2086,17 @@
   if (bsize >= BLOCK_8X8 &&
       (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
     update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+#endif  // CONFIG_EXT_PARTITION_TYPES
+
+#if CONFIG_DERING
+  if (bsize == BLOCK_64X64 && cm->dering_level != 0 &&
+      !sb_all_skip(cm, mi_row, mi_col)) {
+    aom_write_literal(
+        w,
+        cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]->mbmi.dering_gain,
+        DERING_REFINEMENT_BITS);
+  }
+#endif
 
 #if CONFIG_CLPF
   if (bsize == BLOCK_64X64 && cm->clpf_blocks && cm->clpf_strength_y &&
@@ -2088,17 +2128,6 @@
       aom_write_literal(w, cm->clpf_blocks[br], 1);
   }
 #endif
-
-#if CONFIG_DERING
-  if (bsize == BLOCK_64X64 && cm->dering_level != 0 &&
-      !sb_all_skip(cm, mi_row, mi_col)) {
-    aom_write_literal(
-        w,
-        cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]->mbmi.dering_gain,
-        DERING_REFINEMENT_BITS);
-  }
-#endif
-#endif  // CONFIG_EXT_PARTITION_TYPES
 }
 
 static void write_modes(AV1_COMP *const cpi, const TileInfo *const tile,
@@ -2161,9 +2190,18 @@
                                      av1_coeff_probs_model *new_coef_probs) {
   av1_coeff_probs_model *old_coef_probs = cpi->common.fc->coef_probs[tx_size];
   const aom_prob upd = DIFF_UPDATE_PROB;
+#if CONFIG_EC_ADAPT
+  const int entropy_nodes_update = UNCONSTRAINED_NODES - 1;
+#else
   const int entropy_nodes_update = UNCONSTRAINED_NODES;
+#endif
   int i, j, k, l, t;
   int stepsize = cpi->sf.coeff_prob_appx_step;
+#if CONFIG_TILE_GROUPS
+  const int probwt = cpi->common.num_tg;
+#else
+  const int probwt = 1;
+#endif
 
   switch (cpi->sf.use_fast_coef_updates) {
     case TWO_LOOP: {
@@ -2182,10 +2220,11 @@
                 if (t == PIVOT_NODE)
                   s = av1_prob_diff_update_savings_search_model(
                       frame_branch_ct[i][j][k][l][0],
-                      old_coef_probs[i][j][k][l], &newp, upd, stepsize);
+                      old_coef_probs[i][j][k][l], &newp, upd, stepsize, probwt);
                 else
                   s = av1_prob_diff_update_savings_search(
-                      frame_branch_ct[i][j][k][l][t], oldp, &newp, upd);
+                      frame_branch_ct[i][j][k][l][t], oldp, &newp, upd, probwt);
+
                 if (s > 0 && newp != oldp) u = 1;
                 if (u)
                   savings += s - (int)(av1_cost_zero(upd));
@@ -2217,10 +2256,11 @@
                 if (t == PIVOT_NODE)
                   s = av1_prob_diff_update_savings_search_model(
                       frame_branch_ct[i][j][k][l][0],
-                      old_coef_probs[i][j][k][l], &newp, upd, stepsize);
+                      old_coef_probs[i][j][k][l], &newp, upd, stepsize, probwt);
                 else
                   s = av1_prob_diff_update_savings_search(
-                      frame_branch_ct[i][j][k][l][t], *oldp, &newp, upd);
+                      frame_branch_ct[i][j][k][l][t], *oldp, &newp, upd,
+                      probwt);
                 if (s > 0 && newp != *oldp) u = 1;
                 aom_write(bc, u, upd);
                 if (u) {
@@ -2249,14 +2289,14 @@
                 aom_prob *oldp = old_coef_probs[i][j][k][l] + t;
                 int s;
                 int u = 0;
-
                 if (t == PIVOT_NODE) {
                   s = av1_prob_diff_update_savings_search_model(
                       frame_branch_ct[i][j][k][l][0],
-                      old_coef_probs[i][j][k][l], &newp, upd, stepsize);
+                      old_coef_probs[i][j][k][l], &newp, upd, stepsize, probwt);
                 } else {
                   s = av1_prob_diff_update_savings_search(
-                      frame_branch_ct[i][j][k][l][t], *oldp, &newp, upd);
+                      frame_branch_ct[i][j][k][l][t], *oldp, &newp, upd,
+                      probwt);
                 }
 
                 if (s > 0 && newp != *oldp) u = 1;
@@ -2840,8 +2880,14 @@
   }
 }
 
+#if !CONFIG_EC_ADAPT
 static void update_seg_probs(AV1_COMP *cpi, aom_writer *w) {
   AV1_COMMON *cm = &cpi->common;
+#if CONFIG_TILE_GROUPS
+  const int probwt = cm->num_tg;
+#else
+  const int probwt = 1;
+#endif
 
   if (!cm->seg.enabled || !cm->seg.update_map) return;
 
@@ -2850,19 +2896,16 @@
 
     for (i = 0; i < PREDICTION_PROBS; i++)
       av1_cond_prob_diff_update(w, &cm->fc->seg.pred_probs[i],
-                                cm->counts.seg.pred[i]);
+                                cm->counts.seg.pred[i], probwt);
 
     prob_diff_update(av1_segment_tree, cm->fc->seg.tree_probs,
-                     cm->counts.seg.tree_mispred, MAX_SEGMENTS, w);
+                     cm->counts.seg.tree_mispred, MAX_SEGMENTS, probwt, w);
   } else {
     prob_diff_update(av1_segment_tree, cm->fc->seg.tree_probs,
-                     cm->counts.seg.tree_total, MAX_SEGMENTS, w);
+                     cm->counts.seg.tree_total, MAX_SEGMENTS, probwt, w);
   }
-#if CONFIG_DAALA_EC
-  av1_tree_to_cdf(av1_segment_tree, cm->fc->seg.tree_probs,
-                  cm->fc->seg.tree_cdf);
-#endif
 }
+#endif
 
 static void write_txfm_mode(TX_MODE mode, struct aom_write_bit_buffer *wb) {
   aom_wb_write_bit(wb, mode == TX_MODE_SELECT);
@@ -2871,12 +2914,17 @@
 
 static void update_txfm_probs(AV1_COMMON *cm, aom_writer *w,
                               FRAME_COUNTS *counts) {
+#if CONFIG_TILE_GROUPS
+  const int probwt = cm->num_tg;
+#else
+  const int probwt = 1;
+#endif
   if (cm->tx_mode == TX_MODE_SELECT) {
     int i, j;
     for (i = 0; i < MAX_TX_DEPTH; ++i)
       for (j = 0; j < TX_SIZE_CONTEXTS; ++j)
         prob_diff_update(av1_tx_size_tree[i], cm->fc->tx_size_probs[i][j],
-                         counts->tx_size[i][j], i + 2, w);
+                         counts->tx_size[i][j], i + 2, probwt, w);
   }
 }
 
@@ -3059,9 +3107,16 @@
 }
 #endif  // CONFIG_EXT_TILE
 
+#if CONFIG_TILE_GROUPS
+static uint32_t write_tiles(AV1_COMP *const cpi,
+                            struct aom_write_bit_buffer *wb,
+                            unsigned int *max_tile_size,
+                            unsigned int *max_tile_col_size) {
+#else
 static uint32_t write_tiles(AV1_COMP *const cpi, uint8_t *const dst,
                             unsigned int *max_tile_size,
                             unsigned int *max_tile_col_size) {
+#endif
   const AV1_COMMON *const cm = &cpi->common;
 #if CONFIG_ANS
   struct AnsCoder token_ans;
@@ -3074,6 +3129,20 @@
   size_t total_size = 0;
   const int tile_cols = cm->tile_cols;
   const int tile_rows = cm->tile_rows;
+#if CONFIG_TILE_GROUPS
+  const int n_log2_tiles = cm->log2_tile_rows + cm->log2_tile_cols;
+  const int have_tiles = n_log2_tiles > 0;
+  size_t comp_hdr_size;
+  // Fixed size tile groups for the moment
+  const int num_tg_hdrs = cm->num_tg;
+  const int tg_size = (tile_rows * tile_cols + num_tg_hdrs - 1) / num_tg_hdrs;
+  int tile_count = 0;
+  int uncompressed_hdr_size = 0;
+  uint8_t *dst = NULL;
+  struct aom_write_bit_buffer comp_hdr_len_wb;
+  struct aom_write_bit_buffer tg_params_wb;
+  int saved_offset;
+#endif
 #if CONFIG_EXT_TILE
   const int have_tiles = tile_cols * tile_rows > 1;
 #endif  // CONFIG_EXT_TILE
@@ -3166,6 +3235,28 @@
     }
   }
 #else
+#if CONFIG_TILE_GROUPS
+  write_uncompressed_header(cpi, wb);
+
+  // Write the tile length code. Use full 32 bit length fields for the moment
+  if (have_tiles) aom_wb_write_literal(wb, 3, 2);
+
+  /* Write a placeholder for the number of tiles in each tile group */
+  tg_params_wb = *wb;
+  saved_offset = wb->bit_offset;
+  if (n_log2_tiles) aom_wb_write_literal(wb, 0, n_log2_tiles * 2);
+
+  /* Write a placeholder for the compressed header length */
+  comp_hdr_len_wb = *wb;
+  aom_wb_write_literal(wb, 0, 16);
+
+  uncompressed_hdr_size = aom_wb_bytes_written(wb);
+  dst = wb->bit_buffer;
+  comp_hdr_size = write_compressed_header(cpi, dst + uncompressed_hdr_size);
+  aom_wb_write_literal(&comp_hdr_len_wb, (int)(comp_hdr_size), 16);
+  total_size += uncompressed_hdr_size + comp_hdr_size;
+#endif
+
   for (tile_row = 0; tile_row < tile_rows; tile_row++) {
     TileInfo tile_info;
     const int is_last_row = (tile_row == tile_rows - 1);
@@ -3175,11 +3266,33 @@
     for (tile_col = 0; tile_col < tile_cols; tile_col++) {
       TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col];
       const int is_last_col = (tile_col == tile_cols - 1);
-      const int is_last_tile = is_last_col && is_last_row;
       unsigned int tile_size;
       const TOKENEXTRA *tok = tok_buffers[tile_row][tile_col];
       const TOKENEXTRA *tok_end = tok + cpi->tok_count[tile_row][tile_col];
-
+#if !CONFIG_TILE_GROUPS
+      const int is_last_tile = is_last_col && is_last_row;
+#else
+      const int tile_idx = tile_row * tile_cols + tile_col;
+      // All tiles in a tile group have a length
+      const int is_last_tile = 0;
+      if (tile_count >= tg_size) {
+        // Copy uncompressed header
+        memcpy(dst + total_size, dst, uncompressed_hdr_size * sizeof(uint8_t));
+        // Write the number of tiles in the group into the last uncompressed
+        // header
+        aom_wb_write_literal(&tg_params_wb, tile_idx - tile_count,
+                             n_log2_tiles);
+        aom_wb_write_literal(&tg_params_wb, tile_count - 1, n_log2_tiles);
+        tg_params_wb.bit_offset = saved_offset + 8 * total_size;
+        // Copy compressed header
+        memcpy(dst + total_size + uncompressed_hdr_size,
+               dst + uncompressed_hdr_size, comp_hdr_size * sizeof(uint8_t));
+        total_size += uncompressed_hdr_size;
+        total_size += comp_hdr_size;
+        tile_count = 0;
+      }
+      tile_count++;
+#endif
       av1_tile_set_col(&tile_info, cm, tile_col);
 
       buf->data = dst + total_size;
@@ -3187,19 +3300,19 @@
       // The last tile does not have a header.
       if (!is_last_tile) total_size += 4;
 
-#if !CONFIG_ANS
-      aom_start_encode(&mode_bc, dst + total_size);
-      write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end);
-      assert(tok == tok_end);
-      aom_stop_encode(&mode_bc);
-      tile_size = mode_bc.pos;
-#else
+#if CONFIG_ANS
       buf_ans_write_reset(buf_ans);
       write_modes(cpi, &tile_info, buf_ans, &tok, tok_end);
       assert(tok == tok_end);
       ans_write_init(&token_ans, dst + total_size);
       buf_ans_flush(buf_ans, &token_ans);
       tile_size = ans_write_end(&token_ans);
+#else
+      aom_start_encode(&mode_bc, dst + total_size);
+      write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end);
+      assert(tok == tok_end);
+      aom_stop_encode(&mode_bc);
+      tile_size = mode_bc.pos;
 #endif  // !CONFIG_ANS
 
       assert(tile_size > 0);
@@ -3215,6 +3328,14 @@
       total_size += tile_size;
     }
   }
+#if CONFIG_TILE_GROUPS
+  // Write the final tile group size
+  if (n_log2_tiles) {
+    aom_wb_write_literal(&tg_params_wb, (1 << n_log2_tiles) - tile_count,
+                         n_log2_tiles);
+    aom_wb_write_literal(&tg_params_wb, tile_count - 1, n_log2_tiles);
+  }
+#endif
 #endif  // CONFIG_EXT_TILE
   return (uint32_t)total_size;
 }
@@ -3443,12 +3564,12 @@
 #endif  // CONFIG_EXT_PARTITION
 
   encode_loopfilter(cm, wb);
-#if CONFIG_CLPF
-  encode_clpf(cm, wb);
-#endif
 #if CONFIG_DERING
   encode_dering(cm->dering_level, wb);
 #endif  // CONFIG_DERING
+#if CONFIG_CLPF
+  encode_clpf(cm, wb);
+#endif
 #if CONFIG_LOOP_RESTORATION
   encode_restoration_mode(cm, wb);
 #endif  // CONFIG_LOOP_RESTORATION
@@ -3560,6 +3681,12 @@
   aom_writer *header_bc;
   int i, j;
 
+#if CONFIG_TILE_GROUPS
+  const int probwt = cm->num_tg;
+#else
+  const int probwt = 1;
+#endif
+
 #if CONFIG_ANS
   struct AnsCoder header_ans;
   int header_size;
@@ -3574,17 +3701,16 @@
 #if CONFIG_LOOP_RESTORATION
   encode_restoration(cm, header_bc);
 #endif  // CONFIG_LOOP_RESTORATION
-
   update_txfm_probs(cm, header_bc, counts);
   update_coef_probs(cpi, header_bc);
 
 #if CONFIG_VAR_TX
-  update_txfm_partition_probs(cm, header_bc, counts);
+  update_txfm_partition_probs(cm, header_bc, counts, probwt);
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
   if (cm->tx_mode == TX_MODE_SELECT) {
     for (i = 1; i < TX_SIZES - 1; ++i)
       av1_cond_prob_diff_update(header_bc, &fc->rect_tx_prob[i],
-                                counts->rect_tx[i]);
+                                counts->rect_tx[i], probwt);
   }
 #endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
 #endif
@@ -3593,155 +3719,143 @@
 #if CONFIG_DELTA_Q
   update_delta_q_probs(cm, header_bc, counts);
 #endif
+#if !CONFIG_EC_ADAPT
   update_seg_probs(cpi, header_bc);
 
   for (i = 0; i < INTRA_MODES; ++i) {
     prob_diff_update(av1_intra_mode_tree, fc->uv_mode_prob[i],
-                     counts->uv_mode[i], INTRA_MODES, header_bc);
-#if CONFIG_DAALA_EC
-    av1_tree_to_cdf(av1_intra_mode_tree, fc->uv_mode_prob[i],
-                    fc->uv_mode_cdf[i]);
-#endif
+                     counts->uv_mode[i], INTRA_MODES, probwt, header_bc);
   }
 
 #if CONFIG_EXT_PARTITION_TYPES
   prob_diff_update(av1_partition_tree, fc->partition_prob[0],
-                   counts->partition[0], PARTITION_TYPES, header_bc);
+                   counts->partition[0], PARTITION_TYPES, probwt, header_bc);
   for (i = 1; i < PARTITION_CONTEXTS; ++i)
     prob_diff_update(av1_ext_partition_tree, fc->partition_prob[i],
-                     counts->partition[i], EXT_PARTITION_TYPES, header_bc);
+                     counts->partition[i], EXT_PARTITION_TYPES, probwt,
+                     header_bc);
 #else
   for (i = 0; i < PARTITION_CONTEXTS; ++i) {
     prob_diff_update(av1_partition_tree, fc->partition_prob[i],
-                     counts->partition[i], PARTITION_TYPES, header_bc);
-#if CONFIG_DAALA_EC
-    av1_tree_to_cdf(av1_partition_tree, cm->fc->partition_prob[i],
-                    cm->fc->partition_cdf[i]);
-#endif
+                     counts->partition[i], PARTITION_TYPES, probwt, header_bc);
   }
-#endif  // CONFIG_EXT_PARTITION_TYPES
+#endif  // CONFIG_EC_ADAPT, CONFIG_DAALA_EC
 
 #if CONFIG_EXT_INTRA
   for (i = 0; i < INTRA_FILTERS + 1; ++i)
     prob_diff_update(av1_intra_filter_tree, fc->intra_filter_probs[i],
-                     counts->intra_filter[i], INTRA_FILTERS, header_bc);
+                     counts->intra_filter[i], INTRA_FILTERS, probwt, header_bc);
 #endif  // CONFIG_EXT_INTRA
-
+#endif  // CONFIG_EC_ADAPT, CONFIG_DAALA_EC
   if (frame_is_intra_only(cm)) {
     av1_copy(cm->kf_y_prob, av1_kf_y_mode_prob);
 #if CONFIG_DAALA_EC
     av1_copy(cm->kf_y_cdf, av1_kf_y_mode_cdf);
 #endif
+
+#if !CONFIG_EC_ADAPT
     for (i = 0; i < INTRA_MODES; ++i)
-      for (j = 0; j < INTRA_MODES; ++j) {
+      for (j = 0; j < INTRA_MODES; ++j)
         prob_diff_update(av1_intra_mode_tree, cm->kf_y_prob[i][j],
-                         counts->kf_y_mode[i][j], INTRA_MODES, header_bc);
-#if CONFIG_DAALA_EC
-        av1_tree_to_cdf(av1_intra_mode_tree, cm->kf_y_prob[i][j],
-                        cm->kf_y_cdf[i][j]);
-#endif
-      }
+                         counts->kf_y_mode[i][j], INTRA_MODES, probwt,
+                         header_bc);
+#endif  // CONFIG_EC_ADAPT
   } else {
 #if CONFIG_REF_MV
     update_inter_mode_probs(cm, header_bc, counts);
 #else
+#if !CONFIG_EC_ADAPT
     for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
       prob_diff_update(av1_inter_mode_tree, cm->fc->inter_mode_probs[i],
-                       counts->inter_mode[i], INTER_MODES, header_bc);
-#if CONFIG_DAALA_EC
-      av1_tree_to_cdf(av1_inter_mode_tree, cm->fc->inter_mode_probs[i],
-                      cm->fc->inter_mode_cdf[i]);
-#endif
+                       counts->inter_mode[i], INTER_MODES, probwt, header_bc);
     }
 #endif
-
+#endif
 #if CONFIG_EXT_INTER
-    update_inter_compound_mode_probs(cm, header_bc);
+    update_inter_compound_mode_probs(cm, probwt, header_bc);
 
     if (cm->reference_mode != COMPOUND_REFERENCE) {
       for (i = 0; i < BLOCK_SIZE_GROUPS; i++) {
         if (is_interintra_allowed_bsize_group(i)) {
           av1_cond_prob_diff_update(header_bc, &fc->interintra_prob[i],
-                                    cm->counts.interintra[i]);
+                                    cm->counts.interintra[i], probwt);
         }
       }
       for (i = 0; i < BLOCK_SIZE_GROUPS; i++) {
         prob_diff_update(
             av1_interintra_mode_tree, cm->fc->interintra_mode_prob[i],
-            counts->interintra_mode[i], INTERINTRA_MODES, header_bc);
+            counts->interintra_mode[i], INTERINTRA_MODES, probwt, header_bc);
       }
       for (i = 0; i < BLOCK_SIZES; i++) {
         if (is_interintra_allowed_bsize(i) && is_interintra_wedge_used(i))
           av1_cond_prob_diff_update(header_bc, &fc->wedge_interintra_prob[i],
-                                    cm->counts.wedge_interintra[i]);
+                                    cm->counts.wedge_interintra[i], probwt);
       }
     }
     if (cm->reference_mode != SINGLE_REFERENCE) {
       for (i = 0; i < BLOCK_SIZES; i++)
         if (is_interinter_wedge_used(i))
           av1_cond_prob_diff_update(header_bc, &fc->wedge_interinter_prob[i],
-                                    cm->counts.wedge_interinter[i]);
+                                    cm->counts.wedge_interinter[i], probwt);
     }
 #endif  // CONFIG_EXT_INTER
 
 #if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
     for (i = BLOCK_8X8; i < BLOCK_SIZES; ++i)
       prob_diff_update(av1_motion_mode_tree, fc->motion_mode_prob[i],
-                       counts->motion_mode[i], MOTION_MODES, header_bc);
+                       counts->motion_mode[i], MOTION_MODES, probwt, header_bc);
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-
+#if !CONFIG_EC_ADAPT
     if (cm->interp_filter == SWITCHABLE)
       update_switchable_interp_probs(cm, header_bc, counts);
+#endif
 
     for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
       av1_cond_prob_diff_update(header_bc, &fc->intra_inter_prob[i],
-                                counts->intra_inter[i]);
+                                counts->intra_inter[i], probwt);
 
     if (cpi->allow_comp_inter_inter) {
       const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT;
       if (use_hybrid_pred)
         for (i = 0; i < COMP_INTER_CONTEXTS; i++)
           av1_cond_prob_diff_update(header_bc, &fc->comp_inter_prob[i],
-                                    counts->comp_inter[i]);
+                                    counts->comp_inter[i], probwt);
     }
 
     if (cm->reference_mode != COMPOUND_REFERENCE) {
       for (i = 0; i < REF_CONTEXTS; i++) {
         for (j = 0; j < (SINGLE_REFS - 1); j++) {
           av1_cond_prob_diff_update(header_bc, &fc->single_ref_prob[i][j],
-                                    counts->single_ref[i][j]);
+                                    counts->single_ref[i][j], probwt);
         }
       }
     }
-
     if (cm->reference_mode != SINGLE_REFERENCE) {
       for (i = 0; i < REF_CONTEXTS; i++) {
 #if CONFIG_EXT_REFS
         for (j = 0; j < (FWD_REFS - 1); j++) {
           av1_cond_prob_diff_update(header_bc, &fc->comp_ref_prob[i][j],
-                                    counts->comp_ref[i][j]);
+                                    counts->comp_ref[i][j], probwt);
         }
         for (j = 0; j < (BWD_REFS - 1); j++) {
           av1_cond_prob_diff_update(header_bc, &fc->comp_bwdref_prob[i][j],
-                                    counts->comp_bwdref[i][j]);
+                                    counts->comp_bwdref[i][j], probwt);
         }
 #else
         for (j = 0; j < (COMP_REFS - 1); j++) {
           av1_cond_prob_diff_update(header_bc, &fc->comp_ref_prob[i][j],
-                                    counts->comp_ref[i][j]);
+                                    counts->comp_ref[i][j], probwt);
         }
 #endif  // CONFIG_EXT_REFS
       }
     }
 
+#if !CONFIG_EC_ADAPT
     for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) {
       prob_diff_update(av1_intra_mode_tree, cm->fc->y_mode_prob[i],
-                       counts->y_mode[i], INTRA_MODES, header_bc);
-#if CONFIG_DAALA_EC
-      av1_tree_to_cdf(av1_intra_mode_tree, cm->fc->y_mode_prob[i],
-                      cm->fc->y_mode_cdf[i]);
-#endif
+                       counts->y_mode[i], INTRA_MODES, probwt, header_bc);
     }
+#endif
 
     av1_write_nmv_probs(cm, cm->allow_high_precision_mv, header_bc,
 #if CONFIG_REF_MV
@@ -3753,14 +3867,23 @@
     av1_tree_to_cdf(av1_mv_joint_tree, cm->fc->nmvc.joints,
                     cm->fc->nmvc.joint_cdf);
 #endif
+#if !CONFIG_EC_ADAPT
     update_ext_tx_probs(cm, header_bc);
+#endif
 #if CONFIG_SUPERTX
-    if (!xd->lossless[0]) update_supertx_probs(cm, header_bc);
+    if (!xd->lossless[0]) update_supertx_probs(cm, probwt, header_bc);
 #endif  // CONFIG_SUPERTX
 #if CONFIG_GLOBAL_MOTION
     write_global_motion(cpi, header_bc);
 #endif  // CONFIG_GLOBAL_MOTION
   }
+#if CONFIG_EC_MULTISYMBOL
+  av1_coef_pareto_cdfs(fc);
+  av1_set_mv_cdfs(&fc->nmvc);
+#if CONFIG_DAALA_EC
+  av1_set_mode_cdfs(cm);
+#endif
+#endif
 #if CONFIG_ANS
   ans_write_init(&header_ans, data);
   buf_ans_flush(header_bc, &header_ans);
@@ -3774,6 +3897,7 @@
 #endif  // CONFIG_ANS
 }
 
+#if !CONFIG_TILE_GROUPS
 static int choose_size_bytes(uint32_t size, int spare_msbs) {
   // Choose the number of bytes required to represent size, without
   // using the 'spare_msbs' number of most significant bits.
@@ -3803,7 +3927,6 @@
     default: assert("Invalid size" && 0); break;
   }
 }
-
 static int remux_tiles(const AV1_COMMON *const cm, uint8_t *dst,
                        const uint32_t data_size, const uint32_t max_tile_size,
                        const uint32_t max_tile_col_size,
@@ -3901,19 +4024,24 @@
     return wpos;
   }
 }
+#endif  // CONFIG_TILE_GROUPS
 
 void av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) {
   uint8_t *data = dst;
+#if !CONFIG_TILE_GROUPS
   uint32_t compressed_header_size;
   uint32_t uncompressed_header_size;
+  struct aom_write_bit_buffer saved_wb;
+#endif
   uint32_t data_size;
   struct aom_write_bit_buffer wb = { data, 0 };
-  struct aom_write_bit_buffer saved_wb;
+
   unsigned int max_tile_size;
   unsigned int max_tile_col_size;
+
+#if !CONFIG_TILE_GROUPS
   int tile_size_bytes;
   int tile_col_size_bytes;
-
   AV1_COMMON *const cm = &cpi->common;
   const int have_tiles = cm->tile_cols * cm->tile_rows > 1;
 
@@ -3958,7 +4086,10 @@
 
   // Write the encoded tile data
   data_size = write_tiles(cpi, data, &max_tile_size, &max_tile_col_size);
-
+#else
+  data_size = write_tiles(cpi, &wb, &max_tile_size, &max_tile_col_size);
+#endif
+#if !CONFIG_TILE_GROUPS
   if (have_tiles) {
     data_size =
         remux_tiles(cm, data, data_size, max_tile_size, max_tile_col_size,
@@ -3979,6 +4110,8 @@
   // TODO(jbb): Figure out what to do if compressed_header_size > 16 bits.
   assert(compressed_header_size <= 0xffff);
   aom_wb_write_literal(&saved_wb, compressed_header_size, 16);
-
+#else
+  data += data_size;
+#endif
   *size = data - dst;
 }
diff --git a/av1/encoder/dct.c b/av1/encoder/dct.c
index 221e3cd..dd4031f 100644
--- a/av1/encoder/dct.c
+++ b/av1/encoder/dct.c
@@ -18,6 +18,8 @@
 #include "aom_dsp/fwd_txfm.h"
 #include "aom_ports/mem.h"
 #include "av1/common/blockd.h"
+#include "av1/common/av1_fwd_txfm1d.h"
+#include "av1/common/av1_fwd_txfm2d_cfg.h"
 #include "av1/common/idct.h"
 
 static INLINE void range_check(const tran_low_t *input, const int size,
@@ -1874,12 +1876,103 @@
   }
 }
 
+#if CONFIG_TX64X64
+#if CONFIG_EXT_TX
+static void fidtx64(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 64; ++i)
+    output[i] = (tran_low_t)fdct_round_shift(input[i] * 4 * Sqrt2);
+}
+
+// For use in lieu of ADST
+static void fhalfright64(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  tran_low_t inputhalf[32];
+  for (i = 0; i < 32; ++i) {
+    output[32 + i] = (tran_low_t)fdct_round_shift(input[i] * 4 * Sqrt2);
+  }
+  // Multiply input by sqrt(2)
+  for (i = 0; i < 32; ++i) {
+    inputhalf[i] = (tran_low_t)fdct_round_shift(input[i + 32] * Sqrt2);
+  }
+  fdct32(inputhalf, output);
+  // Note overall scaling factor is 2 times unitary
+}
+#endif  // CONFIG_EXT_TX
+
+static void fdct64_col(const tran_low_t *input, tran_low_t *output) {
+  int32_t in[64], out[64];
+  int i;
+  for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
+  av1_fdct64_new(in, out, fwd_cos_bit_col_dct_dct_64,
+                 fwd_stage_range_col_dct_dct_64);
+  for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
+}
+
+static void fdct64_row(const tran_low_t *input, tran_low_t *output) {
+  int32_t in[64], out[64];
+  int i;
+  for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
+  av1_fdct64_new(in, out, fwd_cos_bit_row_dct_dct_64,
+                 fwd_stage_range_row_dct_dct_64);
+  for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
+}
+
+void av1_fht64x64_c(const int16_t *input, tran_low_t *output, int stride,
+                    int tx_type) {
+  static const transform_2d FHT[] = {
+    { fdct64_col, fdct64_row },  // DCT_DCT
+#if CONFIG_EXT_TX
+    { fhalfright64, fdct64_row },    // ADST_DCT
+    { fdct64_col, fhalfright64 },    // DCT_ADST
+    { fhalfright64, fhalfright64 },  // ADST_ADST
+    { fhalfright64, fdct64_row },    // FLIPADST_DCT
+    { fdct64_col, fhalfright64 },    // DCT_FLIPADST
+    { fhalfright64, fhalfright64 },  // FLIPADST_FLIPADST
+    { fhalfright64, fhalfright64 },  // ADST_FLIPADST
+    { fhalfright64, fhalfright64 },  // FLIPADST_ADST
+    { fidtx64, fidtx64 },            // IDTX
+    { fdct64_col, fidtx64 },         // V_DCT
+    { fidtx64, fdct64_row },         // H_DCT
+    { fhalfright64, fidtx64 },       // V_ADST
+    { fidtx64, fhalfright64 },       // H_ADST
+    { fhalfright64, fidtx64 },       // V_FLIPADST
+    { fidtx64, fhalfright64 },       // H_FLIPADST
+#endif
+  };
+  const transform_2d ht = FHT[tx_type];
+  tran_low_t out[4096];
+  int i, j;
+  tran_low_t temp_in[64], temp_out[64];
+#if CONFIG_EXT_TX
+  int16_t flipped_input[64 * 64];
+  maybe_flip_input(&input, &stride, 64, 64, flipped_input, tx_type);
+#endif
+  // Columns
+  for (i = 0; i < 64; ++i) {
+    for (j = 0; j < 64; ++j) temp_in[j] = input[j * stride + i];
+    ht.cols(temp_in, temp_out);
+    for (j = 0; j < 64; ++j)
+      out[j * 64 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+  }
+
+  // Rows
+  for (i = 0; i < 64; ++i) {
+    for (j = 0; j < 64; ++j) temp_in[j] = out[j + i * 64];
+    ht.rows(temp_in, temp_out);
+    for (j = 0; j < 64; ++j)
+      output[j + i * 64] =
+          (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
+  }
+}
+#endif  // CONFIG_TX64X64
+
 #if CONFIG_EXT_TX
 // Forward identity transform.
 void av1_fwd_idtx_c(const int16_t *src_diff, tran_low_t *coeff, int stride,
                     int bs, int tx_type) {
   int r, c;
-  const int shift = bs < 32 ? 3 : 2;
+  const int shift = bs < 32 ? 3 : (bs < 64 ? 2 : 1);
   if (tx_type == IDTX) {
     for (r = 0; r < bs; ++r) {
       for (c = 0; c < bs; ++c) coeff[c] = src_diff[c] * (1 << shift);
@@ -1894,5 +1987,12 @@
                            int tx_type) {
   av1_fht32x32_c(input, output, stride, tx_type);
 }
+
+#if CONFIG_TX64X64
+void av1_highbd_fht64x64_c(const int16_t *input, tran_low_t *output, int stride,
+                           int tx_type) {
+  av1_fht64x64_c(input, output, stride, tx_type);
+}
+#endif  // CONFIG_TX64X64
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 #endif  // CONFIG_EXT_TX
diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 3733efc..d1d6ecc 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c
@@ -1305,8 +1305,8 @@
 #if CONFIG_VAR_TX
   {
     const TX_SIZE mtx = mbmi->tx_size;
-    const int num_4x4_blocks_wide = num_4x4_blocks_wide_txsize_lookup[mtx] >> 1;
-    const int num_4x4_blocks_high = num_4x4_blocks_high_txsize_lookup[mtx] >> 1;
+    const int num_4x4_blocks_wide = tx_size_wide_unit[mtx] >> 1;
+    const int num_4x4_blocks_high = tx_size_high_unit[mtx] >> 1;
     int idy, idx;
     mbmi->inter_tx_size[0][0] = mtx;
     for (idy = 0; idy < num_4x4_blocks_high; ++idy)
@@ -4442,8 +4442,10 @@
   av1_zero_above_context(cm, tile_info->mi_col_start, tile_info->mi_col_end);
 
   // Set up pointers to per thread motion search counters.
-  td->mb.m_search_count_ptr = &td->rd_counts.m_search_count;
-  td->mb.ex_search_count_ptr = &td->rd_counts.ex_search_count;
+  this_tile->m_search_count = 0;   // Count of motion search hits.
+  this_tile->ex_search_count = 0;  // Exhaustive mesh search hits.
+  td->mb.m_search_count_ptr = &this_tile->m_search_count;
+  td->mb.ex_search_count_ptr = &this_tile->ex_search_count;
 
   for (mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
        mi_row += cm->mib_size) {
@@ -4484,10 +4486,35 @@
 #define MIN_TRANS_THRESH 8
 #define GLOBAL_MOTION_ADVANTAGE_THRESH 0.60
 #define GLOBAL_MOTION_MODEL ROTZOOM
-// TODO(sarahparker) This function needs to be adjusted
-// to accomodate changes in the paraemter integerization.
-// Commenting it out until the fix is made.
-/*
+
+// Adds some offset to a global motion parameter and handles
+// all of the necessary precision shifts, clamping, and
+// zero-centering.
+static int16_t add_param_offset(int param_index, int16_t param_value,
+                                int16_t offset) {
+  const int scale_vals[2] = { GM_ALPHA_PREC_DIFF, GM_TRANS_PREC_DIFF };
+  const int clamp_vals[2] = { GM_ALPHA_MAX, GM_TRANS_MAX };
+  const int is_trans_param = param_index < 2;
+  const int is_one_centered = (!is_trans_param) && (param_index & 1);
+
+  // Make parameter zero-centered and offset the shift that was done to make
+  // it compatible with the warped model
+  param_value = (param_value - (is_one_centered << WARPEDMODEL_PREC_BITS)) >>
+                scale_vals[is_trans_param];
+  // Add desired offset to the rescaled/zero-centered parameter
+  param_value += offset;
+  // Clamp the parameter so it does not overflow the number of bits allotted
+  // to it in the bitstream
+  param_value = (int16_t)clamp(param_value, -clamp_vals[is_trans_param],
+                               clamp_vals[is_trans_param]);
+  // Rescale the parameter to WARPEDMODEL_PRECISION_BITS so it is compatible
+  // with the warped motion library
+  param_value *= (1 << scale_vals[is_trans_param]);
+
+  // Undo the zero-centering step if necessary
+  return param_value + (is_one_centered << WARPEDMODEL_PREC_BITS);
+}
+
 static void refine_integerized_param(WarpedMotionParams *wm,
 #if CONFIG_AOM_HIGHBITDEPTH
                                      int use_hbd, int bd,
@@ -4500,7 +4527,7 @@
   int n_params = n_trans_model_params[wm->wmtype];
   int16_t *param_mat = (int16_t *)wm->wmmat;
   double step_error;
-  int step;
+  int16_t step;
   int16_t *param;
   int16_t curr_param;
   int16_t best_param;
@@ -4519,9 +4546,7 @@
     best_param = curr_param;
     for (i = 0; i < n_refinements; i++) {
       // look to the left
-      *param =
-          (int16_t)clamp(curr_param - step, p < 2 ? GM_TRANS_MIN : GM_ALPHA_MIN,
-                         p < 2 ? GM_TRANS_MAX : GM_ALPHA_MAX);
+      *param = add_param_offset(p, curr_param, -step);
       step_error =
           av1_warp_erroradv(wm,
 #if CONFIG_AOM_HIGHBITDEPTH
@@ -4538,9 +4563,7 @@
       }
 
       // look to the right
-      *param =
-          (int16_t)clamp(curr_param + step, p < 2 ? GM_TRANS_MIN : GM_ALPHA_MIN,
-                         p < 2 ? GM_TRANS_MAX : GM_ALPHA_MAX);
+      *param = add_param_offset(p, curr_param, step);
       step_error =
           av1_warp_erroradv(wm,
 #if CONFIG_AOM_HIGHBITDEPTH
@@ -4564,7 +4587,6 @@
     *param = best_param;
   }
 }
-*/
 
 static void convert_to_params(const double *params, TransformationType type,
                               int16_t *model) {
@@ -4617,6 +4639,9 @@
 
   x->min_partition_size = AOMMIN(x->min_partition_size, cm->sb_size);
   x->max_partition_size = AOMMIN(x->max_partition_size, cm->sb_size);
+#if CONFIG_SIMP_MV_PRED
+  cm->setup_mi(cm);
+#endif
 
   xd->mi = cm->mi_grid_visible;
   xd->mi[0] = cm->mi;
@@ -4624,8 +4649,6 @@
   av1_zero(*td->counts);
   av1_zero(rdc->coef_counts);
   av1_zero(rdc->comp_pred_diff);
-  rdc->m_search_count = 0;   // Count of motion search hits.
-  rdc->ex_search_count = 0;  // Exhaustive mesh search hits.
 
 #if CONFIG_GLOBAL_MOTION
   aom_clear_system_state();
@@ -4643,6 +4666,14 @@
           convert_model_to_params(params, GLOBAL_MOTION_MODEL,
                                   &cm->global_motion[frame]);
           if (get_gmtype(&cm->global_motion[frame]) > GLOBAL_ZERO) {
+            refine_integerized_param(
+                &cm->global_motion[frame].motion_params,
+#if CONFIG_AOM_HIGHBITDEPTH
+                xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+                ref_buf->y_buffer, ref_buf->y_width, ref_buf->y_height,
+                ref_buf->y_stride, cpi->Source->y_buffer, cpi->Source->y_width,
+                cpi->Source->y_height, cpi->Source->y_stride, 3);
             // compute the advantage of using gm parameters over 0 motion
             erroradvantage = av1_warp_erroradv(
                 &cm->global_motion[frame].motion_params,
@@ -4858,17 +4889,18 @@
       int count16x16_16x16p = 0, count16x16_lp = 0;
       int count32x32 = 0;
       for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
-        count4x4 += counts->tx_size[0][i][TX_4X4];
-        count4x4 += counts->tx_size[1][i][TX_4X4];
-        count4x4 += counts->tx_size[2][i][TX_4X4];
+        // counts->tx_size[max_depth][context_idx][this_depth_level]
+        count4x4 += counts->tx_size[0][i][0];
+        count4x4 += counts->tx_size[1][i][0];
+        count4x4 += counts->tx_size[2][i][0];
 
-        count8x8_lp += counts->tx_size[1][i][TX_8X8];
-        count8x8_lp += counts->tx_size[2][i][TX_8X8];
-        count8x8_8x8p += counts->tx_size[0][i][TX_8X8];
+        count8x8_lp += counts->tx_size[1][i][1];
+        count8x8_lp += counts->tx_size[2][i][1];
+        count8x8_8x8p += counts->tx_size[0][i][1];
 
-        count16x16_16x16p += counts->tx_size[1][i][TX_16X16];
-        count16x16_lp += counts->tx_size[2][i][TX_16X16];
-        count32x32 += counts->tx_size[2][i][TX_32X32];
+        count16x16_16x16p += counts->tx_size[1][i][2];
+        count16x16_lp += counts->tx_size[2][i][2];
+        count32x32 += counts->tx_size[2][i][3];
       }
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
       count4x4 += counts->tx_size_implied[0][TX_4X4];
@@ -4955,20 +4987,18 @@
 
 #if CONFIG_VAR_TX
 static void update_txfm_count(MACROBLOCK *x, MACROBLOCKD *xd,
-                              FRAME_COUNTS *counts, TX_SIZE tx_size,
+                              FRAME_COUNTS *counts, TX_SIZE tx_size, int depth,
                               int blk_row, int blk_col) {
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   const int tx_row = blk_row >> 1;
   const int tx_col = blk_col >> 1;
-  int max_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
-  int max_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
+  const int max_blocks_high = max_block_high(xd, mbmi->sb_type, 0);
+  const int max_blocks_wide = max_block_wide(xd, mbmi->sb_type, 0);
   int ctx = txfm_partition_context(xd->above_txfm_context + tx_col,
-                                   xd->left_txfm_context + tx_row, tx_size);
+                                   xd->left_txfm_context + tx_row,
+                                   mbmi->sb_type, tx_size);
   const TX_SIZE plane_tx_size = mbmi->inter_tx_size[tx_row][tx_col];
 
-  if (xd->mb_to_bottom_edge < 0) max_blocks_high += xd->mb_to_bottom_edge >> 5;
-  if (xd->mb_to_right_edge < 0) max_blocks_wide += xd->mb_to_right_edge >> 5;
-
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
   if (tx_size == plane_tx_size) {
@@ -4994,8 +5024,8 @@
     for (i = 0; i < 4; ++i) {
       int offsetr = (i >> 1) * bh / 2;
       int offsetc = (i & 0x01) * bh / 2;
-      update_txfm_count(x, xd, counts, tx_size - 1, blk_row + offsetr,
-                        blk_col + offsetc);
+      update_txfm_count(x, xd, counts, tx_size - 1, depth + 1,
+                        blk_row + offsetr, blk_col + offsetc);
     }
   }
 }
@@ -5017,7 +5047,8 @@
 
   for (idy = 0; idy < mi_height; idy += bh)
     for (idx = 0; idx < mi_width; idx += bh)
-      update_txfm_count(x, xd, td_counts, max_tx_size, idy, idx);
+      update_txfm_count(x, xd, td_counts, max_tx_size, mi_width != mi_height,
+                        idy, idx);
 }
 
 static void set_txfm_context(MACROBLOCKD *xd, TX_SIZE tx_size, int blk_row,
@@ -5025,13 +5056,10 @@
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   const int tx_row = blk_row >> 1;
   const int tx_col = blk_col >> 1;
-  int max_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
-  int max_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
+  const int max_blocks_high = max_block_high(xd, mbmi->sb_type, 0);
+  const int max_blocks_wide = max_block_wide(xd, mbmi->sb_type, 0);
   const TX_SIZE plane_tx_size = mbmi->inter_tx_size[tx_row][tx_col];
 
-  if (xd->mb_to_bottom_edge < 0) max_blocks_high += xd->mb_to_bottom_edge >> 5;
-  if (xd->mb_to_right_edge < 0) max_blocks_wide += xd->mb_to_right_edge >> 5;
-
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
   if (tx_size == plane_tx_size) {
@@ -5206,6 +5234,7 @@
       const int tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize]
                                        : intra_tx_size_cat_lookup[bsize];
       const TX_SIZE coded_tx_size = txsize_sqr_up_map[mbmi->tx_size];
+      const int depth = tx_size_to_depth(coded_tx_size);
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
       assert(IMPLIES(is_rect_tx(mbmi->tx_size), is_rect_tx_allowed(xd, mbmi)));
 #endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
@@ -5219,7 +5248,7 @@
         if (is_inter) {
           tx_partition_count_update(cm, x, bsize, mi_row, mi_col, td->counts);
         } else {
-          ++td->counts->tx_size[tx_size_cat][tx_size_ctx][coded_tx_size];
+          ++td->counts->tx_size[tx_size_cat][tx_size_ctx][depth];
           if (mbmi->tx_size != max_txsize_lookup[bsize]) ++x->txb_split_count;
         }
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
@@ -5227,7 +5256,7 @@
 #endif
 #endif
 #if !CONFIG_VAR_TX
-      ++td->counts->tx_size[tx_size_cat][tx_size_ctx][coded_tx_size];
+      ++td->counts->tx_size[tx_size_cat][tx_size_ctx][depth];
 #endif
     } else {
       int i, j;
diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index c5459dc..3a33f35 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c
@@ -451,7 +451,7 @@
   const qm_val_t *iqmatrix = pd->seg_iqmatrix[seg_id][!is_inter][tx_size];
 #endif
   const int16_t *src_diff;
-  const int tx2d_size = get_tx2d_size(tx_size);
+  const int tx2d_size = tx_size_2d[tx_size];
 
   FWD_TXFM_PARAM fwd_txfm_param;
   QUANT_PARAM qparam;
@@ -542,14 +542,13 @@
     highbd_fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
     if (tx_size == TX_32X32) {
       highbd_quantize_32x32_nuq(
-          coeff, get_tx2d_size(tx_size), x->skip_block, p->quant,
-          p->quant_shift, pd->dequant,
-          (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+          coeff, tx_size_2d[tx_size], x->skip_block, p->quant, p->quant_shift,
+          pd->dequant, (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
           (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff,
           dqcoeff, eob, scan_order->scan, band);
     } else {
-      highbd_quantize_nuq(coeff, get_tx2d_size(tx_size), x->skip_block,
-                          p->quant, p->quant_shift, pd->dequant,
+      highbd_quantize_nuq(coeff, tx_size_2d[tx_size], x->skip_block, p->quant,
+                          p->quant_shift, pd->dequant,
                           (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
                           (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
                           qcoeff, dqcoeff, eob, scan_order->scan, band);
@@ -566,7 +565,7 @@
                        (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
                        qcoeff, dqcoeff, eob, scan_order->scan, band);
   } else {
-    quantize_nuq(coeff, get_tx2d_size(tx_size), x->skip_block, p->quant,
+    quantize_nuq(coeff, tx_size_2d[tx_size], x->skip_block, p->quant,
                  p->quant_shift, pd->dequant,
                  (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
                  (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff,
@@ -613,14 +612,14 @@
     highbd_fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
     if (tx_size == TX_32X32) {
       highbd_quantize_32x32_fp_nuq(
-          coeff, get_tx2d_size(tx_size), x->skip_block, p->quant_fp,
-          pd->dequant, (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+          coeff, tx_size_2d[tx_size], x->skip_block, p->quant_fp, pd->dequant,
+          (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
           (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff,
           dqcoeff, eob, scan_order->scan, band);
     } else {
       highbd_quantize_fp_nuq(
-          coeff, get_tx2d_size(tx_size), x->skip_block, p->quant_fp,
-          pd->dequant, (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+          coeff, tx_size_2d[tx_size], x->skip_block, p->quant_fp, pd->dequant,
+          (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
           (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff,
           dqcoeff, eob, scan_order->scan, band);
     }
@@ -630,13 +629,13 @@
 
   fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
   if (tx_size == TX_32X32) {
-    quantize_32x32_fp_nuq(coeff, get_tx2d_size(tx_size), x->skip_block,
+    quantize_32x32_fp_nuq(coeff, tx_size_2d[tx_size], x->skip_block,
                           p->quant_fp, pd->dequant,
                           (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
                           (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
                           qcoeff, dqcoeff, eob, scan_order->scan, band);
   } else {
-    quantize_fp_nuq(coeff, get_tx2d_size(tx_size), x->skip_block, p->quant_fp,
+    quantize_fp_nuq(coeff, tx_size_2d[tx_size], x->skip_block, p->quant_fp,
                     pd->dequant,
                     (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
                     (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
@@ -681,11 +680,11 @@
     highbd_fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
     if (tx_size == TX_32X32) {
       highbd_quantize_dc_32x32_nuq(
-          coeff, get_tx2d_size(tx_size), x->skip_block, p->quant[0],
+          coeff, tx_size_2d[tx_size], x->skip_block, p->quant[0],
           p->quant_shift[0], pd->dequant[0], p->cuml_bins_nuq[dq][0],
           pd->dequant_val_nuq[dq][0], qcoeff, dqcoeff, eob);
     } else {
-      highbd_quantize_dc_nuq(coeff, get_tx2d_size(tx_size), x->skip_block,
+      highbd_quantize_dc_nuq(coeff, tx_size_2d[tx_size], x->skip_block,
                              p->quant[0], p->quant_shift[0], pd->dequant[0],
                              p->cuml_bins_nuq[dq][0],
                              pd->dequant_val_nuq[dq][0], qcoeff, dqcoeff, eob);
@@ -696,12 +695,12 @@
 
   fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
   if (tx_size == TX_32X32) {
-    quantize_dc_32x32_nuq(coeff, get_tx2d_size(tx_size), x->skip_block,
+    quantize_dc_32x32_nuq(coeff, tx_size_2d[tx_size], x->skip_block,
                           p->quant[0], p->quant_shift[0], pd->dequant[0],
                           p->cuml_bins_nuq[dq][0], pd->dequant_val_nuq[dq][0],
                           qcoeff, dqcoeff, eob);
   } else {
-    quantize_dc_nuq(coeff, get_tx2d_size(tx_size), x->skip_block, p->quant[0],
+    quantize_dc_nuq(coeff, tx_size_2d[tx_size], x->skip_block, p->quant[0],
                     p->quant_shift[0], pd->dequant[0], p->cuml_bins_nuq[dq][0],
                     pd->dequant_val_nuq[dq][0], qcoeff, dqcoeff, eob);
   }
@@ -744,12 +743,12 @@
     highbd_fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
     if (tx_size == TX_32X32) {
       highbd_quantize_dc_32x32_fp_nuq(
-          coeff, get_tx2d_size(tx_size), x->skip_block, p->quant_fp[0],
+          coeff, tx_size_2d[tx_size], x->skip_block, p->quant_fp[0],
           pd->dequant[0], p->cuml_bins_nuq[dq][0], pd->dequant_val_nuq[dq][0],
           qcoeff, dqcoeff, eob);
     } else {
       highbd_quantize_dc_fp_nuq(
-          coeff, get_tx2d_size(tx_size), x->skip_block, p->quant_fp[0],
+          coeff, tx_size_2d[tx_size], x->skip_block, p->quant_fp[0],
           pd->dequant[0], p->cuml_bins_nuq[dq][0], pd->dequant_val_nuq[dq][0],
           qcoeff, dqcoeff, eob);
     }
@@ -759,12 +758,12 @@
 
   fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
   if (tx_size == TX_32X32) {
-    quantize_dc_32x32_fp_nuq(coeff, get_tx2d_size(tx_size), x->skip_block,
+    quantize_dc_32x32_fp_nuq(coeff, tx_size_2d[tx_size], x->skip_block,
                              p->quant_fp[0], pd->dequant[0],
                              p->cuml_bins_nuq[dq][0],
                              pd->dequant_val_nuq[dq][0], qcoeff, dqcoeff, eob);
   } else {
-    quantize_dc_fp_nuq(coeff, get_tx2d_size(tx_size), x->skip_block,
+    quantize_dc_fp_nuq(coeff, tx_size_2d[tx_size], x->skip_block,
                        p->quant_fp[0], pd->dequant[0], p->cuml_bins_nuq[dq][0],
                        pd->dequant_val_nuq[dq][0], qcoeff, dqcoeff, eob);
   }
@@ -864,14 +863,8 @@
   const int tx_row = blk_row >> (1 - pd->subsampling_y);
   const int tx_col = blk_col >> (1 - pd->subsampling_x);
   TX_SIZE plane_tx_size;
-
-  int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
-  int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
-
-  if (xd->mb_to_bottom_edge < 0)
-    max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
-  if (xd->mb_to_right_edge < 0)
-    max_blocks_wide += xd->mb_to_right_edge >> (5 + pd->subsampling_x);
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
 
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
diff --git a/av1/encoder/encodemv.c b/av1/encoder/encodemv.c
index 53dac12..8a6ad18 100644
--- a/av1/encoder/encodemv.c
+++ b/av1/encoder/encodemv.c
@@ -30,8 +30,8 @@
   av1_tokens_from_tree(mv_fp_encodings, av1_mv_fp_tree);
 }
 
-static void encode_mv_component(aom_writer *w, int comp,
-                                const nmv_component *mvcomp, int usehp) {
+static void encode_mv_component(aom_writer *w, int comp, nmv_component *mvcomp,
+                                int usehp) {
   int offset;
   const int sign = comp < 0;
   const int mag = sign ? -comp : comp;
@@ -46,7 +46,7 @@
   aom_write(w, sign, mvcomp->sign);
 
 // Class
-#if CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
   aom_write_symbol(w, mv_class, mvcomp->class_cdf, MV_CLASSES);
 #else
   av1_write_token(w, av1_mv_class_tree, mvcomp->classes,
@@ -63,7 +63,7 @@
   }
 
 // Fractional bits
-#if CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
   aom_write_symbol(
       w, fr, mv_class == MV_CLASS_0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf,
       MV_FP_SIZE);
@@ -141,9 +141,16 @@
 static void update_mv(aom_writer *w, const unsigned int ct[2], aom_prob *cur_p,
                       aom_prob upd_p) {
   (void)upd_p;
-  av1_cond_prob_diff_update(w, cur_p, ct);
+#if CONFIG_TILE_GROUPS
+  // Just use the maximum number of tile groups to avoid passing in the actual
+  // number
+  av1_cond_prob_diff_update(w, cur_p, ct, MAX_NUM_TG);
+#else
+  av1_cond_prob_diff_update(w, cur_p, ct, 1);
+#endif
 }
 
+#if !CONFIG_EC_ADAPT
 static void write_mv_update(const aom_tree_index *tree,
                             aom_prob probs[/*n - 1*/],
                             const unsigned int counts[/*n - 1*/], int n,
@@ -158,19 +165,22 @@
   for (i = 0; i < n - 1; ++i)
     update_mv(w, branch_ct[i], &probs[i], MV_UPDATE_PROB);
 }
+#endif
 
 void av1_write_nmv_probs(AV1_COMMON *cm, int usehp, aom_writer *w,
                          nmv_context_counts *const nmv_counts) {
-  int i, j;
+  int i;
 #if CONFIG_REF_MV
   int nmv_ctx = 0;
   for (nmv_ctx = 0; nmv_ctx < NMV_CONTEXTS; ++nmv_ctx) {
     nmv_context *const mvc = &cm->fc->nmvc[nmv_ctx];
     nmv_context_counts *const counts = &nmv_counts[nmv_ctx];
+#if !CONFIG_EC_ADAPT
     write_mv_update(av1_mv_joint_tree, mvc->joints, counts->joints, MV_JOINTS,
                     w);
 
     for (i = 0; i < 2; ++i) {
+      int j;
       nmv_component *comp = &mvc->comps[i];
       nmv_component_counts *comp_counts = &counts->comps[i];
 
@@ -184,6 +194,7 @@
     }
 
     for (i = 0; i < 2; ++i) {
+      int j;
       for (j = 0; j < CLASS0_SIZE; ++j)
         write_mv_update(av1_mv_fp_tree, mvc->comps[i].class0_fp[j],
                         counts->comps[i].class0_fp[j], MV_FP_SIZE, w);
@@ -191,6 +202,7 @@
       write_mv_update(av1_mv_fp_tree, mvc->comps[i].fp, counts->comps[i].fp,
                       MV_FP_SIZE, w);
     }
+#endif
 
     if (usehp) {
       for (i = 0; i < 2; ++i) {
@@ -204,18 +216,17 @@
   nmv_context *const mvc = &cm->fc->nmvc;
   nmv_context_counts *const counts = nmv_counts;
 
+#if !CONFIG_EC_ADAPT
   write_mv_update(av1_mv_joint_tree, mvc->joints, counts->joints, MV_JOINTS, w);
 
   for (i = 0; i < 2; ++i) {
+    int j;
     nmv_component *comp = &mvc->comps[i];
     nmv_component_counts *comp_counts = &counts->comps[i];
 
     update_mv(w, comp_counts->sign, &comp->sign, MV_UPDATE_PROB);
     write_mv_update(av1_mv_class_tree, comp->classes, comp_counts->classes,
                     MV_CLASSES, w);
-#if CONFIG_DAALA_EC
-    av1_tree_to_cdf(av1_mv_class_tree, comp->classes, comp->class_cdf);
-#endif
     write_mv_update(av1_mv_class0_tree, comp->class0, comp_counts->class0,
                     CLASS0_SIZE, w);
     for (j = 0; j < MV_OFFSET_BITS; ++j)
@@ -223,20 +234,15 @@
   }
 
   for (i = 0; i < 2; ++i) {
+    int j;
     for (j = 0; j < CLASS0_SIZE; ++j) {
       write_mv_update(av1_mv_fp_tree, mvc->comps[i].class0_fp[j],
                       counts->comps[i].class0_fp[j], MV_FP_SIZE, w);
-#if CONFIG_DAALA_EC
-      av1_tree_to_cdf(av1_mv_fp_tree, mvc->comps[i].class0_fp[j],
-                      mvc->comps[i].class0_fp_cdf[j]);
-#endif
     }
     write_mv_update(av1_mv_fp_tree, mvc->comps[i].fp, counts->comps[i].fp,
                     MV_FP_SIZE, w);
-#if CONFIG_DAALA_EC
-    av1_tree_to_cdf(av1_mv_fp_tree, mvc->comps[i].fp, mvc->comps[i].fp_cdf);
-#endif
   }
+#endif  // !CONFIG_EC_ADAPT
 
   if (usehp) {
     for (i = 0; i < 2; ++i) {
@@ -252,14 +258,13 @@
 #if CONFIG_REF_MV
                    int is_compound,
 #endif
-                   const nmv_context *mvctx, int usehp) {
+                   nmv_context *mvctx, int usehp) {
   const MV diff = { mv->row - ref->row, mv->col - ref->col };
   const MV_JOINT_TYPE j = av1_get_mv_joint(&diff);
-  usehp = usehp && av1_use_mv_hp(ref);
 #if CONFIG_REF_MV
   (void)is_compound;
 #endif
-#if CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
   aom_write_symbol(w, j, mvctx->joint_cdf, MV_JOINTS);
 #else
   av1_write_token(w, av1_mv_joint_tree, mvctx->joints, &mv_joint_encodings[j]);
@@ -312,7 +317,7 @@
       nmv_context_counts *counts = &nmv_counts[nmv_ctx];
       (void)pred_mvs;
 #endif
-      av1_inc_mv(&diff, counts, av1_use_mv_hp(ref));
+      av1_inc_mv(&diff, counts, 1);
     }
   } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) {
     const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0].as_mv;
@@ -325,7 +330,7 @@
                     mbmi_ext->ref_mv_stack[rf_type], 1, mbmi->ref_mv_idx);
     nmv_context_counts *counts = &nmv_counts[nmv_ctx];
 #endif
-    av1_inc_mv(&diff, counts, av1_use_mv_hp(ref));
+    av1_inc_mv(&diff, counts, 1);
   } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
     const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0].as_mv;
     const MV diff = { mvs[0].as_mv.row - ref->row,
@@ -337,7 +342,7 @@
                     mbmi_ext->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
     nmv_context_counts *counts = &nmv_counts[nmv_ctx];
 #endif
-    av1_inc_mv(&diff, counts, av1_use_mv_hp(ref));
+    av1_inc_mv(&diff, counts, 1);
   }
 }
 
@@ -366,7 +371,7 @@
                       mbmi_ext->ref_mv_stack[rf_type], i, mbmi->ref_mv_idx);
       nmv_context_counts *counts = &nmv_counts[nmv_ctx];
 #endif
-      av1_inc_mv(&diff, counts, av1_use_mv_hp(ref));
+      av1_inc_mv(&diff, counts, 1);
     }
   } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) {
     const MV *ref = &mi->bmi[block].ref_mv[1].as_mv;
@@ -379,7 +384,7 @@
                     mbmi_ext->ref_mv_stack[rf_type], 1, mbmi->ref_mv_idx);
     nmv_context_counts *counts = &nmv_counts[nmv_ctx];
 #endif
-    av1_inc_mv(&diff, counts, av1_use_mv_hp(ref));
+    av1_inc_mv(&diff, counts, 1);
   } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
     const MV *ref = &mi->bmi[block].ref_mv[0].as_mv;
     const MV diff = { mvs[0].as_mv.row - ref->row,
@@ -391,7 +396,7 @@
                     mbmi_ext->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
     nmv_context_counts *counts = &nmv_counts[nmv_ctx];
 #endif
-    av1_inc_mv(&diff, counts, av1_use_mv_hp(ref));
+    av1_inc_mv(&diff, counts, 1);
   }
 }
 #else
@@ -419,7 +424,7 @@
 #endif
     const MV diff = { mvs[i].as_mv.row - ref->row,
                       mvs[i].as_mv.col - ref->col };
-    av1_inc_mv(&diff, counts, av1_use_mv_hp(ref));
+    av1_inc_mv(&diff, counts, 1);
   }
 }
 #endif  // CONFIG_EXT_INTER
diff --git a/av1/encoder/encodemv.h b/av1/encoder/encodemv.h
index 543064e..17baa2d 100644
--- a/av1/encoder/encodemv.h
+++ b/av1/encoder/encodemv.h
@@ -27,7 +27,7 @@
 #if CONFIG_REF_MV
                    int is_compound,
 #endif
-                   const nmv_context *mvctx, int usehp);
+                   nmv_context *mvctx, int usehp);
 
 void av1_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
                               const nmv_context *mvctx, int usehp);
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 629eb46..52408b9 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -3324,6 +3324,16 @@
       av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
 #endif
   }
+#if CONFIG_DERING
+  if (is_lossless_requested(&cpi->oxcf)) {
+    cm->dering_level = 0;
+  } else {
+    cm->dering_level =
+        av1_dering_search(cm->frame_to_show, cpi->Source, cm, xd);
+    av1_dering_frame(cm->frame_to_show, cm, xd, cm->dering_level);
+  }
+#endif  // CONFIG_DERING
+
 #if CONFIG_CLPF
   cm->clpf_strength_y = cm->clpf_strength_u = cm->clpf_strength_v = 0;
   cm->clpf_size = CLPF_64X64;
@@ -3372,15 +3382,6 @@
     }
   }
 #endif
-#if CONFIG_DERING
-  if (is_lossless_requested(&cpi->oxcf)) {
-    cm->dering_level = 0;
-  } else {
-    cm->dering_level =
-        av1_dering_search(cm->frame_to_show, cpi->Source, cm, xd);
-    av1_dering_frame(cm->frame_to_show, cm, xd, cm->dering_level);
-  }
-#endif  // CONFIG_DERING
 #if CONFIG_LOOP_RESTORATION
   if (cm->rst_info.restoration_type != RESTORE_NONE) {
     av1_loop_restoration_init(&cm->rst_internal, &cm->rst_info,
@@ -3596,7 +3597,7 @@
   recon_err = aom_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
 
   if (cpi->twopass.total_left_stats.coded_error != 0.0)
-    fprintf(f, "%10u %dx%d  %10d %10d %d %d %10d %10d %10d %10d"
+    fprintf(f, "%10u %dx%d %d %d %10d %10d %10d %10d"
        "%10"PRId64" %10"PRId64" %5d %5d %10"PRId64" "
        "%10"PRId64" %10"PRId64" %10d "
        "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf"
@@ -3605,8 +3606,6 @@
         "%10lf %8u %10"PRId64" %10d %10d %10d\n",
         cpi->common.current_video_frame,
         cm->width, cm->height,
-        cpi->td.rd_counts.m_search_count,
-        cpi->td.rd_counts.ex_search_count,
         cpi->rc.source_alt_ref_pending,
         cpi->rc.source_alt_ref_active,
         cpi->rc.this_frame_target,
@@ -3921,12 +3920,6 @@
 
   set_size_independent_vars(cpi);
 
-  // cpi->sf.use_upsampled_references can be different from frame to frame.
-  // Every time when cpi->sf.use_upsampled_references is changed from 0 to 1.
-  // The reference frames for this frame have to be up-sampled before encoding.
-  if (!use_upsampled_ref && cpi->sf.use_upsampled_references)
-    reset_use_upsampled_references(cpi);
-
   do {
     aom_clear_system_state();
 
@@ -3935,6 +3928,14 @@
     if (loop_count == 0 || cpi->resize_pending != 0) {
       set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
 
+      // cpi->sf.use_upsampled_references can be different from frame to frame.
+      // Every time when cpi->sf.use_upsampled_references is changed from 0 to
+      // 1.
+      // The reference frames for this frame have to be up-sampled before
+      // encoding.
+      if (!use_upsampled_ref && cpi->sf.use_upsampled_references)
+        reset_use_upsampled_references(cpi);
+
       // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed.
       set_mv_search_params(cpi);
 
@@ -4565,6 +4566,12 @@
       cm->reset_frame_context = RESET_FRAME_CONTEXT_CURRENT;
     }
   }
+#if CONFIG_TILE_GROUPS
+  if (cm->error_resilient_mode)
+    cm->num_tg = MAX_NUM_TG;
+  else
+    cm->num_tg = 1;
+#endif
 
   // For 1 pass CBR, check if we are dropping this frame.
   // Never drop on key frame.
diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index 8738609..00abc71 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h
@@ -268,13 +268,13 @@
   TileInfo tile_info;
   int thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
   int mode_map[BLOCK_SIZES][MAX_MODES];
+  int m_search_count;
+  int ex_search_count;
 } TileDataEnc;
 
 typedef struct RD_COUNTS {
   av1_coeff_count coef_counts[TX_SIZES][PLANE_TYPES];
   int64_t comp_pred_diff[REFERENCE_MODES];
-  int m_search_count;
-  int ex_search_count;
 } RD_COUNTS;
 
 typedef struct ThreadData {
diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c
index 117d0ed..5876d15 100644
--- a/av1/encoder/ethread.c
+++ b/av1/encoder/ethread.c
@@ -27,10 +27,6 @@
             for (n = 0; n < ENTROPY_TOKENS; n++)
               td->rd_counts.coef_counts[i][j][k][l][m][n] +=
                   td_t->rd_counts.coef_counts[i][j][k][l][m][n];
-
-  // Counts of all motion searches and exhuastive mesh searches.
-  td->rd_counts.m_search_count += td_t->rd_counts.m_search_count;
-  td->rd_counts.ex_search_count += td_t->rd_counts.ex_search_count;
 }
 
 static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) {
diff --git a/av1/encoder/mcomp.c b/av1/encoder/mcomp.c
index 3fbceab..4b54a2c 100644
--- a/av1/encoder/mcomp.c
+++ b/av1/encoder/mcomp.c
@@ -424,7 +424,7 @@
   tr = br;
   tc = bc;
 
-  if (allow_hp && av1_use_mv_hp(ref_mv) && forced_stop == 0) {
+  if (allow_hp && forced_stop == 0) {
     hstep >>= 1;
     FIRST_LEVEL_CHECKS;
     if (eighthiters > 1) {
@@ -484,7 +484,7 @@
     }
   }
 
-  if (allow_hp && av1_use_mv_hp(ref_mv) && forced_stop == 0) {
+  if (allow_hp && forced_stop == 0) {
     tr = br;
     tc = bc;
     hstep >>= 1;
@@ -572,7 +572,7 @@
     tc = bc;
   }
 
-  if (allow_hp && av1_use_mv_hp(ref_mv) && forced_stop == 0) {
+  if (allow_hp && forced_stop == 0) {
     hstep >>= 1;
     FIRST_LEVEL_CHECKS;
     if (eighthiters > 1) {
@@ -687,7 +687,7 @@
   unsigned int cost_array[5];
   int kr, kc;
 
-  if (!(allow_hp && av1_use_mv_hp(ref_mv)))
+  if (!allow_hp)
     if (round == 3) round = 2;
 
   bestmv->row *= 8;
@@ -2446,7 +2446,7 @@
     tc = bc;
   }
 
-  if (allow_hp && av1_use_mv_hp(ref_mv) && forced_stop == 0) {
+  if (allow_hp && forced_stop == 0) {
     hstep >>= 1;
     FIRST_LEVEL_CHECKS;
     if (eighthiters > 1) {
@@ -2581,7 +2581,7 @@
   y_stride = pd->pre[is_second].stride;
   offset = bestmv->row * y_stride + bestmv->col;
 
-  if (!(allow_hp && av1_use_mv_hp(ref_mv)))
+  if (!allow_hp)
     if (round == 3) round = 2;
 
   bestmv->row *= 8;
@@ -3083,7 +3083,7 @@
   y_stride = pd->pre[is_second].stride;
   offset = bestmv->row * y_stride + bestmv->col;
 
-  if (!(allow_hp && av1_use_mv_hp(ref_mv)))
+  if (!allow_hp)
     if (round == 3) round = 2;
 
   bestmv->row *= 8;
diff --git a/av1/encoder/pickdering.c b/av1/encoder/pickdering.c
index 4ef83cd..0c79e45 100644
--- a/av1/encoder/pickdering.c
+++ b/av1/encoder/pickdering.c
@@ -9,6 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+// clang-format off
+
 #include <string.h>
 #include <math.h>
 
@@ -41,7 +43,7 @@
   int nhsb, nvsb;
   od_dering_in *src;
   int16_t *ref_coeff;
-  unsigned char *bskip;
+  unsigned char bskip[MAX_MIB_SIZE*MAX_MIB_SIZE][2];
   int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS] = { { 0 } };
   int stride;
   int bsize[3];
@@ -49,10 +51,10 @@
   int pli;
   int level;
   int best_level;
+  int dering_count;
   int coeff_shift = AOMMAX(cm->bit_depth - 8, 0);
   src = aom_malloc(sizeof(*src) * cm->mi_rows * cm->mi_cols * 64);
   ref_coeff = aom_malloc(sizeof(*ref_coeff) * cm->mi_rows * cm->mi_cols * 64);
-  bskip = aom_malloc(sizeof(*bskip) * cm->mi_rows * cm->mi_cols);
   av1_setup_dst_planes(xd->plane, frame, 0, 0);
   for (pli = 0; pli < 3; pli++) {
     dec[pli] = xd->plane[pli].subsampling_x;
@@ -77,13 +79,6 @@
 #endif
     }
   }
-  for (r = 0; r < cm->mi_rows; ++r) {
-    for (c = 0; c < cm->mi_cols; ++c) {
-      const MB_MODE_INFO *mbmi =
-          &cm->mi_grid_visible[r * cm->mi_stride + c]->mbmi;
-      bskip[r * cm->mi_cols + c] = mbmi->skip;
-    }
-  }
   nvsb = (cm->mi_rows + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
   nhsb = (cm->mi_cols + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
   /* Pick a base threshold based on the quantizer. The threshold will then be
@@ -105,20 +100,28 @@
       int16_t dst[MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8];
       nhb = AOMMIN(MAX_MIB_SIZE, cm->mi_cols - MAX_MIB_SIZE * sbc);
       nvb = AOMMIN(MAX_MIB_SIZE, cm->mi_rows - MAX_MIB_SIZE * sbr);
-      if (sb_all_skip(cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE)) continue;
+      if (sb_all_skip_out(cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE, bskip, &dering_count))
+        continue;
       best_gi = 0;
       for (gi = 0; gi < DERING_REFINEMENT_LEVELS; gi++) {
         int cur_mse;
         int threshold;
         level = compute_level_from_index(best_level, gi);
         threshold = level << coeff_shift;
+        for (r = 0; r < bsize[0] * nvb; r++) {
+          for (c = 0; c < bsize[0] * nhb; c++) {
+            dst[r * MAX_MIB_SIZE * bsize[0] + c] =
+                src[(sbr * bsize[0] * MAX_MIB_SIZE + r) * stride +
+                    sbc * bsize[0] * MAX_MIB_SIZE + c];
+          }
+        }
         od_dering(dst, MAX_MIB_SIZE * bsize[0],
                   &src[sbr * stride * bsize[0] * MAX_MIB_SIZE +
                        sbc * bsize[0] * MAX_MIB_SIZE],
                   cm->mi_cols * bsize[0], nhb, nvb, sbc, sbr, nhsb, nvsb, 0,
                   dir, 0,
-                  &bskip[MAX_MIB_SIZE * sbr * cm->mi_cols + MAX_MIB_SIZE * sbc],
-                  cm->mi_cols, threshold, coeff_shift);
+                  bskip,
+                  dering_count, threshold, coeff_shift);
         cur_mse = (int)compute_dist(
             dst, MAX_MIB_SIZE * bsize[0],
             &ref_coeff[sbr * stride * bsize[0] * MAX_MIB_SIZE +
@@ -136,6 +139,5 @@
   }
   aom_free(src);
   aom_free(ref_coeff);
-  aom_free(bskip);
   return best_level;
 }
diff --git a/av1/encoder/rd.c b/av1/encoder/rd.c
index 4a2bc3f..f5af485 100644
--- a/av1/encoder/rd.c
+++ b/av1/encoder/rd.c
@@ -719,31 +719,42 @@
 }
 
 #if CONFIG_DUAL_FILTER
-int av1_get_switchable_rate(const AV1_COMP *cpi, const MACROBLOCKD *const xd) {
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  int inter_filter_cost = 0;
-  int dir;
+int av1_get_switchable_rate(const AV1_COMP *cpi, const MACROBLOCKD *xd) {
+  const AV1_COMMON *const cm = &cpi->common;
+  if (cm->interp_filter == SWITCHABLE) {
+    const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+    int inter_filter_cost = 0;
+    int dir;
 
-  for (dir = 0; dir < 2; ++dir) {
-    if (has_subpel_mv_component(xd->mi[0], xd, dir) ||
-        (mbmi->ref_frame[1] > INTRA_FRAME &&
-         has_subpel_mv_component(xd->mi[0], xd, dir + 2))) {
-      const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
-      inter_filter_cost +=
-          cpi->switchable_interp_costs[ctx][mbmi->interp_filter[dir]];
+    for (dir = 0; dir < 2; ++dir) {
+      if (has_subpel_mv_component(xd->mi[0], xd, dir) ||
+          (mbmi->ref_frame[1] > INTRA_FRAME &&
+           has_subpel_mv_component(xd->mi[0], xd, dir + 2))) {
+        const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+        inter_filter_cost +=
+            cpi->switchable_interp_costs[ctx][mbmi->interp_filter[dir]];
+      }
     }
+    return SWITCHABLE_INTERP_RATE_FACTOR * inter_filter_cost;
+  } else {
+    return 0;
   }
-  return SWITCHABLE_INTERP_RATE_FACTOR * inter_filter_cost;
 }
 #else
-int av1_get_switchable_rate(const AV1_COMP *cpi, const MACROBLOCKD *const xd) {
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const int ctx = av1_get_pred_context_switchable_interp(xd);
+int av1_get_switchable_rate(const AV1_COMP *cpi, const MACROBLOCKD *xd) {
+  const AV1_COMMON *const cm = &cpi->common;
+  if (cm->interp_filter == SWITCHABLE) {
 #if CONFIG_EXT_INTERP
-  if (!av1_is_interp_needed(xd)) return 0;
-#endif  // CONFIG_EXT_INTERP
-  return SWITCHABLE_INTERP_RATE_FACTOR *
-         cpi->switchable_interp_costs[ctx][mbmi->interp_filter];
+    if (av1_is_interp_needed(xd))
+#endif
+    {
+      const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+      const int ctx = av1_get_pred_context_switchable_interp(xd);
+      return SWITCHABLE_INTERP_RATE_FACTOR *
+             cpi->switchable_interp_costs[ctx][mbmi->interp_filter];
+    }
+  }
+  return 0;
 }
 #endif
 
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 76d471e..8682a3e 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -707,17 +707,17 @@
   switch (cpi->sf.tx_type_search.prune_mode) {
     case NO_PRUNE: return 0; break;
     case PRUNE_ONE:
-      if ((tx_set >= 0) & !(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D]))
+      if ((tx_set >= 0) && !(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D]))
         return 0;
       return prune_one_for_sby(cpi, bsize, x, xd);
       break;
 #if CONFIG_EXT_TX
     case PRUNE_TWO:
-      if ((tx_set >= 0) & !(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D])) {
+      if ((tx_set >= 0) && !(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D])) {
         if (!(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D])) return 0;
         return prune_two_for_sby(cpi, bsize, x, xd, 0, 1);
       }
-      if ((tx_set >= 0) & !(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D]))
+      if ((tx_set >= 0) && !(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D]))
         return prune_two_for_sby(cpi, bsize, x, xd, 1, 0);
       return prune_two_for_sby(cpi, bsize, x, xd, 1, 1);
       break;
@@ -1318,9 +1318,9 @@
   const int tx_size_cat =
       is_inter ? inter_tx_size_cat_lookup[bs] : intra_tx_size_cat_lookup[bs];
   const TX_SIZE coded_tx_size = txsize_sqr_up_map[tx_size];
+  const int depth = tx_size_to_depth(coded_tx_size);
   const int tx_select = cm->tx_mode == TX_MODE_SELECT;
-  const int r_tx_size =
-      cpi->tx_size_cost[tx_size_cat][tx_size_ctx][coded_tx_size];
+  const int r_tx_size = cpi->tx_size_cost[tx_size_cat][tx_size_ctx][depth];
 
   assert(skip_prob > 0);
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
@@ -2746,7 +2746,7 @@
       // not the tokenonly rate.
       this_rate_tokenonly -=
           cpi->tx_size_cost[max_tx_size - TX_8X8][get_tx_size_context(xd)]
-                           [mic->mbmi.tx_size];
+                           [tx_size_to_depth(mic->mbmi.tx_size)];
     }
 #if CONFIG_PALETTE
     if (cpi->common.allow_screen_content_tools && mic->mbmi.mode == DC_PRED)
@@ -2918,9 +2918,12 @@
   TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
   const SCAN_ORDER *const scan_order =
       get_scan(cm, tx_size, tx_type, is_inter_block(&xd->mi[0]->mbmi));
-
   BLOCK_SIZE txm_bsize = txsize_to_bsize[tx_size];
-  int bh = 4 * num_4x4_blocks_wide_lookup[txm_bsize];
+  int bh = block_size_high[txm_bsize];
+  int bw = block_size_wide[txm_bsize];
+  int txb_h = tx_size_high_unit[tx_size];
+  int txb_w = tx_size_wide_unit[tx_size];
+
   int src_stride = p->src.stride;
   uint8_t *src = &p->src.buf[4 * blk_row * src_stride + 4 * blk_col];
   uint8_t *dst = &pd->dst.buf[4 * blk_row * pd->dst.stride + 4 * blk_col];
@@ -2930,20 +2933,21 @@
 #else
   DECLARE_ALIGNED(16, uint8_t, rec_buffer[MAX_TX_SQUARE]);
 #endif  // CONFIG_AOM_HIGHBITDEPTH
-  const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  int max_blocks_high = block_size_high[plane_bsize];
+  int max_blocks_wide = block_size_wide[plane_bsize];
+  const int diff_stride = max_blocks_wide;
   const int16_t *diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
-
-  int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
-  int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
-
 #if CONFIG_EXT_TX
   assert(tx_size < TX_SIZES);
 #endif  // CONFIG_EXT_TX
 
   if (xd->mb_to_bottom_edge < 0)
-    max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
+    max_blocks_high += xd->mb_to_bottom_edge >> (3 + pd->subsampling_y);
   if (xd->mb_to_right_edge < 0)
-    max_blocks_wide += xd->mb_to_right_edge >> (5 + pd->subsampling_x);
+    max_blocks_wide += xd->mb_to_right_edge >> (3 + pd->subsampling_x);
+
+  max_blocks_high >>= tx_size_wide_log2[0];
+  max_blocks_wide >>= tx_size_wide_log2[0];
 
 #if CONFIG_NEW_QUANT
   av1_xform_quant_fp_nuq(cm, x, plane, block, blk_row, blk_col, plane_bsize,
@@ -2960,22 +2964,21 @@
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     rec_buffer = CONVERT_TO_BYTEPTR(rec_buffer16);
     aom_highbd_convolve_copy(dst, pd->dst.stride, rec_buffer, MAX_TX_SIZE, NULL,
-                             0, NULL, 0, bh, bh, xd->bd);
+                             0, NULL, 0, bw, bh, xd->bd);
   } else {
     rec_buffer = (uint8_t *)rec_buffer16;
     aom_convolve_copy(dst, pd->dst.stride, rec_buffer, MAX_TX_SIZE, NULL, 0,
-                      NULL, 0, bh, bh);
+                      NULL, 0, bw, bh);
   }
 #else
   aom_convolve_copy(dst, pd->dst.stride, rec_buffer, MAX_TX_SIZE, NULL, 0, NULL,
-                    0, bh, bh);
+                    0, bw, bh);
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
-  if (blk_row + (bh >> 2) > max_blocks_high ||
-      blk_col + (bh >> 2) > max_blocks_wide) {
+  if (blk_row + txb_h > max_blocks_high || blk_col + txb_w > max_blocks_wide) {
     int idx, idy;
-    int blocks_height = AOMMIN(bh >> 2, max_blocks_high - blk_row);
-    int blocks_width = AOMMIN(bh >> 2, max_blocks_wide - blk_col);
+    int blocks_height = AOMMIN(txb_h, max_blocks_high - blk_row);
+    int blocks_width = AOMMIN(txb_w, max_blocks_wide - blk_col);
     tmp = 0;
     for (idy = 0; idy < blocks_height; idy += 2) {
       for (idx = 0; idx < blocks_width; idx += 2) {
@@ -2984,7 +2987,7 @@
       }
     }
   } else {
-    tmp = aom_sum_squares_2d_i16(diff, diff_stride, bh);
+    tmp = sum_squares_2d(diff, diff_stride, tx_size);
   }
 
 #if CONFIG_AOM_HIGHBITDEPTH
@@ -3010,12 +3013,12 @@
     inv_txfm_add(dqcoeff, rec_buffer, MAX_TX_SIZE, &inv_txfm_param);
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
-    if ((bh >> 2) + blk_col > max_blocks_wide ||
-        (bh >> 2) + blk_row > max_blocks_high) {
+    if (txb_h + blk_col > max_blocks_wide ||
+        txb_w + blk_row > max_blocks_high) {
       int idx, idy;
       unsigned int this_dist;
-      int blocks_height = AOMMIN(bh >> 2, max_blocks_high - blk_row);
-      int blocks_width = AOMMIN(bh >> 2, max_blocks_wide - blk_col);
+      int blocks_height = AOMMIN(txb_h, max_blocks_high - blk_row);
+      int blocks_width = AOMMIN(txb_w, max_blocks_wide - blk_col);
       tmp = 0;
       for (idy = 0; idy < blocks_height; idy += 2) {
         for (idx = 0; idx < blocks_width; idx += 2) {
@@ -3054,20 +3057,20 @@
   TX_SIZE(*const inter_tx_size)
   [MAX_MIB_SIZE] =
       (TX_SIZE(*)[MAX_MIB_SIZE]) & mbmi->inter_tx_size[tx_row][tx_col];
-  const int bw = num_4x4_blocks_wide_lookup[plane_bsize];
-  int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
-  int max_blocks_wide = bw;
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+  const int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
   int64_t this_rd = INT64_MAX;
   ENTROPY_CONTEXT *pta = ta + blk_col;
   ENTROPY_CONTEXT *ptl = tl + blk_row;
-  ENTROPY_CONTEXT stxa = 0, stxl = 0;
   int coeff_ctx, i;
-  int ctx = txfm_partition_context(tx_above + (blk_col >> 1),
-                                   tx_left + (blk_row >> 1), tx_size);
+  int ctx =
+      txfm_partition_context(tx_above + (blk_col >> 1),
+                             tx_left + (blk_row >> 1), mbmi->sb_type, tx_size);
 
   int64_t sum_dist = 0, sum_bsse = 0;
   int64_t sum_rd = INT64_MAX;
-  int sum_rate = av1_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 1);
+  int sum_rate = 0;
   int all_skip = 1;
   int tmp_eob = 0;
   int zero_blk_rate;
@@ -3081,31 +3084,7 @@
     return;
   }
 
-  switch (tx_size) {
-    case TX_4X4:
-      stxa = pta[0];
-      stxl = ptl[0];
-      break;
-    case TX_8X8:
-      stxa = !!*(const uint16_t *)&pta[0];
-      stxl = !!*(const uint16_t *)&ptl[0];
-      break;
-    case TX_16X16:
-      stxa = !!*(const uint32_t *)&pta[0];
-      stxl = !!*(const uint32_t *)&ptl[0];
-      break;
-    case TX_32X32:
-      stxa = !!*(const uint64_t *)&pta[0];
-      stxl = !!*(const uint64_t *)&ptl[0];
-      break;
-    default: assert(0 && "Invalid transform size."); break;
-  }
-  coeff_ctx = combine_entropy_contexts(stxa, stxl);
-
-  if (xd->mb_to_bottom_edge < 0)
-    max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
-  if (xd->mb_to_right_edge < 0)
-    max_blocks_wide += xd->mb_to_right_edge >> (5 + pd->subsampling_x);
+  coeff_ctx = get_entropy_context(tx_size, pta, ptl);
 
   *rate = 0;
   *dist = 0;
@@ -3144,8 +3123,10 @@
 
   if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH) {
     BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
-    int bsl = b_height_log2_lookup[bsize];
-    int sub_step = num_4x4_blocks_txsize_lookup[tx_size - 1];
+    int bsl = block_size_wide[bsize] >> (tx_size_wide_log2[0] + 1);
+    // TODO(jingning): Refactor this transform block size transition.
+    TX_SIZE sub_txs = tx_size - 1;
+    int sub_step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
     int this_rate;
     int64_t this_dist;
     int64_t this_bsse;
@@ -3153,18 +3134,17 @@
     int this_cost_valid = 1;
     int64_t tmp_rd = 0;
 
+    sum_rate = av1_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 1);
 #if CONFIG_EXT_TX
     assert(tx_size < TX_SIZES);
 #endif  // CONFIG_EXT_TX
-    --bsl;
     for (i = 0; i < 4 && this_cost_valid; ++i) {
-      int offsetr = (i >> 1) << bsl;
-      int offsetc = (i & 0x01) << bsl;
+      int offsetr = (i >> 1) * bsl;
+      int offsetc = (i & 0x01) * bsl;
       select_tx_block(cpi, x, blk_row + offsetr, blk_col + offsetc, plane,
-                      block + i * sub_step, tx_size - 1, depth + 1, plane_bsize,
-                      ta, tl, tx_above, tx_left, &this_rate, &this_dist,
-                      &this_bsse, &this_skip, ref_best_rd - tmp_rd,
-                      &this_cost_valid);
+                      block + i * sub_step, sub_txs, depth + 1, plane_bsize, ta,
+                      tl, tx_above, tx_left, &this_rate, &this_dist, &this_bsse,
+                      &this_skip, ref_best_rd - tmp_rd, &this_cost_valid);
       sum_rate += this_rate;
       sum_dist += this_dist;
       sum_bsse += this_bsse;
@@ -3177,15 +3157,13 @@
 
   if (this_rd < sum_rd) {
     int idx, idy;
-    for (i = 0; i < num_4x4_blocks_wide_txsize_lookup[tx_size]; ++i)
-      pta[i] = !(tmp_eob == 0);
-    for (i = 0; i < num_4x4_blocks_high_txsize_lookup[tx_size]; ++i)
-      ptl[i] = !(tmp_eob == 0);
+    for (i = 0; i < tx_size_wide_unit[tx_size]; ++i) pta[i] = !(tmp_eob == 0);
+    for (i = 0; i < tx_size_high_unit[tx_size]; ++i) ptl[i] = !(tmp_eob == 0);
     txfm_partition_update(tx_above + (blk_col >> 1), tx_left + (blk_row >> 1),
                           tx_size);
     inter_tx_size[0][0] = tx_size;
-    for (idy = 0; idy < num_4x4_blocks_high_txsize_lookup[tx_size] / 2; ++idy)
-      for (idx = 0; idx < num_4x4_blocks_wide_txsize_lookup[tx_size] / 2; ++idx)
+    for (idy = 0; idy < tx_size_high_unit[tx_size] / 2; ++idy)
+      for (idx = 0; idx < tx_size_wide_unit[tx_size] / 2; ++idx)
         inter_tx_size[idy][idx] = tx_size;
     mbmi->tx_size = tx_size;
     if (this_rd == INT64_MAX) *is_cost_valid = 0;
@@ -3222,7 +3200,8 @@
     int bh = num_4x4_blocks_wide_lookup[txb_size];
     int idx, idy;
     int block = 0;
-    int step = 1 << (max_txsize_lookup[plane_bsize] * 2);
+    int step = tx_size_wide_unit[max_txsize_lookup[plane_bsize]] *
+               tx_size_high_unit[max_txsize_lookup[plane_bsize]];
     ENTROPY_CONTEXT ctxa[2 * MAX_MIB_SIZE];
     ENTROPY_CONTEXT ctxl[2 * MAX_MIB_SIZE];
     TXFM_CONTEXT tx_above[MAX_MIB_SIZE];
@@ -3478,17 +3457,20 @@
   const int tx_row = blk_row >> (1 - pd->subsampling_y);
   const int tx_col = blk_col >> (1 - pd->subsampling_x);
   TX_SIZE plane_tx_size;
-  int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
-  int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
+  int max_blocks_high = block_size_high[plane_bsize];
+  int max_blocks_wide = block_size_wide[plane_bsize];
 
 #if CONFIG_EXT_TX
   assert(tx_size < TX_SIZES);
 #endif  // CONFIG_EXT_TX
 
   if (xd->mb_to_bottom_edge < 0)
-    max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
+    max_blocks_high += xd->mb_to_bottom_edge >> (3 + pd->subsampling_y);
   if (xd->mb_to_right_edge < 0)
-    max_blocks_wide += xd->mb_to_right_edge >> (5 + pd->subsampling_x);
+    max_blocks_wide += xd->mb_to_right_edge >> (3 + pd->subsampling_x);
+
+  max_blocks_high >>= tx_size_wide_log2[0];
+  max_blocks_wide >>= tx_size_wide_log2[0];
 
   if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
 
@@ -3519,24 +3501,25 @@
     coeff_ctx = combine_entropy_contexts(ta[0], tl[0]);
     av1_tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block,
                       plane_bsize, coeff_ctx, rate, dist, bsse, skip);
-    for (i = 0; i < num_4x4_blocks_wide_txsize_lookup[tx_size]; ++i)
+
+    for (i = 0; i < tx_size_wide_unit[tx_size]; ++i)
       ta[i] = !(p->eobs[block] == 0);
-    for (i = 0; i < num_4x4_blocks_high_txsize_lookup[tx_size]; ++i)
+    for (i = 0; i < tx_size_high_unit[tx_size]; ++i)
       tl[i] = !(p->eobs[block] == 0);
   } else {
-    int bsl = b_width_log2_lookup[bsize];
-    int step = num_4x4_blocks_txsize_lookup[tx_size - 1];
+    const int bsl = block_size_wide[bsize] >> (1 + tx_size_wide_log2[0]);
+    const TX_SIZE sub_txs = tx_size - 1;
+    int step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
     int i;
 
     assert(bsl > 0);
-    --bsl;
 
     for (i = 0; i < 4; ++i) {
-      int offsetr = (i >> 1) << bsl;
-      int offsetc = (i & 0x01) << bsl;
+      int offsetr = (i >> 1) * bsl;
+      int offsetc = (i & 0x01) * bsl;
       tx_block_rd(cpi, x, blk_row + offsetr, blk_col + offsetc, plane,
-                  block + i * step, tx_size - 1, plane_bsize, above_ctx,
-                  left_ctx, rate, dist, bsse, skip);
+                  block + i * step, sub_txs, plane_bsize, above_ctx, left_ctx,
+                  rate, dist, bsse, skip);
     }
   }
 }
@@ -4218,8 +4201,7 @@
 #endif  // CONFIG_EXT_INTER
       this_mv[0].as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
 #if CONFIG_EXT_INTER
-      if (!cpi->common.allow_high_precision_mv ||
-          !av1_use_mv_hp(&best_ref_mv[0]->as_mv))
+      if (!cpi->common.allow_high_precision_mv)
         lower_mv_precision(&this_mv[0].as_mv, 0);
 #endif  // CONFIG_EXT_INTER
 
@@ -4278,11 +4260,9 @@
         this_mv[0].as_int = compound_seg_newmvs[0].as_int;
         this_mv[1].as_int = compound_seg_newmvs[1].as_int;
       }
-      if (!cpi->common.allow_high_precision_mv ||
-          !av1_use_mv_hp(&best_ref_mv[0]->as_mv))
+      if (!cpi->common.allow_high_precision_mv)
         lower_mv_precision(&this_mv[0].as_mv, 0);
-      if (!cpi->common.allow_high_precision_mv ||
-          !av1_use_mv_hp(&best_ref_mv[1]->as_mv))
+      if (!cpi->common.allow_high_precision_mv)
         lower_mv_precision(&this_mv[1].as_mv, 0);
       thismvcost += av1_mv_bit_cost(&this_mv[0].as_mv, &best_ref_mv[0]->as_mv,
                                     mvjcost, mvcost, MV_COST_WEIGHT_SUB);
@@ -4292,8 +4272,7 @@
     case NEW_NEARMV:
     case NEW_NEARESTMV:
       this_mv[0].as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
-      if (!cpi->common.allow_high_precision_mv ||
-          !av1_use_mv_hp(&best_ref_mv[0]->as_mv))
+      if (!cpi->common.allow_high_precision_mv)
         lower_mv_precision(&this_mv[0].as_mv, 0);
       thismvcost += av1_mv_bit_cost(&this_mv[0].as_mv, &best_ref_mv[0]->as_mv,
                                     mvjcost, mvcost, MV_COST_WEIGHT_SUB);
@@ -4303,8 +4282,7 @@
     case NEAREST_NEWMV:
       this_mv[0].as_int = frame_mv[mode][mbmi->ref_frame[0]].as_int;
       this_mv[1].as_int = seg_mvs[mbmi->ref_frame[1]].as_int;
-      if (!cpi->common.allow_high_precision_mv ||
-          !av1_use_mv_hp(&best_ref_mv[1]->as_mv))
+      if (!cpi->common.allow_high_precision_mv)
         lower_mv_precision(&this_mv[1].as_mv, 0);
       thismvcost += av1_mv_bit_cost(&this_mv[1].as_mv, &best_ref_mv[1]->as_mv,
                                     mvjcost, mvcost, MV_COST_WEIGHT_SUB);
@@ -4374,8 +4352,8 @@
   struct macroblock_plane *const p = &x->plane[0];
   MODE_INFO *const mi = xd->mi[0];
   const BLOCK_SIZE plane_bsize = get_plane_block_size(mi->mbmi.sb_type, pd);
-  const int width = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
-  const int height = 4 * num_4x4_blocks_high_lookup[plane_bsize];
+  const int width = block_size_wide[plane_bsize];
+  const int height = block_size_high[plane_bsize];
   int idx, idy;
   const uint8_t *const src =
       &p->src.buf[av1_raster_block_offset(BLOCK_8X8, i, p->src.stride)];
@@ -4387,8 +4365,8 @@
 
   TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, i, tx_size);
   const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, 1);
-  const int num_4x4_w = num_4x4_blocks_wide_txsize_lookup[tx_size];
-  const int num_4x4_h = num_4x4_blocks_high_txsize_lookup[tx_size];
+  const int num_4x4_w = tx_size_wide_unit[tx_size];
+  const int num_4x4_h = tx_size_high_unit[tx_size];
 
 #if CONFIG_EXT_TX && CONFIG_RECT_TX
   assert(IMPLIES(xd->lossless[mi->mbmi.segment_id], tx_size == TX_4X4));
@@ -4428,11 +4406,8 @@
         block = k;
       else
         block = (i ? 2 : 0);
-#if CONFIG_VAR_TX
-      coeff_ctx = get_entropy_context(tx_size, ta + (k & 1), tl + (k >> 1));
-#else
+
       coeff_ctx = combine_entropy_contexts(*(ta + (k & 1)), *(tl + (k >> 1)));
-#endif
 #if CONFIG_NEW_QUANT
       av1_xform_quant_fp_nuq(cm, x, 0, block, idy + (i >> 1), idx + (i & 0x01),
                              BLOCK_8X8, tx_size, coeff_ctx);
@@ -5145,8 +5120,7 @@
         if (!has_second_rf &&
 #if CONFIG_EXT_INTER
             have_newmv_in_inter_mode(this_mode) &&
-            (seg_mvs[index][mv_idx][mbmi->ref_frame[0]].as_int == INVALID_MV ||
-             av1_use_mv_hp(&bsi->ref_mv[0]->as_mv) == 0)
+            (seg_mvs[index][mv_idx][mbmi->ref_frame[0]].as_int == INVALID_MV)
 #else
             this_mode == NEWMV &&
             (seg_mvs[index][mbmi->ref_frame[0]].as_int == INVALID_MV ||
@@ -6367,10 +6341,6 @@
   const int this_mode = mbmi->mode;
   int refs[2] = { mbmi->ref_frame[0],
                   (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
-#if CONFIG_DUAL_FILTER
-  (void)pred_filter_search;
-  return SWITCHABLE;
-#else
   if (pred_filter_search) {
     InterpFilter af = SWITCHABLE, lf = SWITCHABLE;
     if (xd->up_available) af = xd->mi[-xd->mi_stride]->mbmi.interp_filter;
@@ -6385,7 +6355,6 @@
 #endif  // CONFIG_EXT_INTER
       best_filter = af;
   }
-#endif
   if (is_comp_pred) {
     if (cpi->sf.adaptive_mode_search) {
 #if CONFIG_EXT_INTER
@@ -6448,15 +6417,8 @@
 #endif  // CONFIG_EXT_INTER
     }
   }
-  if (cm->interp_filter != BILINEAR) {
-    if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) {
-      best_filter = EIGHTTAP_REGULAR;
-    }
-#if CONFIG_EXT_INTERP
-    else if (!av1_is_interp_needed(xd) && cm->interp_filter == SWITCHABLE) {
-      best_filter = EIGHTTAP_REGULAR;
-    }
-#endif
+  if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) {
+    best_filter = EIGHTTAP_REGULAR;
   }
   return best_filter;
 }
@@ -6676,6 +6638,7 @@
   int_mv cur_mv[2];
   int rate_mv = 0;
 #if CONFIG_EXT_INTER
+  int pred_exists = 1;
   const int bw = 4 * num_4x4_blocks_wide_lookup[bsize];
   int mv_idx = (this_mode == NEWFROMNEARMV) ? 1 : 0;
   int_mv single_newmv[TOTAL_REFS_PER_FRAME];
@@ -6706,6 +6669,7 @@
   uint8_t best_blk_skip[MAX_MB_PLANE][MAX_MIB_SIZE * MAX_MIB_SIZE * 4];
 #endif  // CONFIG_VAR_TX
   int64_t best_distortion = INT64_MAX;
+  int64_t best_rd = INT64_MAX;
   MB_MODE_INFO best_mbmi;
 #if CONFIG_EXT_INTER
   int rate2_bmc_nocoeff;
@@ -6713,24 +6677,13 @@
   MB_MODE_INFO best_bmc_mbmi;
 #endif  // CONFIG_EXT_INTER
 #endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
-
-  int pred_exists = 0;
-  int intpel_mv;
-  int64_t rd, tmp_rd, best_rd = INT64_MAX;
-  int best_needs_copy = 0;
+  int64_t rd = INT64_MAX;
   uint8_t *orig_dst[MAX_MB_PLANE];
   int orig_dst_stride[MAX_MB_PLANE];
+  uint8_t *tmp_dst[MAX_MB_PLANE];
+  int tmp_dst_stride[MAX_MB_PLANE];
   int rs = 0;
-#if CONFIG_DUAL_FILTER
-  // Index use case:
-  // {0, 1} -> (vertical, horizontal) filter types for the first ref frame
-  // {2, 3} -> (vertical, horizontal) filter types for the second ref frame
-  InterpFilter best_filter[4] = {
-    SWITCHABLE, SWITCHABLE, SWITCHABLE, SWITCHABLE,
-  };
-#else
-  InterpFilter best_filter = SWITCHABLE;
-#endif
+  InterpFilter assign_filter = SWITCHABLE;
 
   int skip_txfm_sb = 0;
   int64_t skip_sse_sb = INT64_MAX;
@@ -6792,24 +6745,24 @@
           av1_set_mvcost(x, mbmi->ref_frame[0], 0, mbmi->ref_mv_idx);
 #endif  // CONFIG_REF_MV
           rate_mv = av1_mv_bit_cost(&frame_mv[refs[0]].as_mv,
-                                    &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,
+                                    &mbmi_ext->ref_mvs[refs[0]][0].as_mv,
                                     x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
 #if CONFIG_REF_MV
           av1_set_mvcost(x, mbmi->ref_frame[1], 1, mbmi->ref_mv_idx);
 #endif  // CONFIG_REF_MV
           rate_mv += av1_mv_bit_cost(
-              &frame_mv[refs[1]].as_mv, &x->mbmi_ext->ref_mvs[refs[1]][0].as_mv,
+              &frame_mv[refs[1]].as_mv, &mbmi_ext->ref_mvs[refs[1]][0].as_mv,
               x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
         }
       } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
         frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
         rate_mv = av1_mv_bit_cost(&frame_mv[refs[1]].as_mv,
-                                  &x->mbmi_ext->ref_mvs[refs[1]][0].as_mv,
+                                  &mbmi_ext->ref_mvs[refs[1]][0].as_mv,
                                   x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
       } else {
         frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
         rate_mv = av1_mv_bit_cost(&frame_mv[refs[0]].as_mv,
-                                  &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,
+                                  &mbmi_ext->ref_mvs[refs[0]][0].as_mv,
                                   x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
       }
 #else
@@ -6825,13 +6778,13 @@
         av1_set_mvcost(x, mbmi->ref_frame[0], 0, mbmi->ref_mv_idx);
 #endif  // CONFIG_REF_MV
         rate_mv = av1_mv_bit_cost(&frame_mv[refs[0]].as_mv,
-                                  &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,
+                                  &mbmi_ext->ref_mvs[refs[0]][0].as_mv,
                                   x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
 #if CONFIG_REF_MV
         av1_set_mvcost(x, mbmi->ref_frame[1], 1, mbmi->ref_mv_idx);
 #endif  // CONFIG_REF_MV
         rate_mv += av1_mv_bit_cost(&frame_mv[refs[1]].as_mv,
-                                   &x->mbmi_ext->ref_mvs[refs[1]][0].as_mv,
+                                   &mbmi_ext->ref_mvs[refs[1]][0].as_mv,
                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
       }
 #endif  // CONFIG_EXT_INTER
@@ -6966,6 +6919,10 @@
   // one for future predictions. In the end, copy from tmp_buf to
   // dst if necessary.
   for (i = 0; i < MAX_MB_PLANE; i++) {
+    tmp_dst[i] = tmp_buf + i * MAX_SB_SQUARE;
+    tmp_dst_stride[i] = MAX_SB_SIZE;
+  }
+  for (i = 0; i < MAX_MB_PLANE; i++) {
     orig_dst[i] = xd->plane[i].dst.buf;
     orig_dst_stride[i] = xd->plane[i].dst.stride;
   }
@@ -7003,135 +6960,126 @@
       )
     return INT64_MAX;
 
-  pred_exists = 0;
-  // Are all MVs integer pel for Y and UV
-  intpel_mv = !mv_has_subpel(&mbmi->mv[0].as_mv);
-  if (is_comp_pred) intpel_mv &= !mv_has_subpel(&mbmi->mv[1].as_mv);
-
+  if (cm->interp_filter == SWITCHABLE) {
 #if !CONFIG_DUAL_FILTER
-  best_filter =
-      predict_interp_filter(cpi, x, bsize, mi_row, mi_col, single_filter);
+    assign_filter =
+        predict_interp_filter(cpi, x, bsize, mi_row, mi_col, single_filter);
 #endif
+#if CONFIG_EXT_INTERP || CONFIG_DUAL_FILTER
+    if (!av1_is_interp_needed(xd)) assign_filter = EIGHTTAP_REGULAR;
+#endif
+  } else {
+    assign_filter = cm->interp_filter;
+  }
 
-  if (cm->interp_filter != BILINEAR) {
-    int newbest;
-    int tmp_rate_sum = 0;
-    int64_t tmp_dist_sum = 0;
-
+  {  // Do interpolation filter search in the parentheses
+    int tmp_rate;
+    int64_t tmp_dist;
 #if CONFIG_DUAL_FILTER
-    for (i = 0; i < SWITCHABLE_FILTERS * SWITCHABLE_FILTERS; ++i)
+    mbmi->interp_filter[0] =
+        assign_filter == SWITCHABLE ? EIGHTTAP_REGULAR : assign_filter;
+    mbmi->interp_filter[1] =
+        assign_filter == SWITCHABLE ? EIGHTTAP_REGULAR : assign_filter;
+    mbmi->interp_filter[2] =
+        assign_filter == SWITCHABLE ? EIGHTTAP_REGULAR : assign_filter;
+    mbmi->interp_filter[3] =
+        assign_filter == SWITCHABLE ? EIGHTTAP_REGULAR : assign_filter;
 #else
-    for (i = 0; i < SWITCHABLE_FILTERS; ++i)
+    mbmi->interp_filter =
+        assign_filter == SWITCHABLE ? EIGHTTAP_REGULAR : assign_filter;
 #endif
-    {
-      int j;
-      int64_t rs_rd;
-      int tmp_skip_sb = 0;
-      int64_t tmp_skip_sse = INT64_MAX;
+    rs = av1_get_switchable_rate(cpi, xd);
+    av1_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+    model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
+                    &tmp_dist, &skip_txfm_sb, &skip_sse_sb);
+    rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
 
+    if (assign_filter == SWITCHABLE) {
+      // do interp_filter search
+      if (av1_is_interp_needed(xd)) {
+        int best_in_temp = 0;
 #if CONFIG_DUAL_FILTER
-      mbmi->interp_filter[0] = filter_sets[i][0];
-      mbmi->interp_filter[1] = filter_sets[i][1];
-      mbmi->interp_filter[2] = filter_sets[i][0];
-      mbmi->interp_filter[3] = filter_sets[i][1];
+        InterpFilter best_filter[4];
+        av1_copy(best_filter, mbmi->interp_filter);
 #else
-      mbmi->interp_filter = i;
+        InterpFilter best_filter = mbmi->interp_filter;
 #endif
-      rs = av1_get_switchable_rate(cpi, xd);
-      rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
+        restore_dst_buf(xd, tmp_dst, tmp_dst_stride);
+#if CONFIG_DUAL_FILTER
+        // EIGHTTAP_REGULAR mode is calculated beforehand
+        for (i = 1; i < SWITCHABLE_FILTERS * SWITCHABLE_FILTERS; ++i)
+#else
+        // EIGHTTAP_REGULAR mode is calculated beforehand
+        for (i = 1; i < SWITCHABLE_FILTERS; ++i)
+#endif
+        {
+          int tmp_skip_sb = 0;
+          int64_t tmp_skip_sse = INT64_MAX;
+          int tmp_rs;
+          int64_t tmp_rd;
+#if CONFIG_DUAL_FILTER
+          mbmi->interp_filter[0] = filter_sets[i][0];
+          mbmi->interp_filter[1] = filter_sets[i][1];
+          mbmi->interp_filter[2] = filter_sets[i][0];
+          mbmi->interp_filter[3] = filter_sets[i][1];
+#else
+          mbmi->interp_filter = i;
+#endif
+          tmp_rs = av1_get_switchable_rate(cpi, xd);
+          av1_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+          model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
+                          &tmp_dist, &tmp_skip_sb, &tmp_skip_sse);
+          tmp_rd = RDCOST(x->rdmult, x->rddiv, tmp_rs + tmp_rate, tmp_dist);
 
-      if (i > 0 && intpel_mv && IsInterpolatingFilter(i)) {
-        rd = RDCOST(x->rdmult, x->rddiv, tmp_rate_sum, tmp_dist_sum);
-        if (cm->interp_filter == SWITCHABLE) rd += rs_rd;
-      } else {
-        int rate_sum = 0;
-        int64_t dist_sum = 0;
-        if ((cm->interp_filter == SWITCHABLE && (!i || best_needs_copy)) ||
-#if CONFIG_EXT_INTER
-            is_comp_interintra_pred ||
-#endif  // CONFIG_EXT_INTER
-            (cm->interp_filter != SWITCHABLE &&
-             (
+          if (tmp_rd < rd) {
+            rd = tmp_rd;
+            rs = av1_get_switchable_rate(cpi, xd);
 #if CONFIG_DUAL_FILTER
-                 cm->interp_filter == mbmi->interp_filter[0]
+            av1_copy(best_filter, mbmi->interp_filter);
 #else
-                 cm->interp_filter == mbmi->interp_filter
+            best_filter = mbmi->interp_filter;
 #endif
-                 || (i == 0 && intpel_mv && IsInterpolatingFilter(i))))) {
-          restore_dst_buf(xd, orig_dst, orig_dst_stride);
-        } else {
-          for (j = 0; j < MAX_MB_PLANE; j++) {
-            xd->plane[j].dst.buf = tmp_buf + j * MAX_SB_SQUARE;
-            xd->plane[j].dst.stride = MAX_SB_SIZE;
+            skip_txfm_sb = tmp_skip_sb;
+            skip_sse_sb = tmp_skip_sse;
+            best_in_temp = !best_in_temp;
+            if (best_in_temp) {
+              restore_dst_buf(xd, orig_dst, orig_dst_stride);
+            } else {
+              restore_dst_buf(xd, tmp_dst, tmp_dst_stride);
+            }
           }
         }
-        av1_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
-        model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &rate_sum,
-                        &dist_sum, &tmp_skip_sb, &tmp_skip_sse);
-
-        rd = RDCOST(x->rdmult, x->rddiv, rate_sum, dist_sum);
-        if (cm->interp_filter == SWITCHABLE) rd += rs_rd;
-
-        if (i == 0 && intpel_mv && IsInterpolatingFilter(i)) {
-          tmp_rate_sum = rate_sum;
-          tmp_dist_sum = dist_sum;
+        if (best_in_temp) {
+          restore_dst_buf(xd, tmp_dst, tmp_dst_stride);
+        } else {
+          restore_dst_buf(xd, orig_dst, orig_dst_stride);
         }
-      }
-      newbest = i == 0 || rd < best_rd;
-
-      if (newbest) {
-        best_rd = rd;
 #if CONFIG_DUAL_FILTER
-        best_filter[0] = mbmi->interp_filter[0];
-        best_filter[1] = mbmi->interp_filter[1];
-        best_filter[2] = mbmi->interp_filter[2];
-        best_filter[3] = mbmi->interp_filter[3];
+        av1_copy(mbmi->interp_filter, best_filter);
 #else
-        best_filter = mbmi->interp_filter;
+        mbmi->interp_filter = best_filter;
 #endif
-        if (cm->interp_filter == SWITCHABLE && i &&
-            !(intpel_mv && IsInterpolatingFilter(i)))
-          best_needs_copy = !best_needs_copy;
-      }
-
-      if ((cm->interp_filter == SWITCHABLE && newbest) ||
-          (cm->interp_filter != SWITCHABLE &&
-#if CONFIG_DUAL_FILTER
-           cm->interp_filter == mbmi->interp_filter[0]
-#else
-           cm->interp_filter == mbmi->interp_filter
-#endif
-           )) {
-        pred_exists = 1;
-        tmp_rd = best_rd;
-
-        skip_txfm_sb = tmp_skip_sb;
-        skip_sse_sb = tmp_skip_sse;
       } else {
-        pred_exists = 0;
+#if !CONFIG_EXT_INTERP && !CONFIG_DUAL_FILTER
+        int tmp_rs;
+        InterpFilter best_filter = mbmi->interp_filter;
+        rs = av1_get_switchable_rate(cpi, xd);
+        for (i = 1; i < SWITCHABLE_FILTERS; ++i) {
+          mbmi->interp_filter = i;
+          tmp_rs = av1_get_switchable_rate(cpi, xd);
+          if (tmp_rs < rs) {
+            rs = tmp_rs;
+            best_filter = i;
+          }
+        }
+        mbmi->interp_filter = best_filter;
+#else
+        assert(0);
+#endif
       }
     }
-    restore_dst_buf(xd, orig_dst, orig_dst_stride);
   }
 
-// Set the appropriate filter
-#if CONFIG_DUAL_FILTER
-  mbmi->interp_filter[0] =
-      cm->interp_filter != SWITCHABLE ? cm->interp_filter : best_filter[0];
-  mbmi->interp_filter[1] =
-      cm->interp_filter != SWITCHABLE ? cm->interp_filter : best_filter[1];
-  if (mbmi->ref_frame[1] > INTRA_FRAME) {
-    mbmi->interp_filter[2] =
-        cm->interp_filter != SWITCHABLE ? cm->interp_filter : best_filter[2];
-    mbmi->interp_filter[3] =
-        cm->interp_filter != SWITCHABLE ? cm->interp_filter : best_filter[3];
-  }
-#else
-  mbmi->interp_filter =
-      cm->interp_filter != SWITCHABLE ? cm->interp_filter : best_filter;
-#endif
-  rs = cm->interp_filter == SWITCHABLE ? av1_get_switchable_rate(cpi, xd) : 0;
-
 #if CONFIG_EXT_INTER
 #if CONFIG_MOTION_VAR
   best_bmc_mbmi = *mbmi;
@@ -7264,7 +7212,6 @@
       return INT64_MAX;
 
     pred_exists = 0;
-    tmp_rd = AOMMIN(best_rd_wedge, best_rd_nowedge);
 
     if (mbmi->use_wedge_interinter)
       *compmode_wedge_cost =
@@ -7405,7 +7352,6 @@
     }
 
     pred_exists = 0;
-    tmp_rd = best_interintra_rd;
     *compmode_interintra_cost =
         av1_cost_bit(cm->fc->interintra_prob[size_group_lookup[bsize]], 1);
     *compmode_interintra_cost += interintra_mode_cost[mbmi->interintra_mode];
@@ -7432,29 +7378,15 @@
     pred_exists = 0;
   }
 #endif  // CONFIG_EXT_INTERP
-#endif  // CONFIG_EXT_INTER
-
-  if (pred_exists) {
-    if (best_needs_copy) {
-      // again temporarily set the buffers to local memory to prevent a memcpy
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = tmp_buf + i * MAX_SB_SQUARE;
-        xd->plane[i].dst.stride = MAX_SB_SIZE;
-      }
-    }
-    rd = tmp_rd;
-  } else {
+  if (pred_exists == 0) {
     int tmp_rate;
     int64_t tmp_dist;
-
-    // Handles the special case when a filter that is not in the
-    // switchable list (ex. bilinear) is indicated at the frame level, or
-    // skip condition holds.
     av1_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
     model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
                     &tmp_dist, &skip_txfm_sb, &skip_sse_sb);
     rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
   }
+#endif  // CONFIG_EXT_INTER
 
 #if CONFIG_DUAL_FILTER
   if (!is_comp_pred) single_filter[this_mode][refs[0]] = mbmi->interp_filter[0];
@@ -7498,6 +7430,7 @@
   for (mbmi->motion_mode = SIMPLE_TRANSLATION;
        mbmi->motion_mode < (allow_motvar ? MOTION_MODES : 1);
        mbmi->motion_mode++) {
+    int64_t tmp_rd = INT64_MAX;
 #if CONFIG_EXT_INTER
     int tmp_rate2 = mbmi->motion_mode != SIMPLE_TRANSLATION ? rate2_bmc_nocoeff
                                                             : rate2_nocoeff;
@@ -8054,7 +7987,7 @@
     // (prediction granularity), so we account for it in the full rate,
     // not the tokenonly rate.
     rate_y -= cpi->tx_size_cost[max_tx_size - TX_8X8][get_tx_size_context(xd)]
-                               [mbmi->tx_size];
+                               [tx_size_to_depth(mbmi->tx_size)];
   }
 
   rate2 += av1_cost_bit(cm->fc->filter_intra_probs[0],
@@ -8811,8 +8744,9 @@
         // tokenonly rate, but for intra blocks, tx_size is always coded
         // (prediction granularity), so we account for it in the full rate,
         // not the tokenonly rate.
-        rate_y -= cpi->tx_size_cost[max_tx_size - TX_8X8]
-                                   [get_tx_size_context(xd)][mbmi->tx_size];
+        rate_y -=
+            cpi->tx_size_cost[max_tx_size - TX_8X8][get_tx_size_context(xd)]
+                             [tx_size_to_depth(mbmi->tx_size)];
       }
 #if CONFIG_EXT_INTRA
       if (is_directional_mode) {
@@ -10227,14 +10161,13 @@
       this_rd_thresh = (ref_frame == LAST3_FRAME)
                            ? rd_opt->threshes[segment_id][bsize][THR_LAST3]
                            : this_rd_thresh;
+      this_rd_thresh = (ref_frame == BWDREF_FRAME)
+                           ? rd_opt->threshes[segment_id][bsize][THR_BWDR]
+                           : this_rd_thresh;
 #endif  // CONFIG_EXT_REFS
       this_rd_thresh = (ref_frame == GOLDEN_FRAME)
                            ? rd_opt->threshes[segment_id][bsize][THR_GOLD]
                            : this_rd_thresh;
-#if CONFIG_EXT_REFS
-// TODO(zoeliu): To explore whether this_rd_thresh should consider
-//               BWDREF_FRAME and ALTREF_FRAME
-#endif  // CONFIG_EXT_REFS
 
       // TODO(any): Add search of the tx_type to improve rd performance at the
       // expense of speed.
diff --git a/av1/encoder/segmentation.c b/av1/encoder/segmentation.c
index 3292da4..828b31c 100644
--- a/av1/encoder/segmentation.c
+++ b/av1/encoder/segmentation.c
@@ -51,7 +51,8 @@
 // Based on set of segment counts calculate a probability tree
 static void calc_segtree_probs(unsigned *segcounts,
                                aom_prob *segment_tree_probs,
-                               const aom_prob *cur_tree_probs) {
+                               const aom_prob *cur_tree_probs,
+                               const int probwt) {
   // Work out probabilities of each segment
   const unsigned cc[4] = { segcounts[0] + segcounts[1],
                            segcounts[2] + segcounts[3],
@@ -71,8 +72,9 @@
   for (i = 0; i < 7; i++) {
     const unsigned *ct =
         i == 0 ? ccc : i < 3 ? cc + (i & 2) : segcounts + (i - 3) * 2;
-    av1_prob_diff_update_savings_search(
-        ct, cur_tree_probs[i], &segment_tree_probs[i], DIFF_UPDATE_PROB);
+    av1_prob_diff_update_savings_search(ct, cur_tree_probs[i],
+                                        &segment_tree_probs[i],
+                                        DIFF_UPDATE_PROB, probwt);
   }
 }
 
@@ -294,6 +296,11 @@
   int t_pred_cost = INT_MAX;
 
   int i, tile_col, tile_row, mi_row, mi_col;
+#if CONFIG_TILE_GROUPS
+  const int probwt = cm->num_tg;
+#else
+  const int probwt = 1;
+#endif
 
   unsigned(*temporal_predictor_count)[2] = cm->counts.seg.pred;
   unsigned *no_pred_segcounts = cm->counts.seg.tree_total;
@@ -333,14 +340,15 @@
 
   // Work out probability tree for coding segments without prediction
   // and the cost.
-  calc_segtree_probs(no_pred_segcounts, no_pred_tree, segp->tree_probs);
+  calc_segtree_probs(no_pred_segcounts, no_pred_tree, segp->tree_probs, probwt);
   no_pred_cost = cost_segmap(no_pred_segcounts, no_pred_tree);
 
   // Key frames cannot use temporal prediction
   if (!frame_is_intra_only(cm) && !cm->error_resilient_mode) {
     // Work out probability tree for coding those segments not
     // predicted using the temporal method and the cost.
-    calc_segtree_probs(t_unpred_seg_counts, t_pred_tree, segp->tree_probs);
+    calc_segtree_probs(t_unpred_seg_counts, t_pred_tree, segp->tree_probs,
+                       probwt);
     t_pred_cost = cost_segmap(t_unpred_seg_counts, t_pred_tree);
 
     // Add in the cost of the signaling for each prediction context.
@@ -349,9 +357,9 @@
       const int count1 = temporal_predictor_count[i][1];
 
       t_nopred_prob[i] = get_binary_prob(count0, count1);
-      av1_prob_diff_update_savings_search(temporal_predictor_count[i],
-                                          segp->pred_probs[i],
-                                          &t_nopred_prob[i], DIFF_UPDATE_PROB);
+      av1_prob_diff_update_savings_search(
+          temporal_predictor_count[i], segp->pred_probs[i], &t_nopred_prob[i],
+          DIFF_UPDATE_PROB, probwt);
 
       // Add in the predictor signaling cost
       t_pred_cost += count0 * av1_cost_zero(t_nopred_prob[i]) +
@@ -366,9 +374,6 @@
   } else {
     seg->temporal_update = 0;
   }
-#if CONFIG_DAALA_EC
-  av1_tree_to_cdf(av1_segment_tree, segp->tree_probs, segp->tree_cdf);
-#endif
 }
 
 void av1_reset_segment_features(AV1_COMMON *cm) {
diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index acdc13b..2fb651c 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c
@@ -69,6 +69,11 @@
                                                        int speed) {
   AV1_COMMON *const cm = &cpi->common;
 
+  // Limit memory usage for high resolutions
+  if (AOMMIN(cm->width, cm->height) > 1080) {
+    sf->use_upsampled_references = 0;
+  }
+
   if (speed >= 1) {
     if (AOMMIN(cm->width, cm->height) >= 720) {
       sf->disable_split_mask =
diff --git a/av1/encoder/subexp.c b/av1/encoder/subexp.c
index 0ca5247..81bb56d 100644
--- a/av1/encoder/subexp.c
+++ b/av1/encoder/subexp.c
@@ -116,7 +116,8 @@
 }
 
 int av1_prob_diff_update_savings_search(const unsigned int *ct, aom_prob oldp,
-                                        aom_prob *bestp, aom_prob upd) {
+                                        aom_prob *bestp, aom_prob upd,
+                                        int probwt) {
   const uint32_t old_b = cost_branch256(ct, oldp);
   int bestsavings = 0;
   aom_prob newp, bestnewp = oldp;
@@ -126,7 +127,7 @@
     const uint32_t new_b = cost_branch256(ct, newp);
     const uint32_t update_b =
         prob_diff_update_cost(newp, oldp) + av1_cost_upd256;
-    const int savings = (int)((int64_t)old_b - new_b - update_b);
+    const int savings = (int)((int64_t)old_b - new_b - update_b * probwt);
     if (savings > bestsavings) {
       bestsavings = savings;
       bestnewp = newp;
@@ -139,7 +140,7 @@
 int av1_prob_diff_update_savings_search_model(const unsigned int *ct,
                                               const aom_prob *oldp,
                                               aom_prob *bestp, aom_prob upd,
-                                              int stepsize) {
+                                              int stepsize, int probwt) {
   int i, old_b, new_b, update_b, savings, bestsavings;
   int newp;
   const int step_sign = *bestp > oldp[PIVOT_NODE] ? -1 : 1;
@@ -164,7 +165,7 @@
       new_b += cost_branch256(ct + 2 * i, newplist[i]);
     new_b += cost_branch256(ct + 2 * PIVOT_NODE, newplist[PIVOT_NODE]);
     update_b = prob_diff_update_cost(newp, oldp[PIVOT_NODE]) + av1_cost_upd256;
-    savings = old_b - new_b - update_b;
+    savings = old_b - new_b - update_b * probwt;
     if (savings > bestsavings) {
       bestsavings = savings;
       bestnewp = newp;
@@ -253,11 +254,11 @@
 #endif  // CONFIG_ENTROPY
 
 void av1_cond_prob_diff_update(aom_writer *w, aom_prob *oldp,
-                               const unsigned int ct[2]) {
+                               const unsigned int ct[2], int probwt) {
   const aom_prob upd = DIFF_UPDATE_PROB;
   aom_prob newp = get_binary_prob(ct[0], ct[1]);
   const int savings =
-      av1_prob_diff_update_savings_search(ct, *oldp, &newp, upd);
+      av1_prob_diff_update_savings_search(ct, *oldp, &newp, upd, probwt);
   assert(newp >= 1);
   if (savings > 0) {
     aom_write(w, 1, upd);
@@ -268,12 +269,12 @@
   }
 }
 
-int av1_cond_prob_diff_update_savings(aom_prob *oldp,
-                                      const unsigned int ct[2]) {
+int av1_cond_prob_diff_update_savings(aom_prob *oldp, const unsigned int ct[2],
+                                      int probwt) {
   const aom_prob upd = DIFF_UPDATE_PROB;
   aom_prob newp = get_binary_prob(ct[0], ct[1]);
   const int savings =
-      av1_prob_diff_update_savings_search(ct, *oldp, &newp, upd);
+      av1_prob_diff_update_savings_search(ct, *oldp, &newp, upd, probwt);
   return savings;
 }
 
diff --git a/av1/encoder/subexp.h b/av1/encoder/subexp.h
index 25750bb..d01dea9 100644
--- a/av1/encoder/subexp.h
+++ b/av1/encoder/subexp.h
@@ -19,20 +19,22 @@
 #include "aom_dsp/bitwriter.h"
 #include "aom_dsp/prob.h"
 
-void av1_write_prob_diff_update(aom_writer *w, aom_prob newp, aom_prob oldp);
+void av1_write_prob_diff_update(aom_writer *w, aom_prob newp, aom_prob oldpm);
 
 void av1_cond_prob_diff_update(aom_writer *w, aom_prob *oldp,
-                               const unsigned int ct[2]);
+                               const unsigned int ct[2], int probwt);
 
 int av1_prob_diff_update_savings_search(const unsigned int *ct, aom_prob oldp,
-                                        aom_prob *bestp, aom_prob upd);
+                                        aom_prob *bestp, aom_prob upd,
+                                        int probwt);
 
 int av1_prob_diff_update_savings_search_model(const unsigned int *ct,
                                               const aom_prob *oldp,
                                               aom_prob *bestp, aom_prob upd,
-                                              int stepsize);
-int av1_cond_prob_diff_update_savings(aom_prob *oldp, const unsigned int ct[2]);
+                                              int stepsize, int probwt);
 
+int av1_cond_prob_diff_update_savings(aom_prob *oldp, const unsigned int ct[2],
+                                      int probwt);
 #if CONFIG_ENTROPY
 int av1_prob_update_search_subframe(unsigned int ct[][2], aom_prob oldp,
                                     aom_prob *bestp, aom_prob upd, int n);
diff --git a/av1/encoder/tokenize.c b/av1/encoder/tokenize.c
index 47cc02a..d79763e 100644
--- a/av1/encoder/tokenize.c
+++ b/av1/encoder/tokenize.c
@@ -357,17 +357,17 @@
 }
 
 static INLINE void add_token(TOKENEXTRA **t, const aom_prob *context_tree,
-#if CONFIG_RANS || CONFIG_DAALA_EC
-                             const aom_cdf_prob (*token_cdf)[ENTROPY_TOKENS],
-#endif  // CONFIG_RANS
+#if CONFIG_EC_MULTISYMBOL
+                             aom_cdf_prob (*token_cdf)[ENTROPY_TOKENS],
+#endif  // CONFIG_EC_MULTISYMBOL
                              int32_t extra, uint8_t token,
                              uint8_t skip_eob_node, unsigned int *counts) {
   (*t)->token = token;
   (*t)->extra = extra;
   (*t)->context_tree = context_tree;
-#if CONFIG_RANS || CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
   (*t)->token_cdf = token_cdf;
-#endif  // CONFIG_RANS
+#endif  // CONFIG_EC_MULTISYMBOL
   (*t)->skip_eob_node = skip_eob_node;
   (*t)++;
   ++counts[token];
@@ -458,7 +458,7 @@
   aom_prob(*const coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
       cpi->common.fc->coef_probs[txsize_sqr_map[tx_size]][type][ref];
 #endif  // CONFIG_ENTROPY
-#if CONFIG_RANS || CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
   aom_cdf_prob(*const coef_cdfs)[COEFF_CONTEXTS][ENTROPY_TOKENS] =
       cpi->common.fc->coef_cdfs[tx_size][type][ref];
 #endif
@@ -483,8 +483,8 @@
     av1_get_token_extra(v, &token, &extra);
 
     add_token(&t, coef_probs[band[c]][pt],
-#if CONFIG_RANS || CONFIG_DAALA_EC
-              (const aom_cdf_prob(*)[ENTROPY_TOKENS]) & coef_cdfs[band[c]][pt],
+#if CONFIG_EC_MULTISYMBOL
+              &coef_cdfs[band[c]][pt],
 #endif
               extra, (uint8_t)token, (uint8_t)skip_eob, counts[band[c]][pt]);
 
@@ -495,7 +495,7 @@
   }
   if (c < seg_eob) {
     add_token(&t, coef_probs[band[c]][pt],
-#if CONFIG_RANS || CONFIG_DAALA_EC
+#if CONFIG_EC_MULTISYMBOL
               NULL,
 #endif
               0, EOB_TOKEN, 0, counts[band[c]][pt]);
diff --git a/av1/encoder/tokenize.h b/av1/encoder/tokenize.h
index b9487da..27bdef5 100644
--- a/av1/encoder/tokenize.h
+++ b/av1/encoder/tokenize.h
@@ -36,8 +36,8 @@
 
 typedef struct {
   const aom_prob *context_tree;
-#if CONFIG_RANS || CONFIG_DAALA_EC
-  const aom_cdf_prob (*token_cdf)[ENTROPY_TOKENS];
+#if CONFIG_EC_MULTISYMBOL
+  aom_cdf_prob (*token_cdf)[ENTROPY_TOKENS];
 #endif
   EXTRABIT extra;
   uint8_t token;
diff --git a/av1/encoder/x86/hybrid_fwd_txfm_avx2.c b/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
index f4bd142..77ae724 100644
--- a/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
+++ b/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
@@ -18,14 +18,6 @@
 #include "aom_dsp/txfm_common.h"
 #include "aom_dsp/x86/txfm_common_avx2.h"
 
-static INLINE void mm256_reverse_epi16(__m256i *u) {
-  const __m256i control = _mm256_set_epi16(
-      0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E, 0x0100,
-      0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E);
-  __m256i v = _mm256_shuffle_epi8(*u, control);
-  *u = _mm256_permute2x128_si256(v, v, 1);
-}
-
 static int32_t get_16x16_sum(const int16_t *input, int stride) {
   __m256i r0, r1, r2, r3, u0, u1;
   __m256i zero = _mm256_setzero_si256();
@@ -71,134 +63,6 @@
   _mm256_zeroupper();
 }
 
-static void mm256_transpose_16x16(__m256i *in) {
-  __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]);
-  __m256i tr0_1 = _mm256_unpackhi_epi16(in[0], in[1]);
-  __m256i tr0_2 = _mm256_unpacklo_epi16(in[2], in[3]);
-  __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]);
-  __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]);
-  __m256i tr0_5 = _mm256_unpackhi_epi16(in[4], in[5]);
-  __m256i tr0_6 = _mm256_unpacklo_epi16(in[6], in[7]);
-  __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]);
-
-  __m256i tr0_8 = _mm256_unpacklo_epi16(in[8], in[9]);
-  __m256i tr0_9 = _mm256_unpackhi_epi16(in[8], in[9]);
-  __m256i tr0_a = _mm256_unpacklo_epi16(in[10], in[11]);
-  __m256i tr0_b = _mm256_unpackhi_epi16(in[10], in[11]);
-  __m256i tr0_c = _mm256_unpacklo_epi16(in[12], in[13]);
-  __m256i tr0_d = _mm256_unpackhi_epi16(in[12], in[13]);
-  __m256i tr0_e = _mm256_unpacklo_epi16(in[14], in[15]);
-  __m256i tr0_f = _mm256_unpackhi_epi16(in[14], in[15]);
-
-  // 00 10 01 11 02 12 03 13  08 18 09 19 0a 1a 0b 1b
-  // 04 14 05 15 06 16 07 17  0c 1c 0d 1d 0e 1e 0f 1f
-  // 20 30 21 31 22 32 23 33  28 38 29 39 2a 3a 2b 3b
-  // 24 34 25 35 26 36 27 37  2c 3c 2d 3d 2e 3e 2f 3f
-  // 40 50 41 51 42 52 43 53  48 58 49 59 4a 5a 4b 5b
-  // 44 54 45 55 46 56 47 57  4c 5c 4d 5d 4e 5e 4f 5f
-  // 60 70 61 71 62 72 63 73  68 78 69 79 6a 7a 6b 7b
-  // 64 74 65 75 66 76 67 77  6c 7c 6d 7d 6e 7e 6f 7f
-
-  // 80 90 81 91 82 92 83 93  88 98 89 99 8a 9a 8b 9b
-  // 84 94 85 95 86 96 87 97  8c 9c 8d 9d 8e 9e 8f 9f
-  // a0 b0 a1 b1 a2 b2 a3 b3  a8 b8 a9 b9 aa ba ab bb
-  // a4 b4 a5 b5 a6 b6 a7 b7  ac bc ad bd ae be af bf
-  // c0 d0 c1 d1 c2 d2 c3 d3  c8 d8 c9 d9 ca da cb db
-  // c4 d4 c5 d5 c6 d6 c7 d7  cc dc cd dd ce de cf df
-  // e0 f0 e1 f1 e2 f2 e3 f3  e8 f8 e9 f9 ea fa eb fb
-  // e4 f4 e5 f5 e6 f6 e7 f7  ec fc ed fd ee fe ef ff
-
-  __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_2);
-  __m256i tr1_1 = _mm256_unpackhi_epi32(tr0_0, tr0_2);
-  __m256i tr1_2 = _mm256_unpacklo_epi32(tr0_1, tr0_3);
-  __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_1, tr0_3);
-  __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_6);
-  __m256i tr1_5 = _mm256_unpackhi_epi32(tr0_4, tr0_6);
-  __m256i tr1_6 = _mm256_unpacklo_epi32(tr0_5, tr0_7);
-  __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_5, tr0_7);
-
-  __m256i tr1_8 = _mm256_unpacklo_epi32(tr0_8, tr0_a);
-  __m256i tr1_9 = _mm256_unpackhi_epi32(tr0_8, tr0_a);
-  __m256i tr1_a = _mm256_unpacklo_epi32(tr0_9, tr0_b);
-  __m256i tr1_b = _mm256_unpackhi_epi32(tr0_9, tr0_b);
-  __m256i tr1_c = _mm256_unpacklo_epi32(tr0_c, tr0_e);
-  __m256i tr1_d = _mm256_unpackhi_epi32(tr0_c, tr0_e);
-  __m256i tr1_e = _mm256_unpacklo_epi32(tr0_d, tr0_f);
-  __m256i tr1_f = _mm256_unpackhi_epi32(tr0_d, tr0_f);
-
-  // 00 10 20 30 01 11 21 31  08 18 28 38 09 19 29 39
-  // 02 12 22 32 03 13 23 33  0a 1a 2a 3a 0b 1b 2b 3b
-  // 04 14 24 34 05 15 25 35  0c 1c 2c 3c 0d 1d 2d 3d
-  // 06 16 26 36 07 17 27 37  0e 1e 2e 3e 0f 1f 2f 3f
-  // 40 50 60 70 41 51 61 71  48 58 68 78 49 59 69 79
-  // 42 52 62 72 43 53 63 73  4a 5a 6a 7a 4b 5b 6b 7b
-  // 44 54 64 74 45 55 65 75  4c 5c 6c 7c 4d 5d 6d 7d
-  // 46 56 66 76 47 57 67 77  4e 5e 6e 7e 4f 5f 6f 7f
-
-  // 80 90 a0 b0 81 91 a1 b1  88 98 a8 b8 89 99 a9 b9
-  // 82 92 a2 b2 83 93 a3 b3  8a 9a aa ba 8b 9b ab bb
-  // 84 94 a4 b4 85 95 a5 b5  8c 9c ac bc 8d 9d ad bd
-  // 86 96 a6 b6 87 97 a7 b7  8e ae 9e be 8f 9f af bf
-  // c0 d0 e0 f0 c1 d1 e1 f1  c8 d8 e8 f8 c9 d9 e9 f9
-  // c2 d2 e2 f2 c3 d3 e3 f3  ca da ea fa cb db eb fb
-  // c4 d4 e4 f4 c5 d5 e5 f5  cc dc ef fc cd dd ed fd
-  // c6 d6 e6 f6 c7 d7 e7 f7  ce de ee fe cf df ef ff
-
-  tr0_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4);
-  tr0_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4);
-  tr0_2 = _mm256_unpacklo_epi64(tr1_1, tr1_5);
-  tr0_3 = _mm256_unpackhi_epi64(tr1_1, tr1_5);
-  tr0_4 = _mm256_unpacklo_epi64(tr1_2, tr1_6);
-  tr0_5 = _mm256_unpackhi_epi64(tr1_2, tr1_6);
-  tr0_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7);
-  tr0_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7);
-
-  tr0_8 = _mm256_unpacklo_epi64(tr1_8, tr1_c);
-  tr0_9 = _mm256_unpackhi_epi64(tr1_8, tr1_c);
-  tr0_a = _mm256_unpacklo_epi64(tr1_9, tr1_d);
-  tr0_b = _mm256_unpackhi_epi64(tr1_9, tr1_d);
-  tr0_c = _mm256_unpacklo_epi64(tr1_a, tr1_e);
-  tr0_d = _mm256_unpackhi_epi64(tr1_a, tr1_e);
-  tr0_e = _mm256_unpacklo_epi64(tr1_b, tr1_f);
-  tr0_f = _mm256_unpackhi_epi64(tr1_b, tr1_f);
-
-  // 00 10 20 30 40 50 60 70  08 18 28 38 48 58 68 78
-  // 01 11 21 31 41 51 61 71  09 19 29 39 49 59 69 79
-  // 02 12 22 32 42 52 62 72  0a 1a 2a 3a 4a 5a 6a 7a
-  // 03 13 23 33 43 53 63 73  0b 1b 2b 3b 4b 5b 6b 7b
-  // 04 14 24 34 44 54 64 74  0c 1c 2c 3c 4c 5c 6c 7c
-  // 05 15 25 35 45 55 65 75  0d 1d 2d 3d 4d 5d 6d 7d
-  // 06 16 26 36 46 56 66 76  0e 1e 2e 3e 4e 5e 6e 7e
-  // 07 17 27 37 47 57 67 77  0f 1f 2f 3f 4f 5f 6f 7f
-
-  // 80 90 a0 b0 c0 d0 e0 f0  88 98 a8 b8 c8 d8 e8 f8
-  // 81 91 a1 b1 c1 d1 e1 f1  89 99 a9 b9 c9 d9 e9 f9
-  // 82 92 a2 b2 c2 d2 e2 f2  8a 9a aa ba ca da ea fa
-  // 83 93 a3 b3 c3 d3 e3 f3  8b 9b ab bb cb db eb fb
-  // 84 94 a4 b4 c4 d4 e4 f4  8c 9c ac bc cc dc ef fc
-  // 85 95 a5 b5 c5 d5 e5 f5  8d 9d ad bd cd dd ed fd
-  // 86 96 a6 b6 c6 d6 e6 f6  8e ae 9e be ce de ee fe
-  // 87 97 a7 b7 c7 d7 e7 f7  8f 9f af bf cf df ef ff
-
-  in[0] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x20);  // 0010 0000
-  in[8] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x31);  // 0011 0001
-  in[1] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x20);
-  in[9] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x31);
-  in[2] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x20);
-  in[10] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x31);
-  in[3] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x20);
-  in[11] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x31);
-
-  in[4] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x20);
-  in[12] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x31);
-  in[5] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x20);
-  in[13] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x31);
-  in[6] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x20);
-  in[14] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x31);
-  in[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20);
-  in[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31);
-}
-
 static INLINE void load_buffer_16x16(const int16_t *input, int stride,
                                      int flipud, int fliplr, __m256i *in) {
   if (!flipud) {
@@ -352,19 +216,6 @@
   in[15] = _mm256_srai_epi16(in[15], 2);
 }
 
-static INLINE __m256i butter_fly(__m256i a0, __m256i a1, const __m256i cospi) {
-  const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
-  __m256i y0 = _mm256_madd_epi16(a0, cospi);
-  __m256i y1 = _mm256_madd_epi16(a1, cospi);
-
-  y0 = _mm256_add_epi32(y0, dct_rounding);
-  y1 = _mm256_add_epi32(y1, dct_rounding);
-  y0 = _mm256_srai_epi32(y0, DCT_CONST_BITS);
-  y1 = _mm256_srai_epi32(y1, DCT_CONST_BITS);
-
-  return _mm256_packs_epi32(y0, y1);
-}
-
 static void fdct16_avx2(__m256i *in) {
   // sequence: cospi_L_H = pairs(L, H) and L first
   const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
@@ -1099,31 +950,7 @@
 }
 
 #if CONFIG_EXT_TX
-static void fidtx16_avx2(__m256i *in) {
-  const __m256i zero = _mm256_setzero_si256();
-  const __m256i sqrt2_epi16 = _mm256_set1_epi16((int16_t)Sqrt2);
-  const __m256i dct_const_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
-  __m256i u0, u1;
-  int i = 0;
-
-  while (i < 16) {
-    in[i] = _mm256_slli_epi16(in[i], 1);
-
-    u0 = _mm256_unpacklo_epi16(zero, in[i]);
-    u1 = _mm256_unpackhi_epi16(zero, in[i]);
-
-    u0 = _mm256_madd_epi16(u0, sqrt2_epi16);
-    u1 = _mm256_madd_epi16(u1, sqrt2_epi16);
-
-    u0 = _mm256_add_epi32(u0, dct_const_rounding);
-    u1 = _mm256_add_epi32(u1, dct_const_rounding);
-
-    u0 = _mm256_srai_epi32(u0, DCT_CONST_BITS);
-    u1 = _mm256_srai_epi32(u1, DCT_CONST_BITS);
-    in[i] = _mm256_packs_epi32(u0, u1);
-    i++;
-  }
-}
+static void fidtx16_avx2(__m256i *in) { txfm_scaling16_avx2(Sqrt2, in); }
 #endif
 
 void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride,
diff --git a/configure b/configure
index 5bfce04..3258ee3 100755
--- a/configure
+++ b/configure
@@ -272,6 +272,7 @@
     supertx
     ans
     rans
+    ec_multisymbol
     loop_restoration
     ext_partition
     ext_partition_types
@@ -289,6 +290,10 @@
     delta_q
     adapt_scan
     filter_7bit
+    parallel_deblocking
+    tile_groups
+    ec_adapt
+    simp_mv_pred
 "
 CONFIG_LIST="
     dependency_tracking
@@ -405,6 +410,7 @@
     aom_highbitdepth
     experimental
     aom_qm
+    tile_groups
 "
 
 process_cmdline() {
@@ -450,6 +456,11 @@
         enabled ${c} && enable_feature ${c##*_}s
     done
 
+    # Fix up experiment dependencies
+    enabled ec_adapt && enable_feature ec_multisymbol
+    enabled ec_multisymbol && ! enabled ans && soft_enable daala_ec
+    enabled ec_multisymbol && ! enabled daala_ec && soft_enable ans
+    enabled daala_ec && enable_feature ec_multisymbol
     if enabled global_motion && (enabled ext_inter || enabled dual_filter); then
       log_echo "global_motion currently not compatible with ext_inter"
       log_echo "and dual_filter. Disabling global_motion."
@@ -619,6 +630,7 @@
         check_add_cflags -Wuninitialized
         check_add_cflags -Wunused
         check_add_cflags -Wsign-compare
+        check_add_cflags -Wlogical-op
         # Enabling the following warning (in combination with -Wunused above)
         # for C++ generates errors in third_party code including googletest and
         # libyuv. So enable it only for C code.
diff --git a/test/av1_convolve_test.cc b/test/av1_convolve_test.cc
index 435e106..0324b8e 100644
--- a/test/av1_convolve_test.cc
+++ b/test/av1_convolve_test.cc
@@ -15,7 +15,7 @@
 #include "./aom_dsp_rtcd.h"
 #include "test/acm_random.h"
 #include "av1/common/filter.h"
-#include "av1/common/av1_convolve.h"
+#include "av1/common/convolve.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_ports/mem.h"
 
diff --git a/test/av1_fht16x16_test.cc b/test/av1_fht16x16_test.cc
index 4a44e16..0b89071 100644
--- a/test/av1_fht16x16_test.cc
+++ b/test/av1_fht16x16_test.cc
@@ -33,6 +33,11 @@
   av1_fht16x16_c(in, out, stride, tx_type);
 }
 
+void iht16x16_ref(const tran_low_t *in, uint8_t *dest, int stride,
+                  int tx_type) {
+  av1_iht16x16_256_add_c(in, dest, stride, tx_type);
+}
+
 #if CONFIG_AOM_HIGHBITDEPTH
 typedef void (*IHbdHtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                            int tx_type, int bd);
@@ -48,16 +53,6 @@
 }
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
-#if HAVE_AVX2
-void dummy_inv_txfm(const tran_low_t *in, uint8_t *out, int stride,
-                    int tx_type) {
-  (void)in;
-  (void)out;
-  (void)stride;
-  (void)tx_type;
-}
-#endif
-
 class AV1Trans16x16HT : public libaom_test::TransformTestBase,
                         public ::testing::TestWithParam<Ht16x16Param> {
  public:
@@ -70,6 +65,7 @@
     pitch_ = 16;
     height_ = 16;
     fwd_txfm_ref = fht16x16_ref;
+    inv_txfm_ref = iht16x16_ref;
     bit_depth_ = GET_PARAM(3);
     mask_ = (1 << bit_depth_) - 1;
     num_coeffs_ = GET_PARAM(4);
@@ -90,6 +86,7 @@
 };
 
 TEST_P(AV1Trans16x16HT, CoeffCheck) { RunCoeffCheck(); }
+TEST_P(AV1Trans16x16HT, InvCoeffCheck) { RunInvCoeffCheck(); }
 
 #if CONFIG_AOM_HIGHBITDEPTH
 class AV1HighbdTrans16x16HT
@@ -203,22 +200,27 @@
 
 #if HAVE_AVX2
 const Ht16x16Param kArrayHt16x16Param_avx2[] = {
-  make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 0, AOM_BITS_8, 256),
-  make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 1, AOM_BITS_8, 256),
-  make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 2, AOM_BITS_8, 256),
-  make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 3, AOM_BITS_8, 256),
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 0, AOM_BITS_8, 256),
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 1, AOM_BITS_8, 256),
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 2, AOM_BITS_8, 256),
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 3, AOM_BITS_8, 256),
 #if CONFIG_EXT_TX
-  make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 4, AOM_BITS_8, 256),
-  make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 5, AOM_BITS_8, 256),
-  make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 6, AOM_BITS_8, 256),
-  make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 7, AOM_BITS_8, 256),
-  make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 8, AOM_BITS_8, 256),
-  make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 10, AOM_BITS_8, 256),
-  make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 11, AOM_BITS_8, 256),
-  make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 12, AOM_BITS_8, 256),
-  make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 13, AOM_BITS_8, 256),
-  make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 14, AOM_BITS_8, 256),
-  make_tuple(&av1_fht16x16_avx2, dummy_inv_txfm, 15, AOM_BITS_8, 256)
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 4, AOM_BITS_8, 256),
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 5, AOM_BITS_8, 256),
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 6, AOM_BITS_8, 256),
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 7, AOM_BITS_8, 256),
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 8, AOM_BITS_8, 256),
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 10, AOM_BITS_8,
+             256),
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 11, AOM_BITS_8,
+             256),
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 12, AOM_BITS_8,
+             256),
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 13, AOM_BITS_8,
+             256),
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 14, AOM_BITS_8,
+             256),
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 15, AOM_BITS_8, 256)
 #endif  // CONFIG_EXT_TX
 };
 INSTANTIATE_TEST_CASE_P(AVX2, AV1Trans16x16HT,
diff --git a/test/av1_inv_txfm_test.cc b/test/av1_inv_txfm_test.cc
index 84e2402..8f6c868 100644
--- a/test/av1_inv_txfm_test.cc
+++ b/test/av1_inv_txfm_test.cc
@@ -24,7 +24,7 @@
 #include "av1/common/blockd.h"
 #include "av1/common/scan.h"
 #include "aom/aom_integer.h"
-#include "av1/common/av1_inv_txfm.h"
+#include "aom_dsp/inv_txfm.h"
 
 using libaom_test::ACMRandom;
 
@@ -104,10 +104,10 @@
 
 INSTANTIATE_TEST_CASE_P(
     C, AV1InvTxfm,
-    ::testing::Values(IdctParam(&av1_idct4_c, &reference_idct_1d, 4, 1),
-                      IdctParam(&av1_idct8_c, &reference_idct_1d, 8, 2),
-                      IdctParam(&av1_idct16_c, &reference_idct_1d, 16, 4),
-                      IdctParam(&av1_idct32_c, &reference_idct_1d, 32, 6)));
+    ::testing::Values(IdctParam(&aom_idct4_c, &reference_idct_1d, 4, 1),
+                      IdctParam(&aom_idct8_c, &reference_idct_1d, 8, 2),
+                      IdctParam(&aom_idct16_c, &reference_idct_1d, 16, 4),
+                      IdctParam(&aom_idct32_c, &reference_idct_1d, 32, 6)));
 
 #if CONFIG_AV1_ENCODER
 typedef void (*FwdTxfmFunc)(const int16_t *in, tran_low_t *out, int stride);
@@ -262,19 +262,19 @@
 
 INSTANTIATE_TEST_CASE_P(
     C, AV1PartialIDctTest,
-    ::testing::Values(make_tuple(&av1_fdct32x32_c, &av1_idct32x32_1024_add_c,
-                                 &av1_idct32x32_34_add_c, TX_32X32, 34),
-                      make_tuple(&av1_fdct32x32_c, &av1_idct32x32_1024_add_c,
-                                 &av1_idct32x32_1_add_c, TX_32X32, 1),
-                      make_tuple(&av1_fdct16x16_c, &av1_idct16x16_256_add_c,
-                                 &av1_idct16x16_10_add_c, TX_16X16, 10),
-                      make_tuple(&av1_fdct16x16_c, &av1_idct16x16_256_add_c,
-                                 &av1_idct16x16_1_add_c, TX_16X16, 1),
-                      make_tuple(&av1_fdct8x8_c, &av1_idct8x8_64_add_c,
-                                 &av1_idct8x8_12_add_c, TX_8X8, 12),
-                      make_tuple(&av1_fdct8x8_c, &av1_idct8x8_64_add_c,
-                                 &av1_idct8x8_1_add_c, TX_8X8, 1),
-                      make_tuple(&av1_fdct4x4_c, &av1_idct4x4_16_add_c,
-                                 &av1_idct4x4_1_add_c, TX_4X4, 1)));
+    ::testing::Values(make_tuple(&aom_fdct32x32_c, &aom_idct32x32_1024_add_c,
+                                 &aom_idct32x32_34_add_c, TX_32X32, 34),
+                      make_tuple(&aom_fdct32x32_c, &aom_idct32x32_1024_add_c,
+                                 &aom_idct32x32_1_add_c, TX_32X32, 1),
+                      make_tuple(&aom_fdct16x16_c, &aom_idct16x16_256_add_c,
+                                 &aom_idct16x16_10_add_c, TX_16X16, 10),
+                      make_tuple(&aom_fdct16x16_c, &aom_idct16x16_256_add_c,
+                                 &aom_idct16x16_1_add_c, TX_16X16, 1),
+                      make_tuple(&aom_fdct8x8_c, &aom_idct8x8_64_add_c,
+                                 &aom_idct8x8_12_add_c, TX_8X8, 12),
+                      make_tuple(&aom_fdct8x8_c, &aom_idct8x8_64_add_c,
+                                 &aom_idct8x8_1_add_c, TX_8X8, 1),
+                      make_tuple(&aom_fdct4x4_c, &aom_idct4x4_16_add_c,
+                                 &aom_idct4x4_1_add_c, TX_4X4, 1)));
 #endif  // CONFIG_AV1_ENCODER
 }  // namespace
diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index e4179ef..cb2fbd5 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -436,6 +436,15 @@
                                  &aom_idct32x32_1024_add_sse2, 1, AOM_BITS_8)));
 #endif  // HAVE_AVX2 && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
+#if HAVE_AVX2 && CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    AVX2, Trans32x32Test,
+    ::testing::Values(make_tuple(&aom_fdct32x32_avx2,
+                                 &aom_idct32x32_1024_add_sse2, 0, AOM_BITS_8),
+                      make_tuple(&aom_fdct32x32_rd_avx2,
+                                 &aom_idct32x32_1024_add_sse2, 1, AOM_BITS_8)));
+#endif  // HAVE_AVX2 && CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
 #if HAVE_MSA && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     MSA, Trans32x32Test,
diff --git a/test/encoder_parms_get_to_decoder.cc b/test/encoder_parms_get_to_decoder.cc
index 52d68b1..e6bf0be 100644
--- a/test/encoder_parms_get_to_decoder.cc
+++ b/test/encoder_parms_get_to_decoder.cc
@@ -49,7 +49,9 @@
   { 0, 0, 0, 1, 0, AOM_CR_STUDIO_RANGE, AOM_CS_BT_601, { 0, 0 } },
   { 0, 0, 0, 0, 0, AOM_CR_FULL_RANGE, AOM_CS_BT_709, { 0, 0 } },
   { 0, 0, 1, 0, 0, AOM_CR_FULL_RANGE, AOM_CS_BT_2020, { 0, 0 } },
+#if !CONFIG_EC_ADAPT
   { 0, 2, 0, 0, 1, AOM_CR_STUDIO_RANGE, AOM_CS_UNKNOWN, { 640, 480 } },
+#endif
   // TODO(JBB): Test profiles (requires more work).
 };
 
diff --git a/test/ethread_test.cc b/test/ethread_test.cc
index 6b2f1ea..2685f87 100644
--- a/test/ethread_test.cc
+++ b/test/ethread_test.cc
@@ -89,6 +89,7 @@
         encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
         encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
         encoder->Control(AOME_SET_ARNR_TYPE, 3);
+        encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 0);
       } else {
         encoder->Control(AOME_SET_ENABLEAUTOALTREF, 0);
         encoder->Control(AV1E_SET_AQ_MODE, 3);
@@ -172,6 +173,10 @@
 
 TEST_P(AVxEncoderThreadTestLarge, EncoderResultTest) { DoTest(); }
 
+#if CONFIG_EC_ADAPT
+// TODO(thdavies): EC_ADAPT does not support tiles
+
+#else
 AV1_INSTANTIATE_TEST_CASE(AVxEncoderThreadTest,
                           ::testing::Values(::libaom_test::kTwoPassGood,
                                             ::libaom_test::kOnePassGood),
@@ -180,5 +185,6 @@
 AV1_INSTANTIATE_TEST_CASE(AVxEncoderThreadTestLarge,
                           ::testing::Values(::libaom_test::kTwoPassGood,
                                             ::libaom_test::kOnePassGood),
-                          ::testing::Range(1, 3));
+                          ::testing::Range(0, 3));
+#endif
 }  // namespace
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index 9f62ffe..bbfb7f1 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -728,8 +728,7 @@
         make_tuple(&idct8x8_12, &idct8x8_64_add_12_sse2, 6225, AOM_BITS_12)));
 #endif  // HAVE_SSE2 && CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
-#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_AOM_HIGHBITDEPTH && \
-    !CONFIG_EMULATE_HARDWARE
+#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(SSSE3, FwdTrans8x8DCT,
                         ::testing::Values(make_tuple(&aom_fdct8x8_ssse3,
                                                      &aom_idct8x8_64_add_ssse3,
diff --git a/test/fht32x32_test.cc b/test/fht32x32_test.cc
index 1f85761..8545b2c 100644
--- a/test/fht32x32_test.cc
+++ b/test/fht32x32_test.cc
@@ -90,12 +90,11 @@
   IhtFunc inv_txfm_;
 };
 
-// TODO(luoyi): Owing to the range check in DCT_DCT of av1_fht32x32_avx2, as
-// input is out of the range, we use aom_fdct32x32_avx2. However this function
-// does not support CONFIG_AOM_HIGHBITDEPTH. I need to fix the scaling/rounding
-// of av1_fht32x32_avx2 then add this test on CONFIG_AOM_HIGHBITDEPTH.
-#if !CONFIG_AOM_HIGHBITDEPTH
 TEST_P(AV1Trans32x32HT, CoeffCheck) { RunCoeffCheck(); }
+// TODO(luoyi): As CONFIG_AOM_HIGHBITDEPTH = 1, our AVX2 implementation of
+// av1_fht32x32 does not support tran_low_t (int32_t) as intermediate result.
+// Therefore MemCheck test, tx_type=1,2,...,8 can't pass the test yet.
+#if !CONFIG_AOM_HIGHBITDEPTH
 TEST_P(AV1Trans32x32HT, MemCheck) { RunMemCheck(); }
 #endif
 
diff --git a/test/scan_test.cc b/test/scan_test.cc
new file mode 100644
index 0000000..22a6d85
--- /dev/null
+++ b/test/scan_test.cc
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/scan.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+namespace {
+
+TEST(ScanTest, av1_augment_prob) {
+  const int tx1d_size = 4;
+  uint32_t prob[16] = { 8, 8, 7, 7, 8, 8, 4, 2, 3, 3, 2, 2, 2, 2, 2, 2 };
+  const uint32_t ref_prob[16] = {
+    8, 8, 7, 7, 8, 8, 4, 2, 3, 3, 2, 2, 2, 2, 2, 2
+  };
+  av1_augment_prob(prob, tx1d_size, tx1d_size);
+  for (int r = 0; r < tx1d_size; ++r) {
+    for (int c = 0; c < tx1d_size; ++c) {
+      const int idx = r * tx1d_size + c;
+      EXPECT_EQ(ref_prob[idx], prob[idx] >> 16);
+    }
+  }
+
+  const int mask = (1 << 10) - 1;
+  for (int r = 0; r < tx1d_size; ++r) {
+    for (int c = 0; c < tx1d_size; ++c) {
+      const int idx = r * tx1d_size + c;
+      EXPECT_EQ(idx, mask ^ (prob[r * tx1d_size + c] & mask));
+    }
+  }
+}
+
+TEST(ScanTest, av1_update_sort_order) {
+  const TX_SIZE tx_size = TX_4X4;
+  const uint32_t prob[16] = { 8, 8, 7, 7, 8, 8, 4, 2, 3, 3, 2, 2, 2, 2, 2, 2 };
+  const int16_t ref_sort_order[16] = { 0, 1,  4, 5,  2,  3,  6,  8,
+                                       9, 12, 7, 10, 13, 11, 14, 15 };
+  int16_t sort_order[16];
+  av1_update_sort_order(tx_size, prob, sort_order);
+  for (int i = 0; i < 16; ++i) EXPECT_EQ(ref_sort_order[i], sort_order[i]);
+}
+
+TEST(ScanTest, av1_update_scan_order) {
+  TX_SIZE tx_size = TX_4X4;
+  const uint32_t prob[16] = { 4, 5, 7, 4, 5, 6, 8, 2, 3, 3, 2, 2, 2, 2, 2, 2 };
+  int16_t sort_order[16];
+  int16_t scan[16];
+  int16_t iscan[16];
+  const int16_t ref_iscan[16] = { 0, 1, 2,  6,  3, 4,  5,  10,
+                                  7, 8, 11, 13, 9, 12, 14, 15 };
+
+  av1_update_sort_order(tx_size, prob, sort_order);
+  av1_update_scan_order(tx_size, sort_order, scan, iscan);
+
+  for (int i = 0; i < 16; ++i) {
+    EXPECT_EQ(ref_iscan[i], iscan[i]);
+    EXPECT_EQ(i, scan[ref_iscan[i]]);
+  }
+}
+
+TEST(ScanTest, av1_update_neighbors) {
+  TX_SIZE tx_size = TX_4X4;
+  // raster order
+  const int16_t scan[16] = { 0, 1, 2,  3,  4,  5,  6,  7,
+                             8, 9, 10, 11, 12, 13, 14, 15 };
+  int16_t nb[(16 + 1) * 2];
+  const int16_t ref_nb[(16 + 1) * 2] = { 0, 0,  0,  0,  1,  1, 2, 2, 0,
+                                         0, 4,  1,  5,  2,  6, 3, 4, 4,
+                                         8, 5,  9,  6,  10, 7, 8, 8, 12,
+                                         9, 13, 10, 14, 11, 0, 0 };
+
+  // raster order's scan and iscan are the same
+  av1_update_neighbors(tx_size, scan, scan, nb);
+  for (int i = 0; i < (16 + 1) * 2; ++i) {
+    EXPECT_EQ(ref_nb[i], nb[i]);
+  }
+}
+
+}  // namespace
diff --git a/test/superframe_test.cc b/test/superframe_test.cc
index 339843f..94f4be9 100644
--- a/test/superframe_test.cc
+++ b/test/superframe_test.cc
@@ -134,7 +134,7 @@
                        ::testing::ValuesIn(tile_col_values),
                        ::testing::ValuesIn(tile_row_values)));
 #else
-#if !CONFIG_ANS
+#if !CONFIG_ANS && !CONFIG_DAALA_EC
 AV1_INSTANTIATE_TEST_CASE(
     SuperframeTest,
     ::testing::Combine(::testing::Values(::libaom_test::kTwoPassGood),
diff --git a/test/tile_independence_test.cc b/test/tile_independence_test.cc
index cbc27d0..57f4a60 100644
--- a/test/tile_independence_test.cc
+++ b/test/tile_independence_test.cc
@@ -126,6 +126,9 @@
 
 TEST_P(TileIndependenceTestLarge, MD5Match) { DoTest(); }
 
+#if CONFIG_EC_ADAPT
+// TODO(thdavies): EC_ADAPT does not support tiles
+#else
 #if CONFIG_EXT_TILE
 AV1_INSTANTIATE_TEST_CASE(TileIndependenceTest, ::testing::Values(1, 2, 32),
                           ::testing::Values(1, 2, 32));
@@ -138,4 +141,5 @@
 AV1_INSTANTIATE_TEST_CASE(TileIndependenceTestLarge, ::testing::Values(0, 1),
                           ::testing::Values(0, 1));
 #endif  // CONFIG_EXT_TILE
+#endif  // CONFIG_EC_ADAPT
 }  // namespace
diff --git a/test/transform_test_base.h b/test/transform_test_base.h
index 540136c..64bf2d6 100644
--- a/test/transform_test_base.h
+++ b/test/transform_test_base.h
@@ -210,7 +210,7 @@
           int out_idx = j * stride + k;
           ASSERT_EQ(output_block[out_idx], output_ref_block[out_idx])
               << "Error: not bit-exact result at index: " << out_idx
-              << " at test block: " << i;
+              << " j = " << j << " k = " << k << " at test block: " << i;
         }
       }
     }