Merge "Making names consistent with REFERENCE_MODE enum name."

diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index 2df3b6f..a1e472a 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc

@@ -260,4 +260,14 @@
         make_tuple(&vp9_fdct32x32_rd_sse2,
                    &vp9_idct32x32_1024_add_sse2, 1)));
 #endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+    AVX2, Trans32x32Test,
+    ::testing::Values(
+        make_tuple(&vp9_fdct32x32_avx2,
+                   &vp9_idct32x32_1024_add_sse2, 0),
+        make_tuple(&vp9_fdct32x32_rd_avx2,
+                   &vp9_idct32x32_1024_add_sse2, 1)));
+#endif
 }  // namespace

diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index 6e12638..f495c29 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c

@@ -200,9 +200,6 @@
 
 void vp9_create_common(VP9_COMMON *cm) {
   vp9_machine_specific_config(cm);
-
-  cm->tx_mode = ONLY_4X4;
-  cm->comp_pred_mode = REFERENCE_MODE_SELECT;
 }
 
 void vp9_remove_common(VP9_COMMON *cm) {
@@ -211,8 +208,6 @@
 
 void vp9_initialize_common() {
   vp9_init_neighbors();
-  vp9_coef_tree_initialize();
-  vp9_entropy_mode_init();
 }
 
 void vp9_update_frame_size(VP9_COMMON *cm) {

diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 37da92b..993ee79 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h

@@ -198,7 +198,6 @@
 };
 
 struct macroblockd_plane {
-  int16_t *qcoeff;
   int16_t *dqcoeff;
   uint16_t *eobs;
   PLANE_TYPE plane_type;
@@ -387,19 +386,6 @@
     foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg);
 }
 
-static int raster_block_offset(BLOCK_SIZE plane_bsize,
-                               int raster_block, int stride) {
-  const int bw = b_width_log2(plane_bsize);
-  const int y = 4 * (raster_block >> bw);
-  const int x = 4 * (raster_block & ((1 << bw) - 1));
-  return y * stride + x;
-}
-static int16_t* raster_block_offset_int16(BLOCK_SIZE plane_bsize,
-                                          int raster_block, int16_t *base) {
-  const int stride = 4 << b_width_log2(plane_bsize);
-  return base + raster_block_offset(plane_bsize, raster_block, stride);
-}
-
 static void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize,
                                      TX_SIZE tx_size, int block,
                                      int *x, int *y) {

diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c
index 0f978cc..b35c43f 100644
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c

@@ -113,49 +113,6 @@
   0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5
 };
 
-// Array indices are identical to previously-existing CONTEXT_NODE indices
-const vp9_tree_index vp9_coef_tree[TREE_SIZE(MAX_ENTROPY_TOKENS)] = {
-  -DCT_EOB_TOKEN, 2,                          /* 0 = EOB */
-  -ZERO_TOKEN, 4,                             /* 1 = ZERO */
-  -ONE_TOKEN, 6,                              /* 2 = ONE */
-  8, 12,                                      /* 3 = LOW_VAL */
-  -TWO_TOKEN, 10,                            /* 4 = TWO */
-  -THREE_TOKEN, -FOUR_TOKEN,                /* 5 = THREE */
-  14, 16,                                   /* 6 = HIGH_LOW */
-  -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2,   /* 7 = CAT_ONE */
-  18, 20,                                   /* 8 = CAT_THREEFOUR */
-  -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4,   /* 9 = CAT_THREE */
-  -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6    /* 10 = CAT_FIVE */
-};
-
-// Unconstrained Node Tree
-const vp9_tree_index vp9_coef_con_tree[TREE_SIZE(MAX_ENTROPY_TOKENS)] = {
-  2, 6,                                     /* 0 = LOW_VAL */
-  -TWO_TOKEN, 4,                            /* 1 = TWO */
-  -THREE_TOKEN, -FOUR_TOKEN,                /* 2 = THREE */
-  8, 10,                                    /* 3 = HIGH_LOW */
-  -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2,   /* 4 = CAT_ONE */
-  12, 14,                                   /* 5 = CAT_THREEFOUR */
-  -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4,   /* 6 = CAT_THREE */
-  -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6    /* 7 = CAT_FIVE */
-};
-
-
-
-struct vp9_token vp9_coef_encodings[MAX_ENTROPY_TOKENS];
-
-/* Trees for extra bits.  Probabilities are constant and
-   do not depend on previously encoded bits */
-
-static const vp9_prob Pcat1[] = { 159};
-static const vp9_prob Pcat2[] = { 165, 145};
-static const vp9_prob Pcat3[] = { 173, 148, 140};
-static const vp9_prob Pcat4[] = { 176, 155, 140, 135};
-static const vp9_prob Pcat5[] = { 180, 157, 141, 134, 130};
-static const vp9_prob Pcat6[] = {
-  254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129
-};
-
 const vp9_tree_index vp9_coefmodel_tree[6] = {
   -DCT_EOB_MODEL_TOKEN, 2,                      /* 0 = EOB */
   -ZERO_TOKEN, 4,                               /* 1 = ZERO */
@@ -446,43 +403,6 @@
   extend_to_full_distribution(&full[UNCONSTRAINED_NODES], model[PIVOT_NODE]);
 }
 
-static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[28];
-
-static void init_bit_tree(vp9_tree_index *p, int n) {
-  int i = 0;
-
-  while (++i < n) {
-    p[0] = p[1] = i << 1;
-    p += 2;
-  }
-
-  p[0] = p[1] = 0;
-}
-
-static void init_bit_trees() {
-  init_bit_tree(cat1, 1);
-  init_bit_tree(cat2, 2);
-  init_bit_tree(cat3, 3);
-  init_bit_tree(cat4, 4);
-  init_bit_tree(cat5, 5);
-  init_bit_tree(cat6, 14);
-}
-
-const vp9_extra_bit vp9_extra_bits[MAX_ENTROPY_TOKENS] = {
-  {0, 0, 0, 0},           // ZERO_TOKEN
-  {0, 0, 0, 1},           // ONE_TOKEN
-  {0, 0, 0, 2},           // TWO_TOKEN
-  {0, 0, 0, 3},           // THREE_TOKEN
-  {0, 0, 0, 4},           // FOUR_TOKEN
-  {cat1, Pcat1, 1, 5},    // DCT_VAL_CATEGORY1
-  {cat2, Pcat2, 2, 7},    // DCT_VAL_CATEGORY2
-  {cat3, Pcat3, 3, 11},   // DCT_VAL_CATEGORY3
-  {cat4, Pcat4, 4, 19},   // DCT_VAL_CATEGORY4
-  {cat5, Pcat5, 5, 35},   // DCT_VAL_CATEGORY5
-  {cat6, Pcat6, 14, 67},  // DCT_VAL_CATEGORY6
-  {0, 0, 0, 0}            // DCT_EOB_TOKEN
-};
-
 #include "vp9/common/vp9_default_coef_probs.h"
 
 void vp9_default_coef_probs(VP9_COMMON *cm) {
@@ -492,11 +412,6 @@
   vp9_copy(cm->fc.coef_probs[TX_32X32], default_coef_probs_32x32);
 }
 
-void vp9_coef_tree_initialize() {
-  init_bit_trees();
-  vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree);
-}
-
 #define COEF_COUNT_SAT 24
 #define COEF_MAX_UPDATE_FACTOR 112
 #define COEF_COUNT_SAT_KEY 24

diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index 92a6c59..941b251 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h

@@ -44,15 +44,9 @@
 extern DECLARE_ALIGNED(16, const uint8_t,
                        vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
 
-extern const vp9_tree_index vp9_coef_tree[TREE_SIZE(MAX_ENTROPY_TOKENS)];
-
-extern const vp9_tree_index vp9_coef_con_tree[];
-
 #define DCT_EOB_MODEL_TOKEN     3      /* EOB       Extra Bits 0+0 */
 extern const vp9_tree_index vp9_coefmodel_tree[];
 
-extern struct vp9_token vp9_coef_encodings[MAX_ENTROPY_TOKENS];
-
 typedef struct {
   const vp9_tree_index *tree;
   const vp9_prob *prob;
@@ -105,8 +99,6 @@
 
 struct VP9Common;
 void vp9_default_coef_probs(struct VP9Common *cm);
-
-void vp9_coef_tree_initialize();
 void vp9_adapt_coef_probs(struct VP9Common *cm);
 
 static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) {

diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index 5d74c69..2652421 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c

@@ -232,21 +232,18 @@
   -D63_PRED, 16,                    /* 7 = D63_NODE */
   -D153_PRED, -D207_PRED             /* 8 = D153_NODE */
 };
-struct vp9_token vp9_intra_mode_encodings[INTRA_MODES];
 
 const vp9_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)] = {
   -INTER_OFFSET(ZEROMV), 2,
   -INTER_OFFSET(NEARESTMV), 4,
   -INTER_OFFSET(NEARMV), -INTER_OFFSET(NEWMV)
 };
-struct vp9_token vp9_inter_mode_encodings[INTER_MODES];
 
 const vp9_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)] = {
   -PARTITION_NONE, 2,
   -PARTITION_HORZ, 4,
   -PARTITION_VERT, -PARTITION_SPLIT
 };
-struct vp9_token vp9_partition_encodings[PARTITION_TYPES];
 
 static const vp9_prob default_intra_inter_p[INTRA_INTER_CONTEXTS] = {
   9, 102, 187, 225
@@ -337,15 +334,6 @@
   -EIGHTTAP, 2,
   -EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP
 };
-struct vp9_token vp9_switchable_interp_encodings[SWITCHABLE_FILTERS];
-
-void vp9_entropy_mode_init() {
-  vp9_tokens_from_tree(vp9_intra_mode_encodings, vp9_intra_mode_tree);
-  vp9_tokens_from_tree(vp9_switchable_interp_encodings,
-                       vp9_switchable_interp_tree);
-  vp9_tokens_from_tree(vp9_partition_encodings, vp9_partition_tree);
-  vp9_tokens_from_tree(vp9_inter_mode_encodings, vp9_inter_mode_tree);
-}
 
 #define COUNT_SAT 20
 #define MAX_UPDATE_FACTOR 128

diff --git a/vp9/common/vp9_entropymode.h b/vp9/common/vp9_entropymode.h
index 38b4199..df58bea 100644
--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h

@@ -37,24 +37,13 @@
 extern const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
 extern const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES]
                                         [INTRA_MODES - 1];
-
 extern const vp9_prob vp9_kf_partition_probs[PARTITION_CONTEXTS]
                                             [PARTITION_TYPES - 1];
-
 extern const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)];
-extern struct vp9_token vp9_intra_mode_encodings[INTRA_MODES];
-
 extern const vp9_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)];
-extern struct vp9_token vp9_inter_mode_encodings[INTER_MODES];
-
 extern const vp9_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)];
-extern struct vp9_token vp9_partition_encodings[PARTITION_TYPES];
-
 extern const vp9_tree_index vp9_switchable_interp_tree
                                 [TREE_SIZE(SWITCHABLE_FILTERS)];
-extern struct vp9_token vp9_switchable_interp_encodings[SWITCHABLE_FILTERS];
-
-void vp9_entropy_mode_init();
 
 void vp9_setup_past_independence(struct VP9Common *cm);
 

diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 85dd7d8..ff2bc45 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c

@@ -353,19 +353,10 @@
 
     // TODO(yunqingwang): count in loopfilter functions should be removed.
     if (mask & 1) {
-      if ((mask_16x16_0 | mask_16x16_1) & 1) {
-        // TODO(yunqingwang): if (mask_16x16_0 & 1), then (mask_16x16_0 & 1)
-        // is always 1. Same is true for horizontal lf.
-        if ((mask_16x16_0 & mask_16x16_1) & 1) {
-          vp9_mb_lpf_vertical_edge_w_16(s, pitch, lfi0->mblim, lfi0->lim,
-                                     lfi0->hev_thr);
-        } else if (mask_16x16_0 & 1) {
-          vp9_mb_lpf_vertical_edge_w(s, pitch, lfi0->mblim, lfi0->lim,
-                                     lfi0->hev_thr);
-        } else {
-          vp9_mb_lpf_vertical_edge_w(s + 8 *pitch, pitch, lfi1->mblim,
-                                     lfi1->lim, lfi1->hev_thr);
-        }
+      if (mask_16x16_0 & 1) {
+        // if (mask_16x16_0 & 1) is 1, then (mask_16x16_1 & 1) is 1.
+        vp9_mb_lpf_vertical_edge_w_16(s, pitch, lfi0->mblim, lfi0->lim,
+                                      lfi0->hev_thr);
       }
 
       if ((mask_8x8_0 | mask_8x8_1) & 1) {
@@ -441,14 +432,10 @@
     count = 1;
     if (mask & 1) {
       if (mask_16x16 & 1) {
-        if ((mask_16x16 & 3) == 3) {
-          vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim,
-                                       lfi->hev_thr, 2);
-          count = 2;
-        } else {
-          vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim,
-                                       lfi->hev_thr, 1);
-        }
+        // If (mask_16x16 & 1) is 1, then (mask_16x16 & 3) is 3.
+        vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim,
+                                     lfi->hev_thr, 2);
+        count = 2;
       } else if (mask_8x8 & 1) {
         if ((mask_8x8 & 3) == 3) {
           // Next block's thresholds

diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 28d2417..627ea31 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh

@@ -707,31 +707,31 @@
 
 # fdct functions
 prototype void vp9_short_fht4x4 "const int16_t *input, int16_t *output, int stride, int tx_type"
-specialize vp9_short_fht4x4 sse2
+specialize vp9_short_fht4x4 sse2 avx2
 
 prototype void vp9_short_fht8x8 "const int16_t *input, int16_t *output, int stride, int tx_type"
-specialize vp9_short_fht8x8 sse2
+specialize vp9_short_fht8x8 sse2 avx2
 
 prototype void vp9_short_fht16x16 "const int16_t *input, int16_t *output, int stride, int tx_type"
-specialize vp9_short_fht16x16 sse2
+specialize vp9_short_fht16x16 sse2 avx2
 
 prototype void vp9_fwht4x4 "const int16_t *input, int16_t *output, int stride"
 specialize vp9_fwht4x4
 
 prototype void vp9_fdct4x4 "const int16_t *input, int16_t *output, int stride"
-specialize vp9_fdct4x4 sse2
+specialize vp9_fdct4x4 sse2 avx2
 
 prototype void vp9_fdct8x8 "const int16_t *input, int16_t *output, int stride"
-specialize vp9_fdct8x8 sse2
+specialize vp9_fdct8x8 sse2 avx2
 
 prototype void vp9_fdct16x16 "const int16_t *input, int16_t *output, int stride"
-specialize vp9_fdct16x16 sse2
+specialize vp9_fdct16x16 sse2 avx2
 
 prototype void vp9_fdct32x32 "const int16_t *input, int16_t *output, int stride"
-specialize vp9_fdct32x32 sse2
+specialize vp9_fdct32x32 sse2 avx2
 
 prototype void vp9_fdct32x32_rd "const int16_t *input, int16_t *output, int stride"
-specialize vp9_fdct32x32_rd sse2
+specialize vp9_fdct32x32_rd sse2 avx2
 
 #
 # Motion search

diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index c65184f..8fdf19a 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c

@@ -431,6 +431,27 @@
       res3 = _mm_packs_epi32(tmp6, tmp7); \
   }
 
+#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
+  {   \
+      tmp0 = _mm_madd_epi16(lo_0, cst0); \
+      tmp1 = _mm_madd_epi16(hi_0, cst0); \
+      tmp2 = _mm_madd_epi16(lo_0, cst1); \
+      tmp3 = _mm_madd_epi16(hi_0, cst1); \
+      \
+      tmp0 = _mm_add_epi32(tmp0, rounding); \
+      tmp1 = _mm_add_epi32(tmp1, rounding); \
+      tmp2 = _mm_add_epi32(tmp2, rounding); \
+      tmp3 = _mm_add_epi32(tmp3, rounding); \
+      \
+      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+      \
+      res0 = _mm_packs_epi32(tmp0, tmp1); \
+      res1 = _mm_packs_epi32(tmp2, tmp3); \
+  }
+
 #define IDCT8_1D  \
   /* Stage1 */      \
   { \
@@ -2801,28 +2822,329 @@
     input += 8; \
   }  \
 
+#define IDCT32_1D_34 \
+/* Stage1 */ \
+{ \
+  const __m128i zero = _mm_setzero_si128();\
+  const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \
+  const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \
+  \
+  const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \
+  const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \
+  \
+  const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \
+  const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \
+  \
+  const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \
+  const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \
+  \
+  MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \
+                         stg1_1, stp1_16, stp1_31); \
+  MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \
+                         stg1_7, stp1_19, stp1_28); \
+  MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \
+                         stg1_9, stp1_20, stp1_27); \
+  MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \
+                         stg1_15, stp1_23, stp1_24); \
+} \
+\
+/* Stage2 */ \
+{ \
+  const __m128i zero = _mm_setzero_si128();\
+  const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \
+  const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \
+  \
+  const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \
+  const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \
+  \
+  MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \
+                         stg2_1, stp2_8, stp2_15); \
+  MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \
+                         stg2_7, stp2_11, stp2_12); \
+  \
+  stp2_16 = stp1_16; \
+  stp2_19 = stp1_19; \
+  \
+  stp2_20 = stp1_20; \
+  stp2_23 = stp1_23; \
+  \
+  stp2_24 = stp1_24; \
+  stp2_27 = stp1_27; \
+  \
+  stp2_28 = stp1_28; \
+  stp2_31 = stp1_31; \
+} \
+\
+/* Stage3 */ \
+{ \
+  const __m128i zero = _mm_setzero_si128();\
+  const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \
+  const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \
+  \
+  const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \
+  const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \
+  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \
+  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \
+  \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \
+  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \
+  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \
+  \
+  MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \
+                         stg3_1, stp1_4, stp1_7); \
+  \
+  stp1_8 = stp2_8; \
+  stp1_11 = stp2_11; \
+  stp1_12 = stp2_12; \
+  stp1_15 = stp2_15; \
+  \
+  MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
+                         stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
+                         stp1_18, stp1_29) \
+  MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
+                         stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
+                         stp1_22, stp1_25) \
+  \
+  stp1_16 = stp2_16; \
+  stp1_31 = stp2_31; \
+  stp1_19 = stp2_19; \
+  stp1_20 = stp2_20; \
+  stp1_23 = stp2_23; \
+  stp1_24 = stp2_24; \
+  stp1_27 = stp2_27; \
+  stp1_28 = stp2_28; \
+} \
+\
+/* Stage4 */ \
+{ \
+  const __m128i zero = _mm_setzero_si128();\
+  const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \
+  const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \
+  \
+  const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \
+  const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \
+  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \
+  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \
+  \
+  MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \
+                         stg4_1, stp2_0, stp2_1); \
+  \
+  stp2_4 = stp1_4; \
+  stp2_5 = stp1_4; \
+  stp2_6 = stp1_7; \
+  stp2_7 = stp1_7; \
+  \
+  MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
+                         stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
+                         stp2_10, stp2_13) \
+  \
+  stp2_8 = stp1_8; \
+  stp2_15 = stp1_15; \
+  stp2_11 = stp1_11; \
+  stp2_12 = stp1_12; \
+  \
+  stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
+  stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
+  stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
+  stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
+  stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
+  stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
+  stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
+  stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
+  \
+  stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
+  stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
+  stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
+  stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
+  stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
+  stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
+  stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
+  stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
+} \
+\
+/* Stage5 */ \
+{ \
+  const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+  const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
+  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
+  \
+  const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
+  const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
+  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+  \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+  \
+  stp1_0 = stp2_0; \
+  stp1_1 = stp2_1; \
+  stp1_2 = stp2_1; \
+  stp1_3 = stp2_0; \
+  \
+  tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+  tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+  tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+  tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+  \
+  tmp0 = _mm_add_epi32(tmp0, rounding); \
+  tmp1 = _mm_add_epi32(tmp1, rounding); \
+  tmp2 = _mm_add_epi32(tmp2, rounding); \
+  tmp3 = _mm_add_epi32(tmp3, rounding); \
+  \
+  tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+  tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+  \
+  stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+  stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+  \
+  stp1_4 = stp2_4; \
+  stp1_7 = stp2_7; \
+  \
+  stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
+  stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
+  stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
+  stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
+  stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
+  stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
+  stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
+  stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
+  \
+  stp1_16 = stp2_16; \
+  stp1_17 = stp2_17; \
+  \
+  MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
+                         stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
+                         stp1_19, stp1_28) \
+  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
+                         stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
+                         stp1_21, stp1_26) \
+  \
+  stp1_22 = stp2_22; \
+  stp1_23 = stp2_23; \
+  stp1_24 = stp2_24; \
+  stp1_25 = stp2_25; \
+  stp1_30 = stp2_30; \
+  stp1_31 = stp2_31; \
+} \
+\
+/* Stage6 */ \
+{ \
+  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+  const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+  const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+  \
+  stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
+  stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+  stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+  stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
+  stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
+  stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+  stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+  stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
+  \
+  stp2_8 = stp1_8; \
+  stp2_9 = stp1_9; \
+  stp2_14 = stp1_14; \
+  stp2_15 = stp1_15; \
+  \
+  MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
+                         stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
+                         stp2_13, stp2_11, stp2_12) \
+  \
+  stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
+  stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
+  stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
+  stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
+  stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
+  stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
+  stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
+  stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
+  \
+  stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
+  stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
+  stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
+  stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
+  stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
+  stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
+  stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
+  stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
+} \
+\
+/* Stage7 */ \
+{ \
+  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+  \
+  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
+  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
+  const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
+  const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
+  \
+  stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
+  stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
+  stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
+  stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
+  stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
+  stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
+  stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
+  stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
+  stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
+  stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
+  stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
+  stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
+  stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
+  stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
+  stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
+  stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
+  \
+  stp1_16 = stp2_16; \
+  stp1_17 = stp2_17; \
+  stp1_18 = stp2_18; \
+  stp1_19 = stp2_19; \
+  \
+  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
+                         stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
+                         stp1_21, stp1_26) \
+  MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
+                         stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
+                         stp1_23, stp1_24) \
+  \
+  stp1_28 = stp2_28; \
+  stp1_29 = stp2_29; \
+  stp1_30 = stp2_30; \
+  stp1_31 = stp2_31; \
+}
+
+
 #define IDCT32_1D \
 /* Stage1 */ \
 { \
-  const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31); \
-  const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31); \
-  const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15); \
-  const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15); \
+  const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \
+  const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \
+  const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \
+  const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \
   \
-  const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23); \
-  const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23); \
-  const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7); \
-  const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7); \
+  const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \
+  const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \
+  const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \
+  const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \
   \
-  const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27); \
-  const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27); \
-  const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11); \
-  const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11); \
+  const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \
+  const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \
+  const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \
+  const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \
   \
-  const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19); \
-  const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19); \
-  const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3); \
-  const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3); \
+  const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \
+  const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \
+  const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \
+  const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \
   \
   MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
                          stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \
@@ -2840,15 +3162,15 @@
 \
 /* Stage2 */ \
 { \
-  const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30); \
-  const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30); \
-  const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14); \
-  const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14); \
+  const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \
+  const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \
+  const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \
+  const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \
   \
-  const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22); \
-  const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22); \
-  const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6); \
-  const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6); \
+  const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \
+  const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \
+  const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \
+  const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \
   \
   MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
                          stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
@@ -2880,10 +3202,10 @@
 \
 /* Stage3 */ \
 { \
-  const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28); \
-  const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28); \
-  const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12); \
-  const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12); \
+  const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \
+  const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \
+  const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \
+  const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \
   \
   const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
   const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
@@ -2927,10 +3249,10 @@
 \
 /* Stage4 */ \
 { \
-  const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16); \
-  const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16); \
-  const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24); \
-  const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24); \
+  const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \
+  const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \
+  const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \
+  const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \
   \
   const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
   const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
@@ -3187,10 +3509,7 @@
 
   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
 
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12,
-          in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23,
-          in24, in25, in26, in27, in28, in29, in30, in31;
-  __m128i col[128];
+  __m128i in[32], col[32];
   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
           stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
           stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
@@ -3202,296 +3521,225 @@
           stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
           stp2_30, stp2_31;
   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i, j, i32;
+  int i;
+  // Load input data.
+  LOAD_DQCOEFF(in[0], input);
+  LOAD_DQCOEFF(in[8], input);
+  LOAD_DQCOEFF(in[16], input);
+  LOAD_DQCOEFF(in[24], input);
+  LOAD_DQCOEFF(in[1], input);
+  LOAD_DQCOEFF(in[9], input);
+  LOAD_DQCOEFF(in[17], input);
+  LOAD_DQCOEFF(in[25], input);
+  LOAD_DQCOEFF(in[2], input);
+  LOAD_DQCOEFF(in[10], input);
+  LOAD_DQCOEFF(in[18], input);
+  LOAD_DQCOEFF(in[26], input);
+  LOAD_DQCOEFF(in[3], input);
+  LOAD_DQCOEFF(in[11], input);
+  LOAD_DQCOEFF(in[19], input);
+  LOAD_DQCOEFF(in[27], input);
 
-  // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct.
-  for (i = 0; i < 8; i++) {
-    i32 = (i << 5);
-    if (i == 0) {
-      // First 1-D idct: first 8 rows
-      // Load input data.
-      LOAD_DQCOEFF(in0, input);
-      LOAD_DQCOEFF(in8, input);
-      LOAD_DQCOEFF(in16, input);
-      LOAD_DQCOEFF(in24, input);
-      LOAD_DQCOEFF(in1, input);
-      LOAD_DQCOEFF(in9, input);
-      LOAD_DQCOEFF(in17, input);
-      LOAD_DQCOEFF(in25, input);
-      LOAD_DQCOEFF(in2, input);
-      LOAD_DQCOEFF(in10, input);
-      LOAD_DQCOEFF(in18, input);
-      LOAD_DQCOEFF(in26, input);
-      LOAD_DQCOEFF(in3, input);
-      LOAD_DQCOEFF(in11, input);
-      LOAD_DQCOEFF(in19, input);
-      LOAD_DQCOEFF(in27, input);
+  LOAD_DQCOEFF(in[4], input);
+  LOAD_DQCOEFF(in[12], input);
+  LOAD_DQCOEFF(in[20], input);
+  LOAD_DQCOEFF(in[28], input);
+  LOAD_DQCOEFF(in[5], input);
+  LOAD_DQCOEFF(in[13], input);
+  LOAD_DQCOEFF(in[21], input);
+  LOAD_DQCOEFF(in[29], input);
+  LOAD_DQCOEFF(in[6], input);
+  LOAD_DQCOEFF(in[14], input);
+  LOAD_DQCOEFF(in[22], input);
+  LOAD_DQCOEFF(in[30], input);
+  LOAD_DQCOEFF(in[7], input);
+  LOAD_DQCOEFF(in[15], input);
+  LOAD_DQCOEFF(in[23], input);
+  LOAD_DQCOEFF(in[31], input);
 
-      LOAD_DQCOEFF(in4, input);
-      LOAD_DQCOEFF(in12, input);
-      LOAD_DQCOEFF(in20, input);
-      LOAD_DQCOEFF(in28, input);
-      LOAD_DQCOEFF(in5, input);
-      LOAD_DQCOEFF(in13, input);
-      LOAD_DQCOEFF(in21, input);
-      LOAD_DQCOEFF(in29, input);
-      LOAD_DQCOEFF(in6, input);
-      LOAD_DQCOEFF(in14, input);
-      LOAD_DQCOEFF(in22, input);
-      LOAD_DQCOEFF(in30, input);
-      LOAD_DQCOEFF(in7, input);
-      LOAD_DQCOEFF(in15, input);
-      LOAD_DQCOEFF(in23, input);
-      LOAD_DQCOEFF(in31, input);
+  array_transpose_8x8(in, in);
+  array_transpose_8x8(in+8, in+8);
+  array_transpose_8x8(in+16, in+16);
+  array_transpose_8x8(in+24, in+24);
 
-      // Transpose 32x8 block to 8x32 block
-      TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                    in4, in5, in6, in7);
-      TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
-                    in10, in11, in12, in13, in14, in15);
-      TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17,
-                    in18, in19, in20, in21, in22, in23);
-      TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25,
-                    in26, in27, in28, in29, in30, in31);
-    } else if (i < 4) {
-      // First 1-D idct: next 24 zero-coeff rows
-      col[i32 + 0] = _mm_setzero_si128();
-      col[i32 + 1] = _mm_setzero_si128();
-      col[i32 + 2] = _mm_setzero_si128();
-      col[i32 + 3] = _mm_setzero_si128();
-      col[i32 + 4] = _mm_setzero_si128();
-      col[i32 + 5] = _mm_setzero_si128();
-      col[i32 + 6] = _mm_setzero_si128();
-      col[i32 + 7] = _mm_setzero_si128();
-      col[i32 + 8] = _mm_setzero_si128();
-      col[i32 + 9] = _mm_setzero_si128();
-      col[i32 + 10] = _mm_setzero_si128();
-      col[i32 + 11] = _mm_setzero_si128();
-      col[i32 + 12] = _mm_setzero_si128();
-      col[i32 + 13] = _mm_setzero_si128();
-      col[i32 + 14] = _mm_setzero_si128();
-      col[i32 + 15] = _mm_setzero_si128();
-      col[i32 + 16] = _mm_setzero_si128();
-      col[i32 + 17] = _mm_setzero_si128();
-      col[i32 + 18] = _mm_setzero_si128();
-      col[i32 + 19] = _mm_setzero_si128();
-      col[i32 + 20] = _mm_setzero_si128();
-      col[i32 + 21] = _mm_setzero_si128();
-      col[i32 + 22] = _mm_setzero_si128();
-      col[i32 + 23] = _mm_setzero_si128();
-      col[i32 + 24] = _mm_setzero_si128();
-      col[i32 + 25] = _mm_setzero_si128();
-      col[i32 + 26] = _mm_setzero_si128();
-      col[i32 + 27] = _mm_setzero_si128();
-      col[i32 + 28] = _mm_setzero_si128();
-      col[i32 + 29] = _mm_setzero_si128();
-      col[i32 + 30] = _mm_setzero_si128();
-      col[i32 + 31] = _mm_setzero_si128();
-      continue;
-    } else {
-      // Second 1-D idct
-      j = i - 4;
+  IDCT32_1D
 
-      // Transpose 32x8 block to 8x32 block
-      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
-                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
-                    col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4,
-                    in5, in6, in7);
-      j += 4;
-      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
-                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
-                    col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10,
-                    in11, in12, in13, in14, in15);
-      j += 4;
-      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
-                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
-                    col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18,
-                    in19, in20, in21, in22, in23);
-      j += 4;
-      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
-                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
-                    col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27,
-                    in28, in29, in30, in31);
-    }
-
-    IDCT32_1D
-
-    // final stage
-    if (i < 4) {
-      // 1_D: Store 32 intermediate results for each 8x32 block.
-      col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
-      col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
-      col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
-      col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
-      col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
-      col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
-      col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
-      col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
-      col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
-      col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
-      col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
-      col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
-      col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
-      col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
-      col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
-      col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
-      col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
-      col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
-      col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
-      col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
-      col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
-      col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
-      col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
-      col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
-      col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
-      col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
-      col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
-      col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
-      col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
-      col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
-      col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
-      col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
-    } else {
+  // 1_D: Store 32 intermediate results for each 8x32 block.
+  col[0] = _mm_add_epi16(stp1_0, stp1_31);
+  col[1] = _mm_add_epi16(stp1_1, stp1_30);
+  col[2] = _mm_add_epi16(stp1_2, stp1_29);
+  col[3] = _mm_add_epi16(stp1_3, stp1_28);
+  col[4] = _mm_add_epi16(stp1_4, stp1_27);
+  col[5] = _mm_add_epi16(stp1_5, stp1_26);
+  col[6] = _mm_add_epi16(stp1_6, stp1_25);
+  col[7] = _mm_add_epi16(stp1_7, stp1_24);
+  col[8] = _mm_add_epi16(stp1_8, stp1_23);
+  col[9] = _mm_add_epi16(stp1_9, stp1_22);
+  col[10] = _mm_add_epi16(stp1_10, stp1_21);
+  col[11] = _mm_add_epi16(stp1_11, stp1_20);
+  col[12] = _mm_add_epi16(stp1_12, stp1_19);
+  col[13] = _mm_add_epi16(stp1_13, stp1_18);
+  col[14] = _mm_add_epi16(stp1_14, stp1_17);
+  col[15] = _mm_add_epi16(stp1_15, stp1_16);
+  col[16] = _mm_sub_epi16(stp1_15, stp1_16);
+  col[17] = _mm_sub_epi16(stp1_14, stp1_17);
+  col[18] = _mm_sub_epi16(stp1_13, stp1_18);
+  col[19] = _mm_sub_epi16(stp1_12, stp1_19);
+  col[20] = _mm_sub_epi16(stp1_11, stp1_20);
+  col[21] = _mm_sub_epi16(stp1_10, stp1_21);
+  col[22] = _mm_sub_epi16(stp1_9, stp1_22);
+  col[23] = _mm_sub_epi16(stp1_8, stp1_23);
+  col[24] = _mm_sub_epi16(stp1_7, stp1_24);
+  col[25] = _mm_sub_epi16(stp1_6, stp1_25);
+  col[26] = _mm_sub_epi16(stp1_5, stp1_26);
+  col[27] = _mm_sub_epi16(stp1_4, stp1_27);
+  col[28] = _mm_sub_epi16(stp1_3, stp1_28);
+  col[29] = _mm_sub_epi16(stp1_2, stp1_29);
+  col[30] = _mm_sub_epi16(stp1_1, stp1_30);
+  col[31] = _mm_sub_epi16(stp1_0, stp1_31);
+  for (i = 0; i < 4; i++) {
       const __m128i zero = _mm_setzero_si128();
+      // Transpose 32x8 block to 8x32 block
+      array_transpose_8x8(col+i*8, in);
+      IDCT32_1D_34
 
       // 2_D: Calculate the results and store them to destination.
-      in0 = _mm_add_epi16(stp1_0, stp1_31);
-      in1 = _mm_add_epi16(stp1_1, stp1_30);
-      in2 = _mm_add_epi16(stp1_2, stp1_29);
-      in3 = _mm_add_epi16(stp1_3, stp1_28);
-      in4 = _mm_add_epi16(stp1_4, stp1_27);
-      in5 = _mm_add_epi16(stp1_5, stp1_26);
-      in6 = _mm_add_epi16(stp1_6, stp1_25);
-      in7 = _mm_add_epi16(stp1_7, stp1_24);
-      in8 = _mm_add_epi16(stp1_8, stp1_23);
-      in9 = _mm_add_epi16(stp1_9, stp1_22);
-      in10 = _mm_add_epi16(stp1_10, stp1_21);
-      in11 = _mm_add_epi16(stp1_11, stp1_20);
-      in12 = _mm_add_epi16(stp1_12, stp1_19);
-      in13 = _mm_add_epi16(stp1_13, stp1_18);
-      in14 = _mm_add_epi16(stp1_14, stp1_17);
-      in15 = _mm_add_epi16(stp1_15, stp1_16);
-      in16 = _mm_sub_epi16(stp1_15, stp1_16);
-      in17 = _mm_sub_epi16(stp1_14, stp1_17);
-      in18 = _mm_sub_epi16(stp1_13, stp1_18);
-      in19 = _mm_sub_epi16(stp1_12, stp1_19);
-      in20 = _mm_sub_epi16(stp1_11, stp1_20);
-      in21 = _mm_sub_epi16(stp1_10, stp1_21);
-      in22 = _mm_sub_epi16(stp1_9, stp1_22);
-      in23 = _mm_sub_epi16(stp1_8, stp1_23);
-      in24 = _mm_sub_epi16(stp1_7, stp1_24);
-      in25 = _mm_sub_epi16(stp1_6, stp1_25);
-      in26 = _mm_sub_epi16(stp1_5, stp1_26);
-      in27 = _mm_sub_epi16(stp1_4, stp1_27);
-      in28 = _mm_sub_epi16(stp1_3, stp1_28);
-      in29 = _mm_sub_epi16(stp1_2, stp1_29);
-      in30 = _mm_sub_epi16(stp1_1, stp1_30);
-      in31 = _mm_sub_epi16(stp1_0, stp1_31);
+      in[0] = _mm_add_epi16(stp1_0, stp1_31);
+      in[1] = _mm_add_epi16(stp1_1, stp1_30);
+      in[2] = _mm_add_epi16(stp1_2, stp1_29);
+      in[3] = _mm_add_epi16(stp1_3, stp1_28);
+      in[4] = _mm_add_epi16(stp1_4, stp1_27);
+      in[5] = _mm_add_epi16(stp1_5, stp1_26);
+      in[6] = _mm_add_epi16(stp1_6, stp1_25);
+      in[7] = _mm_add_epi16(stp1_7, stp1_24);
+      in[8] = _mm_add_epi16(stp1_8, stp1_23);
+      in[9] = _mm_add_epi16(stp1_9, stp1_22);
+      in[10] = _mm_add_epi16(stp1_10, stp1_21);
+      in[11] = _mm_add_epi16(stp1_11, stp1_20);
+      in[12] = _mm_add_epi16(stp1_12, stp1_19);
+      in[13] = _mm_add_epi16(stp1_13, stp1_18);
+      in[14] = _mm_add_epi16(stp1_14, stp1_17);
+      in[15] = _mm_add_epi16(stp1_15, stp1_16);
+      in[16] = _mm_sub_epi16(stp1_15, stp1_16);
+      in[17] = _mm_sub_epi16(stp1_14, stp1_17);
+      in[18] = _mm_sub_epi16(stp1_13, stp1_18);
+      in[19] = _mm_sub_epi16(stp1_12, stp1_19);
+      in[20] = _mm_sub_epi16(stp1_11, stp1_20);
+      in[21] = _mm_sub_epi16(stp1_10, stp1_21);
+      in[22] = _mm_sub_epi16(stp1_9, stp1_22);
+      in[23] = _mm_sub_epi16(stp1_8, stp1_23);
+      in[24] = _mm_sub_epi16(stp1_7, stp1_24);
+      in[25] = _mm_sub_epi16(stp1_6, stp1_25);
+      in[26] = _mm_sub_epi16(stp1_5, stp1_26);
+      in[27] = _mm_sub_epi16(stp1_4, stp1_27);
+      in[28] = _mm_sub_epi16(stp1_3, stp1_28);
+      in[29] = _mm_sub_epi16(stp1_2, stp1_29);
+      in[30] = _mm_sub_epi16(stp1_1, stp1_30);
+      in[31] = _mm_sub_epi16(stp1_0, stp1_31);
 
       // Final rounding and shift
-      in0 = _mm_adds_epi16(in0, final_rounding);
-      in1 = _mm_adds_epi16(in1, final_rounding);
-      in2 = _mm_adds_epi16(in2, final_rounding);
-      in3 = _mm_adds_epi16(in3, final_rounding);
-      in4 = _mm_adds_epi16(in4, final_rounding);
-      in5 = _mm_adds_epi16(in5, final_rounding);
-      in6 = _mm_adds_epi16(in6, final_rounding);
-      in7 = _mm_adds_epi16(in7, final_rounding);
-      in8 = _mm_adds_epi16(in8, final_rounding);
-      in9 = _mm_adds_epi16(in9, final_rounding);
-      in10 = _mm_adds_epi16(in10, final_rounding);
-      in11 = _mm_adds_epi16(in11, final_rounding);
-      in12 = _mm_adds_epi16(in12, final_rounding);
-      in13 = _mm_adds_epi16(in13, final_rounding);
-      in14 = _mm_adds_epi16(in14, final_rounding);
-      in15 = _mm_adds_epi16(in15, final_rounding);
-      in16 = _mm_adds_epi16(in16, final_rounding);
-      in17 = _mm_adds_epi16(in17, final_rounding);
-      in18 = _mm_adds_epi16(in18, final_rounding);
-      in19 = _mm_adds_epi16(in19, final_rounding);
-      in20 = _mm_adds_epi16(in20, final_rounding);
-      in21 = _mm_adds_epi16(in21, final_rounding);
-      in22 = _mm_adds_epi16(in22, final_rounding);
-      in23 = _mm_adds_epi16(in23, final_rounding);
-      in24 = _mm_adds_epi16(in24, final_rounding);
-      in25 = _mm_adds_epi16(in25, final_rounding);
-      in26 = _mm_adds_epi16(in26, final_rounding);
-      in27 = _mm_adds_epi16(in27, final_rounding);
-      in28 = _mm_adds_epi16(in28, final_rounding);
-      in29 = _mm_adds_epi16(in29, final_rounding);
-      in30 = _mm_adds_epi16(in30, final_rounding);
-      in31 = _mm_adds_epi16(in31, final_rounding);
+      in[0] = _mm_adds_epi16(in[0], final_rounding);
+      in[1] = _mm_adds_epi16(in[1], final_rounding);
+      in[2] = _mm_adds_epi16(in[2], final_rounding);
+      in[3] = _mm_adds_epi16(in[3], final_rounding);
+      in[4] = _mm_adds_epi16(in[4], final_rounding);
+      in[5] = _mm_adds_epi16(in[5], final_rounding);
+      in[6] = _mm_adds_epi16(in[6], final_rounding);
+      in[7] = _mm_adds_epi16(in[7], final_rounding);
+      in[8] = _mm_adds_epi16(in[8], final_rounding);
+      in[9] = _mm_adds_epi16(in[9], final_rounding);
+      in[10] = _mm_adds_epi16(in[10], final_rounding);
+      in[11] = _mm_adds_epi16(in[11], final_rounding);
+      in[12] = _mm_adds_epi16(in[12], final_rounding);
+      in[13] = _mm_adds_epi16(in[13], final_rounding);
+      in[14] = _mm_adds_epi16(in[14], final_rounding);
+      in[15] = _mm_adds_epi16(in[15], final_rounding);
+      in[16] = _mm_adds_epi16(in[16], final_rounding);
+      in[17] = _mm_adds_epi16(in[17], final_rounding);
+      in[18] = _mm_adds_epi16(in[18], final_rounding);
+      in[19] = _mm_adds_epi16(in[19], final_rounding);
+      in[20] = _mm_adds_epi16(in[20], final_rounding);
+      in[21] = _mm_adds_epi16(in[21], final_rounding);
+      in[22] = _mm_adds_epi16(in[22], final_rounding);
+      in[23] = _mm_adds_epi16(in[23], final_rounding);
+      in[24] = _mm_adds_epi16(in[24], final_rounding);
+      in[25] = _mm_adds_epi16(in[25], final_rounding);
+      in[26] = _mm_adds_epi16(in[26], final_rounding);
+      in[27] = _mm_adds_epi16(in[27], final_rounding);
+      in[28] = _mm_adds_epi16(in[28], final_rounding);
+      in[29] = _mm_adds_epi16(in[29], final_rounding);
+      in[30] = _mm_adds_epi16(in[30], final_rounding);
+      in[31] = _mm_adds_epi16(in[31], final_rounding);
 
-      in0 = _mm_srai_epi16(in0, 6);
-      in1 = _mm_srai_epi16(in1, 6);
-      in2 = _mm_srai_epi16(in2, 6);
-      in3 = _mm_srai_epi16(in3, 6);
-      in4 = _mm_srai_epi16(in4, 6);
-      in5 = _mm_srai_epi16(in5, 6);
-      in6 = _mm_srai_epi16(in6, 6);
-      in7 = _mm_srai_epi16(in7, 6);
-      in8 = _mm_srai_epi16(in8, 6);
-      in9 = _mm_srai_epi16(in9, 6);
-      in10 = _mm_srai_epi16(in10, 6);
-      in11 = _mm_srai_epi16(in11, 6);
-      in12 = _mm_srai_epi16(in12, 6);
-      in13 = _mm_srai_epi16(in13, 6);
-      in14 = _mm_srai_epi16(in14, 6);
-      in15 = _mm_srai_epi16(in15, 6);
-      in16 = _mm_srai_epi16(in16, 6);
-      in17 = _mm_srai_epi16(in17, 6);
-      in18 = _mm_srai_epi16(in18, 6);
-      in19 = _mm_srai_epi16(in19, 6);
-      in20 = _mm_srai_epi16(in20, 6);
-      in21 = _mm_srai_epi16(in21, 6);
-      in22 = _mm_srai_epi16(in22, 6);
-      in23 = _mm_srai_epi16(in23, 6);
-      in24 = _mm_srai_epi16(in24, 6);
-      in25 = _mm_srai_epi16(in25, 6);
-      in26 = _mm_srai_epi16(in26, 6);
-      in27 = _mm_srai_epi16(in27, 6);
-      in28 = _mm_srai_epi16(in28, 6);
-      in29 = _mm_srai_epi16(in29, 6);
-      in30 = _mm_srai_epi16(in30, 6);
-      in31 = _mm_srai_epi16(in31, 6);
+      in[0] = _mm_srai_epi16(in[0], 6);
+      in[1] = _mm_srai_epi16(in[1], 6);
+      in[2] = _mm_srai_epi16(in[2], 6);
+      in[3] = _mm_srai_epi16(in[3], 6);
+      in[4] = _mm_srai_epi16(in[4], 6);
+      in[5] = _mm_srai_epi16(in[5], 6);
+      in[6] = _mm_srai_epi16(in[6], 6);
+      in[7] = _mm_srai_epi16(in[7], 6);
+      in[8] = _mm_srai_epi16(in[8], 6);
+      in[9] = _mm_srai_epi16(in[9], 6);
+      in[10] = _mm_srai_epi16(in[10], 6);
+      in[11] = _mm_srai_epi16(in[11], 6);
+      in[12] = _mm_srai_epi16(in[12], 6);
+      in[13] = _mm_srai_epi16(in[13], 6);
+      in[14] = _mm_srai_epi16(in[14], 6);
+      in[15] = _mm_srai_epi16(in[15], 6);
+      in[16] = _mm_srai_epi16(in[16], 6);
+      in[17] = _mm_srai_epi16(in[17], 6);
+      in[18] = _mm_srai_epi16(in[18], 6);
+      in[19] = _mm_srai_epi16(in[19], 6);
+      in[20] = _mm_srai_epi16(in[20], 6);
+      in[21] = _mm_srai_epi16(in[21], 6);
+      in[22] = _mm_srai_epi16(in[22], 6);
+      in[23] = _mm_srai_epi16(in[23], 6);
+      in[24] = _mm_srai_epi16(in[24], 6);
+      in[25] = _mm_srai_epi16(in[25], 6);
+      in[26] = _mm_srai_epi16(in[26], 6);
+      in[27] = _mm_srai_epi16(in[27], 6);
+      in[28] = _mm_srai_epi16(in[28], 6);
+      in[29] = _mm_srai_epi16(in[29], 6);
+      in[30] = _mm_srai_epi16(in[30], 6);
+      in[31] = _mm_srai_epi16(in[31], 6);
 
-      RECON_AND_STORE(dest, in0);
-      RECON_AND_STORE(dest, in1);
-      RECON_AND_STORE(dest, in2);
-      RECON_AND_STORE(dest, in3);
-      RECON_AND_STORE(dest, in4);
-      RECON_AND_STORE(dest, in5);
-      RECON_AND_STORE(dest, in6);
-      RECON_AND_STORE(dest, in7);
-      RECON_AND_STORE(dest, in8);
-      RECON_AND_STORE(dest, in9);
-      RECON_AND_STORE(dest, in10);
-      RECON_AND_STORE(dest, in11);
-      RECON_AND_STORE(dest, in12);
-      RECON_AND_STORE(dest, in13);
-      RECON_AND_STORE(dest, in14);
-      RECON_AND_STORE(dest, in15);
-      RECON_AND_STORE(dest, in16);
-      RECON_AND_STORE(dest, in17);
-      RECON_AND_STORE(dest, in18);
-      RECON_AND_STORE(dest, in19);
-      RECON_AND_STORE(dest, in20);
-      RECON_AND_STORE(dest, in21);
-      RECON_AND_STORE(dest, in22);
-      RECON_AND_STORE(dest, in23);
-      RECON_AND_STORE(dest, in24);
-      RECON_AND_STORE(dest, in25);
-      RECON_AND_STORE(dest, in26);
-      RECON_AND_STORE(dest, in27);
-      RECON_AND_STORE(dest, in28);
-      RECON_AND_STORE(dest, in29);
-      RECON_AND_STORE(dest, in30);
-      RECON_AND_STORE(dest, in31);
+      RECON_AND_STORE(dest, in[0]);
+      RECON_AND_STORE(dest, in[1]);
+      RECON_AND_STORE(dest, in[2]);
+      RECON_AND_STORE(dest, in[3]);
+      RECON_AND_STORE(dest, in[4]);
+      RECON_AND_STORE(dest, in[5]);
+      RECON_AND_STORE(dest, in[6]);
+      RECON_AND_STORE(dest, in[7]);
+      RECON_AND_STORE(dest, in[8]);
+      RECON_AND_STORE(dest, in[9]);
+      RECON_AND_STORE(dest, in[10]);
+      RECON_AND_STORE(dest, in[11]);
+      RECON_AND_STORE(dest, in[12]);
+      RECON_AND_STORE(dest, in[13]);
+      RECON_AND_STORE(dest, in[14]);
+      RECON_AND_STORE(dest, in[15]);
+      RECON_AND_STORE(dest, in[16]);
+      RECON_AND_STORE(dest, in[17]);
+      RECON_AND_STORE(dest, in[18]);
+      RECON_AND_STORE(dest, in[19]);
+      RECON_AND_STORE(dest, in[20]);
+      RECON_AND_STORE(dest, in[21]);
+      RECON_AND_STORE(dest, in[22]);
+      RECON_AND_STORE(dest, in[23]);
+      RECON_AND_STORE(dest, in[24]);
+      RECON_AND_STORE(dest, in[25]);
+      RECON_AND_STORE(dest, in[26]);
+      RECON_AND_STORE(dest, in[27]);
+      RECON_AND_STORE(dest, in[28]);
+      RECON_AND_STORE(dest, in[29]);
+      RECON_AND_STORE(dest, in[30]);
+      RECON_AND_STORE(dest, in[31]);
 
       dest += 8 - (stride * 32);
     }
   }
-}
 
 void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
                                  int stride) {
@@ -3546,10 +3794,7 @@
 
   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
 
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12,
-          in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23,
-          in24, in25, in26, in27, in28, in29, in30, in31;
-  __m128i col[128];
+  __m128i in[32], col[128], zero_idx[16];
   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
           stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
           stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
@@ -3562,66 +3807,63 @@
           stp2_30, stp2_31;
   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   int i, j, i32;
-  __m128i zero_idx[16];
   int zero_flag[2];
 
-  // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct.
-  for (i = 0; i < 8; i++) {
+  for (i = 0; i < 4; i++) {
     i32 = (i << 5);
-    if (i < 4) {
       // First 1-D idct
       // Load input data.
-      LOAD_DQCOEFF(in0, input);
-      LOAD_DQCOEFF(in8, input);
-      LOAD_DQCOEFF(in16, input);
-      LOAD_DQCOEFF(in24, input);
-      LOAD_DQCOEFF(in1, input);
-      LOAD_DQCOEFF(in9, input);
-      LOAD_DQCOEFF(in17, input);
-      LOAD_DQCOEFF(in25, input);
-      LOAD_DQCOEFF(in2, input);
-      LOAD_DQCOEFF(in10, input);
-      LOAD_DQCOEFF(in18, input);
-      LOAD_DQCOEFF(in26, input);
-      LOAD_DQCOEFF(in3, input);
-      LOAD_DQCOEFF(in11, input);
-      LOAD_DQCOEFF(in19, input);
-      LOAD_DQCOEFF(in27, input);
+      LOAD_DQCOEFF(in[0], input);
+      LOAD_DQCOEFF(in[8], input);
+      LOAD_DQCOEFF(in[16], input);
+      LOAD_DQCOEFF(in[24], input);
+      LOAD_DQCOEFF(in[1], input);
+      LOAD_DQCOEFF(in[9], input);
+      LOAD_DQCOEFF(in[17], input);
+      LOAD_DQCOEFF(in[25], input);
+      LOAD_DQCOEFF(in[2], input);
+      LOAD_DQCOEFF(in[10], input);
+      LOAD_DQCOEFF(in[18], input);
+      LOAD_DQCOEFF(in[26], input);
+      LOAD_DQCOEFF(in[3], input);
+      LOAD_DQCOEFF(in[11], input);
+      LOAD_DQCOEFF(in[19], input);
+      LOAD_DQCOEFF(in[27], input);
 
-      LOAD_DQCOEFF(in4, input);
-      LOAD_DQCOEFF(in12, input);
-      LOAD_DQCOEFF(in20, input);
-      LOAD_DQCOEFF(in28, input);
-      LOAD_DQCOEFF(in5, input);
-      LOAD_DQCOEFF(in13, input);
-      LOAD_DQCOEFF(in21, input);
-      LOAD_DQCOEFF(in29, input);
-      LOAD_DQCOEFF(in6, input);
-      LOAD_DQCOEFF(in14, input);
-      LOAD_DQCOEFF(in22, input);
-      LOAD_DQCOEFF(in30, input);
-      LOAD_DQCOEFF(in7, input);
-      LOAD_DQCOEFF(in15, input);
-      LOAD_DQCOEFF(in23, input);
-      LOAD_DQCOEFF(in31, input);
+      LOAD_DQCOEFF(in[4], input);
+      LOAD_DQCOEFF(in[12], input);
+      LOAD_DQCOEFF(in[20], input);
+      LOAD_DQCOEFF(in[28], input);
+      LOAD_DQCOEFF(in[5], input);
+      LOAD_DQCOEFF(in[13], input);
+      LOAD_DQCOEFF(in[21], input);
+      LOAD_DQCOEFF(in[29], input);
+      LOAD_DQCOEFF(in[6], input);
+      LOAD_DQCOEFF(in[14], input);
+      LOAD_DQCOEFF(in[22], input);
+      LOAD_DQCOEFF(in[30], input);
+      LOAD_DQCOEFF(in[7], input);
+      LOAD_DQCOEFF(in[15], input);
+      LOAD_DQCOEFF(in[23], input);
+      LOAD_DQCOEFF(in[31], input);
 
       // checking if all entries are zero
-      zero_idx[0] = _mm_or_si128(in0, in1);
-      zero_idx[1] = _mm_or_si128(in2, in3);
-      zero_idx[2] = _mm_or_si128(in4, in5);
-      zero_idx[3] = _mm_or_si128(in6, in7);
-      zero_idx[4] = _mm_or_si128(in8, in9);
-      zero_idx[5] = _mm_or_si128(in10, in11);
-      zero_idx[6] = _mm_or_si128(in12, in13);
-      zero_idx[7] = _mm_or_si128(in14, in15);
-      zero_idx[8] = _mm_or_si128(in16, in17);
-      zero_idx[9] = _mm_or_si128(in18, in19);
-      zero_idx[10] = _mm_or_si128(in20, in21);
-      zero_idx[11] = _mm_or_si128(in22, in23);
-      zero_idx[12] = _mm_or_si128(in24, in25);
-      zero_idx[13] = _mm_or_si128(in26, in27);
-      zero_idx[14] = _mm_or_si128(in28, in29);
-      zero_idx[15] = _mm_or_si128(in30, in31);
+      zero_idx[0] = _mm_or_si128(in[0], in[1]);
+      zero_idx[1] = _mm_or_si128(in[2], in[3]);
+      zero_idx[2] = _mm_or_si128(in[4], in[5]);
+      zero_idx[3] = _mm_or_si128(in[6], in[7]);
+      zero_idx[4] = _mm_or_si128(in[8], in[9]);
+      zero_idx[5] = _mm_or_si128(in[10], in[11]);
+      zero_idx[6] = _mm_or_si128(in[12], in[13]);
+      zero_idx[7] = _mm_or_si128(in[14], in[15]);
+      zero_idx[8] = _mm_or_si128(in[16], in[17]);
+      zero_idx[9] = _mm_or_si128(in[18], in[19]);
+      zero_idx[10] = _mm_or_si128(in[20], in[21]);
+      zero_idx[11] = _mm_or_si128(in[22], in[23]);
+      zero_idx[12] = _mm_or_si128(in[24], in[25]);
+      zero_idx[13] = _mm_or_si128(in[26], in[27]);
+      zero_idx[14] = _mm_or_si128(in[28], in[29]);
+      zero_idx[15] = _mm_or_si128(in[30], in[31]);
 
       zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
       zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
@@ -3683,44 +3925,13 @@
       }
 
       // Transpose 32x8 block to 8x32 block
-      TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                    in4, in5, in6, in7);
-      TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
-                    in10, in11, in12, in13, in14, in15);
-      TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17,
-                    in18, in19, in20, in21, in22, in23);
-      TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25,
-                    in26, in27, in28, in29, in30, in31);
-    } else {
-      // Second 1-D idct
-      j = i - 4;
+      array_transpose_8x8(in, in);
+      array_transpose_8x8(in+8, in+8);
+      array_transpose_8x8(in+16, in+16);
+      array_transpose_8x8(in+24, in+24);
 
-      // Transpose 32x8 block to 8x32 block
-      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
-                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
-                    col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4,
-                    in5, in6, in7);
-      j += 4;
-      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
-                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
-                    col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10,
-                    in11, in12, in13, in14, in15);
-      j += 4;
-      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
-                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
-                    col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18,
-                    in19, in20, in21, in22, in23);
-      j += 4;
-      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
-                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
-                    col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27,
-                    in28, in29, in30, in31);
-    }
+      IDCT32_1D
 
-    IDCT32_1D
-
-    // final stage
-    if (i < 4) {
       // 1_D: Store 32 intermediate results for each 8x32 block.
       col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
       col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
@@ -3754,146 +3965,156 @@
       col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
       col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
       col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
-    } else {
+    }
+  for (i = 0; i < 4; i++) {
       const __m128i zero = _mm_setzero_si128();
+      // Second 1-D idct
+      j = i << 3;
+
+      // Transpose 32x8 block to 8x32 block
+      array_transpose_8x8(col+j, in);
+      array_transpose_8x8(col+j+32, in+8);
+      array_transpose_8x8(col+j+64, in+16);
+      array_transpose_8x8(col+j+96, in+24);
+
+      IDCT32_1D
 
       // 2_D: Calculate the results and store them to destination.
-      in0 = _mm_add_epi16(stp1_0, stp1_31);
-      in1 = _mm_add_epi16(stp1_1, stp1_30);
-      in2 = _mm_add_epi16(stp1_2, stp1_29);
-      in3 = _mm_add_epi16(stp1_3, stp1_28);
-      in4 = _mm_add_epi16(stp1_4, stp1_27);
-      in5 = _mm_add_epi16(stp1_5, stp1_26);
-      in6 = _mm_add_epi16(stp1_6, stp1_25);
-      in7 = _mm_add_epi16(stp1_7, stp1_24);
-      in8 = _mm_add_epi16(stp1_8, stp1_23);
-      in9 = _mm_add_epi16(stp1_9, stp1_22);
-      in10 = _mm_add_epi16(stp1_10, stp1_21);
-      in11 = _mm_add_epi16(stp1_11, stp1_20);
-      in12 = _mm_add_epi16(stp1_12, stp1_19);
-      in13 = _mm_add_epi16(stp1_13, stp1_18);
-      in14 = _mm_add_epi16(stp1_14, stp1_17);
-      in15 = _mm_add_epi16(stp1_15, stp1_16);
-      in16 = _mm_sub_epi16(stp1_15, stp1_16);
-      in17 = _mm_sub_epi16(stp1_14, stp1_17);
-      in18 = _mm_sub_epi16(stp1_13, stp1_18);
-      in19 = _mm_sub_epi16(stp1_12, stp1_19);
-      in20 = _mm_sub_epi16(stp1_11, stp1_20);
-      in21 = _mm_sub_epi16(stp1_10, stp1_21);
-      in22 = _mm_sub_epi16(stp1_9, stp1_22);
-      in23 = _mm_sub_epi16(stp1_8, stp1_23);
-      in24 = _mm_sub_epi16(stp1_7, stp1_24);
-      in25 = _mm_sub_epi16(stp1_6, stp1_25);
-      in26 = _mm_sub_epi16(stp1_5, stp1_26);
-      in27 = _mm_sub_epi16(stp1_4, stp1_27);
-      in28 = _mm_sub_epi16(stp1_3, stp1_28);
-      in29 = _mm_sub_epi16(stp1_2, stp1_29);
-      in30 = _mm_sub_epi16(stp1_1, stp1_30);
-      in31 = _mm_sub_epi16(stp1_0, stp1_31);
+      in[0] = _mm_add_epi16(stp1_0, stp1_31);
+      in[1] = _mm_add_epi16(stp1_1, stp1_30);
+      in[2] = _mm_add_epi16(stp1_2, stp1_29);
+      in[3] = _mm_add_epi16(stp1_3, stp1_28);
+      in[4] = _mm_add_epi16(stp1_4, stp1_27);
+      in[5] = _mm_add_epi16(stp1_5, stp1_26);
+      in[6] = _mm_add_epi16(stp1_6, stp1_25);
+      in[7] = _mm_add_epi16(stp1_7, stp1_24);
+      in[8] = _mm_add_epi16(stp1_8, stp1_23);
+      in[9] = _mm_add_epi16(stp1_9, stp1_22);
+      in[10] = _mm_add_epi16(stp1_10, stp1_21);
+      in[11] = _mm_add_epi16(stp1_11, stp1_20);
+      in[12] = _mm_add_epi16(stp1_12, stp1_19);
+      in[13] = _mm_add_epi16(stp1_13, stp1_18);
+      in[14] = _mm_add_epi16(stp1_14, stp1_17);
+      in[15] = _mm_add_epi16(stp1_15, stp1_16);
+      in[16] = _mm_sub_epi16(stp1_15, stp1_16);
+      in[17] = _mm_sub_epi16(stp1_14, stp1_17);
+      in[18] = _mm_sub_epi16(stp1_13, stp1_18);
+      in[19] = _mm_sub_epi16(stp1_12, stp1_19);
+      in[20] = _mm_sub_epi16(stp1_11, stp1_20);
+      in[21] = _mm_sub_epi16(stp1_10, stp1_21);
+      in[22] = _mm_sub_epi16(stp1_9, stp1_22);
+      in[23] = _mm_sub_epi16(stp1_8, stp1_23);
+      in[24] = _mm_sub_epi16(stp1_7, stp1_24);
+      in[25] = _mm_sub_epi16(stp1_6, stp1_25);
+      in[26] = _mm_sub_epi16(stp1_5, stp1_26);
+      in[27] = _mm_sub_epi16(stp1_4, stp1_27);
+      in[28] = _mm_sub_epi16(stp1_3, stp1_28);
+      in[29] = _mm_sub_epi16(stp1_2, stp1_29);
+      in[30] = _mm_sub_epi16(stp1_1, stp1_30);
+      in[31] = _mm_sub_epi16(stp1_0, stp1_31);
 
       // Final rounding and shift
-      in0 = _mm_adds_epi16(in0, final_rounding);
-      in1 = _mm_adds_epi16(in1, final_rounding);
-      in2 = _mm_adds_epi16(in2, final_rounding);
-      in3 = _mm_adds_epi16(in3, final_rounding);
-      in4 = _mm_adds_epi16(in4, final_rounding);
-      in5 = _mm_adds_epi16(in5, final_rounding);
-      in6 = _mm_adds_epi16(in6, final_rounding);
-      in7 = _mm_adds_epi16(in7, final_rounding);
-      in8 = _mm_adds_epi16(in8, final_rounding);
-      in9 = _mm_adds_epi16(in9, final_rounding);
-      in10 = _mm_adds_epi16(in10, final_rounding);
-      in11 = _mm_adds_epi16(in11, final_rounding);
-      in12 = _mm_adds_epi16(in12, final_rounding);
-      in13 = _mm_adds_epi16(in13, final_rounding);
-      in14 = _mm_adds_epi16(in14, final_rounding);
-      in15 = _mm_adds_epi16(in15, final_rounding);
-      in16 = _mm_adds_epi16(in16, final_rounding);
-      in17 = _mm_adds_epi16(in17, final_rounding);
-      in18 = _mm_adds_epi16(in18, final_rounding);
-      in19 = _mm_adds_epi16(in19, final_rounding);
-      in20 = _mm_adds_epi16(in20, final_rounding);
-      in21 = _mm_adds_epi16(in21, final_rounding);
-      in22 = _mm_adds_epi16(in22, final_rounding);
-      in23 = _mm_adds_epi16(in23, final_rounding);
-      in24 = _mm_adds_epi16(in24, final_rounding);
-      in25 = _mm_adds_epi16(in25, final_rounding);
-      in26 = _mm_adds_epi16(in26, final_rounding);
-      in27 = _mm_adds_epi16(in27, final_rounding);
-      in28 = _mm_adds_epi16(in28, final_rounding);
-      in29 = _mm_adds_epi16(in29, final_rounding);
-      in30 = _mm_adds_epi16(in30, final_rounding);
-      in31 = _mm_adds_epi16(in31, final_rounding);
+      in[0] = _mm_adds_epi16(in[0], final_rounding);
+      in[1] = _mm_adds_epi16(in[1], final_rounding);
+      in[2] = _mm_adds_epi16(in[2], final_rounding);
+      in[3] = _mm_adds_epi16(in[3], final_rounding);
+      in[4] = _mm_adds_epi16(in[4], final_rounding);
+      in[5] = _mm_adds_epi16(in[5], final_rounding);
+      in[6] = _mm_adds_epi16(in[6], final_rounding);
+      in[7] = _mm_adds_epi16(in[7], final_rounding);
+      in[8] = _mm_adds_epi16(in[8], final_rounding);
+      in[9] = _mm_adds_epi16(in[9], final_rounding);
+      in[10] = _mm_adds_epi16(in[10], final_rounding);
+      in[11] = _mm_adds_epi16(in[11], final_rounding);
+      in[12] = _mm_adds_epi16(in[12], final_rounding);
+      in[13] = _mm_adds_epi16(in[13], final_rounding);
+      in[14] = _mm_adds_epi16(in[14], final_rounding);
+      in[15] = _mm_adds_epi16(in[15], final_rounding);
+      in[16] = _mm_adds_epi16(in[16], final_rounding);
+      in[17] = _mm_adds_epi16(in[17], final_rounding);
+      in[18] = _mm_adds_epi16(in[18], final_rounding);
+      in[19] = _mm_adds_epi16(in[19], final_rounding);
+      in[20] = _mm_adds_epi16(in[20], final_rounding);
+      in[21] = _mm_adds_epi16(in[21], final_rounding);
+      in[22] = _mm_adds_epi16(in[22], final_rounding);
+      in[23] = _mm_adds_epi16(in[23], final_rounding);
+      in[24] = _mm_adds_epi16(in[24], final_rounding);
+      in[25] = _mm_adds_epi16(in[25], final_rounding);
+      in[26] = _mm_adds_epi16(in[26], final_rounding);
+      in[27] = _mm_adds_epi16(in[27], final_rounding);
+      in[28] = _mm_adds_epi16(in[28], final_rounding);
+      in[29] = _mm_adds_epi16(in[29], final_rounding);
+      in[30] = _mm_adds_epi16(in[30], final_rounding);
+      in[31] = _mm_adds_epi16(in[31], final_rounding);
 
-      in0 = _mm_srai_epi16(in0, 6);
-      in1 = _mm_srai_epi16(in1, 6);
-      in2 = _mm_srai_epi16(in2, 6);
-      in3 = _mm_srai_epi16(in3, 6);
-      in4 = _mm_srai_epi16(in4, 6);
-      in5 = _mm_srai_epi16(in5, 6);
-      in6 = _mm_srai_epi16(in6, 6);
-      in7 = _mm_srai_epi16(in7, 6);
-      in8 = _mm_srai_epi16(in8, 6);
-      in9 = _mm_srai_epi16(in9, 6);
-      in10 = _mm_srai_epi16(in10, 6);
-      in11 = _mm_srai_epi16(in11, 6);
-      in12 = _mm_srai_epi16(in12, 6);
-      in13 = _mm_srai_epi16(in13, 6);
-      in14 = _mm_srai_epi16(in14, 6);
-      in15 = _mm_srai_epi16(in15, 6);
-      in16 = _mm_srai_epi16(in16, 6);
-      in17 = _mm_srai_epi16(in17, 6);
-      in18 = _mm_srai_epi16(in18, 6);
-      in19 = _mm_srai_epi16(in19, 6);
-      in20 = _mm_srai_epi16(in20, 6);
-      in21 = _mm_srai_epi16(in21, 6);
-      in22 = _mm_srai_epi16(in22, 6);
-      in23 = _mm_srai_epi16(in23, 6);
-      in24 = _mm_srai_epi16(in24, 6);
-      in25 = _mm_srai_epi16(in25, 6);
-      in26 = _mm_srai_epi16(in26, 6);
-      in27 = _mm_srai_epi16(in27, 6);
-      in28 = _mm_srai_epi16(in28, 6);
-      in29 = _mm_srai_epi16(in29, 6);
-      in30 = _mm_srai_epi16(in30, 6);
-      in31 = _mm_srai_epi16(in31, 6);
+      in[0] = _mm_srai_epi16(in[0], 6);
+      in[1] = _mm_srai_epi16(in[1], 6);
+      in[2] = _mm_srai_epi16(in[2], 6);
+      in[3] = _mm_srai_epi16(in[3], 6);
+      in[4] = _mm_srai_epi16(in[4], 6);
+      in[5] = _mm_srai_epi16(in[5], 6);
+      in[6] = _mm_srai_epi16(in[6], 6);
+      in[7] = _mm_srai_epi16(in[7], 6);
+      in[8] = _mm_srai_epi16(in[8], 6);
+      in[9] = _mm_srai_epi16(in[9], 6);
+      in[10] = _mm_srai_epi16(in[10], 6);
+      in[11] = _mm_srai_epi16(in[11], 6);
+      in[12] = _mm_srai_epi16(in[12], 6);
+      in[13] = _mm_srai_epi16(in[13], 6);
+      in[14] = _mm_srai_epi16(in[14], 6);
+      in[15] = _mm_srai_epi16(in[15], 6);
+      in[16] = _mm_srai_epi16(in[16], 6);
+      in[17] = _mm_srai_epi16(in[17], 6);
+      in[18] = _mm_srai_epi16(in[18], 6);
+      in[19] = _mm_srai_epi16(in[19], 6);
+      in[20] = _mm_srai_epi16(in[20], 6);
+      in[21] = _mm_srai_epi16(in[21], 6);
+      in[22] = _mm_srai_epi16(in[22], 6);
+      in[23] = _mm_srai_epi16(in[23], 6);
+      in[24] = _mm_srai_epi16(in[24], 6);
+      in[25] = _mm_srai_epi16(in[25], 6);
+      in[26] = _mm_srai_epi16(in[26], 6);
+      in[27] = _mm_srai_epi16(in[27], 6);
+      in[28] = _mm_srai_epi16(in[28], 6);
+      in[29] = _mm_srai_epi16(in[29], 6);
+      in[30] = _mm_srai_epi16(in[30], 6);
+      in[31] = _mm_srai_epi16(in[31], 6);
 
-      RECON_AND_STORE(dest, in0);
-      RECON_AND_STORE(dest, in1);
-      RECON_AND_STORE(dest, in2);
-      RECON_AND_STORE(dest, in3);
-      RECON_AND_STORE(dest, in4);
-      RECON_AND_STORE(dest, in5);
-      RECON_AND_STORE(dest, in6);
-      RECON_AND_STORE(dest, in7);
-      RECON_AND_STORE(dest, in8);
-      RECON_AND_STORE(dest, in9);
-      RECON_AND_STORE(dest, in10);
-      RECON_AND_STORE(dest, in11);
-      RECON_AND_STORE(dest, in12);
-      RECON_AND_STORE(dest, in13);
-      RECON_AND_STORE(dest, in14);
-      RECON_AND_STORE(dest, in15);
-      RECON_AND_STORE(dest, in16);
-      RECON_AND_STORE(dest, in17);
-      RECON_AND_STORE(dest, in18);
-      RECON_AND_STORE(dest, in19);
-      RECON_AND_STORE(dest, in20);
-      RECON_AND_STORE(dest, in21);
-      RECON_AND_STORE(dest, in22);
-      RECON_AND_STORE(dest, in23);
-      RECON_AND_STORE(dest, in24);
-      RECON_AND_STORE(dest, in25);
-      RECON_AND_STORE(dest, in26);
-      RECON_AND_STORE(dest, in27);
-      RECON_AND_STORE(dest, in28);
-      RECON_AND_STORE(dest, in29);
-      RECON_AND_STORE(dest, in30);
-      RECON_AND_STORE(dest, in31);
+      RECON_AND_STORE(dest, in[0]);
+      RECON_AND_STORE(dest, in[1]);
+      RECON_AND_STORE(dest, in[2]);
+      RECON_AND_STORE(dest, in[3]);
+      RECON_AND_STORE(dest, in[4]);
+      RECON_AND_STORE(dest, in[5]);
+      RECON_AND_STORE(dest, in[6]);
+      RECON_AND_STORE(dest, in[7]);
+      RECON_AND_STORE(dest, in[8]);
+      RECON_AND_STORE(dest, in[9]);
+      RECON_AND_STORE(dest, in[10]);
+      RECON_AND_STORE(dest, in[11]);
+      RECON_AND_STORE(dest, in[12]);
+      RECON_AND_STORE(dest, in[13]);
+      RECON_AND_STORE(dest, in[14]);
+      RECON_AND_STORE(dest, in[15]);
+      RECON_AND_STORE(dest, in[16]);
+      RECON_AND_STORE(dest, in[17]);
+      RECON_AND_STORE(dest, in[18]);
+      RECON_AND_STORE(dest, in[19]);
+      RECON_AND_STORE(dest, in[20]);
+      RECON_AND_STORE(dest, in[21]);
+      RECON_AND_STORE(dest, in[22]);
+      RECON_AND_STORE(dest, in[23]);
+      RECON_AND_STORE(dest, in[24]);
+      RECON_AND_STORE(dest, in[25]);
+      RECON_AND_STORE(dest, in[26]);
+      RECON_AND_STORE(dest, in[27]);
+      RECON_AND_STORE(dest, in[28]);
+      RECON_AND_STORE(dest, in[29]);
+      RECON_AND_STORE(dest, in[30]);
+      RECON_AND_STORE(dest, in[31]);
 
       dest += 8 - (stride * 32);
     }
-  }
 }  //NOLINT
 
 void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {

diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 9c94905..9b6740e 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c

@@ -41,8 +41,6 @@
   VP9_COMMON *cm;
   vp9_reader bit_reader;
   DECLARE_ALIGNED(16, MACROBLOCKD, xd);
-  DECLARE_ALIGNED(16, unsigned char, token_cache[1024]);
-  DECLARE_ALIGNED(16, int16_t,  qcoeff[MAX_MB_PLANE][64 * 64]);
   DECLARE_ALIGNED(16, int16_t,  dqcoeff[MAX_MB_PLANE][64 * 64]);
   DECLARE_ALIGNED(16, uint16_t, eobs[MAX_MB_PLANE][256]);
 } TileWorkerData;
@@ -240,8 +238,7 @@
 }
 
 static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
-                                    TX_SIZE tx_size, uint8_t *dst, int stride,
-                                    uint8_t *token_cache) {
+                                    TX_SIZE tx_size, uint8_t *dst, int stride) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const int eob = pd->eobs[block];
   if (eob > 0) {
@@ -274,20 +271,13 @@
 
     if (eob == 1) {
       vpx_memset(dqcoeff, 0, 2 * sizeof(dqcoeff[0]));
-      vpx_memset(token_cache, 0, 2 * sizeof(token_cache[0]));
     } else {
-      if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10) {
+      if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10)
         vpx_memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
-        vpx_memset(token_cache, 0,
-                   4 * (4 << tx_size) * sizeof(token_cache[0]));
-      } else if (tx_size == TX_32X32 && eob <= 34) {
+      else if (tx_size == TX_32X32 && eob <= 34)
         vpx_memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
-        vpx_memset(token_cache, 0, 256 * sizeof(token_cache[0]));
-      } else {
+      else
         vpx_memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
-        vpx_memset(token_cache, 0,
-                   (16 << (tx_size << 1)) * sizeof(token_cache[0]));
-      }
     }
   }
 }
@@ -296,7 +286,6 @@
   VP9_COMMON *cm;
   MACROBLOCKD *xd;
   vp9_reader *r;
-  uint8_t *token_cache;
 };
 
 static void predict_and_reconstruct_intra_block(int plane, int block,
@@ -325,9 +314,8 @@
 
   if (!mi->mbmi.skip_coeff) {
     vp9_decode_block_tokens(cm, xd, plane, block, plane_bsize, x, y, tx_size,
-                            args->r, args->token_cache);
-    inverse_transform_block(xd, plane, block, tx_size, dst, pd->dst.stride,
-                            args->token_cache);
+                            args->r);
+    inverse_transform_block(xd, plane, block, tx_size, dst, pd->dst.stride);
   }
 }
 
@@ -336,7 +324,6 @@
   MACROBLOCKD *xd;
   vp9_reader *r;
   int *eobtotal;
-  uint8_t *token_cache;
 };
 
 static void reconstruct_inter_block(int plane, int block,
@@ -350,10 +337,10 @@
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
   *args->eobtotal += vp9_decode_block_tokens(cm, xd, plane, block,
                                              plane_bsize, x, y, tx_size,
-                                             args->r, args->token_cache);
+                                             args->r);
   inverse_transform_block(xd, plane, block, tx_size,
                           &pd->dst.buf[4 * y * pd->dst.stride + 4 * x],
-                          pd->dst.stride, args->token_cache);
+                          pd->dst.stride);
 }
 
 static void set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
@@ -403,8 +390,7 @@
 static void decode_modes_b(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                            const TileInfo *const tile,
                            int mi_row, int mi_col,
-                           vp9_reader *r, BLOCK_SIZE bsize,
-                           uint8_t *token_cache) {
+                           vp9_reader *r, BLOCK_SIZE bsize) {
   const int less8x8 = bsize < BLOCK_8X8;
   MB_MODE_INFO *mbmi;
 
@@ -426,9 +412,7 @@
   }
 
   if (!is_inter_block(mbmi)) {
-    struct intra_args arg = {
-      cm, xd, r, token_cache
-    };
+    struct intra_args arg = { cm, xd, r };
     foreach_transformed_block(xd, bsize, predict_and_reconstruct_intra_block,
                               &arg);
   } else {
@@ -446,9 +430,7 @@
     // Reconstruction
     if (!mbmi->skip_coeff) {
       int eobtotal = 0;
-      struct inter_args arg = {
-        cm, xd, r, &eobtotal, token_cache
-      };
+      struct inter_args arg = { cm, xd, r, &eobtotal };
       foreach_transformed_block(xd, bsize, reconstruct_inter_block, &arg);
       if (!less8x8 && eobtotal == 0)
         mbmi->skip_coeff = 1;  // skip loopfilter
@@ -487,8 +469,7 @@
 static void decode_modes_sb(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                             const TileInfo *const tile,
                             int mi_row, int mi_col,
-                            vp9_reader* r, BLOCK_SIZE bsize,
-                            uint8_t *token_cache) {
+                            vp9_reader* r, BLOCK_SIZE bsize) {
   const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
   PARTITION_TYPE partition;
   BLOCK_SIZE subsize;
@@ -499,33 +480,27 @@
   partition = read_partition(cm, xd, hbs, mi_row, mi_col, bsize, r);
   subsize = get_subsize(bsize, partition);
   if (subsize < BLOCK_8X8) {
-    decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
+    decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize);
   } else {
     switch (partition) {
       case PARTITION_NONE:
-        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
+        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize);
         break;
       case PARTITION_HORZ:
-        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
+        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize);
         if (mi_row + hbs < cm->mi_rows)
-          decode_modes_b(cm, xd, tile, mi_row + hbs, mi_col, r, subsize,
-                         token_cache);
+          decode_modes_b(cm, xd, tile, mi_row + hbs, mi_col, r, subsize);
         break;
       case PARTITION_VERT:
-        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
+        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize);
         if (mi_col + hbs < cm->mi_cols)
-          decode_modes_b(cm, xd, tile, mi_row, mi_col + hbs, r, subsize,
-                         token_cache);
+          decode_modes_b(cm, xd, tile, mi_row, mi_col + hbs, r, subsize);
         break;
       case PARTITION_SPLIT:
-        decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, subsize,
-                        token_cache);
-        decode_modes_sb(cm, xd, tile, mi_row, mi_col + hbs, r, subsize,
-                        token_cache);
-        decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col, r, subsize,
-                        token_cache);
-        decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize,
-                        token_cache);
+        decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, subsize);
+        decode_modes_sb(cm, xd, tile, mi_row, mi_col + hbs, r, subsize);
+        decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col, r, subsize);
+        decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize);
         break;
       default:
         assert(!"Invalid partition type");
@@ -808,8 +783,7 @@
     vp9_zero(xd->left_seg_context);
     for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
          mi_col += MI_BLOCK_SIZE) {
-      decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, BLOCK_64X64,
-                      pbi->token_cache);
+      decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, BLOCK_64X64);
     }
 
     if (pbi->do_loopfilter_inline) {
@@ -950,11 +924,9 @@
   int i;
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
-    pd[i].qcoeff  = tile_data->qcoeff[i];
     pd[i].dqcoeff = tile_data->dqcoeff[i];
     pd[i].eobs    = tile_data->eobs[i];
     vpx_memset(xd->plane[i].dqcoeff, 0, 64 * 64 * sizeof(int16_t));
-    vpx_memset(tile_data->token_cache, 0, sizeof(tile_data->token_cache));
   }
 }
 
@@ -970,8 +942,7 @@
     for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
          mi_col += MI_BLOCK_SIZE) {
       decode_modes_sb(tile_data->cm, &tile_data->xd, tile,
-                      mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64,
-                      tile_data->token_cache);
+                      mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64);
     }
   }
   return !tile_data->xd.corrupted;

diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index fb6e52b..bdbe67d 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c

@@ -60,16 +60,10 @@
   254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0
 };
 
-static const int token_to_counttoken[MAX_ENTROPY_TOKENS] = {
-  ZERO_TOKEN, ONE_TOKEN, TWO_TOKEN, TWO_TOKEN,
-  TWO_TOKEN, TWO_TOKEN, TWO_TOKEN, TWO_TOKEN,
-  TWO_TOKEN, TWO_TOKEN, TWO_TOKEN, DCT_EOB_MODEL_TOKEN
-};
-
 #define INCREMENT_COUNT(token)                              \
   do {                                                      \
      if (!cm->frame_parallel_decoding_mode)                 \
-       ++coef_counts[band][pt][token_to_counttoken[token]]; \
+       ++coef_counts[band][pt][token];                      \
   } while (0)
 
 
@@ -77,7 +71,6 @@
   {                                                      \
     v = (val * dqv) >> dq_shift; \
     dqcoeff_ptr[scan[c]] = (vp9_read_bit(r) ? -v : v); \
-    INCREMENT_COUNT(token);                              \
     token_cache[scan[c]] = vp9_pt_energy_class[token];   \
     ++c;                                                 \
     pt = get_coef_context(nb, token_cache, c);           \
@@ -93,9 +86,8 @@
 
 static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
                         vp9_reader *r, int block_idx,
-                        PLANE_TYPE type, int seg_eob, int16_t *dqcoeff_ptr,
-                        TX_SIZE tx_size, const int16_t *dq, int pt,
-                        uint8_t *token_cache) {
+                        PLANE_TYPE type, int max_eob, int16_t *dqcoeff_ptr,
+                        TX_SIZE tx_size, const int16_t *dq, int pt) {
   const FRAME_CONTEXT *const fc = &cm->fc;
   FRAME_COUNTS *const counts = &cm->counts;
   const int ref = is_inter_block(&xd->mi_8x8[0]->mbmi);
@@ -107,6 +99,7 @@
       counts->coef[tx_size][type][ref];
   unsigned int (*eob_branch_count)[PREV_COEF_CONTEXTS] =
       counts->eob_branch[tx_size][type][ref];
+  uint8_t token_cache[32 * 32];
   const uint8_t *cat6;
   const uint8_t *band_translate = get_band_translate(tx_size);
   const int dq_shift = (tx_size == TX_32X32);
@@ -116,38 +109,39 @@
   int v;
   int16_t dqv = dq[0];
 
-
-
-  while (c < seg_eob) {
+  while (c < max_eob) {
     int val;
     band = *band_translate++;
     prob = coef_probs[band][pt];
     if (!cm->frame_parallel_decoding_mode)
       ++eob_branch_count[band][pt];
-    if (!vp9_read(r, prob[EOB_CONTEXT_NODE]))
+    if (!vp9_read(r, prob[EOB_CONTEXT_NODE])) {
+      INCREMENT_COUNT(DCT_EOB_MODEL_TOKEN);
       break;
+    }
 
-  DECODE_ZERO:
-    if (!vp9_read(r, prob[ZERO_CONTEXT_NODE])) {
+    while (!vp9_read(r, prob[ZERO_CONTEXT_NODE])) {
       INCREMENT_COUNT(ZERO_TOKEN);
       dqv = dq[1];
+      token_cache[scan[c]] = 0;
       ++c;
-      if (c >= seg_eob)
-        break;
+      if (c >= max_eob)
+        return c;  // zero tokens at the end (no eob token)
       pt = get_coef_context(nb, token_cache, c);
       band = *band_translate++;
       prob = coef_probs[band][pt];
-      goto DECODE_ZERO;
     }
 
     // ONE_CONTEXT_NODE_0_
     if (!vp9_read(r, prob[ONE_CONTEXT_NODE])) {
+      INCREMENT_COUNT(ONE_TOKEN);
       WRITE_COEF_CONTINUE(1, ONE_TOKEN);
     }
 
-    prob = vp9_pareto8_full[coef_probs[band][pt][PIVOT_NODE]-1];
+    INCREMENT_COUNT(TWO_TOKEN);
 
-    // LOW_VAL_CONTEXT_NODE_0_
+    prob = vp9_pareto8_full[prob[PIVOT_NODE] - 1];
+
     if (!vp9_read(r, prob[LOW_VAL_CONTEXT_NODE])) {
       if (!vp9_read(r, prob[TWO_CONTEXT_NODE])) {
         WRITE_COEF_CONTINUE(2, TWO_TOKEN);
@@ -157,7 +151,7 @@
       }
       WRITE_COEF_CONTINUE(4, FOUR_TOKEN);
     }
-    // HIGH_LOW_CONTEXT_NODE_0_
+
     if (!vp9_read(r, prob[HIGH_LOW_CONTEXT_NODE])) {
       if (!vp9_read(r, prob[CAT_ONE_CONTEXT_NODE])) {
         val = CAT1_MIN_VAL;
@@ -169,7 +163,7 @@
       ADJUST_COEF(CAT2_PROB0, 0);
       WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY2);
     }
-    // CAT_THREEFOUR_CONTEXT_NODE_0_
+
     if (!vp9_read(r, prob[CAT_THREEFOUR_CONTEXT_NODE])) {
       if (!vp9_read(r, prob[CAT_THREE_CONTEXT_NODE])) {
         val = CAT3_MIN_VAL;
@@ -185,7 +179,7 @@
       ADJUST_COEF(CAT4_PROB0, 0);
       WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY4);
     }
-    // CAT_FIVE_CONTEXT_NODE_0_:
+
     if (!vp9_read(r, prob[CAT_FIVE_CONTEXT_NODE])) {
       val = CAT5_MIN_VAL;
       ADJUST_COEF(CAT5_PROB4, 4);
@@ -204,18 +198,12 @@
     WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY6);
   }
 
-  if (c < seg_eob) {
-    if (!cm->frame_parallel_decoding_mode)
-      ++coef_counts[band][pt][DCT_EOB_MODEL_TOKEN];
-  }
-
   return c;
 }
 
 int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
                             int plane, int block, BLOCK_SIZE plane_bsize,
-                            int x, int y, TX_SIZE tx_size, vp9_reader *r,
-                            uint8_t *token_cache) {
+                            int x, int y, TX_SIZE tx_size, vp9_reader *r) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const int seg_eob = get_tx_eob(&cm->seg, xd->mi_8x8[0]->mbmi.segment_id,
                                  tx_size);
@@ -223,7 +211,7 @@
                                               pd->left_context + y);
   const int eob = decode_coefs(cm, xd, r, block, pd->plane_type, seg_eob,
                                BLOCK_OFFSET(pd->dqcoeff, block), tx_size,
-                               pd->dequant, pt, token_cache);
+                               pd->dequant, pt);
   set_contexts(xd, pd, plane_bsize, tx_size, eob > 0, x, y);
   pd->eobs[block] = eob;
   return eob;

diff --git a/vp9/decoder/vp9_detokenize.h b/vp9/decoder/vp9_detokenize.h
index e858a19..2a88073 100644
--- a/vp9/decoder/vp9_detokenize.h
+++ b/vp9/decoder/vp9_detokenize.h

@@ -17,7 +17,6 @@
 
 int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
                             int plane, int block, BLOCK_SIZE plane_bsize,
-                            int x, int y, TX_SIZE tx_size, vp9_reader *r,
-                            uint8_t *token_cache);
+                            int x, int y, TX_SIZE tx_size, vp9_reader *r);
 
 #endif  // VP9_DECODER_VP9_DETOKENIZE_H_

diff --git a/vp9/decoder/vp9_onyxd_if.c b/vp9/decoder/vp9_onyxd_if.c
index 7c0f91d..740ad72 100644
--- a/vp9/decoder/vp9_onyxd_if.c
+++ b/vp9/decoder/vp9_onyxd_if.c

@@ -113,7 +113,6 @@
   int i;
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
-    pd[i].qcoeff  = pbi->qcoeff[i];
     pd[i].dqcoeff = pbi->dqcoeff[i];
     pd[i].eobs    = pbi->eobs[i];
   }

diff --git a/vp9/decoder/vp9_onyxd_int.h b/vp9/decoder/vp9_onyxd_int.h
index d3d29e9..038cd96 100644
--- a/vp9/decoder/vp9_onyxd_int.h
+++ b/vp9/decoder/vp9_onyxd_int.h

@@ -22,7 +22,6 @@
 
   DECLARE_ALIGNED(16, VP9_COMMON, common);
 
-  DECLARE_ALIGNED(16, int16_t,  qcoeff[MAX_MB_PLANE][64 * 64]);
   DECLARE_ALIGNED(16, int16_t,  dqcoeff[MAX_MB_PLANE][64 * 64]);
   DECLARE_ALIGNED(16, uint16_t, eobs[MAX_MB_PLANE][256]);
 
@@ -53,8 +52,6 @@
 
   ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
   PARTITION_CONTEXT *above_seg_context;
-
-  DECLARE_ALIGNED(16, uint8_t, token_cache[1024]);
 } VP9D_COMP;
 
 #endif  // VP9_DECODER_VP9_ONYXD_INT_H_

diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 1466364..9f79f8c 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c

@@ -32,6 +32,7 @@
 #include "vp9/encoder/vp9_bitstream.h"
 #include "vp9/encoder/vp9_segmentation.h"
 #include "vp9/encoder/vp9_subexp.h"
+#include "vp9/encoder/vp9_tokenize.h"
 #include "vp9/encoder/vp9_write_bit_buffer.h"
 
 
@@ -151,6 +152,30 @@
 }
 #endif
 
+static struct vp9_token intra_mode_encodings[INTRA_MODES];
+static struct vp9_token switchable_interp_encodings[SWITCHABLE_FILTERS];
+static struct vp9_token partition_encodings[PARTITION_TYPES];
+static struct vp9_token inter_mode_encodings[INTER_MODES];
+
+void vp9_entropy_mode_init() {
+  vp9_tokens_from_tree(intra_mode_encodings, vp9_intra_mode_tree);
+  vp9_tokens_from_tree(switchable_interp_encodings, vp9_switchable_interp_tree);
+  vp9_tokens_from_tree(partition_encodings, vp9_partition_tree);
+  vp9_tokens_from_tree(inter_mode_encodings, vp9_inter_mode_tree);
+}
+
+static void write_intra_mode(vp9_writer *w, MB_PREDICTION_MODE mode,
+                             const vp9_prob *probs) {
+  write_token(w, vp9_intra_mode_tree, probs, &intra_mode_encodings[mode]);
+}
+
+static void write_inter_mode(vp9_writer *w, MB_PREDICTION_MODE mode,
+                             const vp9_prob *probs) {
+  assert(is_inter_mode(mode));
+  write_token(w, vp9_inter_mode_tree, probs,
+              &inter_mode_encodings[INTER_OFFSET(mode)]);
+}
+
 static INLINE void write_be32(uint8_t *p, int value) {
   p[0] = value >> 24;
   p[1] = value >> 16;
@@ -213,10 +238,6 @@
     vp9_cond_prob_diff_update(w, &cm->fc.mbskip_probs[k], cm->counts.mbskip[k]);
 }
 
-static void write_intra_mode(vp9_writer *bc, int m, const vp9_prob *p) {
-  write_token(bc, vp9_intra_mode_tree, p, vp9_intra_mode_encodings + m);
-}
-
 static void update_switchable_interp_probs(VP9_COMP *cpi, vp9_writer *w) {
   VP9_COMMON *const cm = &cpi->common;
   int j;
@@ -293,14 +314,6 @@
   *tp = p + (p->token == EOSB_TOKEN);
 }
 
-static void write_sb_mv_ref(vp9_writer *w, MB_PREDICTION_MODE mode,
-                            const vp9_prob *p) {
-  assert(is_inter_mode(mode));
-  write_token(w, vp9_inter_mode_tree, p,
-              &vp9_inter_mode_encodings[INTER_OFFSET(mode)]);
-}
-
-
 static void write_segment_id(vp9_writer *w, const struct segmentation *seg,
                              int segment_id) {
   if (seg->enabled && seg->update_map)
@@ -422,7 +435,7 @@
     // If segment skip is not enabled code the mode.
     if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
       if (bsize >= BLOCK_8X8) {
-        write_sb_mv_ref(bc, mode, mv_ref_p);
+        write_inter_mode(bc, mode, mv_ref_p);
         ++cm->counts.inter_mode[mi->mode_context[rf]]
                                [INTER_OFFSET(mode)];
       }
@@ -432,7 +445,7 @@
       const int ctx = vp9_get_pred_context_switchable_interp(xd);
       write_token(bc, vp9_switchable_interp_tree,
                   cm->fc.switchable_interp_prob[ctx],
-                  &vp9_switchable_interp_encodings[mi->interp_filter]);
+                  &switchable_interp_encodings[mi->interp_filter]);
     } else {
       assert(mi->interp_filter == cm->mcomp_filter_type);
     }
@@ -445,7 +458,7 @@
         for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
           const int j = idy * 2 + idx;
           const MB_PREDICTION_MODE blockmode = m->bmi[j].as_mode;
-          write_sb_mv_ref(bc, blockmode, mv_ref_p);
+          write_inter_mode(bc, blockmode, mv_ref_p);
           ++cm->counts.inter_mode[mi->mode_context[rf]]
                                  [INTER_OFFSET(blockmode)];
 
@@ -561,7 +574,7 @@
   const int has_cols = (mi_col + hbs) < cm->mi_cols;
 
   if (has_rows && has_cols) {
-    write_token(w, vp9_partition_tree, probs, &vp9_partition_encodings[p]);
+    write_token(w, vp9_partition_tree, probs, &partition_encodings[p]);
   } else if (!has_rows && has_cols) {
     assert(p == PARTITION_SPLIT || p == PARTITION_HORZ);
     vp9_write(w, p == PARTITION_SPLIT, probs[1]);

diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 4445970..71f7e7a 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h

@@ -69,6 +69,7 @@
 
 struct macroblock_plane {
   DECLARE_ALIGNED(16, int16_t, src_diff[64 * 64]);
+  int16_t *qcoeff;
   int16_t *coeff;
   struct buf_2d src;
 

diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 18cfddd..89da781 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c

@@ -439,14 +439,14 @@
   max_plane = is_inter_block(mbmi) ? MAX_MB_PLANE : 1;
   for (i = 0; i < max_plane; ++i) {
     p[i].coeff = ctx->coeff_pbuf[i][1];
-    pd[i].qcoeff = ctx->qcoeff_pbuf[i][1];
+    p[i].qcoeff = ctx->qcoeff_pbuf[i][1];
     pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
     pd[i].eobs = ctx->eobs_pbuf[i][1];
   }
 
   for (i = max_plane; i < MAX_MB_PLANE; ++i) {
     p[i].coeff = ctx->coeff_pbuf[i][2];
-    pd[i].qcoeff = ctx->qcoeff_pbuf[i][2];
+    p[i].qcoeff = ctx->qcoeff_pbuf[i][2];
     pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][2];
     pd[i].eobs = ctx->eobs_pbuf[i][2];
   }
@@ -675,7 +675,7 @@
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     p[i].coeff = ctx->coeff_pbuf[i][0];
-    pd[i].qcoeff = ctx->qcoeff_pbuf[i][0];
+    p[i].qcoeff = ctx->qcoeff_pbuf[i][0];
     pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][0];
     pd[i].eobs = ctx->eobs_pbuf[i][0];
   }
@@ -1567,10 +1567,14 @@
   // Override skipping rectangular partition operations for edge blocks
   const int force_horz_split = (mi_row + ms >= cm->mi_rows);
   const int force_vert_split = (mi_col + ms >= cm->mi_cols);
+  const int xss = x->e_mbd.plane[1].subsampling_x;
+  const int yss = x->e_mbd.plane[1].subsampling_y;
 
   int partition_none_allowed = !force_horz_split && !force_vert_split;
-  int partition_horz_allowed = !force_vert_split && bsize >= BLOCK_8X8;
-  int partition_vert_allowed = !force_horz_split && bsize >= BLOCK_8X8;
+  int partition_horz_allowed = !force_vert_split && yss <= xss &&
+                               bsize >= BLOCK_8X8;
+  int partition_vert_allowed = !force_horz_split && xss <= yss &&
+                               bsize >= BLOCK_8X8;
 
   int partition_split_done = 0;
   (void) *tp_orig;

diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index bd9678a..3691e7a 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c

@@ -136,6 +136,7 @@
                        ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
                        TX_SIZE tx_size) {
   MACROBLOCKD *const xd = &mb->e_mbd;
+  struct macroblock_plane *p = &mb->plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
   const int ref = is_inter_block(&xd->mi_8x8[0]->mbmi);
   vp9_token_state tokens[1025][2];
@@ -163,7 +164,7 @@
 
   assert((!type && !plane) || (type && plane));
   dqcoeff_ptr = BLOCK_OFFSET(pd->dqcoeff, block);
-  qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block);
+  qcoeff_ptr = BLOCK_OFFSET(p->qcoeff, block);
   assert(eob <= default_eob);
 
   /* Now set up a Viterbi trellis to evaluate alternative roundings. */
@@ -368,7 +369,7 @@
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   int16_t *coeff = BLOCK_OFFSET(p->coeff, block);
-  int16_t *qcoeff = BLOCK_OFFSET(pd->qcoeff, block);
+  int16_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   const scan_order *so;
   uint16_t *eob = &pd->eobs[block];
@@ -530,7 +531,7 @@
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   int16_t *coeff = BLOCK_OFFSET(p->coeff, block);
-  int16_t *qcoeff = BLOCK_OFFSET(pd->qcoeff, block);
+  int16_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   const scan_order *so;
   TX_TYPE tx_type;
@@ -558,8 +559,9 @@
       mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
       block >>= 6;
       vp9_predict_intra_block(xd, block, bwl, TX_32X32, mode,
-                              dst, pd->dst.stride, dst, pd->dst.stride);
-
+                              x->skip_encode ? src : dst,
+                              x->skip_encode ? p->src.stride : pd->dst.stride,
+                              dst, pd->dst.stride);
       if (!x->skip_recode) {
         vp9_subtract_block(32, 32, src_diff, diff_stride,
                            src, p->src.stride, dst, pd->dst.stride);
@@ -581,7 +583,9 @@
       mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
       block >>= 4;
       vp9_predict_intra_block(xd, block, bwl, TX_16X16, mode,
-                              dst, pd->dst.stride, dst, pd->dst.stride);
+                              x->skip_encode ? src : dst,
+                              x->skip_encode ? p->src.stride : pd->dst.stride,
+                              dst, pd->dst.stride);
       if (!x->skip_recode) {
         vp9_subtract_block(16, 16, src_diff, diff_stride,
                            src, p->src.stride, dst, pd->dst.stride);
@@ -599,7 +603,9 @@
       mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
       block >>= 2;
       vp9_predict_intra_block(xd, block, bwl, TX_8X8, mode,
-                              dst, pd->dst.stride, dst, pd->dst.stride);
+                              x->skip_encode ? src : dst,
+                              x->skip_encode ? p->src.stride : pd->dst.stride,
+                              dst, pd->dst.stride);
       if (!x->skip_recode) {
         vp9_subtract_block(8, 8, src_diff, diff_stride,
                            src, p->src.stride, dst, pd->dst.stride);
@@ -620,7 +626,9 @@
         mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
 
       vp9_predict_intra_block(xd, block, bwl, TX_4X4, mode,
-                              dst, pd->dst.stride, dst, pd->dst.stride);
+                              x->skip_encode ? src : dst,
+                              x->skip_encode ? p->src.stride : pd->dst.stride,
+                              dst, pd->dst.stride);
 
       if (!x->skip_recode) {
         vp9_subtract_block(4, 4, src_diff, diff_stride,

diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 924f9f3..50d8036 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c

@@ -76,6 +76,19 @@
   return ret_val;
 }
 
+static int gfboost_qadjust(int qindex) {
+  const double q = vp9_convert_qindex_to_q(qindex);
+  return (int)((0.00000828 * q * q * q) +
+               (-0.0055 * q * q) +
+               (1.32 * q) + 79.3);
+}
+
+static int kfboost_qadjust(int qindex) {
+  const double q = vp9_convert_qindex_to_q(qindex);
+  return (int)((0.00000973 * q * q * q) +
+               (-0.00613 * q * q) +
+               (1.316 * q) + 121.2);
+}
 
 // Resets the first pass file to the given position using a relative seek from
 // the current position.
@@ -530,7 +543,7 @@
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     p[i].coeff = ctx->coeff_pbuf[i][1];
-    pd[i].qcoeff = ctx->qcoeff_pbuf[i][1];
+    p[i].qcoeff = ctx->qcoeff_pbuf[i][1];
     pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
     pd[i].eobs = ctx->eobs_pbuf[i][1];
   }
@@ -928,11 +941,11 @@
   intra_cost = bitcost(av_intra);
 
   // Estimate of extra bits per mv overhead for mbs
-  // << 9 is the normalization to the (bits * 512) used in vp9_bits_per_mb
+  // << 9 is the normalization to the (bits * 512) used in vp9_rc_bits_per_mb
   mv_cost = ((int)(fpstats->new_mv_count / fpstats->count) * 8) << 9;
 
   // Crude estimate of overhead cost from modes
-  // << 9 is the normalization to (bits * 512) used in vp9_bits_per_mb
+  // << 9 is the normalization to (bits * 512) used in vp9_rc_bits_per_mb
   mode_cost =
     (int)((((av_pct_inter - av_pct_motion) * zz_cost) +
            (av_pct_motion * motion_cost) +
@@ -1052,8 +1065,8 @@
                                 sr_correction * speed_correction *
                                 cpi->twopass.est_max_qcorrection_factor;
 
-    bits_per_mb_at_this_q = vp9_bits_per_mb(INTER_FRAME, q,
-                                            err_correction_factor);
+    bits_per_mb_at_this_q = vp9_rc_bits_per_mb(INTER_FRAME, q,
+                                               err_correction_factor);
 
     if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
       break;
@@ -1140,7 +1153,7 @@
       sr_correction * speed_correction * clip_iifactor;
 
     bits_per_mb_at_this_q =
-      vp9_bits_per_mb(INTER_FRAME, q, err_correction_factor);
+      vp9_rc_bits_per_mb(INTER_FRAME, q, err_correction_factor);
 
     if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
       break;
@@ -1936,7 +1949,7 @@
     int q = cpi->rc.last_q[INTER_FRAME];
     int gf_bits;
 
-    int boost = (cpi->rc.gfu_boost * vp9_gfboost_qadjust(q)) / 100;
+    int boost = (cpi->rc.gfu_boost * gfboost_qadjust(q)) / 100;
 
     // Set max and minimum boost and hence minimum allocation
     boost = clamp(boost, 125, (cpi->rc.baseline_gf_interval + 1) * 200);
@@ -2728,3 +2741,21 @@
   // sizes.
   cpi->twopass.modified_error_left -= kf_group_err;
 }
+
+void vp9_twopass_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
+#ifdef DISABLE_RC_LONG_TERM_MEM
+  cpi->twopass.bits_left -=  cpi->rc.this_frame_target;
+#else
+  cpi->twopass.bits_left -= 8 * bytes_used;
+#endif
+  if (!cpi->refresh_alt_ref_frame) {
+    double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.framerate;
+    double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth *
+                                        cpi->oxcf.two_pass_vbrmin_section
+                                        / 100);
+    if (two_pass_min_rate < lower_bounds_min_rate)
+      two_pass_min_rate = lower_bounds_min_rate;
+    cpi->twopass.bits_left += (int64_t)(two_pass_min_rate /
+                                        cpi->oxcf.framerate);
+  }
+}

diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index b289391..968cbfb 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c

@@ -39,6 +39,9 @@
 
 #include "vpx_ports/vpx_timer.h"
 
+void vp9_entropy_mode_init();
+void vp9_coef_tree_initialize();
+
 static void set_default_lf_deltas(struct loopfilter *lf);
 
 #define DEFAULT_INTERP_FILTER SWITCHABLE
@@ -159,9 +162,11 @@
     vp9_tokenize_initialize();
     vp9_init_quant_tables();
     vp9_init_me_luts();
-    vp9_init_minq_luts();
+    vp9_rc_init_minq_luts();
     // init_base_skip_probs();
     vp9_entropy_mv_init();
+    vp9_entropy_mode_init();
+    vp9_coef_tree_initialize();
     init_done = 1;
   }
 }
@@ -265,8 +270,8 @@
   vp9_clear_system_state();
 
   // Look up the current projected bits per block for the base index
-  base_bits_per_mb = vp9_bits_per_mb(cpi->common.frame_type,
-                                     base_q_index, 1.0);
+  base_bits_per_mb = vp9_rc_bits_per_mb(cpi->common.frame_type,
+                                        base_q_index, 1.0);
 
   // Find the target bits per mb based on the base value and given ratio.
   target_bits_per_mb = rate_target_ratio * base_bits_per_mb;
@@ -274,8 +279,8 @@
   // Convert the q target to an index
   for (i = cpi->rc.best_quality; i < cpi->rc.worst_quality; i++) {
     target_index = i;
-    if (vp9_bits_per_mb(cpi->common.frame_type,
-                        i, 1.0) <= target_bits_per_mb )
+    if (vp9_rc_bits_per_mb(cpi->common.frame_type,
+                           i, 1.0) <= target_bits_per_mb )
       break;
   }
 
@@ -2683,7 +2688,7 @@
 static void encode_with_recode_loop(VP9_COMP *cpi,
                                     unsigned long *size,
                                     uint8_t *dest,
-                                    int q,
+                                    int *q,
                                     int bottom_index,
                                     int top_index,
                                     int frame_over_shoot_limit,
@@ -2697,7 +2702,7 @@
   do {
     vp9_clear_system_state();  // __asm emms;
 
-    vp9_set_quantizer(cpi, q);
+    vp9_set_quantizer(cpi, *q);
 
     if (loop_count == 0) {
       // Set up entropy context depending on frame type. The decoder mandates
@@ -2750,7 +2755,7 @@
     } else {
       // Special case handling for forced key frames
       if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {
-        int last_q = q;
+        int last_q = *q;
         int kf_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm));
 
         int high_err_target = cpi->ambient_err;
@@ -2766,32 +2771,32 @@
             (kf_err > low_err_target &&
              cpi->rc.projected_frame_size <= frame_under_shoot_limit)) {
           // Lower q_high
-          q_high = q > q_low ? q - 1 : q_low;
+          q_high = *q > q_low ? *q - 1 : q_low;
 
           // Adjust Q
-          q = (q * high_err_target) / kf_err;
-          q = MIN(q, (q_high + q_low) >> 1);
+          *q = ((*q) * high_err_target) / kf_err;
+          *q = MIN((*q), (q_high + q_low) >> 1);
         } else if (kf_err < low_err_target &&
                    cpi->rc.projected_frame_size >= frame_under_shoot_limit) {
           // The key frame is much better than the previous frame
           // Raise q_low
-          q_low = q < q_high ? q + 1 : q_high;
+          q_low = *q < q_high ? *q + 1 : q_high;
 
           // Adjust Q
-          q = (q * low_err_target) / kf_err;
-          q = MIN(q, (q_high + q_low + 1) >> 1);
+          *q = ((*q) * low_err_target) / kf_err;
+          *q = MIN((*q), (q_high + q_low + 1) >> 1);
         }
 
         // Clamp Q to upper and lower limits:
-        q = clamp(q, q_low, q_high);
+        *q = clamp(*q, q_low, q_high);
 
-        loop = q != last_q;
+        loop = *q != last_q;
       } else if (recode_loop_test(
           cpi, frame_over_shoot_limit, frame_under_shoot_limit,
-          q, top_index, bottom_index)) {
+          *q, top_index, bottom_index)) {
         // Is the projected frame size out of range and are we allowed
         // to attempt to recode.
-        int last_q = q;
+        int last_q = *q;
         int retries = 0;
 
         // Frame size out of permitted range:
@@ -2800,22 +2805,22 @@
         // Frame is too large
         if (cpi->rc.projected_frame_size > cpi->rc.this_frame_target) {
           // Raise Qlow as to at least the current value
-          q_low = q < q_high ? q + 1 : q_high;
+          q_low = *q < q_high ? *q + 1 : q_high;
 
           if (undershoot_seen || loop_count > 1) {
             // Update rate_correction_factor unless
-            vp9_update_rate_correction_factors(cpi, 1);
+            vp9_rc_update_rate_correction_factors(cpi, 1);
 
-            q = (q_high + q_low + 1) / 2;
+            *q = (q_high + q_low + 1) / 2;
           } else {
             // Update rate_correction_factor unless
-            vp9_update_rate_correction_factors(cpi, 0);
+            vp9_rc_update_rate_correction_factors(cpi, 0);
 
-            q = vp9_regulate_q(cpi, cpi->rc.this_frame_target);
+            *q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target);
 
-            while (q < q_low && retries < 10) {
-              vp9_update_rate_correction_factors(cpi, 0);
-              q = vp9_regulate_q(cpi, cpi->rc.this_frame_target);
+            while (*q < q_low && retries < 10) {
+              vp9_rc_update_rate_correction_factors(cpi, 0);
+              *q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target);
               retries++;
             }
           }
@@ -2823,32 +2828,33 @@
           overshoot_seen = 1;
         } else {
           // Frame is too small
-          q_high = q > q_low ? q - 1 : q_low;
+          q_high = *q > q_low ? *q - 1 : q_low;
 
           if (overshoot_seen || loop_count > 1) {
             // Update rate_correction_factor unless
             // cpi->rc.active_worst_quality has changed.
-            vp9_update_rate_correction_factors(cpi, 1);
+            vp9_rc_update_rate_correction_factors(cpi, 1);
 
-            q = (q_high + q_low) / 2;
+            *q = (q_high + q_low) / 2;
           } else {
             // Update rate_correction_factor unless
             // cpi->rc.active_worst_quality has changed.
-            vp9_update_rate_correction_factors(cpi, 0);
+            vp9_rc_update_rate_correction_factors(cpi, 0);
 
-            q = vp9_regulate_q(cpi, cpi->rc.this_frame_target);
+            *q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target);
 
             // Special case reset for qlow for constrained quality.
             // This should only trigger where there is very substantial
             // undershoot on a frame and the auto cq level is above
             // the user passsed in value.
-            if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY && q < q_low) {
-              q_low = q;
+            if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY &&
+                *q < q_low) {
+              q_low = *q;
             }
 
-            while (q > q_high && retries < 10) {
-              vp9_update_rate_correction_factors(cpi, 0);
-              q = vp9_regulate_q(cpi, cpi->rc.this_frame_target);
+            while (*q > q_high && retries < 10) {
+              vp9_rc_update_rate_correction_factors(cpi, 0);
+              *q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target);
               retries++;
             }
           }
@@ -2857,9 +2863,9 @@
         }
 
         // Clamp Q to upper and lower limits:
-        q = clamp(q, q_low, q_high);
+        *q = clamp(*q, q_low, q_high);
 
-        loop = q != last_q;
+        loop = *q != last_q;
       } else {
         loop = 0;
       }
@@ -3000,15 +3006,16 @@
     configure_static_seg_features(cpi);
   }
 
-  // Decide how big to make the frame.
-  vp9_pick_frame_size(cpi);
-
   vp9_clear_system_state();
 
-  q = vp9_pick_q_and_adjust_q_bounds(cpi, &bottom_index, &top_index);
+  // Decide how big to make the frame.
+  vp9_rc_pick_frame_size_and_bounds(cpi,
+                                    &frame_under_shoot_limit,
+                                    &frame_over_shoot_limit);
 
-  vp9_compute_frame_size_bounds(cpi, &frame_under_shoot_limit,
-                                &frame_over_shoot_limit);
+  q = vp9_rc_pick_q_and_adjust_q_bounds(cpi,
+                                        &bottom_index,
+                                        &top_index);
 
 #if CONFIG_MULTIPLE_ARF
   // Force the quantizer determined by the coding order pattern.
@@ -3072,7 +3079,7 @@
   encode_with_recode_loop(cpi,
                           size,
                           dest,
-                          q,
+                          &q,
                           bottom_index,
                           top_index,
                           frame_over_shoot_limit,
@@ -3154,106 +3161,7 @@
    * needed in motion search besides loopfilter */
   cm->last_frame_type = cm->frame_type;
 
-  // Update rate control heuristics
-  cpi->rc.projected_frame_size = (*size) << 3;
-
-  // Post encode loop adjustment of Q prediction.
-  vp9_update_rate_correction_factors(
-      cpi, (cpi->sf.recode_loop ||
-            cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) ? 2 : 0);
-
-
-  cpi->rc.last_q[cm->frame_type] = cm->base_qindex;
-
-  // Keep record of last boosted (KF/KF/ARF) Q value.
-  // If the current frame is coded at a lower Q then we also update it.
-  // If all mbs in this group are skipped only update if the Q value is
-  // better than that already stored.
-  // This is used to help set quality in forced key frames to reduce popping
-  if ((cm->base_qindex < cpi->rc.last_boosted_qindex) ||
-      ((cpi->static_mb_pct < 100) &&
-       ((cm->frame_type == KEY_FRAME) ||
-        cpi->refresh_alt_ref_frame ||
-        (cpi->refresh_golden_frame && !cpi->is_src_frame_alt_ref)))) {
-    cpi->rc.last_boosted_qindex = cm->base_qindex;
-  }
-
-  if (cm->frame_type == KEY_FRAME) {
-    vp9_adjust_key_frame_context(cpi);
-  }
-
-  // Keep a record of ambient average Q.
-  if (cm->frame_type != KEY_FRAME)
-    cpi->rc.avg_frame_qindex = (2 + 3 * cpi->rc.avg_frame_qindex +
-                            cm->base_qindex) >> 2;
-
-  // Keep a record from which we can calculate the average Q excluding GF
-  // updates and key frames.
-  if (cm->frame_type != KEY_FRAME &&
-      !cpi->refresh_golden_frame &&
-      !cpi->refresh_alt_ref_frame) {
-    cpi->rc.ni_frames++;
-    cpi->rc.tot_q += vp9_convert_qindex_to_q(q);
-    cpi->rc.avg_q = cpi->rc.tot_q / (double)cpi->rc.ni_frames;
-
-    // Calculate the average Q for normal inter frames (not key or GFU frames).
-    cpi->rc.ni_tot_qi += q;
-    cpi->rc.ni_av_qi = cpi->rc.ni_tot_qi / cpi->rc.ni_frames;
-  }
-
-  // Update the buffer level variable.
-  // Non-viewable frames are a special case and are treated as pure overhead.
-  if (!cm->show_frame)
-    cpi->rc.bits_off_target -= cpi->rc.projected_frame_size;
-  else
-    cpi->rc.bits_off_target += cpi->rc.av_per_frame_bandwidth -
-                               cpi->rc.projected_frame_size;
-
-  // Clip the buffer level at the maximum buffer size
-  if (cpi->rc.bits_off_target > cpi->oxcf.maximum_buffer_size)
-    cpi->rc.bits_off_target = cpi->oxcf.maximum_buffer_size;
-
-  // Rolling monitors of whether we are over or underspending used to help
-  // regulate min and Max Q in two pass.
-  if (cm->frame_type != KEY_FRAME) {
-    cpi->rc.rolling_target_bits =
-        ((cpi->rc.rolling_target_bits * 3) +
-         cpi->rc.this_frame_target + 2) / 4;
-    cpi->rc.rolling_actual_bits =
-        ((cpi->rc.rolling_actual_bits * 3) +
-         cpi->rc.projected_frame_size + 2) / 4;
-    cpi->rc.long_rolling_target_bits =
-        ((cpi->rc.long_rolling_target_bits * 31) +
-         cpi->rc.this_frame_target + 16) / 32;
-    cpi->rc.long_rolling_actual_bits =
-        ((cpi->rc.long_rolling_actual_bits * 31) +
-         cpi->rc.projected_frame_size + 16) / 32;
-  }
-
-  // Actual bits spent
-  cpi->rc.total_actual_bits += cpi->rc.projected_frame_size;
-
-  // Debug stats
-  cpi->rc.total_target_vs_actual += (cpi->rc.this_frame_target -
-                                     cpi->rc.projected_frame_size);
-
-  cpi->rc.buffer_level = cpi->rc.bits_off_target;
-
-#ifndef DISABLE_RC_LONG_TERM_MEM
-  // Update bits left to the kf and gf groups to account for overshoot or
-  // undershoot on these frames
-  if (cm->frame_type == KEY_FRAME) {
-    cpi->twopass.kf_group_bits += cpi->rc.this_frame_target -
-                                  cpi->rc.projected_frame_size;
-
-    cpi->twopass.kf_group_bits = MAX(cpi->twopass.kf_group_bits, 0);
-  } else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) {
-    cpi->twopass.gf_group_bits += cpi->rc.this_frame_target -
-                                  cpi->rc.projected_frame_size;
-
-    cpi->twopass.gf_group_bits = MAX(cpi->twopass.gf_group_bits, 0);
-  }
-#endif
+  vp9_rc_postencode_update(cpi, *size, q);
 
 #if 0
   output_frame_level_debug_stats(cpi);
@@ -3383,6 +3291,10 @@
 
   encode_frame_to_data_rate(cpi, size, dest, frame_flags);
   // vp9_print_modes_and_motion_vectors(&cpi->common, "encode.stt");
+
+  vp9_twopass_postencode_update(cpi, *size);
+
+  /*
 #ifdef DISABLE_RC_LONG_TERM_MEM
   cpi->twopass.bits_left -=  cpi->rc.this_frame_target;
 #else
@@ -3401,6 +3313,7 @@
     cpi->twopass.bits_left += (int64_t)(two_pass_min_rate
                               / cpi->oxcf.framerate);
   }
+  */
 }
 
 static void check_initial_width(VP9_COMP *cpi, YV12_BUFFER_CONFIG *sd) {

diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index 03002ef..b8602e0 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h

@@ -30,7 +30,6 @@
 #include "vp9/encoder/vp9_lookahead.h"
 
 #define DISABLE_RC_LONG_TERM_MEM 0
-
 // #define MODE_TEST_HIT_STATS
 
 // #define SPEEDSTATS 1

diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index d24be96..2591a57 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c

@@ -137,45 +137,18 @@
   *eob_ptr = eob + 1;
 }
 
-struct plane_block_idx {
-  int plane;
-  int block;
-};
-
-// TODO(jkoleszar): returning a struct so it can be used in a const context,
-// expect to refactor this further later.
-static INLINE struct plane_block_idx plane_block_idx(int y_blocks,
-                                                     int b_idx) {
-  const int v_offset = y_blocks * 5 / 4;
-  struct plane_block_idx res;
-
-  if (b_idx < y_blocks) {
-    res.plane = 0;
-    res.block = b_idx;
-  } else if (b_idx < v_offset) {
-    res.plane = 1;
-    res.block = b_idx - y_blocks;
-  } else {
-    assert(b_idx < y_blocks * 3 / 2);
-    res.plane = 2;
-    res.block = b_idx - v_offset;
-  }
-  return res;
-}
-
-void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int y_blocks, int b_idx,
+void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
                                 const int16_t *scan, const int16_t *iscan) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, b_idx);
-  struct macroblock_plane* p = &x->plane[pb_idx.plane];
-  struct macroblockd_plane* pd = &xd->plane[pb_idx.plane];
+  struct macroblock_plane* p = &x->plane[plane];
+  struct macroblockd_plane* pd = &xd->plane[plane];
 
-  vp9_quantize_b(BLOCK_OFFSET(p->coeff, pb_idx.block),
+  vp9_quantize_b(BLOCK_OFFSET(p->coeff, block),
            16, x->skip_block,
            p->zbin, p->round, p->quant, p->quant_shift,
-           BLOCK_OFFSET(pd->qcoeff, pb_idx.block),
-           BLOCK_OFFSET(pd->dqcoeff, pb_idx.block),
-           pd->dequant, p->zbin_extra, &pd->eobs[pb_idx.block], scan, iscan);
+           BLOCK_OFFSET(p->qcoeff, block),
+           BLOCK_OFFSET(pd->dqcoeff, block),
+           pd->dequant, p->zbin_extra, &pd->eobs[block], scan, iscan);
 }
 
 static void invert_quant(int16_t *quant, int16_t *shift, int d) {

diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h
index c078e1d..41cfa52 100644
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h

@@ -13,7 +13,7 @@
 
 #include "vp9/encoder/vp9_block.h"
 
-void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int y_blocks, int b_idx,
+void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
                                 const int16_t *scan, const int16_t *iscan);
 
 struct VP9_COMP;

diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 42372e5..bf1fc4f 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c

@@ -26,6 +26,8 @@
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_seg_common.h"
 
+#define LIMIT_QRANGE_FOR_ALTREF_AND_KEY 1
+
 #define MIN_BPB_FACTOR 0.005
 #define MAX_BPB_FACTOR 50
 
@@ -67,7 +69,7 @@
   return QINDEX_RANGE - 1;
 }
 
-void vp9_init_minq_luts(void) {
+void vp9_rc_init_minq_luts(void) {
   int i;
 
   for (i = 0; i < QINDEX_RANGE; i++) {
@@ -121,22 +123,8 @@
   return vp9_ac_quant(qindex, 0) / 4.0;
 }
 
-int vp9_gfboost_qadjust(int qindex) {
-  const double q = vp9_convert_qindex_to_q(qindex);
-  return (int)((0.00000828 * q * q * q) +
-               (-0.0055 * q * q) +
-               (1.32 * q) + 79.3);
-}
-
-static int kfboost_qadjust(int qindex) {
-  const double q = vp9_convert_qindex_to_q(qindex);
-  return (int)((0.00000973 * q * q * q) +
-               (-0.00613 * q * q) +
-               (1.316 * q) + 121.2);
-}
-
-int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex,
-                    double correction_factor) {
+int vp9_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
+                       double correction_factor) {
   const double q = vp9_convert_qindex_to_q(qindex);
   int enumerator = frame_type == KEY_FRAME ? 3300000 : 2250000;
 
@@ -213,7 +201,7 @@
 
 static int estimate_bits_at_q(int frame_kind, int q, int mbs,
                               double correction_factor) {
-  const int bpm = (int)(vp9_bits_per_mb(frame_kind, q, correction_factor));
+  const int bpm = (int)(vp9_rc_bits_per_mb(frame_kind, q, correction_factor));
 
   // Attempt to retain reasonable accuracy without overflow. The cutoff is
   // chosen such that the maximum product of Bpm and MBs fits 31 bits. The
@@ -240,15 +228,9 @@
     if (target > max_rate)
       target = max_rate;
   }
-
   cpi->rc.this_frame_target = target;
-
-  // Target rate per SB64 (including partial SB64s.
-  cpi->rc.sb64_target_rate = ((int64_t)cpi->rc.this_frame_target * 64 * 64) /
-                             (cpi->common.width * cpi->common.height);
 }
 
-
 //  Do the best we can to define the parameters for the next GF based
 //  on what information we have available.
 //
@@ -273,11 +255,6 @@
     cpi->rc.this_frame_target = cpi->rc.per_frame_bandwidth;
   }
 
-  // Target rate per SB64 (including partial SB64s.
-  cpi->rc.sb64_target_rate = ((int64_t)cpi->rc.this_frame_target * 64 * 64) /
-                             (cpi->common.width * cpi->common.height);
-
-
   // Check that the total sum of adjustments is not above the maximum allowed.
   // That is, having allowed for the KF and GF penalties, we have not pushed
   // the current inter-frame target too low. If the adjustment we apply here is
@@ -309,7 +286,7 @@
 }
 
 
-void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
+void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
   const int q = cpi->common.base_qindex;
   int correction_factor = 100;
   double rate_correction_factor;
@@ -390,7 +367,7 @@
 }
 
 
-int vp9_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame) {
+int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame) {
   int q = cpi->rc.active_worst_quality;
 
   int i;
@@ -422,8 +399,8 @@
   i = cpi->rc.active_best_quality;
 
   do {
-    bits_per_mb_at_this_q = (int)vp9_bits_per_mb(cpi->common.frame_type, i,
-                                                 correction_factor);
+    bits_per_mb_at_this_q = (int)vp9_rc_bits_per_mb(cpi->common.frame_type, i,
+                                                    correction_factor);
 
     if (bits_per_mb_at_this_q <= target_bits_per_mb) {
       if ((target_bits_per_mb - bits_per_mb_at_this_q) <= last_error)
@@ -461,8 +438,9 @@
   return active_best_quality;
 }
 
-int vp9_pick_q_and_adjust_q_bounds(VP9_COMP *cpi,
-                                   int * bottom_index, int * top_index) {
+int vp9_rc_pick_q_and_adjust_q_bounds(VP9_COMP *cpi,
+                                      int *bottom_index,
+                                      int *top_index) {
   // Set an active best quality and if necessary active worst quality
   int q = cpi->rc.active_worst_quality;
   VP9_COMMON *const cm = &cpi->common;
@@ -481,7 +459,12 @@
                                         (last_boosted_q * 0.75));
 
       cpi->rc.active_best_quality = MAX(qindex + delta_qindex,
-                                     cpi->rc.best_quality);
+                                        cpi->rc.best_quality);
+    } else if (cpi->pass == 0 && cpi->common.current_video_frame == 0) {
+      // If this is the first (key) frame in 1-pass, active best/worst is
+      // the user best/worst-allowed, and leave the top_index to active_worst.
+      cpi->rc.active_best_quality = cpi->oxcf.best_allowed_q;
+      cpi->rc.active_worst_quality = cpi->oxcf.worst_allowed_q;
     } else {
       int high = 5000;
       int low = 400;
@@ -490,9 +473,9 @@
 
       // Baseline value derived from cpi->active_worst_quality and kf boost
       cpi->rc.active_best_quality = get_active_quality(q, cpi->rc.kf_boost,
-                                                    low, high,
-                                                    kf_low_motion_minq,
-                                                    kf_high_motion_minq);
+                                                       low, high,
+                                                       kf_low_motion_minq,
+                                                       kf_high_motion_minq);
 
       // Allow somewhat lower kf minq with small image formats.
       if ((cm->width * cm->height) <= (352 * 288)) {
@@ -533,14 +516,14 @@
         q = cpi->cq_target_quality;
       if (cpi->frames_since_key > 1) {
         cpi->rc.active_best_quality = get_active_quality(q, cpi->rc.gfu_boost,
-                                                      low, high,
-                                                      afq_low_motion_minq,
-                                                      afq_high_motion_minq);
+                                                         low, high,
+                                                         afq_low_motion_minq,
+                                                         afq_high_motion_minq);
       } else {
         cpi->rc.active_best_quality = get_active_quality(q, cpi->rc.gfu_boost,
-                                                      low, high,
-                                                      gf_low_motion_minq,
-                                                      gf_high_motion_minq);
+                                                         low, high,
+                                                         gf_low_motion_minq,
+                                                         gf_high_motion_minq);
       }
       // Constrained quality use slightly lower active best.
       cpi->rc.active_best_quality = cpi->rc.active_best_quality * 15 / 16;
@@ -550,22 +533,19 @@
         cpi->rc.active_best_quality = cpi->cq_target_quality;
       } else {
         if (cpi->frames_since_key > 1) {
-          cpi->rc.active_best_quality = get_active_quality(q, cpi->rc.gfu_boost,
-                                                        low, high,
-                                                        afq_low_motion_minq,
-                                                        afq_high_motion_minq);
+          cpi->rc.active_best_quality = get_active_quality(
+              q, cpi->rc.gfu_boost, low, high,
+              afq_low_motion_minq, afq_high_motion_minq);
         } else {
-          cpi->rc.active_best_quality = get_active_quality(q, cpi->rc.gfu_boost,
-                                                        low, high,
-                                                        gf_low_motion_minq,
-                                                        gf_high_motion_minq);
+          cpi->rc.active_best_quality = get_active_quality(
+              q, cpi->rc.gfu_boost, low, high,
+              gf_low_motion_minq, gf_high_motion_minq);
         }
       }
     } else {
-        cpi->rc.active_best_quality = get_active_quality(q, cpi->rc.gfu_boost,
-                                                         low, high,
-                                                         gf_low_motion_minq,
-                                                         gf_high_motion_minq);
+        cpi->rc.active_best_quality = get_active_quality(
+            q, cpi->rc.gfu_boost, low, high,
+            gf_low_motion_minq, gf_high_motion_minq);
     }
   } else {
     if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
@@ -605,25 +585,23 @@
   if (cpi->rc.active_worst_quality < cpi->rc.active_best_quality)
     cpi->rc.active_worst_quality = cpi->rc.active_best_quality;
 
+  *top_index = cpi->rc.active_worst_quality;
+  *bottom_index = cpi->rc.active_best_quality;
+
+#if LIMIT_QRANGE_FOR_ALTREF_AND_KEY
   // Limit Q range for the adaptive loop.
   if (cm->frame_type == KEY_FRAME && !cpi->this_key_frame_forced) {
-    *top_index =
-      (cpi->rc.active_worst_quality + cpi->rc.active_best_quality * 3) / 4;
-    // If this is the first (key) frame in 1-pass, active best is the user
-    // best-allowed, and leave the top_index to active_worst.
-    if (cpi->pass == 0 && cpi->common.current_video_frame == 0) {
-      cpi->rc.active_best_quality = cpi->oxcf.best_allowed_q;
-      *top_index = cpi->oxcf.worst_allowed_q;
+    if (!(cpi->pass == 0 && cpi->common.current_video_frame == 0)) {
+      *top_index =
+          (cpi->rc.active_worst_quality + cpi->rc.active_best_quality * 3) / 4;
     }
   } else if (!cpi->is_src_frame_alt_ref &&
              (cpi->oxcf.end_usage != USAGE_STREAM_FROM_SERVER) &&
              (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
     *top_index =
       (cpi->rc.active_worst_quality + cpi->rc.active_best_quality) / 2;
-  } else {
-    *top_index = cpi->rc.active_worst_quality;
   }
-  *bottom_index = cpi->rc.active_best_quality;
+#endif
 
   if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
     q = cpi->rc.active_best_quality;
@@ -636,14 +614,13 @@
       // 1-pass: for now, use per-frame-bw for target size of frame, scaled
       // by |x| for key frame.
       int scale = (cm->frame_type == KEY_FRAME) ? 5 : 1;
-      q = vp9_regulate_q(cpi, scale * cpi->rc.av_per_frame_bandwidth);
+      q = vp9_rc_regulate_q(cpi, scale * cpi->rc.av_per_frame_bandwidth);
     } else {
-      q = vp9_regulate_q(cpi, cpi->rc.this_frame_target);
+      q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target);
     }
     if (q > *top_index)
       q = *top_index;
   }
-
   return q;
 }
 
@@ -695,7 +672,7 @@
 }
 
 
-void vp9_adjust_key_frame_context(VP9_COMP *cpi) {
+static void adjust_key_frame_context(VP9_COMP *cpi) {
   // Clear down mmx registers to allow floating point in what follows
   vp9_clear_system_state();
 
@@ -704,28 +681,30 @@
 }
 
 
-void vp9_compute_frame_size_bounds(VP9_COMP *cpi, int *frame_under_shoot_limit,
-                                   int *frame_over_shoot_limit) {
+static void compute_frame_size_bounds(const VP9_COMP *cpi,
+                                      int this_frame_target,
+                                      int *frame_under_shoot_limit,
+                                      int *frame_over_shoot_limit) {
   // Set-up bounds on acceptable frame size:
   if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
     *frame_under_shoot_limit = 0;
     *frame_over_shoot_limit  = INT_MAX;
   } else {
     if (cpi->common.frame_type == KEY_FRAME) {
-      *frame_over_shoot_limit  = cpi->rc.this_frame_target * 9 / 8;
-      *frame_under_shoot_limit = cpi->rc.this_frame_target * 7 / 8;
+      *frame_over_shoot_limit  = this_frame_target * 9 / 8;
+      *frame_under_shoot_limit = this_frame_target * 7 / 8;
     } else {
       if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) {
-        *frame_over_shoot_limit  = cpi->rc.this_frame_target * 9 / 8;
-        *frame_under_shoot_limit = cpi->rc.this_frame_target * 7 / 8;
+        *frame_over_shoot_limit  = this_frame_target * 9 / 8;
+        *frame_under_shoot_limit = this_frame_target * 7 / 8;
       } else {
         // Stron overshoot limit for constrained quality
         if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
-          *frame_over_shoot_limit  = cpi->rc.this_frame_target * 11 / 8;
-          *frame_under_shoot_limit = cpi->rc.this_frame_target * 2 / 8;
+          *frame_over_shoot_limit  = this_frame_target * 11 / 8;
+          *frame_under_shoot_limit = this_frame_target * 2 / 8;
         } else {
-          *frame_over_shoot_limit  = cpi->rc.this_frame_target * 11 / 8;
-          *frame_under_shoot_limit = cpi->rc.this_frame_target * 5 / 8;
+          *frame_over_shoot_limit  = this_frame_target * 11 / 8;
+          *frame_under_shoot_limit = this_frame_target * 5 / 8;
         }
       }
     }
@@ -740,9 +719,10 @@
   }
 }
 
-
 // return of 0 means drop frame
-int vp9_pick_frame_size(VP9_COMP *cpi) {
+int vp9_rc_pick_frame_size_and_bounds(VP9_COMP *cpi,
+                                      int *frame_under_shoot_limit,
+                                      int *frame_over_shoot_limit) {
   VP9_COMMON *cm = &cpi->common;
 
   if (cm->frame_type == KEY_FRAME)
@@ -750,5 +730,112 @@
   else
     calc_pframe_target_size(cpi);
 
+  // Target rate per SB64 (including partial SB64s.
+  cpi->rc.sb64_target_rate = ((int64_t)cpi->rc.this_frame_target * 64 * 64) /
+                             (cpi->common.width * cpi->common.height);
+  compute_frame_size_bounds(cpi, cpi->rc.this_frame_target,
+                            frame_under_shoot_limit, frame_over_shoot_limit);
+
   return 1;
 }
+
+void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used, int q) {
+  VP9_COMMON *const cm = &cpi->common;
+  // Update rate control heuristics
+  cpi->rc.projected_frame_size = (bytes_used << 3);
+
+  // Post encode loop adjustment of Q prediction.
+  vp9_rc_update_rate_correction_factors(
+      cpi, (cpi->sf.recode_loop ||
+            cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) ? 2 : 0);
+
+  cpi->rc.last_q[cm->frame_type] = cm->base_qindex;
+
+  // Keep record of last boosted (KF/KF/ARF) Q value.
+  // If the current frame is coded at a lower Q then we also update it.
+  // If all mbs in this group are skipped only update if the Q value is
+  // better than that already stored.
+  // This is used to help set quality in forced key frames to reduce popping
+  if ((cm->base_qindex < cpi->rc.last_boosted_qindex) ||
+      ((cpi->static_mb_pct < 100) &&
+       ((cm->frame_type == KEY_FRAME) || cpi->refresh_alt_ref_frame ||
+        (cpi->refresh_golden_frame && !cpi->is_src_frame_alt_ref)))) {
+    cpi->rc.last_boosted_qindex = cm->base_qindex;
+  }
+
+  if (cm->frame_type == KEY_FRAME) {
+    adjust_key_frame_context(cpi);
+  }
+
+  // Keep a record of ambient average Q.
+  if (cm->frame_type != KEY_FRAME)
+    cpi->rc.avg_frame_qindex = (2 + 3 * cpi->rc.avg_frame_qindex +
+                            cm->base_qindex) >> 2;
+
+  // Keep a record from which we can calculate the average Q excluding GF
+  // updates and key frames.
+  if (cm->frame_type != KEY_FRAME &&
+      !cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame) {
+    cpi->rc.ni_frames++;
+    cpi->rc.tot_q += vp9_convert_qindex_to_q(q);
+    cpi->rc.avg_q = cpi->rc.tot_q / (double)cpi->rc.ni_frames;
+
+    // Calculate the average Q for normal inter frames (not key or GFU frames).
+    cpi->rc.ni_tot_qi += q;
+    cpi->rc.ni_av_qi = cpi->rc.ni_tot_qi / cpi->rc.ni_frames;
+  }
+
+  // Update the buffer level variable.
+  // Non-viewable frames are a special case and are treated as pure overhead.
+  if (!cm->show_frame)
+    cpi->rc.bits_off_target -= cpi->rc.projected_frame_size;
+  else
+    cpi->rc.bits_off_target += cpi->rc.av_per_frame_bandwidth -
+                               cpi->rc.projected_frame_size;
+
+  // Clip the buffer level at the maximum buffer size
+  if (cpi->rc.bits_off_target > cpi->oxcf.maximum_buffer_size)
+    cpi->rc.bits_off_target = cpi->oxcf.maximum_buffer_size;
+
+  // Rolling monitors of whether we are over or underspending used to help
+  // regulate min and Max Q in two pass.
+  if (cm->frame_type != KEY_FRAME) {
+    cpi->rc.rolling_target_bits =
+        ((cpi->rc.rolling_target_bits * 3) +
+         cpi->rc.this_frame_target + 2) / 4;
+    cpi->rc.rolling_actual_bits =
+        ((cpi->rc.rolling_actual_bits * 3) +
+         cpi->rc.projected_frame_size + 2) / 4;
+    cpi->rc.long_rolling_target_bits =
+        ((cpi->rc.long_rolling_target_bits * 31) +
+         cpi->rc.this_frame_target + 16) / 32;
+    cpi->rc.long_rolling_actual_bits =
+        ((cpi->rc.long_rolling_actual_bits * 31) +
+         cpi->rc.projected_frame_size + 16) / 32;
+  }
+
+  // Actual bits spent
+  cpi->rc.total_actual_bits += cpi->rc.projected_frame_size;
+
+  // Debug stats
+  cpi->rc.total_target_vs_actual += (cpi->rc.this_frame_target -
+                                     cpi->rc.projected_frame_size);
+
+  cpi->rc.buffer_level = cpi->rc.bits_off_target;
+
+#ifndef DISABLE_RC_LONG_TERM_MEM
+  // Update bits left to the kf and gf groups to account for overshoot or
+  // undershoot on these frames
+  if (cm->frame_type == KEY_FRAME) {
+    cpi->twopass.kf_group_bits += cpi->rc.this_frame_target -
+                                  cpi->rc.projected_frame_size;
+
+    cpi->twopass.kf_group_bits = MAX(cpi->twopass.kf_group_bits, 0);
+  } else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) {
+    cpi->twopass.gf_group_bits += cpi->rc.this_frame_target -
+                                  cpi->rc.projected_frame_size;
+
+    cpi->twopass.gf_group_bits = MAX(cpi->twopass.gf_group_bits, 0);
+  }
+#endif
+}

diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index 1335744..f01d186 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h

@@ -20,24 +20,41 @@
 void vp9_restore_coding_context(VP9_COMP *cpi);
 
 void vp9_setup_key_frame(VP9_COMP *cpi);
-void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var);
-int vp9_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame);
-void vp9_adjust_key_frame_context(VP9_COMP *cpi);
-void vp9_compute_frame_size_bounds(VP9_COMP *cpi,
-                                   int *frame_under_shoot_limit,
-                                   int *frame_over_shoot_limit);
-
-void vp9_init_minq_luts(void);
-
-// return of 0 means drop frame
-int vp9_pick_frame_size(VP9_COMP *cpi);
+void vp9_setup_inter_frame(VP9_COMP *cpi);
 
 double vp9_convert_qindex_to_q(int qindex);
-int vp9_gfboost_qadjust(int qindex);
-int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex,
-                    double correction_factor);
-void vp9_setup_inter_frame(VP9_COMP *cpi);
-int vp9_pick_q_and_adjust_q_bounds(VP9_COMP *cpi,
-                                   int * bottom_index, int * top_index);
+
+// Updates rate correction factors
+void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi, int damp_var);
+
+// initialize luts for minq
+void vp9_rc_init_minq_luts(void);
+
+// return of 0 means drop frame
+// Changes rc.this_frame_target and rc.sb64_rate_target
+int vp9_rc_pick_frame_size_and_bounds(VP9_COMP *cpi,
+                                      int *frame_under_shoot_limit,
+                                      int *frame_over_shoot_limit);
+// Picks q and q bounds given the target for bits
+int vp9_rc_pick_q_and_adjust_q_bounds(VP9_COMP *cpi,
+                                      int * bottom_index,
+                                      int * top_index);
+
+// Estimates q to achieve a target bits per frame
+int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame);
+
+// Post encode update of the rate control parameters based
+// on bytes used and q used for the frame
+void vp9_rc_postencode_update(VP9_COMP *cpi,
+                              uint64_t bytes_used,
+                              int q_used);
+
+// estimates bits per mb for a given qindex and correction factor
+int vp9_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
+                       double correction_factor);
+
+// Post encode update of the rate control parameters for 2-pass
+void vp9_twopass_postencode_update(VP9_COMP *cpi,
+                                   uint64_t bytes_used);
 
 #endif  // VP9_ENCODER_VP9_RATECTRL_H_

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 43b7d6b..65cf5c7 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c

@@ -113,6 +113,19 @@
 #define MV_COST_WEIGHT      108
 #define MV_COST_WEIGHT_SUB  120
 
+static int raster_block_offset(BLOCK_SIZE plane_bsize,
+                               int raster_block, int stride) {
+  const int bw = b_width_log2(plane_bsize);
+  const int y = 4 * (raster_block >> bw);
+  const int x = 4 * (raster_block & ((1 << bw) - 1));
+  return y * stride + x;
+}
+static int16_t* raster_block_offset_int16(BLOCK_SIZE plane_bsize,
+                                          int raster_block, int16_t *base) {
+  const int stride = 4 << b_width_log2(plane_bsize);
+  return base + raster_block_offset(plane_bsize, raster_block, stride);
+}
+
 static void fill_mode_costs(VP9_COMP *c) {
   VP9_COMMON *const cm = &c->common;
   int i, j;
@@ -270,7 +283,7 @@
 
   cpi->mb.select_txfm_size = (cpi->sf.tx_size_search_method == USE_LARGESTALL &&
                               cm->frame_type != KEY_FRAME) ?
-                             0 : 1;
+                              0 : 1;
 
   set_block_thresholds(cpi);
 
@@ -290,15 +303,9 @@
         &cm->fc.nmvc,
         cm->allow_high_precision_mv, 1, 1);
 
-    for (i = 0; i < INTER_MODE_CONTEXTS; i++) {
-      MB_PREDICTION_MODE m;
-
-      for (m = NEARESTMV; m < MB_MODE_COUNT; m++)
-        cpi->mb.inter_mode_cost[i][INTER_OFFSET(m)] =
-            cost_token(vp9_inter_mode_tree,
-                       cm->fc.inter_mode_probs[i],
-                       &vp9_inter_mode_encodings[INTER_OFFSET(m)]);
-    }
+    for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
+      vp9_cost_tokens((int *)cpi->mb.inter_mode_cost[i],
+                      cm->fc.inter_mode_probs[i], vp9_inter_mode_tree);
   }
 }
 
@@ -514,11 +521,12 @@
                               const int16_t *scan, const int16_t *nb) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+  struct macroblock_plane *p = &x->plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
   const PLANE_TYPE type = pd->plane_type;
   const int16_t *band_count = &band_counts[tx_size][1];
   const int eob = pd->eobs[block];
-  const int16_t *const qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block);
+  const int16_t *const qcoeff_ptr = BLOCK_OFFSET(p->qcoeff, block);
   const int ref = mbmi->ref_frame[0] != INTRA_FRAME;
   unsigned int (*token_costs)[2][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] =
                    x->token_costs[tx_size][type][ref];
@@ -754,32 +762,6 @@
   cpi->tx_stepdown_count[0]++;
 }
 
-static TX_SIZE select_tx_size(TX_MODE tx_mode, TX_SIZE max_tx_size,
-                              int64_t rd[][2]) {
-  if (max_tx_size == TX_32X32 &&
-      (tx_mode == ALLOW_32X32 ||
-       (tx_mode == TX_MODE_SELECT &&
-       rd[TX_32X32][1] < rd[TX_16X16][1] &&
-       rd[TX_32X32][1] < rd[TX_8X8][1] &&
-       rd[TX_32X32][1] < rd[TX_4X4][1]))) {
-    return TX_32X32;
-  } else if (max_tx_size >= TX_16X16 &&
-             (tx_mode == ALLOW_16X16 ||
-              tx_mode == ALLOW_32X32 ||
-              (tx_mode == TX_MODE_SELECT &&
-               rd[TX_16X16][1] < rd[TX_8X8][1] &&
-               rd[TX_16X16][1] < rd[TX_4X4][1]))) {
-    return TX_16X16;
-  } else if (tx_mode == ALLOW_8X8 ||
-             tx_mode == ALLOW_16X16 ||
-             tx_mode == ALLOW_32X32 ||
-             (tx_mode == TX_MODE_SELECT && rd[TX_8X8][1] < rd[TX_4X4][1])) {
-    return TX_8X8;
-  } else {
-    return TX_4X4;
-  }
-}
-
 static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
                                      int (*r)[2], int *rate,
                                      int64_t *d, int64_t *distortion,
@@ -794,39 +776,42 @@
   int64_t rd[TX_SIZES][2];
   int n, m;
   int s0, s1;
+  const TX_SIZE max_mode_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
+  int64_t best_rd = INT64_MAX;
+  TX_SIZE best_tx = TX_4X4;
 
   const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc.tx_probs);
-
-  for (n = TX_4X4; n <= max_tx_size; n++) {
-    r[n][1] = r[n][0];
-    if (r[n][0] == INT_MAX)
-      continue;
-    for (m = 0; m <= n - (n == max_tx_size); m++) {
-      if (m == n)
-        r[n][1] += vp9_cost_zero(tx_probs[m]);
-      else
-        r[n][1] += vp9_cost_one(tx_probs[m]);
-    }
-  }
-
   assert(skip_prob > 0);
   s0 = vp9_cost_bit(skip_prob, 0);
   s1 = vp9_cost_bit(skip_prob, 1);
 
   for (n = TX_4X4; n <= max_tx_size; n++) {
+    r[n][1] = r[n][0];
+    if (r[n][0] < INT_MAX) {
+      for (m = 0; m <= n - (n == max_tx_size); m++) {
+        if (m == n)
+          r[n][1] += vp9_cost_zero(tx_probs[m]);
+        else
+          r[n][1] += vp9_cost_one(tx_probs[m]);
+      }
+    }
     if (d[n] == INT64_MAX) {
       rd[n][0] = rd[n][1] = INT64_MAX;
-      continue;
-    }
-    if (s[n]) {
+    } else if (s[n]) {
       rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);
     } else {
       rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]);
       rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
     }
-  }
 
-  mbmi->tx_size = select_tx_size(cm->tx_mode, max_tx_size, rd);
+    if (rd[n][1] < best_rd) {
+      best_tx = n;
+      best_rd = rd[n][1];
+    }
+  }
+  mbmi->tx_size = cm->tx_mode == TX_MODE_SELECT ?
+                      best_tx : MIN(max_tx_size, max_mode_tx_size);
+
 
   *distortion = d[mbmi->tx_size];
   *rate       = r[mbmi->tx_size][cm->tx_mode == TX_MODE_SELECT];
@@ -836,29 +821,18 @@
   tx_cache[ALLOW_8X8] = rd[TX_8X8][0];
   tx_cache[ALLOW_16X16] = rd[MIN(max_tx_size, TX_16X16)][0];
   tx_cache[ALLOW_32X32] = rd[MIN(max_tx_size, TX_32X32)][0];
-  if (max_tx_size == TX_32X32 &&
-      rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] &&
-      rd[TX_32X32][1] < rd[TX_4X4][1])
-    tx_cache[TX_MODE_SELECT] = rd[TX_32X32][1];
-  else if (max_tx_size >= TX_16X16 &&
-           rd[TX_16X16][1] < rd[TX_8X8][1] && rd[TX_16X16][1] < rd[TX_4X4][1])
-    tx_cache[TX_MODE_SELECT] = rd[TX_16X16][1];
-  else
-    tx_cache[TX_MODE_SELECT] = rd[TX_4X4][1] < rd[TX_8X8][1] ?
-                                 rd[TX_4X4][1] : rd[TX_8X8][1];
 
-  if (max_tx_size == TX_32X32 &&
-      rd[TX_32X32][1] < rd[TX_16X16][1] &&
-      rd[TX_32X32][1] < rd[TX_8X8][1] &&
-      rd[TX_32X32][1] < rd[TX_4X4][1]) {
+  if (max_tx_size == TX_32X32 && best_tx == TX_32X32) {
+    tx_cache[TX_MODE_SELECT] = rd[TX_32X32][1];
     cpi->tx_stepdown_count[0]++;
-  } else if (max_tx_size >= TX_16X16 &&
-             rd[TX_16X16][1] < rd[TX_8X8][1] &&
-             rd[TX_16X16][1] < rd[TX_4X4][1]) {
+  } else if (max_tx_size >= TX_16X16 && best_tx == TX_16X16) {
+    tx_cache[TX_MODE_SELECT] = rd[TX_16X16][1];
     cpi->tx_stepdown_count[max_tx_size - TX_16X16]++;
   } else if (rd[TX_8X8][1] < rd[TX_4X4][1]) {
+    tx_cache[TX_MODE_SELECT] = rd[TX_8X8][1];
     cpi->tx_stepdown_count[max_tx_size - TX_8X8]++;
   } else {
+    tx_cache[TX_MODE_SELECT] = rd[TX_4X4][1];
     cpi->tx_stepdown_count[max_tx_size - TX_4X4]++;
   }
 }
@@ -878,14 +852,17 @@
   int n, m;
   int s0, s1;
   double scale_rd[TX_SIZES] = {1.73, 1.44, 1.20, 1.00};
-  // double scale_r[TX_SIZES] = {2.82, 2.00, 1.41, 1.00};
+  const TX_SIZE max_mode_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
+  int64_t best_rd = INT64_MAX;
+  TX_SIZE best_tx = TX_4X4;
 
   const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc.tx_probs);
-
-  // for (n = TX_4X4; n <= max_txfm_size; n++)
-  //   r[n][0] = (r[n][0] * scale_r[n]);
+  assert(skip_prob > 0);
+  s0 = vp9_cost_bit(skip_prob, 0);
+  s1 = vp9_cost_bit(skip_prob, 1);
 
   for (n = TX_4X4; n <= max_tx_size; n++) {
+    double scale = scale_rd[n];
     r[n][1] = r[n][0];
     for (m = 0; m <= n - (n == max_tx_size); m++) {
       if (m == n)
@@ -893,40 +870,29 @@
       else
         r[n][1] += vp9_cost_one(tx_probs[m]);
     }
-  }
-
-  assert(skip_prob > 0);
-  s0 = vp9_cost_bit(skip_prob, 0);
-  s1 = vp9_cost_bit(skip_prob, 1);
-
-  for (n = TX_4X4; n <= max_tx_size; n++) {
     if (s[n]) {
-      rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);
+      rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]) * scale;
     } else {
-      rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]);
-      rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
+      rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]) * scale;
+      rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]) * scale;
+    }
+    if (rd[n][1] < best_rd) {
+      best_rd = rd[n][1];
+      best_tx = n;
     }
   }
-  for (n = TX_4X4; n <= max_tx_size; n++) {
-    rd[n][0] = (int64_t)(scale_rd[n] * rd[n][0]);
-    rd[n][1] = (int64_t)(scale_rd[n] * rd[n][1]);
-  }
 
-  mbmi->tx_size = select_tx_size(cm->tx_mode, max_tx_size, rd);
+  mbmi->tx_size = cm->tx_mode == TX_MODE_SELECT ?
+                      best_tx : MIN(max_tx_size, max_mode_tx_size);
 
   // Actually encode using the chosen mode if a model was used, but do not
   // update the r, d costs
   txfm_rd_in_plane(x, &cpi->rdcost_stack, rate, distortion, skip,
                    &sse[mbmi->tx_size], ref_best_rd, 0, bs, mbmi->tx_size);
 
-  if (max_tx_size == TX_32X32 &&
-      rd[TX_32X32][1] <= rd[TX_16X16][1] &&
-      rd[TX_32X32][1] <= rd[TX_8X8][1] &&
-      rd[TX_32X32][1] <= rd[TX_4X4][1]) {
+  if (max_tx_size == TX_32X32 && best_tx == TX_32X32) {
     cpi->tx_stepdown_count[0]++;
-  } else if (max_tx_size >= TX_16X16 &&
-             rd[TX_16X16][1] <= rd[TX_8X8][1] &&
-             rd[TX_16X16][1] <= rd[TX_4X4][1]) {
+  } else if (max_tx_size >= TX_16X16 &&  best_tx == TX_16X16) {
     cpi->tx_stepdown_count[max_tx_size - TX_16X16]++;
   } else if (rd[TX_8X8][1] <= rd[TX_4X4][1]) {
     cpi->tx_stepdown_count[max_tx_size - TX_8X8]++;
@@ -946,6 +912,9 @@
   MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
   struct rdcost_block_args *rdcost_stack = &cpi->rdcost_stack;
   const int b_inter_mode = is_inter_block(mbmi);
+  const TX_SIZE max_tx_size = max_txsize_lookup[bs];
+  TX_SIZE tx_size;
+
 
   assert(bs == mbmi->sb_type);
   if (b_inter_mode)
@@ -964,34 +933,16 @@
 
   if (cpi->sf.tx_size_search_method == USE_LARGESTINTRA_MODELINTER &&
       b_inter_mode) {
-    if (bs >= BLOCK_32X32)
-      model_rd_for_sb_y_tx(cpi, bs, TX_32X32, x, xd,
-                           &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]);
-    if (bs >= BLOCK_16X16)
-      model_rd_for_sb_y_tx(cpi, bs, TX_16X16, x, xd,
-                           &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16]);
-
-    model_rd_for_sb_y_tx(cpi, bs, TX_8X8, x, xd,
-                         &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8]);
-
-    model_rd_for_sb_y_tx(cpi, bs, TX_4X4, x, xd,
-                         &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4]);
-
+    for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
+      model_rd_for_sb_y_tx(cpi, bs, tx_size, x, xd,
+                           &r[tx_size][0], &d[tx_size], &s[tx_size]);
     choose_txfm_size_from_modelrd(cpi, x, r, rate, d, distortion, s,
                                   skip, sse, ref_best_rd, bs);
   } else {
-    if (bs >= BLOCK_32X32)
-      txfm_rd_in_plane(x, rdcost_stack, &r[TX_32X32][0], &d[TX_32X32],
-                       &s[TX_32X32], &sse[TX_32X32],
-                       ref_best_rd, 0, bs, TX_32X32);
-    if (bs >= BLOCK_16X16)
-      txfm_rd_in_plane(x, rdcost_stack, &r[TX_16X16][0], &d[TX_16X16],
-                       &s[TX_16X16], &sse[TX_16X16],
-                       ref_best_rd, 0, bs, TX_16X16);
-    txfm_rd_in_plane(x, rdcost_stack, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8],
-                     &sse[TX_8X8], ref_best_rd, 0, bs, TX_8X8);
-    txfm_rd_in_plane(x, rdcost_stack, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4],
-                     &sse[TX_4X4], ref_best_rd, 0, bs, TX_4X4);
+    for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
+      txfm_rd_in_plane(x, rdcost_stack, &r[tx_size][0], &d[tx_size],
+                       &s[tx_size], &sse[tx_size],
+                       ref_best_rd, 0, bs, tx_size);
     choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
                              skip, txfm_cache, bs);
   }
@@ -1104,7 +1055,7 @@
         else
           x->fwd_txm4x4(src_diff, coeff, 8);
 
-        vp9_regular_quantize_b_4x4(x, 4, block, so->scan, so->iscan);
+        vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
 
         ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
                              so->scan, so->neighbors);
@@ -1348,11 +1299,7 @@
   int this_rate_tokenonly, this_rate, s;
   int64_t this_distortion, this_sse;
 
-  // int mode_mask = (bsize <= BLOCK_8X8)
-  //                ? ALL_INTRA_MODES : cpi->sf.intra_uv_mode_mask;
-
-  for (mode = DC_PRED; mode <= TM_PRED; mode ++) {
-    // if (!(mode_mask & (1 << mode)))
+  for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
     if (!(cpi->sf.intra_uv_mode_mask[max_uv_txsize_lookup[bsize]]
           & (1 << mode)))
       continue;
@@ -1380,7 +1327,7 @@
         struct macroblockd_plane *const pd = x->e_mbd.plane;
         for (i = 1; i < MAX_MB_PLANE; ++i) {
           p[i].coeff    = ctx->coeff_pbuf[i][2];
-          pd[i].qcoeff  = ctx->qcoeff_pbuf[i][2];
+          p[i].qcoeff   = ctx->qcoeff_pbuf[i][2];
           pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][2];
           pd[i].eobs    = ctx->eobs_pbuf[i][2];
 
@@ -1390,7 +1337,7 @@
           ctx->eobs_pbuf[i][2]    = ctx->eobs_pbuf[i][0];
 
           ctx->coeff_pbuf[i][0]   = p[i].coeff;
-          ctx->qcoeff_pbuf[i][0]  = pd[i].qcoeff;
+          ctx->qcoeff_pbuf[i][0]  = p[i].qcoeff;
           ctx->dqcoeff_pbuf[i][0] = pd[i].dqcoeff;
           ctx->eobs_pbuf[i][0]    = pd[i].eobs;
         }
@@ -1399,7 +1346,6 @@
   }
 
   x->e_mbd.mi_8x8[0]->mbmi.uv_mode = mode_selected;
-
   return best_rd;
 }
 
@@ -1595,7 +1541,7 @@
       coeff = BLOCK_OFFSET(p->coeff, k);
       x->fwd_txm4x4(raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
                     coeff, 8);
-      vp9_regular_quantize_b_4x4(x, 4, k, so->scan, so->iscan);
+      vp9_regular_quantize_b_4x4(x, 0, k, so->scan, so->iscan);
       thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
                                         16, &ssz);
       thissse += ssz;
@@ -2142,7 +2088,7 @@
   int best_index = 0;
   int best_sad = INT_MAX;
   int this_sad = INT_MAX;
-  unsigned int max_mv = 0;
+  int max_mv = 0;
 
   uint8_t *src_y_ptr = x->plane[0].src.buf;
   uint8_t *ref_y_ptr;
@@ -3057,7 +3003,7 @@
 
   for (i = 0; i < max_plane; ++i) {
     p[i].coeff    = ctx->coeff_pbuf[i][1];
-    pd[i].qcoeff  = ctx->qcoeff_pbuf[i][1];
+    p[i].qcoeff  = ctx->qcoeff_pbuf[i][1];
     pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
     pd[i].eobs    = ctx->eobs_pbuf[i][1];
 
@@ -3067,7 +3013,7 @@
     ctx->eobs_pbuf[i][1]    = ctx->eobs_pbuf[i][0];
 
     ctx->coeff_pbuf[i][0]   = p[i].coeff;
-    ctx->qcoeff_pbuf[i][0]  = pd[i].qcoeff;
+    ctx->qcoeff_pbuf[i][0]  = p[i].qcoeff;
     ctx->dqcoeff_pbuf[i][0] = pd[i].dqcoeff;
     ctx->eobs_pbuf[i][0]    = pd[i].eobs;
   }
@@ -3643,7 +3589,7 @@
 
     /* keep record of best compound/single-only prediction */
     if (!disable_skip && ref_frame != INTRA_FRAME) {
-      int single_rd, hybrid_rd, single_rate, hybrid_rate;
+      int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
 
       if (cm->comp_pred_mode == REFERENCE_MODE_SELECT) {
         single_rate = rate2 - compmode_cost;
@@ -3724,7 +3670,9 @@
   if (cpi->sf.use_uv_intra_rd_estimate) {
     // Do Intra UV best rd mode selection if best mode choice above was intra.
     if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME) {
-      TX_SIZE uv_tx_size = get_uv_tx_size(mbmi);
+      TX_SIZE uv_tx_size;
+      *mbmi = best_mbmode;
+      uv_tx_size = get_uv_tx_size(mbmi);
       rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra[uv_tx_size],
                               &rate_uv_tokenonly[uv_tx_size],
                               &dist_uv[uv_tx_size],
@@ -4394,7 +4342,7 @@
 
     /* keep record of best compound/single-only prediction */
     if (!disable_skip && ref_frame != INTRA_FRAME) {
-      int single_rd, hybrid_rd, single_rate, hybrid_rate;
+      int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
 
       if (cpi->common.comp_pred_mode == REFERENCE_MODE_SELECT) {
         single_rate = rate2 - compmode_cost;
@@ -4472,7 +4420,9 @@
   if (cpi->sf.use_uv_intra_rd_estimate) {
     // Do Intra UV best rd mode selection if best mode choice above was intra.
     if (vp9_ref_order[best_mode_index].ref_frame == INTRA_FRAME) {
-      TX_SIZE uv_tx_size = get_uv_tx_size(mbmi);
+      TX_SIZE uv_tx_size;
+      *mbmi = best_mbmode;
+      uv_tx_size = get_uv_tx_size(mbmi);
       rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra[uv_tx_size],
                               &rate_uv_tokenonly[uv_tx_size],
                               &dist_uv[uv_tx_size],

diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 3f1cc6f..389ec15 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c

@@ -26,6 +26,86 @@
 static int dct_value_cost[DCT_MAX_VALUE * 2];
 const int *vp9_dct_value_cost_ptr;
 
+// Array indices are identical to previously-existing CONTEXT_NODE indices
+const vp9_tree_index vp9_coef_tree[TREE_SIZE(MAX_ENTROPY_TOKENS)] = {
+  -DCT_EOB_TOKEN, 2,                          /* 0 = EOB */
+  -ZERO_TOKEN, 4,                             /* 1 = ZERO */
+  -ONE_TOKEN, 6,                              /* 2 = ONE */
+  8, 12,                                      /* 3 = LOW_VAL */
+  -TWO_TOKEN, 10,                            /* 4 = TWO */
+  -THREE_TOKEN, -FOUR_TOKEN,                /* 5 = THREE */
+  14, 16,                                   /* 6 = HIGH_LOW */
+  -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2,   /* 7 = CAT_ONE */
+  18, 20,                                   /* 8 = CAT_THREEFOUR */
+  -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4,   /* 9 = CAT_THREE */
+  -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6    /* 10 = CAT_FIVE */
+};
+
+// Unconstrained Node Tree
+const vp9_tree_index vp9_coef_con_tree[TREE_SIZE(MAX_ENTROPY_TOKENS)] = {
+  2, 6,                                     /* 0 = LOW_VAL */
+  -TWO_TOKEN, 4,                            /* 1 = TWO */
+  -THREE_TOKEN, -FOUR_TOKEN,                /* 2 = THREE */
+  8, 10,                                    /* 3 = HIGH_LOW */
+  -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2,   /* 4 = CAT_ONE */
+  12, 14,                                   /* 5 = CAT_THREEFOUR */
+  -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4,   /* 6 = CAT_THREE */
+  -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6    /* 7 = CAT_FIVE */
+};
+
+static const vp9_prob Pcat1[] = { 159};
+static const vp9_prob Pcat2[] = { 165, 145};
+static const vp9_prob Pcat3[] = { 173, 148, 140};
+static const vp9_prob Pcat4[] = { 176, 155, 140, 135};
+static const vp9_prob Pcat5[] = { 180, 157, 141, 134, 130};
+static const vp9_prob Pcat6[] = {
+  254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129
+};
+
+static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[28];
+
+static void init_bit_tree(vp9_tree_index *p, int n) {
+  int i = 0;
+
+  while (++i < n) {
+    p[0] = p[1] = i << 1;
+    p += 2;
+  }
+
+  p[0] = p[1] = 0;
+}
+
+static void init_bit_trees() {
+  init_bit_tree(cat1, 1);
+  init_bit_tree(cat2, 2);
+  init_bit_tree(cat3, 3);
+  init_bit_tree(cat4, 4);
+  init_bit_tree(cat5, 5);
+  init_bit_tree(cat6, 14);
+}
+
+const vp9_extra_bit vp9_extra_bits[MAX_ENTROPY_TOKENS] = {
+  {0, 0, 0, 0},           // ZERO_TOKEN
+  {0, 0, 0, 1},           // ONE_TOKEN
+  {0, 0, 0, 2},           // TWO_TOKEN
+  {0, 0, 0, 3},           // THREE_TOKEN
+  {0, 0, 0, 4},           // FOUR_TOKEN
+  {cat1, Pcat1, 1, 5},    // DCT_VAL_CATEGORY1
+  {cat2, Pcat2, 2, 7},    // DCT_VAL_CATEGORY2
+  {cat3, Pcat3, 3, 11},   // DCT_VAL_CATEGORY3
+  {cat4, Pcat4, 4, 19},   // DCT_VAL_CATEGORY4
+  {cat5, Pcat5, 5, 35},   // DCT_VAL_CATEGORY5
+  {cat6, Pcat6, 14, 67},  // DCT_VAL_CATEGORY6
+  {0, 0, 0, 0}            // DCT_EOB_TOKEN
+};
+
+struct vp9_token vp9_coef_encodings[MAX_ENTROPY_TOKENS];
+
+void vp9_coef_tree_initialize() {
+  init_bit_trees();
+  vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree);
+}
+
 static void fill_value_tokens() {
   TOKENVALUE *const t = dct_value_tokens + DCT_MAX_VALUE;
   const vp9_extra_bit *const e = vp9_extra_bits;
@@ -108,7 +188,8 @@
   TOKENEXTRA *t = *tp;        /* store tokens starting here */
   const int eob = pd->eobs[block];
   const PLANE_TYPE type = pd->plane_type;
-  const int16_t *qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block);
+  struct macroblock_plane *p = &cpi->mb.plane[plane];
+  const int16_t *qcoeff_ptr = BLOCK_OFFSET(p->qcoeff, block);
 
   const int segment_id = mbmi->segment_id;
   const int16_t *scan, *nb;

diff --git a/vp9/encoder/vp9_tokenize.h b/vp9/encoder/vp9_tokenize.h
index e24e31b..2e3bf52 100644
--- a/vp9/encoder/vp9_tokenize.h
+++ b/vp9/encoder/vp9_tokenize.h

@@ -28,6 +28,10 @@
   uint8_t         skip_eob_node;
 } TOKENEXTRA;
 
+extern const vp9_tree_index vp9_coef_tree[];
+extern const vp9_tree_index vp9_coef_con_tree[];
+extern struct vp9_token vp9_coef_encodings[];
+
 int vp9_sb_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE bsize);
 int vp9_is_skippable_in_plane(MACROBLOCKD *xd, BLOCK_SIZE bsize,
                               int plane);

diff --git a/vp9/encoder/vp9_treewriter.h b/vp9/encoder/vp9_treewriter.h
index c9bf4da..3245960a 100644
--- a/vp9/encoder/vp9_treewriter.h
+++ b/vp9/encoder/vp9_treewriter.h

@@ -64,11 +64,6 @@
   return cost;
 }
 
-static INLINE int cost_token(vp9_tree tree, const vp9_prob *probs,
-                             const struct vp9_token *token) {
-  return treed_cost(tree, probs, token->value, token->len);
-}
-
 void vp9_cost_tokens(int *costs, const vp9_prob *probs, vp9_tree tree);
 void vp9_cost_tokens_skip(int *costs, const vp9_prob *probs, vp9_tree tree);
 

diff --git a/vp9/encoder/x86/vp9_dct32x32_avx2.c b/vp9/encoder/x86/vp9_dct32x32_avx2.c
new file mode 100644
index 0000000..9ea22fe
--- /dev/null
+++ b/vp9/encoder/x86/vp9_dct32x32_avx2.c

@@ -0,0 +1,2710 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>  // AVX2
+#include "vp9/common/vp9_idct.h"  // for cospi constants
+#include "vpx_ports/mem.h"
+
+#define pair256_set_epi16(a, b) \
+  _mm256_set_epi16(b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a)
+
+#define pair256_set_epi32(a, b) \
+  _mm256_set_epi32(b, a, b, a, b, a, b, a)
+
+
+
+
+#if FDCT32x32_HIGH_PRECISION
+static INLINE __m256i k_madd_epi32_avx2(__m256i a, __m256i b) {
+  __m256i buf0, buf1;
+  buf0 = _mm256_mul_epu32(a, b);
+  a = _mm256_srli_epi64(a, 32);
+  b = _mm256_srli_epi64(b, 32);
+  buf1 = _mm256_mul_epu32(a, b);
+  return _mm256_add_epi64(buf0, buf1);
+}
+
+static INLINE __m256i k_packs_epi64_avx2(__m256i a, __m256i b) {
+  __m256i buf0 = _mm256_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
+  __m256i buf1 = _mm256_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
+  return _mm256_unpacklo_epi64(buf0, buf1);
+}
+#endif
+
+void FDCT32x32_2D_AVX2(const int16_t *input,
+                  int16_t *output_org, int stride) {
+  // Calculate pre-multiplied strides
+  const int str1 = stride;
+  const int str2 = 2 * stride;
+  const int str3 = 2 * stride + str1;
+  // We need an intermediate buffer between passes.
+  DECLARE_ALIGNED(32, int16_t, intermediate[32 * 32]);
+  // Constants
+  //    When we use them, in one case, they are all the same. In all others
+  //    it's a pair of them that we need to repeat four times. This is done
+  //    by constructing the 32 bit constant corresponding to that pair.
+  const __m256i k__cospi_p16_p16 = _mm256_set1_epi16(+cospi_16_64);
+  const __m256i k__cospi_p16_m16 = pair256_set_epi16(+cospi_16_64, -cospi_16_64);
+  const __m256i k__cospi_m08_p24 = pair256_set_epi16(-cospi_8_64,   cospi_24_64);
+  const __m256i k__cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m256i k__cospi_p24_p08 = pair256_set_epi16(+cospi_24_64,  cospi_8_64);
+  const __m256i k__cospi_p12_p20 = pair256_set_epi16(+cospi_12_64,  cospi_20_64);
+  const __m256i k__cospi_m20_p12 = pair256_set_epi16(-cospi_20_64,  cospi_12_64);
+  const __m256i k__cospi_m04_p28 = pair256_set_epi16(-cospi_4_64,   cospi_28_64);
+  const __m256i k__cospi_p28_p04 = pair256_set_epi16(+cospi_28_64,  cospi_4_64);
+  const __m256i k__cospi_m28_m04 = pair256_set_epi16(-cospi_28_64, -cospi_4_64);
+  const __m256i k__cospi_m12_m20 = pair256_set_epi16(-cospi_12_64, -cospi_20_64);
+  const __m256i k__cospi_p30_p02 = pair256_set_epi16(+cospi_30_64,  cospi_2_64);
+  const __m256i k__cospi_p14_p18 = pair256_set_epi16(+cospi_14_64,  cospi_18_64);
+  const __m256i k__cospi_p22_p10 = pair256_set_epi16(+cospi_22_64,  cospi_10_64);
+  const __m256i k__cospi_p06_p26 = pair256_set_epi16(+cospi_6_64,   cospi_26_64);
+  const __m256i k__cospi_m26_p06 = pair256_set_epi16(-cospi_26_64,  cospi_6_64);
+  const __m256i k__cospi_m10_p22 = pair256_set_epi16(-cospi_10_64,  cospi_22_64);
+  const __m256i k__cospi_m18_p14 = pair256_set_epi16(-cospi_18_64,  cospi_14_64);
+  const __m256i k__cospi_m02_p30 = pair256_set_epi16(-cospi_2_64,   cospi_30_64);
+  const __m256i k__cospi_p31_p01 = pair256_set_epi16(+cospi_31_64,  cospi_1_64);
+  const __m256i k__cospi_p15_p17 = pair256_set_epi16(+cospi_15_64,  cospi_17_64);
+  const __m256i k__cospi_p23_p09 = pair256_set_epi16(+cospi_23_64,  cospi_9_64);
+  const __m256i k__cospi_p07_p25 = pair256_set_epi16(+cospi_7_64,   cospi_25_64);
+  const __m256i k__cospi_m25_p07 = pair256_set_epi16(-cospi_25_64,  cospi_7_64);
+  const __m256i k__cospi_m09_p23 = pair256_set_epi16(-cospi_9_64,   cospi_23_64);
+  const __m256i k__cospi_m17_p15 = pair256_set_epi16(-cospi_17_64,  cospi_15_64);
+  const __m256i k__cospi_m01_p31 = pair256_set_epi16(-cospi_1_64,   cospi_31_64);
+  const __m256i k__cospi_p27_p05 = pair256_set_epi16(+cospi_27_64,  cospi_5_64);
+  const __m256i k__cospi_p11_p21 = pair256_set_epi16(+cospi_11_64,  cospi_21_64);
+  const __m256i k__cospi_p19_p13 = pair256_set_epi16(+cospi_19_64,  cospi_13_64);
+  const __m256i k__cospi_p03_p29 = pair256_set_epi16(+cospi_3_64,   cospi_29_64);
+  const __m256i k__cospi_m29_p03 = pair256_set_epi16(-cospi_29_64,  cospi_3_64);
+  const __m256i k__cospi_m13_p19 = pair256_set_epi16(-cospi_13_64,  cospi_19_64);
+  const __m256i k__cospi_m21_p11 = pair256_set_epi16(-cospi_21_64,  cospi_11_64);
+  const __m256i k__cospi_m05_p27 = pair256_set_epi16(-cospi_5_64,   cospi_27_64);
+  const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+  const __m256i kZero = _mm256_set1_epi16(0);
+  const __m256i kOne  = _mm256_set1_epi16(1);
+  // Do the two transform/transpose passes
+  int pass;
+  for (pass = 0; pass < 2; ++pass) {
+    // We process sixteen columns (transposed rows in second pass) at a time.
+    int column_start;
+    for (column_start = 0; column_start < 32; column_start += 16) {
+      __m256i step1[32];
+      __m256i step2[32];
+      __m256i step3[32];
+      __m256i out[32];
+      // Stage 1
+      // Note: even though all the loads below are aligned, using the aligned
+      //       intrinsic make the code slightly slower.
+      if (0 == pass) {
+        const int16_t *in  = &input[column_start];
+        // step1[i] =  (in[ 0 * stride] + in[(32 -  1) * stride]) << 2;
+        // Note: the next four blocks could be in a loop. That would help the
+        //       instruction cache but is actually slower.
+        {
+          const int16_t *ina =  in +  0 * str1;
+          const int16_t *inb =  in + 31 * str1;
+          __m256i *step1a = &step1[ 0];
+          __m256i *step1b = &step1[31];
+          const __m256i ina0  = _mm256_loadu_si256((const __m256i *)(ina));
+          const __m256i ina1  = _mm256_loadu_si256((const __m256i *)(ina + str1));
+          const __m256i ina2  = _mm256_loadu_si256((const __m256i *)(ina + str2));
+          const __m256i ina3  = _mm256_loadu_si256((const __m256i *)(ina + str3));
+          const __m256i inb3  = _mm256_loadu_si256((const __m256i *)(inb - str3));
+          const __m256i inb2  = _mm256_loadu_si256((const __m256i *)(inb - str2));
+          const __m256i inb1  = _mm256_loadu_si256((const __m256i *)(inb - str1));
+          const __m256i inb0  = _mm256_loadu_si256((const __m256i *)(inb));
+          step1a[ 0] = _mm256_add_epi16(ina0, inb0);
+          step1a[ 1] = _mm256_add_epi16(ina1, inb1);
+          step1a[ 2] = _mm256_add_epi16(ina2, inb2);
+          step1a[ 3] = _mm256_add_epi16(ina3, inb3);
+          step1b[-3] = _mm256_sub_epi16(ina3, inb3);
+          step1b[-2] = _mm256_sub_epi16(ina2, inb2);
+          step1b[-1] = _mm256_sub_epi16(ina1, inb1);
+          step1b[-0] = _mm256_sub_epi16(ina0, inb0);
+          step1a[ 0] = _mm256_slli_epi16(step1a[ 0], 2);
+          step1a[ 1] = _mm256_slli_epi16(step1a[ 1], 2);
+          step1a[ 2] = _mm256_slli_epi16(step1a[ 2], 2);
+          step1a[ 3] = _mm256_slli_epi16(step1a[ 3], 2);
+          step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
+          step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
+          step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
+          step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
+        }
+        {
+          const int16_t *ina =  in +  4 * str1;
+          const int16_t *inb =  in + 27 * str1;
+          __m256i *step1a = &step1[ 4];
+          __m256i *step1b = &step1[27];
+          const __m256i ina0  = _mm256_loadu_si256((const __m256i *)(ina));
+          const __m256i ina1  = _mm256_loadu_si256((const __m256i *)(ina + str1));
+          const __m256i ina2  = _mm256_loadu_si256((const __m256i *)(ina + str2));
+          const __m256i ina3  = _mm256_loadu_si256((const __m256i *)(ina + str3));
+          const __m256i inb3  = _mm256_loadu_si256((const __m256i *)(inb - str3));
+          const __m256i inb2  = _mm256_loadu_si256((const __m256i *)(inb - str2));
+          const __m256i inb1  = _mm256_loadu_si256((const __m256i *)(inb - str1));
+          const __m256i inb0  = _mm256_loadu_si256((const __m256i *)(inb));
+          step1a[ 0] = _mm256_add_epi16(ina0, inb0);
+          step1a[ 1] = _mm256_add_epi16(ina1, inb1);
+          step1a[ 2] = _mm256_add_epi16(ina2, inb2);
+          step1a[ 3] = _mm256_add_epi16(ina3, inb3);
+          step1b[-3] = _mm256_sub_epi16(ina3, inb3);
+          step1b[-2] = _mm256_sub_epi16(ina2, inb2);
+          step1b[-1] = _mm256_sub_epi16(ina1, inb1);
+          step1b[-0] = _mm256_sub_epi16(ina0, inb0);
+          step1a[ 0] = _mm256_slli_epi16(step1a[ 0], 2);
+          step1a[ 1] = _mm256_slli_epi16(step1a[ 1], 2);
+          step1a[ 2] = _mm256_slli_epi16(step1a[ 2], 2);
+          step1a[ 3] = _mm256_slli_epi16(step1a[ 3], 2);
+          step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
+          step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
+          step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
+          step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
+        }
+        {
+          const int16_t *ina =  in +  8 * str1;
+          const int16_t *inb =  in + 23 * str1;
+          __m256i *step1a = &step1[ 8];
+          __m256i *step1b = &step1[23];
+          const __m256i ina0  = _mm256_loadu_si256((const __m256i *)(ina));
+          const __m256i ina1  = _mm256_loadu_si256((const __m256i *)(ina + str1));
+          const __m256i ina2  = _mm256_loadu_si256((const __m256i *)(ina + str2));
+          const __m256i ina3  = _mm256_loadu_si256((const __m256i *)(ina + str3));
+          const __m256i inb3  = _mm256_loadu_si256((const __m256i *)(inb - str3));
+          const __m256i inb2  = _mm256_loadu_si256((const __m256i *)(inb - str2));
+          const __m256i inb1  = _mm256_loadu_si256((const __m256i *)(inb - str1));
+          const __m256i inb0  = _mm256_loadu_si256((const __m256i *)(inb));
+          step1a[ 0] = _mm256_add_epi16(ina0, inb0);
+          step1a[ 1] = _mm256_add_epi16(ina1, inb1);
+          step1a[ 2] = _mm256_add_epi16(ina2, inb2);
+          step1a[ 3] = _mm256_add_epi16(ina3, inb3);
+          step1b[-3] = _mm256_sub_epi16(ina3, inb3);
+          step1b[-2] = _mm256_sub_epi16(ina2, inb2);
+          step1b[-1] = _mm256_sub_epi16(ina1, inb1);
+          step1b[-0] = _mm256_sub_epi16(ina0, inb0);
+          step1a[ 0] = _mm256_slli_epi16(step1a[ 0], 2);
+          step1a[ 1] = _mm256_slli_epi16(step1a[ 1], 2);
+          step1a[ 2] = _mm256_slli_epi16(step1a[ 2], 2);
+          step1a[ 3] = _mm256_slli_epi16(step1a[ 3], 2);
+          step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
+          step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
+          step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
+          step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
+        }
+        {
+          const int16_t *ina =  in + 12 * str1;
+          const int16_t *inb =  in + 19 * str1;
+          __m256i *step1a = &step1[12];
+          __m256i *step1b = &step1[19];
+          const __m256i ina0  = _mm256_loadu_si256((const __m256i *)(ina));
+          const __m256i ina1  = _mm256_loadu_si256((const __m256i *)(ina + str1));
+          const __m256i ina2  = _mm256_loadu_si256((const __m256i *)(ina + str2));
+          const __m256i ina3  = _mm256_loadu_si256((const __m256i *)(ina + str3));
+          const __m256i inb3  = _mm256_loadu_si256((const __m256i *)(inb - str3));
+          const __m256i inb2  = _mm256_loadu_si256((const __m256i *)(inb - str2));
+          const __m256i inb1  = _mm256_loadu_si256((const __m256i *)(inb - str1));
+          const __m256i inb0  = _mm256_loadu_si256((const __m256i *)(inb));
+          step1a[ 0] = _mm256_add_epi16(ina0, inb0);
+          step1a[ 1] = _mm256_add_epi16(ina1, inb1);
+          step1a[ 2] = _mm256_add_epi16(ina2, inb2);
+          step1a[ 3] = _mm256_add_epi16(ina3, inb3);
+          step1b[-3] = _mm256_sub_epi16(ina3, inb3);
+          step1b[-2] = _mm256_sub_epi16(ina2, inb2);
+          step1b[-1] = _mm256_sub_epi16(ina1, inb1);
+          step1b[-0] = _mm256_sub_epi16(ina0, inb0);
+          step1a[ 0] = _mm256_slli_epi16(step1a[ 0], 2);
+          step1a[ 1] = _mm256_slli_epi16(step1a[ 1], 2);
+          step1a[ 2] = _mm256_slli_epi16(step1a[ 2], 2);
+          step1a[ 3] = _mm256_slli_epi16(step1a[ 3], 2);
+          step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
+          step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
+          step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
+          step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
+        }
+      } else {
+        int16_t *in = &intermediate[column_start];
+        // step1[i] =  in[ 0 * 32] + in[(32 -  1) * 32];
+        // Note: using the same approach as above to have common offset is
+        //       counter-productive as all offsets can be calculated at compile
+        //       time.
+        // Note: the next four blocks could be in a loop. That would help the
+        //       instruction cache but is actually slower.
+        {
+          __m256i in00  = _mm256_loadu_si256((const __m256i *)(in +  0 * 32));
+          __m256i in01  = _mm256_loadu_si256((const __m256i *)(in +  1 * 32));
+          __m256i in02  = _mm256_loadu_si256((const __m256i *)(in +  2 * 32));
+          __m256i in03  = _mm256_loadu_si256((const __m256i *)(in +  3 * 32));
+          __m256i in28  = _mm256_loadu_si256((const __m256i *)(in + 28 * 32));
+          __m256i in29  = _mm256_loadu_si256((const __m256i *)(in + 29 * 32));
+          __m256i in30  = _mm256_loadu_si256((const __m256i *)(in + 30 * 32));
+          __m256i in31  = _mm256_loadu_si256((const __m256i *)(in + 31 * 32));
+          step1[ 0] = _mm256_add_epi16(in00, in31);
+          step1[ 1] = _mm256_add_epi16(in01, in30);
+          step1[ 2] = _mm256_add_epi16(in02, in29);
+          step1[ 3] = _mm256_add_epi16(in03, in28);
+          step1[28] = _mm256_sub_epi16(in03, in28);
+          step1[29] = _mm256_sub_epi16(in02, in29);
+          step1[30] = _mm256_sub_epi16(in01, in30);
+          step1[31] = _mm256_sub_epi16(in00, in31);
+        }
+        {
+          __m256i in04  = _mm256_loadu_si256((const __m256i *)(in +  4 * 32));
+          __m256i in05  = _mm256_loadu_si256((const __m256i *)(in +  5 * 32));
+          __m256i in06  = _mm256_loadu_si256((const __m256i *)(in +  6 * 32));
+          __m256i in07  = _mm256_loadu_si256((const __m256i *)(in +  7 * 32));
+          __m256i in24  = _mm256_loadu_si256((const __m256i *)(in + 24 * 32));
+          __m256i in25  = _mm256_loadu_si256((const __m256i *)(in + 25 * 32));
+          __m256i in26  = _mm256_loadu_si256((const __m256i *)(in + 26 * 32));
+          __m256i in27  = _mm256_loadu_si256((const __m256i *)(in + 27 * 32));
+          step1[ 4] = _mm256_add_epi16(in04, in27);
+          step1[ 5] = _mm256_add_epi16(in05, in26);
+          step1[ 6] = _mm256_add_epi16(in06, in25);
+          step1[ 7] = _mm256_add_epi16(in07, in24);
+          step1[24] = _mm256_sub_epi16(in07, in24);
+          step1[25] = _mm256_sub_epi16(in06, in25);
+          step1[26] = _mm256_sub_epi16(in05, in26);
+          step1[27] = _mm256_sub_epi16(in04, in27);
+        }
+        {
+          __m256i in08  = _mm256_loadu_si256((const __m256i *)(in +  8 * 32));
+          __m256i in09  = _mm256_loadu_si256((const __m256i *)(in +  9 * 32));
+          __m256i in10  = _mm256_loadu_si256((const __m256i *)(in + 10 * 32));
+          __m256i in11  = _mm256_loadu_si256((const __m256i *)(in + 11 * 32));
+          __m256i in20  = _mm256_loadu_si256((const __m256i *)(in + 20 * 32));
+          __m256i in21  = _mm256_loadu_si256((const __m256i *)(in + 21 * 32));
+          __m256i in22  = _mm256_loadu_si256((const __m256i *)(in + 22 * 32));
+          __m256i in23  = _mm256_loadu_si256((const __m256i *)(in + 23 * 32));
+          step1[ 8] = _mm256_add_epi16(in08, in23);
+          step1[ 9] = _mm256_add_epi16(in09, in22);
+          step1[10] = _mm256_add_epi16(in10, in21);
+          step1[11] = _mm256_add_epi16(in11, in20);
+          step1[20] = _mm256_sub_epi16(in11, in20);
+          step1[21] = _mm256_sub_epi16(in10, in21);
+          step1[22] = _mm256_sub_epi16(in09, in22);
+          step1[23] = _mm256_sub_epi16(in08, in23);
+        }
+        {
+          __m256i in12  = _mm256_loadu_si256((const __m256i *)(in + 12 * 32));
+          __m256i in13  = _mm256_loadu_si256((const __m256i *)(in + 13 * 32));
+          __m256i in14  = _mm256_loadu_si256((const __m256i *)(in + 14 * 32));
+          __m256i in15  = _mm256_loadu_si256((const __m256i *)(in + 15 * 32));
+          __m256i in16  = _mm256_loadu_si256((const __m256i *)(in + 16 * 32));
+          __m256i in17  = _mm256_loadu_si256((const __m256i *)(in + 17 * 32));
+          __m256i in18  = _mm256_loadu_si256((const __m256i *)(in + 18 * 32));
+          __m256i in19  = _mm256_loadu_si256((const __m256i *)(in + 19 * 32));
+          step1[12] = _mm256_add_epi16(in12, in19);
+          step1[13] = _mm256_add_epi16(in13, in18);
+          step1[14] = _mm256_add_epi16(in14, in17);
+          step1[15] = _mm256_add_epi16(in15, in16);
+          step1[16] = _mm256_sub_epi16(in15, in16);
+          step1[17] = _mm256_sub_epi16(in14, in17);
+          step1[18] = _mm256_sub_epi16(in13, in18);
+          step1[19] = _mm256_sub_epi16(in12, in19);
+        }
+      }
+      // Stage 2
+      {
+        step2[ 0] = _mm256_add_epi16(step1[0], step1[15]);
+        step2[ 1] = _mm256_add_epi16(step1[1], step1[14]);
+        step2[ 2] = _mm256_add_epi16(step1[2], step1[13]);
+        step2[ 3] = _mm256_add_epi16(step1[3], step1[12]);
+        step2[ 4] = _mm256_add_epi16(step1[4], step1[11]);
+        step2[ 5] = _mm256_add_epi16(step1[5], step1[10]);
+        step2[ 6] = _mm256_add_epi16(step1[6], step1[ 9]);
+        step2[ 7] = _mm256_add_epi16(step1[7], step1[ 8]);
+        step2[ 8] = _mm256_sub_epi16(step1[7], step1[ 8]);
+        step2[ 9] = _mm256_sub_epi16(step1[6], step1[ 9]);
+        step2[10] = _mm256_sub_epi16(step1[5], step1[10]);
+        step2[11] = _mm256_sub_epi16(step1[4], step1[11]);
+        step2[12] = _mm256_sub_epi16(step1[3], step1[12]);
+        step2[13] = _mm256_sub_epi16(step1[2], step1[13]);
+        step2[14] = _mm256_sub_epi16(step1[1], step1[14]);
+        step2[15] = _mm256_sub_epi16(step1[0], step1[15]);
+      }
+      {
+        const __m256i s2_20_0 = _mm256_unpacklo_epi16(step1[27], step1[20]);
+        const __m256i s2_20_1 = _mm256_unpackhi_epi16(step1[27], step1[20]);
+        const __m256i s2_21_0 = _mm256_unpacklo_epi16(step1[26], step1[21]);
+        const __m256i s2_21_1 = _mm256_unpackhi_epi16(step1[26], step1[21]);
+        const __m256i s2_22_0 = _mm256_unpacklo_epi16(step1[25], step1[22]);
+        const __m256i s2_22_1 = _mm256_unpackhi_epi16(step1[25], step1[22]);
+        const __m256i s2_23_0 = _mm256_unpacklo_epi16(step1[24], step1[23]);
+        const __m256i s2_23_1 = _mm256_unpackhi_epi16(step1[24], step1[23]);
+        const __m256i s2_20_2 = _mm256_madd_epi16(s2_20_0, k__cospi_p16_m16);
+        const __m256i s2_20_3 = _mm256_madd_epi16(s2_20_1, k__cospi_p16_m16);
+        const __m256i s2_21_2 = _mm256_madd_epi16(s2_21_0, k__cospi_p16_m16);
+        const __m256i s2_21_3 = _mm256_madd_epi16(s2_21_1, k__cospi_p16_m16);
+        const __m256i s2_22_2 = _mm256_madd_epi16(s2_22_0, k__cospi_p16_m16);
+        const __m256i s2_22_3 = _mm256_madd_epi16(s2_22_1, k__cospi_p16_m16);
+        const __m256i s2_23_2 = _mm256_madd_epi16(s2_23_0, k__cospi_p16_m16);
+        const __m256i s2_23_3 = _mm256_madd_epi16(s2_23_1, k__cospi_p16_m16);
+        const __m256i s2_24_2 = _mm256_madd_epi16(s2_23_0, k__cospi_p16_p16);
+        const __m256i s2_24_3 = _mm256_madd_epi16(s2_23_1, k__cospi_p16_p16);
+        const __m256i s2_25_2 = _mm256_madd_epi16(s2_22_0, k__cospi_p16_p16);
+        const __m256i s2_25_3 = _mm256_madd_epi16(s2_22_1, k__cospi_p16_p16);
+        const __m256i s2_26_2 = _mm256_madd_epi16(s2_21_0, k__cospi_p16_p16);
+        const __m256i s2_26_3 = _mm256_madd_epi16(s2_21_1, k__cospi_p16_p16);
+        const __m256i s2_27_2 = _mm256_madd_epi16(s2_20_0, k__cospi_p16_p16);
+        const __m256i s2_27_3 = _mm256_madd_epi16(s2_20_1, k__cospi_p16_p16);
+        // dct_const_round_shift
+        const __m256i s2_20_4 = _mm256_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_20_5 = _mm256_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_21_4 = _mm256_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_21_5 = _mm256_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_22_4 = _mm256_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_22_5 = _mm256_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_23_4 = _mm256_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_23_5 = _mm256_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_24_4 = _mm256_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_24_5 = _mm256_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_25_4 = _mm256_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_25_5 = _mm256_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_26_4 = _mm256_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_26_5 = _mm256_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_27_4 = _mm256_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_27_5 = _mm256_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_20_6 = _mm256_srai_epi32(s2_20_4, DCT_CONST_BITS);
+        const __m256i s2_20_7 = _mm256_srai_epi32(s2_20_5, DCT_CONST_BITS);
+        const __m256i s2_21_6 = _mm256_srai_epi32(s2_21_4, DCT_CONST_BITS);
+        const __m256i s2_21_7 = _mm256_srai_epi32(s2_21_5, DCT_CONST_BITS);
+        const __m256i s2_22_6 = _mm256_srai_epi32(s2_22_4, DCT_CONST_BITS);
+        const __m256i s2_22_7 = _mm256_srai_epi32(s2_22_5, DCT_CONST_BITS);
+        const __m256i s2_23_6 = _mm256_srai_epi32(s2_23_4, DCT_CONST_BITS);
+        const __m256i s2_23_7 = _mm256_srai_epi32(s2_23_5, DCT_CONST_BITS);
+        const __m256i s2_24_6 = _mm256_srai_epi32(s2_24_4, DCT_CONST_BITS);
+        const __m256i s2_24_7 = _mm256_srai_epi32(s2_24_5, DCT_CONST_BITS);
+        const __m256i s2_25_6 = _mm256_srai_epi32(s2_25_4, DCT_CONST_BITS);
+        const __m256i s2_25_7 = _mm256_srai_epi32(s2_25_5, DCT_CONST_BITS);
+        const __m256i s2_26_6 = _mm256_srai_epi32(s2_26_4, DCT_CONST_BITS);
+        const __m256i s2_26_7 = _mm256_srai_epi32(s2_26_5, DCT_CONST_BITS);
+        const __m256i s2_27_6 = _mm256_srai_epi32(s2_27_4, DCT_CONST_BITS);
+        const __m256i s2_27_7 = _mm256_srai_epi32(s2_27_5, DCT_CONST_BITS);
+        // Combine
+        step2[20] = _mm256_packs_epi32(s2_20_6, s2_20_7);
+        step2[21] = _mm256_packs_epi32(s2_21_6, s2_21_7);
+        step2[22] = _mm256_packs_epi32(s2_22_6, s2_22_7);
+        step2[23] = _mm256_packs_epi32(s2_23_6, s2_23_7);
+        step2[24] = _mm256_packs_epi32(s2_24_6, s2_24_7);
+        step2[25] = _mm256_packs_epi32(s2_25_6, s2_25_7);
+        step2[26] = _mm256_packs_epi32(s2_26_6, s2_26_7);
+        step2[27] = _mm256_packs_epi32(s2_27_6, s2_27_7);
+      }
+
+#if !FDCT32x32_HIGH_PRECISION
+      // dump the magnitude by half, hence the intermediate values are within
+      // the range of 16 bits.
+      if (1 == pass) {
+        __m256i s3_00_0 = _mm256_cmpgt_epi16(kZero,step2[ 0]);
+        __m256i s3_01_0 = _mm256_cmpgt_epi16(kZero,step2[ 1]);
+        __m256i s3_02_0 = _mm256_cmpgt_epi16(kZero,step2[ 2]);
+        __m256i s3_03_0 = _mm256_cmpgt_epi16(kZero,step2[ 3]);
+        __m256i s3_04_0 = _mm256_cmpgt_epi16(kZero,step2[ 4]);
+        __m256i s3_05_0 = _mm256_cmpgt_epi16(kZero,step2[ 5]);
+        __m256i s3_06_0 = _mm256_cmpgt_epi16(kZero,step2[ 6]);
+        __m256i s3_07_0 = _mm256_cmpgt_epi16(kZero,step2[ 7]);
+        __m256i s2_08_0 = _mm256_cmpgt_epi16(kZero,step2[ 8]);
+        __m256i s2_09_0 = _mm256_cmpgt_epi16(kZero,step2[ 9]);
+        __m256i s3_10_0 = _mm256_cmpgt_epi16(kZero,step2[10]);
+        __m256i s3_11_0 = _mm256_cmpgt_epi16(kZero,step2[11]);
+        __m256i s3_12_0 = _mm256_cmpgt_epi16(kZero,step2[12]);
+        __m256i s3_13_0 = _mm256_cmpgt_epi16(kZero,step2[13]);
+        __m256i s2_14_0 = _mm256_cmpgt_epi16(kZero,step2[14]);
+        __m256i s2_15_0 = _mm256_cmpgt_epi16(kZero,step2[15]);
+        __m256i s3_16_0 = _mm256_cmpgt_epi16(kZero,step1[16]);
+        __m256i s3_17_0 = _mm256_cmpgt_epi16(kZero,step1[17]);
+        __m256i s3_18_0 = _mm256_cmpgt_epi16(kZero,step1[18]);
+        __m256i s3_19_0 = _mm256_cmpgt_epi16(kZero,step1[19]);
+        __m256i s3_20_0 = _mm256_cmpgt_epi16(kZero,step2[20]);
+        __m256i s3_21_0 = _mm256_cmpgt_epi16(kZero,step2[21]);
+        __m256i s3_22_0 = _mm256_cmpgt_epi16(kZero,step2[22]);
+        __m256i s3_23_0 = _mm256_cmpgt_epi16(kZero,step2[23]);
+        __m256i s3_24_0 = _mm256_cmpgt_epi16(kZero,step2[24]);
+        __m256i s3_25_0 = _mm256_cmpgt_epi16(kZero,step2[25]);
+        __m256i s3_26_0 = _mm256_cmpgt_epi16(kZero,step2[26]);
+        __m256i s3_27_0 = _mm256_cmpgt_epi16(kZero,step2[27]);
+        __m256i s3_28_0 = _mm256_cmpgt_epi16(kZero,step1[28]);
+        __m256i s3_29_0 = _mm256_cmpgt_epi16(kZero,step1[29]);
+        __m256i s3_30_0 = _mm256_cmpgt_epi16(kZero,step1[30]);
+        __m256i s3_31_0 = _mm256_cmpgt_epi16(kZero,step1[31]);
+
+        step2[ 0] = _mm256_sub_epi16(step2[ 0], s3_00_0);
+        step2[ 1] = _mm256_sub_epi16(step2[ 1], s3_01_0);
+        step2[ 2] = _mm256_sub_epi16(step2[ 2], s3_02_0);
+        step2[ 3] = _mm256_sub_epi16(step2[ 3], s3_03_0);
+        step2[ 4] = _mm256_sub_epi16(step2[ 4], s3_04_0);
+        step2[ 5] = _mm256_sub_epi16(step2[ 5], s3_05_0);
+        step2[ 6] = _mm256_sub_epi16(step2[ 6], s3_06_0);
+        step2[ 7] = _mm256_sub_epi16(step2[ 7], s3_07_0);
+        step2[ 8] = _mm256_sub_epi16(step2[ 8], s2_08_0);
+        step2[ 9] = _mm256_sub_epi16(step2[ 9], s2_09_0);
+        step2[10] = _mm256_sub_epi16(step2[10], s3_10_0);
+        step2[11] = _mm256_sub_epi16(step2[11], s3_11_0);
+        step2[12] = _mm256_sub_epi16(step2[12], s3_12_0);
+        step2[13] = _mm256_sub_epi16(step2[13], s3_13_0);
+        step2[14] = _mm256_sub_epi16(step2[14], s2_14_0);
+        step2[15] = _mm256_sub_epi16(step2[15], s2_15_0);
+        step1[16] = _mm256_sub_epi16(step1[16], s3_16_0);
+        step1[17] = _mm256_sub_epi16(step1[17], s3_17_0);
+        step1[18] = _mm256_sub_epi16(step1[18], s3_18_0);
+        step1[19] = _mm256_sub_epi16(step1[19], s3_19_0);
+        step2[20] = _mm256_sub_epi16(step2[20], s3_20_0);
+        step2[21] = _mm256_sub_epi16(step2[21], s3_21_0);
+        step2[22] = _mm256_sub_epi16(step2[22], s3_22_0);
+        step2[23] = _mm256_sub_epi16(step2[23], s3_23_0);
+        step2[24] = _mm256_sub_epi16(step2[24], s3_24_0);
+        step2[25] = _mm256_sub_epi16(step2[25], s3_25_0);
+        step2[26] = _mm256_sub_epi16(step2[26], s3_26_0);
+        step2[27] = _mm256_sub_epi16(step2[27], s3_27_0);
+        step1[28] = _mm256_sub_epi16(step1[28], s3_28_0);
+        step1[29] = _mm256_sub_epi16(step1[29], s3_29_0);
+        step1[30] = _mm256_sub_epi16(step1[30], s3_30_0);
+        step1[31] = _mm256_sub_epi16(step1[31], s3_31_0);
+
+        step2[ 0] = _mm256_add_epi16(step2[ 0], kOne);
+        step2[ 1] = _mm256_add_epi16(step2[ 1], kOne);
+        step2[ 2] = _mm256_add_epi16(step2[ 2], kOne);
+        step2[ 3] = _mm256_add_epi16(step2[ 3], kOne);
+        step2[ 4] = _mm256_add_epi16(step2[ 4], kOne);
+        step2[ 5] = _mm256_add_epi16(step2[ 5], kOne);
+        step2[ 6] = _mm256_add_epi16(step2[ 6], kOne);
+        step2[ 7] = _mm256_add_epi16(step2[ 7], kOne);
+        step2[ 8] = _mm256_add_epi16(step2[ 8], kOne);
+        step2[ 9] = _mm256_add_epi16(step2[ 9], kOne);
+        step2[10] = _mm256_add_epi16(step2[10], kOne);
+        step2[11] = _mm256_add_epi16(step2[11], kOne);
+        step2[12] = _mm256_add_epi16(step2[12], kOne);
+        step2[13] = _mm256_add_epi16(step2[13], kOne);
+        step2[14] = _mm256_add_epi16(step2[14], kOne);
+        step2[15] = _mm256_add_epi16(step2[15], kOne);
+        step1[16] = _mm256_add_epi16(step1[16], kOne);
+        step1[17] = _mm256_add_epi16(step1[17], kOne);
+        step1[18] = _mm256_add_epi16(step1[18], kOne);
+        step1[19] = _mm256_add_epi16(step1[19], kOne);
+        step2[20] = _mm256_add_epi16(step2[20], kOne);
+        step2[21] = _mm256_add_epi16(step2[21], kOne);
+        step2[22] = _mm256_add_epi16(step2[22], kOne);
+        step2[23] = _mm256_add_epi16(step2[23], kOne);
+        step2[24] = _mm256_add_epi16(step2[24], kOne);
+        step2[25] = _mm256_add_epi16(step2[25], kOne);
+        step2[26] = _mm256_add_epi16(step2[26], kOne);
+        step2[27] = _mm256_add_epi16(step2[27], kOne);
+        step1[28] = _mm256_add_epi16(step1[28], kOne);
+        step1[29] = _mm256_add_epi16(step1[29], kOne);
+        step1[30] = _mm256_add_epi16(step1[30], kOne);
+        step1[31] = _mm256_add_epi16(step1[31], kOne);
+
+        step2[ 0] = _mm256_srai_epi16(step2[ 0], 2);
+        step2[ 1] = _mm256_srai_epi16(step2[ 1], 2);
+        step2[ 2] = _mm256_srai_epi16(step2[ 2], 2);
+        step2[ 3] = _mm256_srai_epi16(step2[ 3], 2);
+        step2[ 4] = _mm256_srai_epi16(step2[ 4], 2);
+        step2[ 5] = _mm256_srai_epi16(step2[ 5], 2);
+        step2[ 6] = _mm256_srai_epi16(step2[ 6], 2);
+        step2[ 7] = _mm256_srai_epi16(step2[ 7], 2);
+        step2[ 8] = _mm256_srai_epi16(step2[ 8], 2);
+        step2[ 9] = _mm256_srai_epi16(step2[ 9], 2);
+        step2[10] = _mm256_srai_epi16(step2[10], 2);
+        step2[11] = _mm256_srai_epi16(step2[11], 2);
+        step2[12] = _mm256_srai_epi16(step2[12], 2);
+        step2[13] = _mm256_srai_epi16(step2[13], 2);
+        step2[14] = _mm256_srai_epi16(step2[14], 2);
+        step2[15] = _mm256_srai_epi16(step2[15], 2);
+        step1[16] = _mm256_srai_epi16(step1[16], 2);
+        step1[17] = _mm256_srai_epi16(step1[17], 2);
+        step1[18] = _mm256_srai_epi16(step1[18], 2);
+        step1[19] = _mm256_srai_epi16(step1[19], 2);
+        step2[20] = _mm256_srai_epi16(step2[20], 2);
+        step2[21] = _mm256_srai_epi16(step2[21], 2);
+        step2[22] = _mm256_srai_epi16(step2[22], 2);
+        step2[23] = _mm256_srai_epi16(step2[23], 2);
+        step2[24] = _mm256_srai_epi16(step2[24], 2);
+        step2[25] = _mm256_srai_epi16(step2[25], 2);
+        step2[26] = _mm256_srai_epi16(step2[26], 2);
+        step2[27] = _mm256_srai_epi16(step2[27], 2);
+        step1[28] = _mm256_srai_epi16(step1[28], 2);
+        step1[29] = _mm256_srai_epi16(step1[29], 2);
+        step1[30] = _mm256_srai_epi16(step1[30], 2);
+        step1[31] = _mm256_srai_epi16(step1[31], 2);
+      }
+#endif
+
+#if FDCT32x32_HIGH_PRECISION
+      if (pass == 0) {
+#endif
+      // Stage 3
+      {
+        step3[0] = _mm256_add_epi16(step2[(8 - 1)], step2[0]);
+        step3[1] = _mm256_add_epi16(step2[(8 - 2)], step2[1]);
+        step3[2] = _mm256_add_epi16(step2[(8 - 3)], step2[2]);
+        step3[3] = _mm256_add_epi16(step2[(8 - 4)], step2[3]);
+        step3[4] = _mm256_sub_epi16(step2[(8 - 5)], step2[4]);
+        step3[5] = _mm256_sub_epi16(step2[(8 - 6)], step2[5]);
+        step3[6] = _mm256_sub_epi16(step2[(8 - 7)], step2[6]);
+        step3[7] = _mm256_sub_epi16(step2[(8 - 8)], step2[7]);
+      }
+      {
+        const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]);
+        const __m256i s3_10_1 = _mm256_unpackhi_epi16(step2[13], step2[10]);
+        const __m256i s3_11_0 = _mm256_unpacklo_epi16(step2[12], step2[11]);
+        const __m256i s3_11_1 = _mm256_unpackhi_epi16(step2[12], step2[11]);
+        const __m256i s3_10_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_m16);
+        const __m256i s3_10_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_m16);
+        const __m256i s3_11_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_m16);
+        const __m256i s3_11_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_m16);
+        const __m256i s3_12_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_p16);
+        const __m256i s3_12_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_p16);
+        const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16);
+        const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16);
+        // dct_const_round_shift
+        const __m256i s3_10_4 = _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_10_5 = _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_11_4 = _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_11_5 = _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_12_4 = _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_12_5 = _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_13_4 = _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_13_5 = _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_10_6 = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS);
+        const __m256i s3_10_7 = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS);
+        const __m256i s3_11_6 = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS);
+        const __m256i s3_11_7 = _mm256_srai_epi32(s3_11_5, DCT_CONST_BITS);
+        const __m256i s3_12_6 = _mm256_srai_epi32(s3_12_4, DCT_CONST_BITS);
+        const __m256i s3_12_7 = _mm256_srai_epi32(s3_12_5, DCT_CONST_BITS);
+        const __m256i s3_13_6 = _mm256_srai_epi32(s3_13_4, DCT_CONST_BITS);
+        const __m256i s3_13_7 = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS);
+        // Combine
+        step3[10] = _mm256_packs_epi32(s3_10_6, s3_10_7);
+        step3[11] = _mm256_packs_epi32(s3_11_6, s3_11_7);
+        step3[12] = _mm256_packs_epi32(s3_12_6, s3_12_7);
+        step3[13] = _mm256_packs_epi32(s3_13_6, s3_13_7);
+      }
+      {
+        step3[16] = _mm256_add_epi16(step2[23], step1[16]);
+        step3[17] = _mm256_add_epi16(step2[22], step1[17]);
+        step3[18] = _mm256_add_epi16(step2[21], step1[18]);
+        step3[19] = _mm256_add_epi16(step2[20], step1[19]);
+        step3[20] = _mm256_sub_epi16(step1[19], step2[20]);
+        step3[21] = _mm256_sub_epi16(step1[18], step2[21]);
+        step3[22] = _mm256_sub_epi16(step1[17], step2[22]);
+        step3[23] = _mm256_sub_epi16(step1[16], step2[23]);
+        step3[24] = _mm256_sub_epi16(step1[31], step2[24]);
+        step3[25] = _mm256_sub_epi16(step1[30], step2[25]);
+        step3[26] = _mm256_sub_epi16(step1[29], step2[26]);
+        step3[27] = _mm256_sub_epi16(step1[28], step2[27]);
+        step3[28] = _mm256_add_epi16(step2[27], step1[28]);
+        step3[29] = _mm256_add_epi16(step2[26], step1[29]);
+        step3[30] = _mm256_add_epi16(step2[25], step1[30]);
+        step3[31] = _mm256_add_epi16(step2[24], step1[31]);
+      }
+
+      // Stage 4
+      {
+        step1[ 0] = _mm256_add_epi16(step3[ 3], step3[ 0]);
+        step1[ 1] = _mm256_add_epi16(step3[ 2], step3[ 1]);
+        step1[ 2] = _mm256_sub_epi16(step3[ 1], step3[ 2]);
+        step1[ 3] = _mm256_sub_epi16(step3[ 0], step3[ 3]);
+        step1[ 8] = _mm256_add_epi16(step3[11], step2[ 8]);
+        step1[ 9] = _mm256_add_epi16(step3[10], step2[ 9]);
+        step1[10] = _mm256_sub_epi16(step2[ 9], step3[10]);
+        step1[11] = _mm256_sub_epi16(step2[ 8], step3[11]);
+        step1[12] = _mm256_sub_epi16(step2[15], step3[12]);
+        step1[13] = _mm256_sub_epi16(step2[14], step3[13]);
+        step1[14] = _mm256_add_epi16(step3[13], step2[14]);
+        step1[15] = _mm256_add_epi16(step3[12], step2[15]);
+      }
+      {
+        const __m256i s1_05_0 = _mm256_unpacklo_epi16(step3[6], step3[5]);
+        const __m256i s1_05_1 = _mm256_unpackhi_epi16(step3[6], step3[5]);
+        const __m256i s1_05_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_m16);
+        const __m256i s1_05_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_m16);
+        const __m256i s1_06_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_p16);
+        const __m256i s1_06_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_p16);
+        // dct_const_round_shift
+        const __m256i s1_05_4 = _mm256_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_05_5 = _mm256_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_06_4 = _mm256_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_06_5 = _mm256_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_05_6 = _mm256_srai_epi32(s1_05_4, DCT_CONST_BITS);
+        const __m256i s1_05_7 = _mm256_srai_epi32(s1_05_5, DCT_CONST_BITS);
+        const __m256i s1_06_6 = _mm256_srai_epi32(s1_06_4, DCT_CONST_BITS);
+        const __m256i s1_06_7 = _mm256_srai_epi32(s1_06_5, DCT_CONST_BITS);
+        // Combine
+        step1[5] = _mm256_packs_epi32(s1_05_6, s1_05_7);
+        step1[6] = _mm256_packs_epi32(s1_06_6, s1_06_7);
+      }
+      {
+        const __m256i s1_18_0 = _mm256_unpacklo_epi16(step3[18], step3[29]);
+        const __m256i s1_18_1 = _mm256_unpackhi_epi16(step3[18], step3[29]);
+        const __m256i s1_19_0 = _mm256_unpacklo_epi16(step3[19], step3[28]);
+        const __m256i s1_19_1 = _mm256_unpackhi_epi16(step3[19], step3[28]);
+        const __m256i s1_20_0 = _mm256_unpacklo_epi16(step3[20], step3[27]);
+        const __m256i s1_20_1 = _mm256_unpackhi_epi16(step3[20], step3[27]);
+        const __m256i s1_21_0 = _mm256_unpacklo_epi16(step3[21], step3[26]);
+        const __m256i s1_21_1 = _mm256_unpackhi_epi16(step3[21], step3[26]);
+        const __m256i s1_18_2 = _mm256_madd_epi16(s1_18_0, k__cospi_m08_p24);
+        const __m256i s1_18_3 = _mm256_madd_epi16(s1_18_1, k__cospi_m08_p24);
+        const __m256i s1_19_2 = _mm256_madd_epi16(s1_19_0, k__cospi_m08_p24);
+        const __m256i s1_19_3 = _mm256_madd_epi16(s1_19_1, k__cospi_m08_p24);
+        const __m256i s1_20_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m24_m08);
+        const __m256i s1_20_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m24_m08);
+        const __m256i s1_21_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m24_m08);
+        const __m256i s1_21_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m24_m08);
+        const __m256i s1_26_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m08_p24);
+        const __m256i s1_26_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m08_p24);
+        const __m256i s1_27_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m08_p24);
+        const __m256i s1_27_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m08_p24);
+        const __m256i s1_28_2 = _mm256_madd_epi16(s1_19_0, k__cospi_p24_p08);
+        const __m256i s1_28_3 = _mm256_madd_epi16(s1_19_1, k__cospi_p24_p08);
+        const __m256i s1_29_2 = _mm256_madd_epi16(s1_18_0, k__cospi_p24_p08);
+        const __m256i s1_29_3 = _mm256_madd_epi16(s1_18_1, k__cospi_p24_p08);
+        // dct_const_round_shift
+        const __m256i s1_18_4 = _mm256_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_18_5 = _mm256_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_19_4 = _mm256_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_19_5 = _mm256_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_20_4 = _mm256_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_20_5 = _mm256_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_21_4 = _mm256_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_21_5 = _mm256_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_26_4 = _mm256_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_26_5 = _mm256_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_27_4 = _mm256_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_27_5 = _mm256_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_28_4 = _mm256_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_28_5 = _mm256_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_29_4 = _mm256_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_29_5 = _mm256_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_18_6 = _mm256_srai_epi32(s1_18_4, DCT_CONST_BITS);
+        const __m256i s1_18_7 = _mm256_srai_epi32(s1_18_5, DCT_CONST_BITS);
+        const __m256i s1_19_6 = _mm256_srai_epi32(s1_19_4, DCT_CONST_BITS);
+        const __m256i s1_19_7 = _mm256_srai_epi32(s1_19_5, DCT_CONST_BITS);
+        const __m256i s1_20_6 = _mm256_srai_epi32(s1_20_4, DCT_CONST_BITS);
+        const __m256i s1_20_7 = _mm256_srai_epi32(s1_20_5, DCT_CONST_BITS);
+        const __m256i s1_21_6 = _mm256_srai_epi32(s1_21_4, DCT_CONST_BITS);
+        const __m256i s1_21_7 = _mm256_srai_epi32(s1_21_5, DCT_CONST_BITS);
+        const __m256i s1_26_6 = _mm256_srai_epi32(s1_26_4, DCT_CONST_BITS);
+        const __m256i s1_26_7 = _mm256_srai_epi32(s1_26_5, DCT_CONST_BITS);
+        const __m256i s1_27_6 = _mm256_srai_epi32(s1_27_4, DCT_CONST_BITS);
+        const __m256i s1_27_7 = _mm256_srai_epi32(s1_27_5, DCT_CONST_BITS);
+        const __m256i s1_28_6 = _mm256_srai_epi32(s1_28_4, DCT_CONST_BITS);
+        const __m256i s1_28_7 = _mm256_srai_epi32(s1_28_5, DCT_CONST_BITS);
+        const __m256i s1_29_6 = _mm256_srai_epi32(s1_29_4, DCT_CONST_BITS);
+        const __m256i s1_29_7 = _mm256_srai_epi32(s1_29_5, DCT_CONST_BITS);
+        // Combine
+        step1[18] = _mm256_packs_epi32(s1_18_6, s1_18_7);
+        step1[19] = _mm256_packs_epi32(s1_19_6, s1_19_7);
+        step1[20] = _mm256_packs_epi32(s1_20_6, s1_20_7);
+        step1[21] = _mm256_packs_epi32(s1_21_6, s1_21_7);
+        step1[26] = _mm256_packs_epi32(s1_26_6, s1_26_7);
+        step1[27] = _mm256_packs_epi32(s1_27_6, s1_27_7);
+        step1[28] = _mm256_packs_epi32(s1_28_6, s1_28_7);
+        step1[29] = _mm256_packs_epi32(s1_29_6, s1_29_7);
+      }
+      // Stage 5
+      {
+        step2[4] = _mm256_add_epi16(step1[5], step3[4]);
+        step2[5] = _mm256_sub_epi16(step3[4], step1[5]);
+        step2[6] = _mm256_sub_epi16(step3[7], step1[6]);
+        step2[7] = _mm256_add_epi16(step1[6], step3[7]);
+      }
+      {
+        const __m256i out_00_0 = _mm256_unpacklo_epi16(step1[0], step1[1]);
+        const __m256i out_00_1 = _mm256_unpackhi_epi16(step1[0], step1[1]);
+        const __m256i out_08_0 = _mm256_unpacklo_epi16(step1[2], step1[3]);
+        const __m256i out_08_1 = _mm256_unpackhi_epi16(step1[2], step1[3]);
+        const __m256i out_00_2 = _mm256_madd_epi16(out_00_0, k__cospi_p16_p16);
+        const __m256i out_00_3 = _mm256_madd_epi16(out_00_1, k__cospi_p16_p16);
+        const __m256i out_16_2 = _mm256_madd_epi16(out_00_0, k__cospi_p16_m16);
+        const __m256i out_16_3 = _mm256_madd_epi16(out_00_1, k__cospi_p16_m16);
+        const __m256i out_08_2 = _mm256_madd_epi16(out_08_0, k__cospi_p24_p08);
+        const __m256i out_08_3 = _mm256_madd_epi16(out_08_1, k__cospi_p24_p08);
+        const __m256i out_24_2 = _mm256_madd_epi16(out_08_0, k__cospi_m08_p24);
+        const __m256i out_24_3 = _mm256_madd_epi16(out_08_1, k__cospi_m08_p24);
+        // dct_const_round_shift
+        const __m256i out_00_4 = _mm256_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_00_5 = _mm256_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_16_4 = _mm256_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_16_5 = _mm256_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_08_4 = _mm256_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_08_5 = _mm256_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_24_4 = _mm256_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_24_5 = _mm256_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_00_6 = _mm256_srai_epi32(out_00_4, DCT_CONST_BITS);
+        const __m256i out_00_7 = _mm256_srai_epi32(out_00_5, DCT_CONST_BITS);
+        const __m256i out_16_6 = _mm256_srai_epi32(out_16_4, DCT_CONST_BITS);
+        const __m256i out_16_7 = _mm256_srai_epi32(out_16_5, DCT_CONST_BITS);
+        const __m256i out_08_6 = _mm256_srai_epi32(out_08_4, DCT_CONST_BITS);
+        const __m256i out_08_7 = _mm256_srai_epi32(out_08_5, DCT_CONST_BITS);
+        const __m256i out_24_6 = _mm256_srai_epi32(out_24_4, DCT_CONST_BITS);
+        const __m256i out_24_7 = _mm256_srai_epi32(out_24_5, DCT_CONST_BITS);
+        // Combine
+        out[ 0] = _mm256_packs_epi32(out_00_6, out_00_7);
+        out[16] = _mm256_packs_epi32(out_16_6, out_16_7);
+        out[ 8] = _mm256_packs_epi32(out_08_6, out_08_7);
+        out[24] = _mm256_packs_epi32(out_24_6, out_24_7);
+      }
+      {
+        const __m256i s2_09_0 = _mm256_unpacklo_epi16(step1[ 9], step1[14]);
+        const __m256i s2_09_1 = _mm256_unpackhi_epi16(step1[ 9], step1[14]);
+        const __m256i s2_10_0 = _mm256_unpacklo_epi16(step1[10], step1[13]);
+        const __m256i s2_10_1 = _mm256_unpackhi_epi16(step1[10], step1[13]);
+        const __m256i s2_09_2 = _mm256_madd_epi16(s2_09_0, k__cospi_m08_p24);
+        const __m256i s2_09_3 = _mm256_madd_epi16(s2_09_1, k__cospi_m08_p24);
+        const __m256i s2_10_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m24_m08);
+        const __m256i s2_10_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m24_m08);
+        const __m256i s2_13_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m08_p24);
+        const __m256i s2_13_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m08_p24);
+        const __m256i s2_14_2 = _mm256_madd_epi16(s2_09_0, k__cospi_p24_p08);
+        const __m256i s2_14_3 = _mm256_madd_epi16(s2_09_1, k__cospi_p24_p08);
+        // dct_const_round_shift
+        const __m256i s2_09_4 = _mm256_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_09_5 = _mm256_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_10_4 = _mm256_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_10_5 = _mm256_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_13_4 = _mm256_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_13_5 = _mm256_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_14_4 = _mm256_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_14_5 = _mm256_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_09_6 = _mm256_srai_epi32(s2_09_4, DCT_CONST_BITS);
+        const __m256i s2_09_7 = _mm256_srai_epi32(s2_09_5, DCT_CONST_BITS);
+        const __m256i s2_10_6 = _mm256_srai_epi32(s2_10_4, DCT_CONST_BITS);
+        const __m256i s2_10_7 = _mm256_srai_epi32(s2_10_5, DCT_CONST_BITS);
+        const __m256i s2_13_6 = _mm256_srai_epi32(s2_13_4, DCT_CONST_BITS);
+        const __m256i s2_13_7 = _mm256_srai_epi32(s2_13_5, DCT_CONST_BITS);
+        const __m256i s2_14_6 = _mm256_srai_epi32(s2_14_4, DCT_CONST_BITS);
+        const __m256i s2_14_7 = _mm256_srai_epi32(s2_14_5, DCT_CONST_BITS);
+        // Combine
+        step2[ 9] = _mm256_packs_epi32(s2_09_6, s2_09_7);
+        step2[10] = _mm256_packs_epi32(s2_10_6, s2_10_7);
+        step2[13] = _mm256_packs_epi32(s2_13_6, s2_13_7);
+        step2[14] = _mm256_packs_epi32(s2_14_6, s2_14_7);
+      }
+      {
+        step2[16] = _mm256_add_epi16(step1[19], step3[16]);
+        step2[17] = _mm256_add_epi16(step1[18], step3[17]);
+        step2[18] = _mm256_sub_epi16(step3[17], step1[18]);
+        step2[19] = _mm256_sub_epi16(step3[16], step1[19]);
+        step2[20] = _mm256_sub_epi16(step3[23], step1[20]);
+        step2[21] = _mm256_sub_epi16(step3[22], step1[21]);
+        step2[22] = _mm256_add_epi16(step1[21], step3[22]);
+        step2[23] = _mm256_add_epi16(step1[20], step3[23]);
+        step2[24] = _mm256_add_epi16(step1[27], step3[24]);
+        step2[25] = _mm256_add_epi16(step1[26], step3[25]);
+        step2[26] = _mm256_sub_epi16(step3[25], step1[26]);
+        step2[27] = _mm256_sub_epi16(step3[24], step1[27]);
+        step2[28] = _mm256_sub_epi16(step3[31], step1[28]);
+        step2[29] = _mm256_sub_epi16(step3[30], step1[29]);
+        step2[30] = _mm256_add_epi16(step1[29], step3[30]);
+        step2[31] = _mm256_add_epi16(step1[28], step3[31]);
+      }
+      // Stage 6
+      {
+        const __m256i out_04_0 = _mm256_unpacklo_epi16(step2[4], step2[7]);
+        const __m256i out_04_1 = _mm256_unpackhi_epi16(step2[4], step2[7]);
+        const __m256i out_20_0 = _mm256_unpacklo_epi16(step2[5], step2[6]);
+        const __m256i out_20_1 = _mm256_unpackhi_epi16(step2[5], step2[6]);
+        const __m256i out_12_0 = _mm256_unpacklo_epi16(step2[5], step2[6]);
+        const __m256i out_12_1 = _mm256_unpackhi_epi16(step2[5], step2[6]);
+        const __m256i out_28_0 = _mm256_unpacklo_epi16(step2[4], step2[7]);
+        const __m256i out_28_1 = _mm256_unpackhi_epi16(step2[4], step2[7]);
+        const __m256i out_04_2 = _mm256_madd_epi16(out_04_0, k__cospi_p28_p04);
+        const __m256i out_04_3 = _mm256_madd_epi16(out_04_1, k__cospi_p28_p04);
+        const __m256i out_20_2 = _mm256_madd_epi16(out_20_0, k__cospi_p12_p20);
+        const __m256i out_20_3 = _mm256_madd_epi16(out_20_1, k__cospi_p12_p20);
+        const __m256i out_12_2 = _mm256_madd_epi16(out_12_0, k__cospi_m20_p12);
+        const __m256i out_12_3 = _mm256_madd_epi16(out_12_1, k__cospi_m20_p12);
+        const __m256i out_28_2 = _mm256_madd_epi16(out_28_0, k__cospi_m04_p28);
+        const __m256i out_28_3 = _mm256_madd_epi16(out_28_1, k__cospi_m04_p28);
+        // dct_const_round_shift
+        const __m256i out_04_4 = _mm256_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_04_5 = _mm256_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_20_4 = _mm256_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_20_5 = _mm256_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_12_4 = _mm256_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_12_5 = _mm256_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_28_4 = _mm256_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_28_5 = _mm256_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_04_6 = _mm256_srai_epi32(out_04_4, DCT_CONST_BITS);
+        const __m256i out_04_7 = _mm256_srai_epi32(out_04_5, DCT_CONST_BITS);
+        const __m256i out_20_6 = _mm256_srai_epi32(out_20_4, DCT_CONST_BITS);
+        const __m256i out_20_7 = _mm256_srai_epi32(out_20_5, DCT_CONST_BITS);
+        const __m256i out_12_6 = _mm256_srai_epi32(out_12_4, DCT_CONST_BITS);
+        const __m256i out_12_7 = _mm256_srai_epi32(out_12_5, DCT_CONST_BITS);
+        const __m256i out_28_6 = _mm256_srai_epi32(out_28_4, DCT_CONST_BITS);
+        const __m256i out_28_7 = _mm256_srai_epi32(out_28_5, DCT_CONST_BITS);
+        // Combine
+        out[ 4] = _mm256_packs_epi32(out_04_6, out_04_7);
+        out[20] = _mm256_packs_epi32(out_20_6, out_20_7);
+        out[12] = _mm256_packs_epi32(out_12_6, out_12_7);
+        out[28] = _mm256_packs_epi32(out_28_6, out_28_7);
+      }
+      {
+        step3[ 8] = _mm256_add_epi16(step2[ 9], step1[ 8]);
+        step3[ 9] = _mm256_sub_epi16(step1[ 8], step2[ 9]);
+        step3[10] = _mm256_sub_epi16(step1[11], step2[10]);
+        step3[11] = _mm256_add_epi16(step2[10], step1[11]);
+        step3[12] = _mm256_add_epi16(step2[13], step1[12]);
+        step3[13] = _mm256_sub_epi16(step1[12], step2[13]);
+        step3[14] = _mm256_sub_epi16(step1[15], step2[14]);
+        step3[15] = _mm256_add_epi16(step2[14], step1[15]);
+      }
+      {
+        const __m256i s3_17_0 = _mm256_unpacklo_epi16(step2[17], step2[30]);
+        const __m256i s3_17_1 = _mm256_unpackhi_epi16(step2[17], step2[30]);
+        const __m256i s3_18_0 = _mm256_unpacklo_epi16(step2[18], step2[29]);
+        const __m256i s3_18_1 = _mm256_unpackhi_epi16(step2[18], step2[29]);
+        const __m256i s3_21_0 = _mm256_unpacklo_epi16(step2[21], step2[26]);
+        const __m256i s3_21_1 = _mm256_unpackhi_epi16(step2[21], step2[26]);
+        const __m256i s3_22_0 = _mm256_unpacklo_epi16(step2[22], step2[25]);
+        const __m256i s3_22_1 = _mm256_unpackhi_epi16(step2[22], step2[25]);
+        const __m256i s3_17_2 = _mm256_madd_epi16(s3_17_0, k__cospi_m04_p28);
+        const __m256i s3_17_3 = _mm256_madd_epi16(s3_17_1, k__cospi_m04_p28);
+        const __m256i s3_18_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m28_m04);
+        const __m256i s3_18_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m28_m04);
+        const __m256i s3_21_2 = _mm256_madd_epi16(s3_21_0, k__cospi_m20_p12);
+        const __m256i s3_21_3 = _mm256_madd_epi16(s3_21_1, k__cospi_m20_p12);
+        const __m256i s3_22_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m12_m20);
+        const __m256i s3_22_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m12_m20);
+        const __m256i s3_25_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m20_p12);
+        const __m256i s3_25_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m20_p12);
+        const __m256i s3_26_2 = _mm256_madd_epi16(s3_21_0, k__cospi_p12_p20);
+        const __m256i s3_26_3 = _mm256_madd_epi16(s3_21_1, k__cospi_p12_p20);
+        const __m256i s3_29_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m04_p28);
+        const __m256i s3_29_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m04_p28);
+        const __m256i s3_30_2 = _mm256_madd_epi16(s3_17_0, k__cospi_p28_p04);
+        const __m256i s3_30_3 = _mm256_madd_epi16(s3_17_1, k__cospi_p28_p04);
+        // dct_const_round_shift
+        const __m256i s3_17_4 = _mm256_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_17_5 = _mm256_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_18_4 = _mm256_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_18_5 = _mm256_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_21_4 = _mm256_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_21_5 = _mm256_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_22_4 = _mm256_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_22_5 = _mm256_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_17_6 = _mm256_srai_epi32(s3_17_4, DCT_CONST_BITS);
+        const __m256i s3_17_7 = _mm256_srai_epi32(s3_17_5, DCT_CONST_BITS);
+        const __m256i s3_18_6 = _mm256_srai_epi32(s3_18_4, DCT_CONST_BITS);
+        const __m256i s3_18_7 = _mm256_srai_epi32(s3_18_5, DCT_CONST_BITS);
+        const __m256i s3_21_6 = _mm256_srai_epi32(s3_21_4, DCT_CONST_BITS);
+        const __m256i s3_21_7 = _mm256_srai_epi32(s3_21_5, DCT_CONST_BITS);
+        const __m256i s3_22_6 = _mm256_srai_epi32(s3_22_4, DCT_CONST_BITS);
+        const __m256i s3_22_7 = _mm256_srai_epi32(s3_22_5, DCT_CONST_BITS);
+        const __m256i s3_25_4 = _mm256_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_25_5 = _mm256_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_26_4 = _mm256_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_26_5 = _mm256_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_29_4 = _mm256_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_29_5 = _mm256_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_30_4 = _mm256_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_30_5 = _mm256_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_25_6 = _mm256_srai_epi32(s3_25_4, DCT_CONST_BITS);
+        const __m256i s3_25_7 = _mm256_srai_epi32(s3_25_5, DCT_CONST_BITS);
+        const __m256i s3_26_6 = _mm256_srai_epi32(s3_26_4, DCT_CONST_BITS);
+        const __m256i s3_26_7 = _mm256_srai_epi32(s3_26_5, DCT_CONST_BITS);
+        const __m256i s3_29_6 = _mm256_srai_epi32(s3_29_4, DCT_CONST_BITS);
+        const __m256i s3_29_7 = _mm256_srai_epi32(s3_29_5, DCT_CONST_BITS);
+        const __m256i s3_30_6 = _mm256_srai_epi32(s3_30_4, DCT_CONST_BITS);
+        const __m256i s3_30_7 = _mm256_srai_epi32(s3_30_5, DCT_CONST_BITS);
+        // Combine
+        step3[17] = _mm256_packs_epi32(s3_17_6, s3_17_7);
+        step3[18] = _mm256_packs_epi32(s3_18_6, s3_18_7);
+        step3[21] = _mm256_packs_epi32(s3_21_6, s3_21_7);
+        step3[22] = _mm256_packs_epi32(s3_22_6, s3_22_7);
+        // Combine
+        step3[25] = _mm256_packs_epi32(s3_25_6, s3_25_7);
+        step3[26] = _mm256_packs_epi32(s3_26_6, s3_26_7);
+        step3[29] = _mm256_packs_epi32(s3_29_6, s3_29_7);
+        step3[30] = _mm256_packs_epi32(s3_30_6, s3_30_7);
+      }
+      // Stage 7
+      {
+        const __m256i out_02_0 = _mm256_unpacklo_epi16(step3[ 8], step3[15]);
+        const __m256i out_02_1 = _mm256_unpackhi_epi16(step3[ 8], step3[15]);
+        const __m256i out_18_0 = _mm256_unpacklo_epi16(step3[ 9], step3[14]);
+        const __m256i out_18_1 = _mm256_unpackhi_epi16(step3[ 9], step3[14]);
+        const __m256i out_10_0 = _mm256_unpacklo_epi16(step3[10], step3[13]);
+        const __m256i out_10_1 = _mm256_unpackhi_epi16(step3[10], step3[13]);
+        const __m256i out_26_0 = _mm256_unpacklo_epi16(step3[11], step3[12]);
+        const __m256i out_26_1 = _mm256_unpackhi_epi16(step3[11], step3[12]);
+        const __m256i out_02_2 = _mm256_madd_epi16(out_02_0, k__cospi_p30_p02);
+        const __m256i out_02_3 = _mm256_madd_epi16(out_02_1, k__cospi_p30_p02);
+        const __m256i out_18_2 = _mm256_madd_epi16(out_18_0, k__cospi_p14_p18);
+        const __m256i out_18_3 = _mm256_madd_epi16(out_18_1, k__cospi_p14_p18);
+        const __m256i out_10_2 = _mm256_madd_epi16(out_10_0, k__cospi_p22_p10);
+        const __m256i out_10_3 = _mm256_madd_epi16(out_10_1, k__cospi_p22_p10);
+        const __m256i out_26_2 = _mm256_madd_epi16(out_26_0, k__cospi_p06_p26);
+        const __m256i out_26_3 = _mm256_madd_epi16(out_26_1, k__cospi_p06_p26);
+        const __m256i out_06_2 = _mm256_madd_epi16(out_26_0, k__cospi_m26_p06);
+        const __m256i out_06_3 = _mm256_madd_epi16(out_26_1, k__cospi_m26_p06);
+        const __m256i out_22_2 = _mm256_madd_epi16(out_10_0, k__cospi_m10_p22);
+        const __m256i out_22_3 = _mm256_madd_epi16(out_10_1, k__cospi_m10_p22);
+        const __m256i out_14_2 = _mm256_madd_epi16(out_18_0, k__cospi_m18_p14);
+        const __m256i out_14_3 = _mm256_madd_epi16(out_18_1, k__cospi_m18_p14);
+        const __m256i out_30_2 = _mm256_madd_epi16(out_02_0, k__cospi_m02_p30);
+        const __m256i out_30_3 = _mm256_madd_epi16(out_02_1, k__cospi_m02_p30);
+        // dct_const_round_shift
+        const __m256i out_02_4 = _mm256_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_02_5 = _mm256_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_18_4 = _mm256_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_18_5 = _mm256_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_10_4 = _mm256_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_10_5 = _mm256_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_26_4 = _mm256_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_26_5 = _mm256_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_06_4 = _mm256_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_06_5 = _mm256_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_22_4 = _mm256_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_22_5 = _mm256_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_14_4 = _mm256_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_14_5 = _mm256_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_30_4 = _mm256_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_30_5 = _mm256_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_02_6 = _mm256_srai_epi32(out_02_4, DCT_CONST_BITS);
+        const __m256i out_02_7 = _mm256_srai_epi32(out_02_5, DCT_CONST_BITS);
+        const __m256i out_18_6 = _mm256_srai_epi32(out_18_4, DCT_CONST_BITS);
+        const __m256i out_18_7 = _mm256_srai_epi32(out_18_5, DCT_CONST_BITS);
+        const __m256i out_10_6 = _mm256_srai_epi32(out_10_4, DCT_CONST_BITS);
+        const __m256i out_10_7 = _mm256_srai_epi32(out_10_5, DCT_CONST_BITS);
+        const __m256i out_26_6 = _mm256_srai_epi32(out_26_4, DCT_CONST_BITS);
+        const __m256i out_26_7 = _mm256_srai_epi32(out_26_5, DCT_CONST_BITS);
+        const __m256i out_06_6 = _mm256_srai_epi32(out_06_4, DCT_CONST_BITS);
+        const __m256i out_06_7 = _mm256_srai_epi32(out_06_5, DCT_CONST_BITS);
+        const __m256i out_22_6 = _mm256_srai_epi32(out_22_4, DCT_CONST_BITS);
+        const __m256i out_22_7 = _mm256_srai_epi32(out_22_5, DCT_CONST_BITS);
+        const __m256i out_14_6 = _mm256_srai_epi32(out_14_4, DCT_CONST_BITS);
+        const __m256i out_14_7 = _mm256_srai_epi32(out_14_5, DCT_CONST_BITS);
+        const __m256i out_30_6 = _mm256_srai_epi32(out_30_4, DCT_CONST_BITS);
+        const __m256i out_30_7 = _mm256_srai_epi32(out_30_5, DCT_CONST_BITS);
+        // Combine
+        out[ 2] = _mm256_packs_epi32(out_02_6, out_02_7);
+        out[18] = _mm256_packs_epi32(out_18_6, out_18_7);
+        out[10] = _mm256_packs_epi32(out_10_6, out_10_7);
+        out[26] = _mm256_packs_epi32(out_26_6, out_26_7);
+        out[ 6] = _mm256_packs_epi32(out_06_6, out_06_7);
+        out[22] = _mm256_packs_epi32(out_22_6, out_22_7);
+        out[14] = _mm256_packs_epi32(out_14_6, out_14_7);
+        out[30] = _mm256_packs_epi32(out_30_6, out_30_7);
+      }
+      {
+        step1[16] = _mm256_add_epi16(step3[17], step2[16]);
+        step1[17] = _mm256_sub_epi16(step2[16], step3[17]);
+        step1[18] = _mm256_sub_epi16(step2[19], step3[18]);
+        step1[19] = _mm256_add_epi16(step3[18], step2[19]);
+        step1[20] = _mm256_add_epi16(step3[21], step2[20]);
+        step1[21] = _mm256_sub_epi16(step2[20], step3[21]);
+        step1[22] = _mm256_sub_epi16(step2[23], step3[22]);
+        step1[23] = _mm256_add_epi16(step3[22], step2[23]);
+        step1[24] = _mm256_add_epi16(step3[25], step2[24]);
+        step1[25] = _mm256_sub_epi16(step2[24], step3[25]);
+        step1[26] = _mm256_sub_epi16(step2[27], step3[26]);
+        step1[27] = _mm256_add_epi16(step3[26], step2[27]);
+        step1[28] = _mm256_add_epi16(step3[29], step2[28]);
+        step1[29] = _mm256_sub_epi16(step2[28], step3[29]);
+        step1[30] = _mm256_sub_epi16(step2[31], step3[30]);
+        step1[31] = _mm256_add_epi16(step3[30], step2[31]);
+      }
+      // Final stage --- outputs indices are bit-reversed.
+      {
+        const __m256i out_01_0 = _mm256_unpacklo_epi16(step1[16], step1[31]);
+        const __m256i out_01_1 = _mm256_unpackhi_epi16(step1[16], step1[31]);
+        const __m256i out_17_0 = _mm256_unpacklo_epi16(step1[17], step1[30]);
+        const __m256i out_17_1 = _mm256_unpackhi_epi16(step1[17], step1[30]);
+        const __m256i out_09_0 = _mm256_unpacklo_epi16(step1[18], step1[29]);
+        const __m256i out_09_1 = _mm256_unpackhi_epi16(step1[18], step1[29]);
+        const __m256i out_25_0 = _mm256_unpacklo_epi16(step1[19], step1[28]);
+        const __m256i out_25_1 = _mm256_unpackhi_epi16(step1[19], step1[28]);
+        const __m256i out_01_2 = _mm256_madd_epi16(out_01_0, k__cospi_p31_p01);
+        const __m256i out_01_3 = _mm256_madd_epi16(out_01_1, k__cospi_p31_p01);
+        const __m256i out_17_2 = _mm256_madd_epi16(out_17_0, k__cospi_p15_p17);
+        const __m256i out_17_3 = _mm256_madd_epi16(out_17_1, k__cospi_p15_p17);
+        const __m256i out_09_2 = _mm256_madd_epi16(out_09_0, k__cospi_p23_p09);
+        const __m256i out_09_3 = _mm256_madd_epi16(out_09_1, k__cospi_p23_p09);
+        const __m256i out_25_2 = _mm256_madd_epi16(out_25_0, k__cospi_p07_p25);
+        const __m256i out_25_3 = _mm256_madd_epi16(out_25_1, k__cospi_p07_p25);
+        const __m256i out_07_2 = _mm256_madd_epi16(out_25_0, k__cospi_m25_p07);
+        const __m256i out_07_3 = _mm256_madd_epi16(out_25_1, k__cospi_m25_p07);
+        const __m256i out_23_2 = _mm256_madd_epi16(out_09_0, k__cospi_m09_p23);
+        const __m256i out_23_3 = _mm256_madd_epi16(out_09_1, k__cospi_m09_p23);
+        const __m256i out_15_2 = _mm256_madd_epi16(out_17_0, k__cospi_m17_p15);
+        const __m256i out_15_3 = _mm256_madd_epi16(out_17_1, k__cospi_m17_p15);
+        const __m256i out_31_2 = _mm256_madd_epi16(out_01_0, k__cospi_m01_p31);
+        const __m256i out_31_3 = _mm256_madd_epi16(out_01_1, k__cospi_m01_p31);
+        // dct_const_round_shift
+        const __m256i out_01_4 = _mm256_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_01_5 = _mm256_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_17_4 = _mm256_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_17_5 = _mm256_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_09_4 = _mm256_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_09_5 = _mm256_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_25_4 = _mm256_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_25_5 = _mm256_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_07_4 = _mm256_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_07_5 = _mm256_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_23_4 = _mm256_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_23_5 = _mm256_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_15_4 = _mm256_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_15_5 = _mm256_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_31_4 = _mm256_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_31_5 = _mm256_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_01_6 = _mm256_srai_epi32(out_01_4, DCT_CONST_BITS);
+        const __m256i out_01_7 = _mm256_srai_epi32(out_01_5, DCT_CONST_BITS);
+        const __m256i out_17_6 = _mm256_srai_epi32(out_17_4, DCT_CONST_BITS);
+        const __m256i out_17_7 = _mm256_srai_epi32(out_17_5, DCT_CONST_BITS);
+        const __m256i out_09_6 = _mm256_srai_epi32(out_09_4, DCT_CONST_BITS);
+        const __m256i out_09_7 = _mm256_srai_epi32(out_09_5, DCT_CONST_BITS);
+        const __m256i out_25_6 = _mm256_srai_epi32(out_25_4, DCT_CONST_BITS);
+        const __m256i out_25_7 = _mm256_srai_epi32(out_25_5, DCT_CONST_BITS);
+        const __m256i out_07_6 = _mm256_srai_epi32(out_07_4, DCT_CONST_BITS);
+        const __m256i out_07_7 = _mm256_srai_epi32(out_07_5, DCT_CONST_BITS);
+        const __m256i out_23_6 = _mm256_srai_epi32(out_23_4, DCT_CONST_BITS);
+        const __m256i out_23_7 = _mm256_srai_epi32(out_23_5, DCT_CONST_BITS);
+        const __m256i out_15_6 = _mm256_srai_epi32(out_15_4, DCT_CONST_BITS);
+        const __m256i out_15_7 = _mm256_srai_epi32(out_15_5, DCT_CONST_BITS);
+        const __m256i out_31_6 = _mm256_srai_epi32(out_31_4, DCT_CONST_BITS);
+        const __m256i out_31_7 = _mm256_srai_epi32(out_31_5, DCT_CONST_BITS);
+        // Combine
+        out[ 1] = _mm256_packs_epi32(out_01_6, out_01_7);
+        out[17] = _mm256_packs_epi32(out_17_6, out_17_7);
+        out[ 9] = _mm256_packs_epi32(out_09_6, out_09_7);
+        out[25] = _mm256_packs_epi32(out_25_6, out_25_7);
+        out[ 7] = _mm256_packs_epi32(out_07_6, out_07_7);
+        out[23] = _mm256_packs_epi32(out_23_6, out_23_7);
+        out[15] = _mm256_packs_epi32(out_15_6, out_15_7);
+        out[31] = _mm256_packs_epi32(out_31_6, out_31_7);
+      }
+      {
+        const __m256i out_05_0 = _mm256_unpacklo_epi16(step1[20], step1[27]);
+        const __m256i out_05_1 = _mm256_unpackhi_epi16(step1[20], step1[27]);
+        const __m256i out_21_0 = _mm256_unpacklo_epi16(step1[21], step1[26]);
+        const __m256i out_21_1 = _mm256_unpackhi_epi16(step1[21], step1[26]);
+        const __m256i out_13_0 = _mm256_unpacklo_epi16(step1[22], step1[25]);
+        const __m256i out_13_1 = _mm256_unpackhi_epi16(step1[22], step1[25]);
+        const __m256i out_29_0 = _mm256_unpacklo_epi16(step1[23], step1[24]);
+        const __m256i out_29_1 = _mm256_unpackhi_epi16(step1[23], step1[24]);
+        const __m256i out_05_2 = _mm256_madd_epi16(out_05_0, k__cospi_p27_p05);
+        const __m256i out_05_3 = _mm256_madd_epi16(out_05_1, k__cospi_p27_p05);
+        const __m256i out_21_2 = _mm256_madd_epi16(out_21_0, k__cospi_p11_p21);
+        const __m256i out_21_3 = _mm256_madd_epi16(out_21_1, k__cospi_p11_p21);
+        const __m256i out_13_2 = _mm256_madd_epi16(out_13_0, k__cospi_p19_p13);
+        const __m256i out_13_3 = _mm256_madd_epi16(out_13_1, k__cospi_p19_p13);
+        const __m256i out_29_2 = _mm256_madd_epi16(out_29_0, k__cospi_p03_p29);
+        const __m256i out_29_3 = _mm256_madd_epi16(out_29_1, k__cospi_p03_p29);
+        const __m256i out_03_2 = _mm256_madd_epi16(out_29_0, k__cospi_m29_p03);
+        const __m256i out_03_3 = _mm256_madd_epi16(out_29_1, k__cospi_m29_p03);
+        const __m256i out_19_2 = _mm256_madd_epi16(out_13_0, k__cospi_m13_p19);
+        const __m256i out_19_3 = _mm256_madd_epi16(out_13_1, k__cospi_m13_p19);
+        const __m256i out_11_2 = _mm256_madd_epi16(out_21_0, k__cospi_m21_p11);
+        const __m256i out_11_3 = _mm256_madd_epi16(out_21_1, k__cospi_m21_p11);
+        const __m256i out_27_2 = _mm256_madd_epi16(out_05_0, k__cospi_m05_p27);
+        const __m256i out_27_3 = _mm256_madd_epi16(out_05_1, k__cospi_m05_p27);
+        // dct_const_round_shift
+        const __m256i out_05_4 = _mm256_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_05_5 = _mm256_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_21_4 = _mm256_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_21_5 = _mm256_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_13_4 = _mm256_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_13_5 = _mm256_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_29_4 = _mm256_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_29_5 = _mm256_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_03_4 = _mm256_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_03_5 = _mm256_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_19_4 = _mm256_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_19_5 = _mm256_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_11_4 = _mm256_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_11_5 = _mm256_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_27_4 = _mm256_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_27_5 = _mm256_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_05_6 = _mm256_srai_epi32(out_05_4, DCT_CONST_BITS);
+        const __m256i out_05_7 = _mm256_srai_epi32(out_05_5, DCT_CONST_BITS);
+        const __m256i out_21_6 = _mm256_srai_epi32(out_21_4, DCT_CONST_BITS);
+        const __m256i out_21_7 = _mm256_srai_epi32(out_21_5, DCT_CONST_BITS);
+        const __m256i out_13_6 = _mm256_srai_epi32(out_13_4, DCT_CONST_BITS);
+        const __m256i out_13_7 = _mm256_srai_epi32(out_13_5, DCT_CONST_BITS);
+        const __m256i out_29_6 = _mm256_srai_epi32(out_29_4, DCT_CONST_BITS);
+        const __m256i out_29_7 = _mm256_srai_epi32(out_29_5, DCT_CONST_BITS);
+        const __m256i out_03_6 = _mm256_srai_epi32(out_03_4, DCT_CONST_BITS);
+        const __m256i out_03_7 = _mm256_srai_epi32(out_03_5, DCT_CONST_BITS);
+        const __m256i out_19_6 = _mm256_srai_epi32(out_19_4, DCT_CONST_BITS);
+        const __m256i out_19_7 = _mm256_srai_epi32(out_19_5, DCT_CONST_BITS);
+        const __m256i out_11_6 = _mm256_srai_epi32(out_11_4, DCT_CONST_BITS);
+        const __m256i out_11_7 = _mm256_srai_epi32(out_11_5, DCT_CONST_BITS);
+        const __m256i out_27_6 = _mm256_srai_epi32(out_27_4, DCT_CONST_BITS);
+        const __m256i out_27_7 = _mm256_srai_epi32(out_27_5, DCT_CONST_BITS);
+        // Combine
+        out[ 5] = _mm256_packs_epi32(out_05_6, out_05_7);
+        out[21] = _mm256_packs_epi32(out_21_6, out_21_7);
+        out[13] = _mm256_packs_epi32(out_13_6, out_13_7);
+        out[29] = _mm256_packs_epi32(out_29_6, out_29_7);
+        out[ 3] = _mm256_packs_epi32(out_03_6, out_03_7);
+        out[19] = _mm256_packs_epi32(out_19_6, out_19_7);
+        out[11] = _mm256_packs_epi32(out_11_6, out_11_7);
+        out[27] = _mm256_packs_epi32(out_27_6, out_27_7);
+      }
+#if FDCT32x32_HIGH_PRECISION
+      } else {
+        __m256i lstep1[64], lstep2[64], lstep3[64];
+        __m256i u[32], v[32], sign[16];
+        const __m256i K32One = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1);
+        // start using 32-bit operations
+        // stage 3
+        {
+          // expanding to 32-bit length priori to addition operations
+          lstep2[ 0] = _mm256_unpacklo_epi16(step2[ 0], kZero);
+          lstep2[ 1] = _mm256_unpackhi_epi16(step2[ 0], kZero);
+          lstep2[ 2] = _mm256_unpacklo_epi16(step2[ 1], kZero);
+          lstep2[ 3] = _mm256_unpackhi_epi16(step2[ 1], kZero);
+          lstep2[ 4] = _mm256_unpacklo_epi16(step2[ 2], kZero);
+          lstep2[ 5] = _mm256_unpackhi_epi16(step2[ 2], kZero);
+          lstep2[ 6] = _mm256_unpacklo_epi16(step2[ 3], kZero);
+          lstep2[ 7] = _mm256_unpackhi_epi16(step2[ 3], kZero);
+          lstep2[ 8] = _mm256_unpacklo_epi16(step2[ 4], kZero);
+          lstep2[ 9] = _mm256_unpackhi_epi16(step2[ 4], kZero);
+          lstep2[10] = _mm256_unpacklo_epi16(step2[ 5], kZero);
+          lstep2[11] = _mm256_unpackhi_epi16(step2[ 5], kZero);
+          lstep2[12] = _mm256_unpacklo_epi16(step2[ 6], kZero);
+          lstep2[13] = _mm256_unpackhi_epi16(step2[ 6], kZero);
+          lstep2[14] = _mm256_unpacklo_epi16(step2[ 7], kZero);
+          lstep2[15] = _mm256_unpackhi_epi16(step2[ 7], kZero);
+          lstep2[ 0] = _mm256_madd_epi16(lstep2[ 0], kOne);
+          lstep2[ 1] = _mm256_madd_epi16(lstep2[ 1], kOne);
+          lstep2[ 2] = _mm256_madd_epi16(lstep2[ 2], kOne);
+          lstep2[ 3] = _mm256_madd_epi16(lstep2[ 3], kOne);
+          lstep2[ 4] = _mm256_madd_epi16(lstep2[ 4], kOne);
+          lstep2[ 5] = _mm256_madd_epi16(lstep2[ 5], kOne);
+          lstep2[ 6] = _mm256_madd_epi16(lstep2[ 6], kOne);
+          lstep2[ 7] = _mm256_madd_epi16(lstep2[ 7], kOne);
+          lstep2[ 8] = _mm256_madd_epi16(lstep2[ 8], kOne);
+          lstep2[ 9] = _mm256_madd_epi16(lstep2[ 9], kOne);
+          lstep2[10] = _mm256_madd_epi16(lstep2[10], kOne);
+          lstep2[11] = _mm256_madd_epi16(lstep2[11], kOne);
+          lstep2[12] = _mm256_madd_epi16(lstep2[12], kOne);
+          lstep2[13] = _mm256_madd_epi16(lstep2[13], kOne);
+          lstep2[14] = _mm256_madd_epi16(lstep2[14], kOne);
+          lstep2[15] = _mm256_madd_epi16(lstep2[15], kOne);
+
+          lstep3[ 0] = _mm256_add_epi32(lstep2[14], lstep2[ 0]);
+          lstep3[ 1] = _mm256_add_epi32(lstep2[15], lstep2[ 1]);
+          lstep3[ 2] = _mm256_add_epi32(lstep2[12], lstep2[ 2]);
+          lstep3[ 3] = _mm256_add_epi32(lstep2[13], lstep2[ 3]);
+          lstep3[ 4] = _mm256_add_epi32(lstep2[10], lstep2[ 4]);
+          lstep3[ 5] = _mm256_add_epi32(lstep2[11], lstep2[ 5]);
+          lstep3[ 6] = _mm256_add_epi32(lstep2[ 8], lstep2[ 6]);
+          lstep3[ 7] = _mm256_add_epi32(lstep2[ 9], lstep2[ 7]);
+          lstep3[ 8] = _mm256_sub_epi32(lstep2[ 6], lstep2[ 8]);
+          lstep3[ 9] = _mm256_sub_epi32(lstep2[ 7], lstep2[ 9]);
+          lstep3[10] = _mm256_sub_epi32(lstep2[ 4], lstep2[10]);
+          lstep3[11] = _mm256_sub_epi32(lstep2[ 5], lstep2[11]);
+          lstep3[12] = _mm256_sub_epi32(lstep2[ 2], lstep2[12]);
+          lstep3[13] = _mm256_sub_epi32(lstep2[ 3], lstep2[13]);
+          lstep3[14] = _mm256_sub_epi32(lstep2[ 0], lstep2[14]);
+          lstep3[15] = _mm256_sub_epi32(lstep2[ 1], lstep2[15]);
+        }
+        {
+          const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]);
+          const __m256i s3_10_1 = _mm256_unpackhi_epi16(step2[13], step2[10]);
+          const __m256i s3_11_0 = _mm256_unpacklo_epi16(step2[12], step2[11]);
+          const __m256i s3_11_1 = _mm256_unpackhi_epi16(step2[12], step2[11]);
+          const __m256i s3_10_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_m16);
+          const __m256i s3_10_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_m16);
+          const __m256i s3_11_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_m16);
+          const __m256i s3_11_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_m16);
+          const __m256i s3_12_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_p16);
+          const __m256i s3_12_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_p16);
+          const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16);
+          const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16);
+          // dct_const_round_shift
+          const __m256i s3_10_4 = _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+          const __m256i s3_10_5 = _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+          const __m256i s3_11_4 = _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+          const __m256i s3_11_5 = _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+          const __m256i s3_12_4 = _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+          const __m256i s3_12_5 = _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+          const __m256i s3_13_4 = _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+          const __m256i s3_13_5 = _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+          lstep3[20] = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS);
+          lstep3[21] = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS);
+          lstep3[22] = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS);
+          lstep3[23] = _mm256_srai_epi32(s3_11_5, DCT_CONST_BITS);
+          lstep3[24] = _mm256_srai_epi32(s3_12_4, DCT_CONST_BITS);
+          lstep3[25] = _mm256_srai_epi32(s3_12_5, DCT_CONST_BITS);
+          lstep3[26] = _mm256_srai_epi32(s3_13_4, DCT_CONST_BITS);
+          lstep3[27] = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS);
+        }
+        {
+          lstep2[40] = _mm256_unpacklo_epi16(step2[20], kZero);
+          lstep2[41] = _mm256_unpackhi_epi16(step2[20], kZero);
+          lstep2[42] = _mm256_unpacklo_epi16(step2[21], kZero);
+          lstep2[43] = _mm256_unpackhi_epi16(step2[21], kZero);
+          lstep2[44] = _mm256_unpacklo_epi16(step2[22], kZero);
+          lstep2[45] = _mm256_unpackhi_epi16(step2[22], kZero);
+          lstep2[46] = _mm256_unpacklo_epi16(step2[23], kZero);
+          lstep2[47] = _mm256_unpackhi_epi16(step2[23], kZero);
+          lstep2[48] = _mm256_unpacklo_epi16(step2[24], kZero);
+          lstep2[49] = _mm256_unpackhi_epi16(step2[24], kZero);
+          lstep2[50] = _mm256_unpacklo_epi16(step2[25], kZero);
+          lstep2[51] = _mm256_unpackhi_epi16(step2[25], kZero);
+          lstep2[52] = _mm256_unpacklo_epi16(step2[26], kZero);
+          lstep2[53] = _mm256_unpackhi_epi16(step2[26], kZero);
+          lstep2[54] = _mm256_unpacklo_epi16(step2[27], kZero);
+          lstep2[55] = _mm256_unpackhi_epi16(step2[27], kZero);
+          lstep2[40] = _mm256_madd_epi16(lstep2[40], kOne);
+          lstep2[41] = _mm256_madd_epi16(lstep2[41], kOne);
+          lstep2[42] = _mm256_madd_epi16(lstep2[42], kOne);
+          lstep2[43] = _mm256_madd_epi16(lstep2[43], kOne);
+          lstep2[44] = _mm256_madd_epi16(lstep2[44], kOne);
+          lstep2[45] = _mm256_madd_epi16(lstep2[45], kOne);
+          lstep2[46] = _mm256_madd_epi16(lstep2[46], kOne);
+          lstep2[47] = _mm256_madd_epi16(lstep2[47], kOne);
+          lstep2[48] = _mm256_madd_epi16(lstep2[48], kOne);
+          lstep2[49] = _mm256_madd_epi16(lstep2[49], kOne);
+          lstep2[50] = _mm256_madd_epi16(lstep2[50], kOne);
+          lstep2[51] = _mm256_madd_epi16(lstep2[51], kOne);
+          lstep2[52] = _mm256_madd_epi16(lstep2[52], kOne);
+          lstep2[53] = _mm256_madd_epi16(lstep2[53], kOne);
+          lstep2[54] = _mm256_madd_epi16(lstep2[54], kOne);
+          lstep2[55] = _mm256_madd_epi16(lstep2[55], kOne);
+
+          lstep1[32] = _mm256_unpacklo_epi16(step1[16], kZero);
+          lstep1[33] = _mm256_unpackhi_epi16(step1[16], kZero);
+          lstep1[34] = _mm256_unpacklo_epi16(step1[17], kZero);
+          lstep1[35] = _mm256_unpackhi_epi16(step1[17], kZero);
+          lstep1[36] = _mm256_unpacklo_epi16(step1[18], kZero);
+          lstep1[37] = _mm256_unpackhi_epi16(step1[18], kZero);
+          lstep1[38] = _mm256_unpacklo_epi16(step1[19], kZero);
+          lstep1[39] = _mm256_unpackhi_epi16(step1[19], kZero);
+          lstep1[56] = _mm256_unpacklo_epi16(step1[28], kZero);
+          lstep1[57] = _mm256_unpackhi_epi16(step1[28], kZero);
+          lstep1[58] = _mm256_unpacklo_epi16(step1[29], kZero);
+          lstep1[59] = _mm256_unpackhi_epi16(step1[29], kZero);
+          lstep1[60] = _mm256_unpacklo_epi16(step1[30], kZero);
+          lstep1[61] = _mm256_unpackhi_epi16(step1[30], kZero);
+          lstep1[62] = _mm256_unpacklo_epi16(step1[31], kZero);
+          lstep1[63] = _mm256_unpackhi_epi16(step1[31], kZero);
+          lstep1[32] = _mm256_madd_epi16(lstep1[32], kOne);
+          lstep1[33] = _mm256_madd_epi16(lstep1[33], kOne);
+          lstep1[34] = _mm256_madd_epi16(lstep1[34], kOne);
+          lstep1[35] = _mm256_madd_epi16(lstep1[35], kOne);
+          lstep1[36] = _mm256_madd_epi16(lstep1[36], kOne);
+          lstep1[37] = _mm256_madd_epi16(lstep1[37], kOne);
+          lstep1[38] = _mm256_madd_epi16(lstep1[38], kOne);
+          lstep1[39] = _mm256_madd_epi16(lstep1[39], kOne);
+          lstep1[56] = _mm256_madd_epi16(lstep1[56], kOne);
+          lstep1[57] = _mm256_madd_epi16(lstep1[57], kOne);
+          lstep1[58] = _mm256_madd_epi16(lstep1[58], kOne);
+          lstep1[59] = _mm256_madd_epi16(lstep1[59], kOne);
+          lstep1[60] = _mm256_madd_epi16(lstep1[60], kOne);
+          lstep1[61] = _mm256_madd_epi16(lstep1[61], kOne);
+          lstep1[62] = _mm256_madd_epi16(lstep1[62], kOne);
+          lstep1[63] = _mm256_madd_epi16(lstep1[63], kOne);
+
+          lstep3[32] = _mm256_add_epi32(lstep2[46], lstep1[32]);
+          lstep3[33] = _mm256_add_epi32(lstep2[47], lstep1[33]);
+
+          lstep3[34] = _mm256_add_epi32(lstep2[44], lstep1[34]);
+          lstep3[35] = _mm256_add_epi32(lstep2[45], lstep1[35]);
+          lstep3[36] = _mm256_add_epi32(lstep2[42], lstep1[36]);
+          lstep3[37] = _mm256_add_epi32(lstep2[43], lstep1[37]);
+          lstep3[38] = _mm256_add_epi32(lstep2[40], lstep1[38]);
+          lstep3[39] = _mm256_add_epi32(lstep2[41], lstep1[39]);
+          lstep3[40] = _mm256_sub_epi32(lstep1[38], lstep2[40]);
+          lstep3[41] = _mm256_sub_epi32(lstep1[39], lstep2[41]);
+          lstep3[42] = _mm256_sub_epi32(lstep1[36], lstep2[42]);
+          lstep3[43] = _mm256_sub_epi32(lstep1[37], lstep2[43]);
+          lstep3[44] = _mm256_sub_epi32(lstep1[34], lstep2[44]);
+          lstep3[45] = _mm256_sub_epi32(lstep1[35], lstep2[45]);
+          lstep3[46] = _mm256_sub_epi32(lstep1[32], lstep2[46]);
+          lstep3[47] = _mm256_sub_epi32(lstep1[33], lstep2[47]);
+          lstep3[48] = _mm256_sub_epi32(lstep1[62], lstep2[48]);
+          lstep3[49] = _mm256_sub_epi32(lstep1[63], lstep2[49]);
+          lstep3[50] = _mm256_sub_epi32(lstep1[60], lstep2[50]);
+          lstep3[51] = _mm256_sub_epi32(lstep1[61], lstep2[51]);
+          lstep3[52] = _mm256_sub_epi32(lstep1[58], lstep2[52]);
+          lstep3[53] = _mm256_sub_epi32(lstep1[59], lstep2[53]);
+          lstep3[54] = _mm256_sub_epi32(lstep1[56], lstep2[54]);
+          lstep3[55] = _mm256_sub_epi32(lstep1[57], lstep2[55]);
+          lstep3[56] = _mm256_add_epi32(lstep2[54], lstep1[56]);
+          lstep3[57] = _mm256_add_epi32(lstep2[55], lstep1[57]);
+          lstep3[58] = _mm256_add_epi32(lstep2[52], lstep1[58]);
+          lstep3[59] = _mm256_add_epi32(lstep2[53], lstep1[59]);
+          lstep3[60] = _mm256_add_epi32(lstep2[50], lstep1[60]);
+          lstep3[61] = _mm256_add_epi32(lstep2[51], lstep1[61]);
+          lstep3[62] = _mm256_add_epi32(lstep2[48], lstep1[62]);
+          lstep3[63] = _mm256_add_epi32(lstep2[49], lstep1[63]);
+        }
+
+        // stage 4
+        {
+          // expanding to 32-bit length priori to addition operations
+          lstep2[16] = _mm256_unpacklo_epi16(step2[ 8], kZero);
+          lstep2[17] = _mm256_unpackhi_epi16(step2[ 8], kZero);
+          lstep2[18] = _mm256_unpacklo_epi16(step2[ 9], kZero);
+          lstep2[19] = _mm256_unpackhi_epi16(step2[ 9], kZero);
+          lstep2[28] = _mm256_unpacklo_epi16(step2[14], kZero);
+          lstep2[29] = _mm256_unpackhi_epi16(step2[14], kZero);
+          lstep2[30] = _mm256_unpacklo_epi16(step2[15], kZero);
+          lstep2[31] = _mm256_unpackhi_epi16(step2[15], kZero);
+          lstep2[16] = _mm256_madd_epi16(lstep2[16], kOne);
+          lstep2[17] = _mm256_madd_epi16(lstep2[17], kOne);
+          lstep2[18] = _mm256_madd_epi16(lstep2[18], kOne);
+          lstep2[19] = _mm256_madd_epi16(lstep2[19], kOne);
+          lstep2[28] = _mm256_madd_epi16(lstep2[28], kOne);
+          lstep2[29] = _mm256_madd_epi16(lstep2[29], kOne);
+          lstep2[30] = _mm256_madd_epi16(lstep2[30], kOne);
+          lstep2[31] = _mm256_madd_epi16(lstep2[31], kOne);
+
+          lstep1[ 0] = _mm256_add_epi32(lstep3[ 6], lstep3[ 0]);
+          lstep1[ 1] = _mm256_add_epi32(lstep3[ 7], lstep3[ 1]);
+          lstep1[ 2] = _mm256_add_epi32(lstep3[ 4], lstep3[ 2]);
+          lstep1[ 3] = _mm256_add_epi32(lstep3[ 5], lstep3[ 3]);
+          lstep1[ 4] = _mm256_sub_epi32(lstep3[ 2], lstep3[ 4]);
+          lstep1[ 5] = _mm256_sub_epi32(lstep3[ 3], lstep3[ 5]);
+          lstep1[ 6] = _mm256_sub_epi32(lstep3[ 0], lstep3[ 6]);
+          lstep1[ 7] = _mm256_sub_epi32(lstep3[ 1], lstep3[ 7]);
+          lstep1[16] = _mm256_add_epi32(lstep3[22], lstep2[16]);
+          lstep1[17] = _mm256_add_epi32(lstep3[23], lstep2[17]);
+          lstep1[18] = _mm256_add_epi32(lstep3[20], lstep2[18]);
+          lstep1[19] = _mm256_add_epi32(lstep3[21], lstep2[19]);
+          lstep1[20] = _mm256_sub_epi32(lstep2[18], lstep3[20]);
+          lstep1[21] = _mm256_sub_epi32(lstep2[19], lstep3[21]);
+          lstep1[22] = _mm256_sub_epi32(lstep2[16], lstep3[22]);
+          lstep1[23] = _mm256_sub_epi32(lstep2[17], lstep3[23]);
+          lstep1[24] = _mm256_sub_epi32(lstep2[30], lstep3[24]);
+          lstep1[25] = _mm256_sub_epi32(lstep2[31], lstep3[25]);
+          lstep1[26] = _mm256_sub_epi32(lstep2[28], lstep3[26]);
+          lstep1[27] = _mm256_sub_epi32(lstep2[29], lstep3[27]);
+          lstep1[28] = _mm256_add_epi32(lstep3[26], lstep2[28]);
+          lstep1[29] = _mm256_add_epi32(lstep3[27], lstep2[29]);
+          lstep1[30] = _mm256_add_epi32(lstep3[24], lstep2[30]);
+          lstep1[31] = _mm256_add_epi32(lstep3[25], lstep2[31]);
+        }
+        {
+        // to be continued...
+        //
+        const __m256i k32_p16_p16 = pair256_set_epi32(cospi_16_64, cospi_16_64);
+        const __m256i k32_p16_m16 = pair256_set_epi32(cospi_16_64, -cospi_16_64);
+
+        u[0] = _mm256_unpacklo_epi32(lstep3[12], lstep3[10]);
+        u[1] = _mm256_unpackhi_epi32(lstep3[12], lstep3[10]);
+        u[2] = _mm256_unpacklo_epi32(lstep3[13], lstep3[11]);
+        u[3] = _mm256_unpackhi_epi32(lstep3[13], lstep3[11]);
+
+        // TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide
+        // instruction latency.
+        v[ 0] = k_madd_epi32_avx2(u[0], k32_p16_m16);
+        v[ 1] = k_madd_epi32_avx2(u[1], k32_p16_m16);
+        v[ 2] = k_madd_epi32_avx2(u[2], k32_p16_m16);
+        v[ 3] = k_madd_epi32_avx2(u[3], k32_p16_m16);
+        v[ 4] = k_madd_epi32_avx2(u[0], k32_p16_p16);
+        v[ 5] = k_madd_epi32_avx2(u[1], k32_p16_p16);
+        v[ 6] = k_madd_epi32_avx2(u[2], k32_p16_p16);
+        v[ 7] = k_madd_epi32_avx2(u[3], k32_p16_p16);
+
+        u[0] = k_packs_epi64_avx2(v[0], v[1]);
+        u[1] = k_packs_epi64_avx2(v[2], v[3]);
+        u[2] = k_packs_epi64_avx2(v[4], v[5]);
+        u[3] = k_packs_epi64_avx2(v[6], v[7]);
+
+        v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+        v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+        v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+        v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+
+        lstep1[10] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+        lstep1[11] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+        lstep1[12] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+        lstep1[13] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+        }
+        {
+          const __m256i k32_m08_p24 = pair256_set_epi32(-cospi_8_64, cospi_24_64);
+          const __m256i k32_m24_m08 = pair256_set_epi32(-cospi_24_64, -cospi_8_64);
+          const __m256i k32_p24_p08 = pair256_set_epi32(cospi_24_64, cospi_8_64);
+
+          u[ 0] = _mm256_unpacklo_epi32(lstep3[36], lstep3[58]);
+          u[ 1] = _mm256_unpackhi_epi32(lstep3[36], lstep3[58]);
+          u[ 2] = _mm256_unpacklo_epi32(lstep3[37], lstep3[59]);
+          u[ 3] = _mm256_unpackhi_epi32(lstep3[37], lstep3[59]);
+          u[ 4] = _mm256_unpacklo_epi32(lstep3[38], lstep3[56]);
+          u[ 5] = _mm256_unpackhi_epi32(lstep3[38], lstep3[56]);
+          u[ 6] = _mm256_unpacklo_epi32(lstep3[39], lstep3[57]);
+          u[ 7] = _mm256_unpackhi_epi32(lstep3[39], lstep3[57]);
+          u[ 8] = _mm256_unpacklo_epi32(lstep3[40], lstep3[54]);
+          u[ 9] = _mm256_unpackhi_epi32(lstep3[40], lstep3[54]);
+          u[10] = _mm256_unpacklo_epi32(lstep3[41], lstep3[55]);
+          u[11] = _mm256_unpackhi_epi32(lstep3[41], lstep3[55]);
+          u[12] = _mm256_unpacklo_epi32(lstep3[42], lstep3[52]);
+          u[13] = _mm256_unpackhi_epi32(lstep3[42], lstep3[52]);
+          u[14] = _mm256_unpacklo_epi32(lstep3[43], lstep3[53]);
+          u[15] = _mm256_unpackhi_epi32(lstep3[43], lstep3[53]);
+
+          v[ 0] = k_madd_epi32_avx2(u[ 0], k32_m08_p24);
+          v[ 1] = k_madd_epi32_avx2(u[ 1], k32_m08_p24);
+          v[ 2] = k_madd_epi32_avx2(u[ 2], k32_m08_p24);
+          v[ 3] = k_madd_epi32_avx2(u[ 3], k32_m08_p24);
+          v[ 4] = k_madd_epi32_avx2(u[ 4], k32_m08_p24);
+          v[ 5] = k_madd_epi32_avx2(u[ 5], k32_m08_p24);
+          v[ 6] = k_madd_epi32_avx2(u[ 6], k32_m08_p24);
+          v[ 7] = k_madd_epi32_avx2(u[ 7], k32_m08_p24);
+          v[ 8] = k_madd_epi32_avx2(u[ 8], k32_m24_m08);
+          v[ 9] = k_madd_epi32_avx2(u[ 9], k32_m24_m08);
+          v[10] = k_madd_epi32_avx2(u[10], k32_m24_m08);
+          v[11] = k_madd_epi32_avx2(u[11], k32_m24_m08);
+          v[12] = k_madd_epi32_avx2(u[12], k32_m24_m08);
+          v[13] = k_madd_epi32_avx2(u[13], k32_m24_m08);
+          v[14] = k_madd_epi32_avx2(u[14], k32_m24_m08);
+          v[15] = k_madd_epi32_avx2(u[15], k32_m24_m08);
+          v[16] = k_madd_epi32_avx2(u[12], k32_m08_p24);
+          v[17] = k_madd_epi32_avx2(u[13], k32_m08_p24);
+          v[18] = k_madd_epi32_avx2(u[14], k32_m08_p24);
+          v[19] = k_madd_epi32_avx2(u[15], k32_m08_p24);
+          v[20] = k_madd_epi32_avx2(u[ 8], k32_m08_p24);
+          v[21] = k_madd_epi32_avx2(u[ 9], k32_m08_p24);
+          v[22] = k_madd_epi32_avx2(u[10], k32_m08_p24);
+          v[23] = k_madd_epi32_avx2(u[11], k32_m08_p24);
+          v[24] = k_madd_epi32_avx2(u[ 4], k32_p24_p08);
+          v[25] = k_madd_epi32_avx2(u[ 5], k32_p24_p08);
+          v[26] = k_madd_epi32_avx2(u[ 6], k32_p24_p08);
+          v[27] = k_madd_epi32_avx2(u[ 7], k32_p24_p08);
+          v[28] = k_madd_epi32_avx2(u[ 0], k32_p24_p08);
+          v[29] = k_madd_epi32_avx2(u[ 1], k32_p24_p08);
+          v[30] = k_madd_epi32_avx2(u[ 2], k32_p24_p08);
+          v[31] = k_madd_epi32_avx2(u[ 3], k32_p24_p08);
+
+          u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]);
+          u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]);
+          u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]);
+          u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]);
+          u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]);
+          u[ 5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[ 6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[ 7] = k_packs_epi64_avx2(v[14], v[15]);
+          u[ 8] = k_packs_epi64_avx2(v[16], v[17]);
+          u[ 9] = k_packs_epi64_avx2(v[18], v[19]);
+          u[10] = k_packs_epi64_avx2(v[20], v[21]);
+          u[11] = k_packs_epi64_avx2(v[22], v[23]);
+          u[12] = k_packs_epi64_avx2(v[24], v[25]);
+          u[13] = k_packs_epi64_avx2(v[26], v[27]);
+          u[14] = k_packs_epi64_avx2(v[28], v[29]);
+          u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+          v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+          v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+          v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+          v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+          v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+          v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+          v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+          v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+          v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+          v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+          v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+          v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+          v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+          v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+          v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+          lstep1[36] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS);
+          lstep1[37] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS);
+          lstep1[38] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS);
+          lstep1[39] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS);
+          lstep1[40] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS);
+          lstep1[41] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS);
+          lstep1[42] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS);
+          lstep1[43] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS);
+          lstep1[52] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS);
+          lstep1[53] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS);
+          lstep1[54] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+          lstep1[55] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+          lstep1[56] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+          lstep1[57] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+          lstep1[58] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+          lstep1[59] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+        }
+        // stage 5
+        {
+          lstep2[ 8] = _mm256_add_epi32(lstep1[10], lstep3[ 8]);
+          lstep2[ 9] = _mm256_add_epi32(lstep1[11], lstep3[ 9]);
+          lstep2[10] = _mm256_sub_epi32(lstep3[ 8], lstep1[10]);
+          lstep2[11] = _mm256_sub_epi32(lstep3[ 9], lstep1[11]);
+          lstep2[12] = _mm256_sub_epi32(lstep3[14], lstep1[12]);
+          lstep2[13] = _mm256_sub_epi32(lstep3[15], lstep1[13]);
+          lstep2[14] = _mm256_add_epi32(lstep1[12], lstep3[14]);
+          lstep2[15] = _mm256_add_epi32(lstep1[13], lstep3[15]);
+        }
+        {
+          const __m256i k32_p16_p16 = pair256_set_epi32(cospi_16_64, cospi_16_64);
+          const __m256i k32_p16_m16 = pair256_set_epi32(cospi_16_64, -cospi_16_64);
+          const __m256i k32_p24_p08 = pair256_set_epi32(cospi_24_64, cospi_8_64);
+          const __m256i k32_m08_p24 = pair256_set_epi32(-cospi_8_64, cospi_24_64);
+
+          u[0] = _mm256_unpacklo_epi32(lstep1[0], lstep1[2]);
+          u[1] = _mm256_unpackhi_epi32(lstep1[0], lstep1[2]);
+          u[2] = _mm256_unpacklo_epi32(lstep1[1], lstep1[3]);
+          u[3] = _mm256_unpackhi_epi32(lstep1[1], lstep1[3]);
+          u[4] = _mm256_unpacklo_epi32(lstep1[4], lstep1[6]);
+          u[5] = _mm256_unpackhi_epi32(lstep1[4], lstep1[6]);
+          u[6] = _mm256_unpacklo_epi32(lstep1[5], lstep1[7]);
+          u[7] = _mm256_unpackhi_epi32(lstep1[5], lstep1[7]);
+
+          // TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide
+          // instruction latency.
+          v[ 0] = k_madd_epi32_avx2(u[0], k32_p16_p16);
+          v[ 1] = k_madd_epi32_avx2(u[1], k32_p16_p16);
+          v[ 2] = k_madd_epi32_avx2(u[2], k32_p16_p16);
+          v[ 3] = k_madd_epi32_avx2(u[3], k32_p16_p16);
+          v[ 4] = k_madd_epi32_avx2(u[0], k32_p16_m16);
+          v[ 5] = k_madd_epi32_avx2(u[1], k32_p16_m16);
+          v[ 6] = k_madd_epi32_avx2(u[2], k32_p16_m16);
+          v[ 7] = k_madd_epi32_avx2(u[3], k32_p16_m16);
+          v[ 8] = k_madd_epi32_avx2(u[4], k32_p24_p08);
+          v[ 9] = k_madd_epi32_avx2(u[5], k32_p24_p08);
+          v[10] = k_madd_epi32_avx2(u[6], k32_p24_p08);
+          v[11] = k_madd_epi32_avx2(u[7], k32_p24_p08);
+          v[12] = k_madd_epi32_avx2(u[4], k32_m08_p24);
+          v[13] = k_madd_epi32_avx2(u[5], k32_m08_p24);
+          v[14] = k_madd_epi32_avx2(u[6], k32_m08_p24);
+          v[15] = k_madd_epi32_avx2(u[7], k32_m08_p24);
+
+          u[0] = k_packs_epi64_avx2(v[0], v[1]);
+          u[1] = k_packs_epi64_avx2(v[2], v[3]);
+          u[2] = k_packs_epi64_avx2(v[4], v[5]);
+          u[3] = k_packs_epi64_avx2(v[6], v[7]);
+          u[4] = k_packs_epi64_avx2(v[8], v[9]);
+          u[5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[7] = k_packs_epi64_avx2(v[14], v[15]);
+
+          v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+          v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+          v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+          v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+          v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+          v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+          v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+          v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+          u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+          u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+          u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+          u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+          u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
+          u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
+          u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
+          u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
+
+          sign[0] = _mm256_cmpgt_epi32(kZero,u[0]);
+          sign[1] = _mm256_cmpgt_epi32(kZero,u[1]);
+          sign[2] = _mm256_cmpgt_epi32(kZero,u[2]);
+          sign[3] = _mm256_cmpgt_epi32(kZero,u[3]);
+          sign[4] = _mm256_cmpgt_epi32(kZero,u[4]);
+          sign[5] = _mm256_cmpgt_epi32(kZero,u[5]);
+          sign[6] = _mm256_cmpgt_epi32(kZero,u[6]);
+          sign[7] = _mm256_cmpgt_epi32(kZero,u[7]);
+
+          u[0] = _mm256_sub_epi32(u[0], sign[0]);
+          u[1] = _mm256_sub_epi32(u[1], sign[1]);
+          u[2] = _mm256_sub_epi32(u[2], sign[2]);
+          u[3] = _mm256_sub_epi32(u[3], sign[3]);
+          u[4] = _mm256_sub_epi32(u[4], sign[4]);
+          u[5] = _mm256_sub_epi32(u[5], sign[5]);
+          u[6] = _mm256_sub_epi32(u[6], sign[6]);
+          u[7] = _mm256_sub_epi32(u[7], sign[7]);
+
+          u[0] = _mm256_add_epi32(u[0], K32One);
+          u[1] = _mm256_add_epi32(u[1], K32One);
+          u[2] = _mm256_add_epi32(u[2], K32One);
+          u[3] = _mm256_add_epi32(u[3], K32One);
+          u[4] = _mm256_add_epi32(u[4], K32One);
+          u[5] = _mm256_add_epi32(u[5], K32One);
+          u[6] = _mm256_add_epi32(u[6], K32One);
+          u[7] = _mm256_add_epi32(u[7], K32One);
+
+          u[0] = _mm256_srai_epi32(u[0], 2);
+          u[1] = _mm256_srai_epi32(u[1], 2);
+          u[2] = _mm256_srai_epi32(u[2], 2);
+          u[3] = _mm256_srai_epi32(u[3], 2);
+          u[4] = _mm256_srai_epi32(u[4], 2);
+          u[5] = _mm256_srai_epi32(u[5], 2);
+          u[6] = _mm256_srai_epi32(u[6], 2);
+          u[7] = _mm256_srai_epi32(u[7], 2);
+
+          // Combine
+          out[ 0] = _mm256_packs_epi32(u[0], u[1]);
+          out[16] = _mm256_packs_epi32(u[2], u[3]);
+          out[ 8] = _mm256_packs_epi32(u[4], u[5]);
+          out[24] = _mm256_packs_epi32(u[6], u[7]);
+        }
+        {
+          const __m256i k32_m08_p24 = pair256_set_epi32(-cospi_8_64, cospi_24_64);
+          const __m256i k32_m24_m08 = pair256_set_epi32(-cospi_24_64, -cospi_8_64);
+          const __m256i k32_p24_p08 = pair256_set_epi32(cospi_24_64, cospi_8_64);
+
+          u[0] = _mm256_unpacklo_epi32(lstep1[18], lstep1[28]);
+          u[1] = _mm256_unpackhi_epi32(lstep1[18], lstep1[28]);
+          u[2] = _mm256_unpacklo_epi32(lstep1[19], lstep1[29]);
+          u[3] = _mm256_unpackhi_epi32(lstep1[19], lstep1[29]);
+          u[4] = _mm256_unpacklo_epi32(lstep1[20], lstep1[26]);
+          u[5] = _mm256_unpackhi_epi32(lstep1[20], lstep1[26]);
+          u[6] = _mm256_unpacklo_epi32(lstep1[21], lstep1[27]);
+          u[7] = _mm256_unpackhi_epi32(lstep1[21], lstep1[27]);
+
+          v[0] = k_madd_epi32_avx2(u[0], k32_m08_p24);
+          v[1] = k_madd_epi32_avx2(u[1], k32_m08_p24);
+          v[2] = k_madd_epi32_avx2(u[2], k32_m08_p24);
+          v[3] = k_madd_epi32_avx2(u[3], k32_m08_p24);
+          v[4] = k_madd_epi32_avx2(u[4], k32_m24_m08);
+          v[5] = k_madd_epi32_avx2(u[5], k32_m24_m08);
+          v[6] = k_madd_epi32_avx2(u[6], k32_m24_m08);
+          v[7] = k_madd_epi32_avx2(u[7], k32_m24_m08);
+          v[ 8] = k_madd_epi32_avx2(u[4], k32_m08_p24);
+          v[ 9] = k_madd_epi32_avx2(u[5], k32_m08_p24);
+          v[10] = k_madd_epi32_avx2(u[6], k32_m08_p24);
+          v[11] = k_madd_epi32_avx2(u[7], k32_m08_p24);
+          v[12] = k_madd_epi32_avx2(u[0], k32_p24_p08);
+          v[13] = k_madd_epi32_avx2(u[1], k32_p24_p08);
+          v[14] = k_madd_epi32_avx2(u[2], k32_p24_p08);
+          v[15] = k_madd_epi32_avx2(u[3], k32_p24_p08);
+
+          u[0] = k_packs_epi64_avx2(v[0], v[1]);
+          u[1] = k_packs_epi64_avx2(v[2], v[3]);
+          u[2] = k_packs_epi64_avx2(v[4], v[5]);
+          u[3] = k_packs_epi64_avx2(v[6], v[7]);
+          u[4] = k_packs_epi64_avx2(v[8], v[9]);
+          u[5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[7] = k_packs_epi64_avx2(v[14], v[15]);
+
+          u[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+          u[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+          u[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+          u[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+          u[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+          u[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+          u[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+          u[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+          lstep2[18] = _mm256_srai_epi32(u[0], DCT_CONST_BITS);
+          lstep2[19] = _mm256_srai_epi32(u[1], DCT_CONST_BITS);
+          lstep2[20] = _mm256_srai_epi32(u[2], DCT_CONST_BITS);
+          lstep2[21] = _mm256_srai_epi32(u[3], DCT_CONST_BITS);
+          lstep2[26] = _mm256_srai_epi32(u[4], DCT_CONST_BITS);
+          lstep2[27] = _mm256_srai_epi32(u[5], DCT_CONST_BITS);
+          lstep2[28] = _mm256_srai_epi32(u[6], DCT_CONST_BITS);
+          lstep2[29] = _mm256_srai_epi32(u[7], DCT_CONST_BITS);
+        }
+        {
+          lstep2[32] = _mm256_add_epi32(lstep1[38], lstep3[32]);
+          lstep2[33] = _mm256_add_epi32(lstep1[39], lstep3[33]);
+          lstep2[34] = _mm256_add_epi32(lstep1[36], lstep3[34]);
+          lstep2[35] = _mm256_add_epi32(lstep1[37], lstep3[35]);
+          lstep2[36] = _mm256_sub_epi32(lstep3[34], lstep1[36]);
+          lstep2[37] = _mm256_sub_epi32(lstep3[35], lstep1[37]);
+          lstep2[38] = _mm256_sub_epi32(lstep3[32], lstep1[38]);
+          lstep2[39] = _mm256_sub_epi32(lstep3[33], lstep1[39]);
+          lstep2[40] = _mm256_sub_epi32(lstep3[46], lstep1[40]);
+          lstep2[41] = _mm256_sub_epi32(lstep3[47], lstep1[41]);
+          lstep2[42] = _mm256_sub_epi32(lstep3[44], lstep1[42]);
+          lstep2[43] = _mm256_sub_epi32(lstep3[45], lstep1[43]);
+          lstep2[44] = _mm256_add_epi32(lstep1[42], lstep3[44]);
+          lstep2[45] = _mm256_add_epi32(lstep1[43], lstep3[45]);
+          lstep2[46] = _mm256_add_epi32(lstep1[40], lstep3[46]);
+          lstep2[47] = _mm256_add_epi32(lstep1[41], lstep3[47]);
+          lstep2[48] = _mm256_add_epi32(lstep1[54], lstep3[48]);
+          lstep2[49] = _mm256_add_epi32(lstep1[55], lstep3[49]);
+          lstep2[50] = _mm256_add_epi32(lstep1[52], lstep3[50]);
+          lstep2[51] = _mm256_add_epi32(lstep1[53], lstep3[51]);
+          lstep2[52] = _mm256_sub_epi32(lstep3[50], lstep1[52]);
+          lstep2[53] = _mm256_sub_epi32(lstep3[51], lstep1[53]);
+          lstep2[54] = _mm256_sub_epi32(lstep3[48], lstep1[54]);
+          lstep2[55] = _mm256_sub_epi32(lstep3[49], lstep1[55]);
+          lstep2[56] = _mm256_sub_epi32(lstep3[62], lstep1[56]);
+          lstep2[57] = _mm256_sub_epi32(lstep3[63], lstep1[57]);
+          lstep2[58] = _mm256_sub_epi32(lstep3[60], lstep1[58]);
+          lstep2[59] = _mm256_sub_epi32(lstep3[61], lstep1[59]);
+          lstep2[60] = _mm256_add_epi32(lstep1[58], lstep3[60]);
+          lstep2[61] = _mm256_add_epi32(lstep1[59], lstep3[61]);
+          lstep2[62] = _mm256_add_epi32(lstep1[56], lstep3[62]);
+          lstep2[63] = _mm256_add_epi32(lstep1[57], lstep3[63]);
+        }
+        // stage 6
+        {
+          const __m256i k32_p28_p04 = pair256_set_epi32(cospi_28_64, cospi_4_64);
+          const __m256i k32_p12_p20 = pair256_set_epi32(cospi_12_64, cospi_20_64);
+          const __m256i k32_m20_p12 = pair256_set_epi32(-cospi_20_64, cospi_12_64);
+          const __m256i k32_m04_p28 = pair256_set_epi32(-cospi_4_64, cospi_28_64);
+
+          u[0] = _mm256_unpacklo_epi32(lstep2[ 8], lstep2[14]);
+          u[1] = _mm256_unpackhi_epi32(lstep2[ 8], lstep2[14]);
+          u[2] = _mm256_unpacklo_epi32(lstep2[ 9], lstep2[15]);
+          u[3] = _mm256_unpackhi_epi32(lstep2[ 9], lstep2[15]);
+          u[4] = _mm256_unpacklo_epi32(lstep2[10], lstep2[12]);
+          u[5] = _mm256_unpackhi_epi32(lstep2[10], lstep2[12]);
+          u[6] = _mm256_unpacklo_epi32(lstep2[11], lstep2[13]);
+          u[7] = _mm256_unpackhi_epi32(lstep2[11], lstep2[13]);
+          u[8] = _mm256_unpacklo_epi32(lstep2[10], lstep2[12]);
+          u[9] = _mm256_unpackhi_epi32(lstep2[10], lstep2[12]);
+          u[10] = _mm256_unpacklo_epi32(lstep2[11], lstep2[13]);
+          u[11] = _mm256_unpackhi_epi32(lstep2[11], lstep2[13]);
+          u[12] = _mm256_unpacklo_epi32(lstep2[ 8], lstep2[14]);
+          u[13] = _mm256_unpackhi_epi32(lstep2[ 8], lstep2[14]);
+          u[14] = _mm256_unpacklo_epi32(lstep2[ 9], lstep2[15]);
+          u[15] = _mm256_unpackhi_epi32(lstep2[ 9], lstep2[15]);
+
+          v[0] = k_madd_epi32_avx2(u[0], k32_p28_p04);
+          v[1] = k_madd_epi32_avx2(u[1], k32_p28_p04);
+          v[2] = k_madd_epi32_avx2(u[2], k32_p28_p04);
+          v[3] = k_madd_epi32_avx2(u[3], k32_p28_p04);
+          v[4] = k_madd_epi32_avx2(u[4], k32_p12_p20);
+          v[5] = k_madd_epi32_avx2(u[5], k32_p12_p20);
+          v[6] = k_madd_epi32_avx2(u[6], k32_p12_p20);
+          v[7] = k_madd_epi32_avx2(u[7], k32_p12_p20);
+          v[ 8] = k_madd_epi32_avx2(u[ 8], k32_m20_p12);
+          v[ 9] = k_madd_epi32_avx2(u[ 9], k32_m20_p12);
+          v[10] = k_madd_epi32_avx2(u[10], k32_m20_p12);
+          v[11] = k_madd_epi32_avx2(u[11], k32_m20_p12);
+          v[12] = k_madd_epi32_avx2(u[12], k32_m04_p28);
+          v[13] = k_madd_epi32_avx2(u[13], k32_m04_p28);
+          v[14] = k_madd_epi32_avx2(u[14], k32_m04_p28);
+          v[15] = k_madd_epi32_avx2(u[15], k32_m04_p28);
+
+          u[0] = k_packs_epi64_avx2(v[0], v[1]);
+          u[1] = k_packs_epi64_avx2(v[2], v[3]);
+          u[2] = k_packs_epi64_avx2(v[4], v[5]);
+          u[3] = k_packs_epi64_avx2(v[6], v[7]);
+          u[4] = k_packs_epi64_avx2(v[8], v[9]);
+          u[5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[7] = k_packs_epi64_avx2(v[14], v[15]);
+
+          v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+          v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+          v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+          v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+          v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+          v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+          v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+          v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+          u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+          u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+          u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+          u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+          u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
+          u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
+          u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
+          u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
+
+          sign[0] = _mm256_cmpgt_epi32(kZero,u[0]);
+          sign[1] = _mm256_cmpgt_epi32(kZero,u[1]);
+          sign[2] = _mm256_cmpgt_epi32(kZero,u[2]);
+          sign[3] = _mm256_cmpgt_epi32(kZero,u[3]);
+          sign[4] = _mm256_cmpgt_epi32(kZero,u[4]);
+          sign[5] = _mm256_cmpgt_epi32(kZero,u[5]);
+          sign[6] = _mm256_cmpgt_epi32(kZero,u[6]);
+          sign[7] = _mm256_cmpgt_epi32(kZero,u[7]);
+
+          u[0] = _mm256_sub_epi32(u[0], sign[0]);
+          u[1] = _mm256_sub_epi32(u[1], sign[1]);
+          u[2] = _mm256_sub_epi32(u[2], sign[2]);
+          u[3] = _mm256_sub_epi32(u[3], sign[3]);
+          u[4] = _mm256_sub_epi32(u[4], sign[4]);
+          u[5] = _mm256_sub_epi32(u[5], sign[5]);
+          u[6] = _mm256_sub_epi32(u[6], sign[6]);
+          u[7] = _mm256_sub_epi32(u[7], sign[7]);
+
+          u[0] = _mm256_add_epi32(u[0], K32One);
+          u[1] = _mm256_add_epi32(u[1], K32One);
+          u[2] = _mm256_add_epi32(u[2], K32One);
+          u[3] = _mm256_add_epi32(u[3], K32One);
+          u[4] = _mm256_add_epi32(u[4], K32One);
+          u[5] = _mm256_add_epi32(u[5], K32One);
+          u[6] = _mm256_add_epi32(u[6], K32One);
+          u[7] = _mm256_add_epi32(u[7], K32One);
+
+          u[0] = _mm256_srai_epi32(u[0], 2);
+          u[1] = _mm256_srai_epi32(u[1], 2);
+          u[2] = _mm256_srai_epi32(u[2], 2);
+          u[3] = _mm256_srai_epi32(u[3], 2);
+          u[4] = _mm256_srai_epi32(u[4], 2);
+          u[5] = _mm256_srai_epi32(u[5], 2);
+          u[6] = _mm256_srai_epi32(u[6], 2);
+          u[7] = _mm256_srai_epi32(u[7], 2);
+
+          out[ 4] = _mm256_packs_epi32(u[0], u[1]);
+          out[20] = _mm256_packs_epi32(u[2], u[3]);
+          out[12] = _mm256_packs_epi32(u[4], u[5]);
+          out[28] = _mm256_packs_epi32(u[6], u[7]);
+        }
+        {
+          lstep3[16] = _mm256_add_epi32(lstep2[18], lstep1[16]);
+          lstep3[17] = _mm256_add_epi32(lstep2[19], lstep1[17]);
+          lstep3[18] = _mm256_sub_epi32(lstep1[16], lstep2[18]);
+          lstep3[19] = _mm256_sub_epi32(lstep1[17], lstep2[19]);
+          lstep3[20] = _mm256_sub_epi32(lstep1[22], lstep2[20]);
+          lstep3[21] = _mm256_sub_epi32(lstep1[23], lstep2[21]);
+          lstep3[22] = _mm256_add_epi32(lstep2[20], lstep1[22]);
+          lstep3[23] = _mm256_add_epi32(lstep2[21], lstep1[23]);
+          lstep3[24] = _mm256_add_epi32(lstep2[26], lstep1[24]);
+          lstep3[25] = _mm256_add_epi32(lstep2[27], lstep1[25]);
+          lstep3[26] = _mm256_sub_epi32(lstep1[24], lstep2[26]);
+          lstep3[27] = _mm256_sub_epi32(lstep1[25], lstep2[27]);
+          lstep3[28] = _mm256_sub_epi32(lstep1[30], lstep2[28]);
+          lstep3[29] = _mm256_sub_epi32(lstep1[31], lstep2[29]);
+          lstep3[30] = _mm256_add_epi32(lstep2[28], lstep1[30]);
+          lstep3[31] = _mm256_add_epi32(lstep2[29], lstep1[31]);
+        }
+        {
+          const __m256i k32_m04_p28 = pair256_set_epi32(-cospi_4_64, cospi_28_64);
+          const __m256i k32_m28_m04 = pair256_set_epi32(-cospi_28_64, -cospi_4_64);
+          const __m256i k32_m20_p12 = pair256_set_epi32(-cospi_20_64, cospi_12_64);
+          const __m256i k32_m12_m20 = pair256_set_epi32(-cospi_12_64,
+                                                     -cospi_20_64);
+          const __m256i k32_p12_p20 = pair256_set_epi32(cospi_12_64, cospi_20_64);
+          const __m256i k32_p28_p04 = pair256_set_epi32(cospi_28_64, cospi_4_64);
+
+          u[ 0] = _mm256_unpacklo_epi32(lstep2[34], lstep2[60]);
+          u[ 1] = _mm256_unpackhi_epi32(lstep2[34], lstep2[60]);
+          u[ 2] = _mm256_unpacklo_epi32(lstep2[35], lstep2[61]);
+          u[ 3] = _mm256_unpackhi_epi32(lstep2[35], lstep2[61]);
+          u[ 4] = _mm256_unpacklo_epi32(lstep2[36], lstep2[58]);
+          u[ 5] = _mm256_unpackhi_epi32(lstep2[36], lstep2[58]);
+          u[ 6] = _mm256_unpacklo_epi32(lstep2[37], lstep2[59]);
+          u[ 7] = _mm256_unpackhi_epi32(lstep2[37], lstep2[59]);
+          u[ 8] = _mm256_unpacklo_epi32(lstep2[42], lstep2[52]);
+          u[ 9] = _mm256_unpackhi_epi32(lstep2[42], lstep2[52]);
+          u[10] = _mm256_unpacklo_epi32(lstep2[43], lstep2[53]);
+          u[11] = _mm256_unpackhi_epi32(lstep2[43], lstep2[53]);
+          u[12] = _mm256_unpacklo_epi32(lstep2[44], lstep2[50]);
+          u[13] = _mm256_unpackhi_epi32(lstep2[44], lstep2[50]);
+          u[14] = _mm256_unpacklo_epi32(lstep2[45], lstep2[51]);
+          u[15] = _mm256_unpackhi_epi32(lstep2[45], lstep2[51]);
+
+          v[ 0] = k_madd_epi32_avx2(u[ 0], k32_m04_p28);
+          v[ 1] = k_madd_epi32_avx2(u[ 1], k32_m04_p28);
+          v[ 2] = k_madd_epi32_avx2(u[ 2], k32_m04_p28);
+          v[ 3] = k_madd_epi32_avx2(u[ 3], k32_m04_p28);
+          v[ 4] = k_madd_epi32_avx2(u[ 4], k32_m28_m04);
+          v[ 5] = k_madd_epi32_avx2(u[ 5], k32_m28_m04);
+          v[ 6] = k_madd_epi32_avx2(u[ 6], k32_m28_m04);
+          v[ 7] = k_madd_epi32_avx2(u[ 7], k32_m28_m04);
+          v[ 8] = k_madd_epi32_avx2(u[ 8], k32_m20_p12);
+          v[ 9] = k_madd_epi32_avx2(u[ 9], k32_m20_p12);
+          v[10] = k_madd_epi32_avx2(u[10], k32_m20_p12);
+          v[11] = k_madd_epi32_avx2(u[11], k32_m20_p12);
+          v[12] = k_madd_epi32_avx2(u[12], k32_m12_m20);
+          v[13] = k_madd_epi32_avx2(u[13], k32_m12_m20);
+          v[14] = k_madd_epi32_avx2(u[14], k32_m12_m20);
+          v[15] = k_madd_epi32_avx2(u[15], k32_m12_m20);
+          v[16] = k_madd_epi32_avx2(u[12], k32_m20_p12);
+          v[17] = k_madd_epi32_avx2(u[13], k32_m20_p12);
+          v[18] = k_madd_epi32_avx2(u[14], k32_m20_p12);
+          v[19] = k_madd_epi32_avx2(u[15], k32_m20_p12);
+          v[20] = k_madd_epi32_avx2(u[ 8], k32_p12_p20);
+          v[21] = k_madd_epi32_avx2(u[ 9], k32_p12_p20);
+          v[22] = k_madd_epi32_avx2(u[10], k32_p12_p20);
+          v[23] = k_madd_epi32_avx2(u[11], k32_p12_p20);
+          v[24] = k_madd_epi32_avx2(u[ 4], k32_m04_p28);
+          v[25] = k_madd_epi32_avx2(u[ 5], k32_m04_p28);
+          v[26] = k_madd_epi32_avx2(u[ 6], k32_m04_p28);
+          v[27] = k_madd_epi32_avx2(u[ 7], k32_m04_p28);
+          v[28] = k_madd_epi32_avx2(u[ 0], k32_p28_p04);
+          v[29] = k_madd_epi32_avx2(u[ 1], k32_p28_p04);
+          v[30] = k_madd_epi32_avx2(u[ 2], k32_p28_p04);
+          v[31] = k_madd_epi32_avx2(u[ 3], k32_p28_p04);
+
+          u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]);
+          u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]);
+          u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]);
+          u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]);
+          u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]);
+          u[ 5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[ 6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[ 7] = k_packs_epi64_avx2(v[14], v[15]);
+          u[ 8] = k_packs_epi64_avx2(v[16], v[17]);
+          u[ 9] = k_packs_epi64_avx2(v[18], v[19]);
+          u[10] = k_packs_epi64_avx2(v[20], v[21]);
+          u[11] = k_packs_epi64_avx2(v[22], v[23]);
+          u[12] = k_packs_epi64_avx2(v[24], v[25]);
+          u[13] = k_packs_epi64_avx2(v[26], v[27]);
+          u[14] = k_packs_epi64_avx2(v[28], v[29]);
+          u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+          v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+          v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+          v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+          v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+          v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+          v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+          v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+          v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+          v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+          v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+          v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+          v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+          v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+          v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+          v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+          lstep3[34] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS);
+          lstep3[35] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS);
+          lstep3[36] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS);
+          lstep3[37] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS);
+          lstep3[42] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS);
+          lstep3[43] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS);
+          lstep3[44] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS);
+          lstep3[45] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS);
+          lstep3[50] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS);
+          lstep3[51] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS);
+          lstep3[52] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+          lstep3[53] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+          lstep3[58] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+          lstep3[59] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+          lstep3[60] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+          lstep3[61] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+        }
+        // stage 7
+        {
+          const __m256i k32_p30_p02 = pair256_set_epi32(cospi_30_64, cospi_2_64);
+          const __m256i k32_p14_p18 = pair256_set_epi32(cospi_14_64, cospi_18_64);
+          const __m256i k32_p22_p10 = pair256_set_epi32(cospi_22_64, cospi_10_64);
+          const __m256i k32_p06_p26 = pair256_set_epi32(cospi_6_64,  cospi_26_64);
+          const __m256i k32_m26_p06 = pair256_set_epi32(-cospi_26_64, cospi_6_64);
+          const __m256i k32_m10_p22 = pair256_set_epi32(-cospi_10_64, cospi_22_64);
+          const __m256i k32_m18_p14 = pair256_set_epi32(-cospi_18_64, cospi_14_64);
+          const __m256i k32_m02_p30 = pair256_set_epi32(-cospi_2_64, cospi_30_64);
+
+          u[ 0] = _mm256_unpacklo_epi32(lstep3[16], lstep3[30]);
+          u[ 1] = _mm256_unpackhi_epi32(lstep3[16], lstep3[30]);
+          u[ 2] = _mm256_unpacklo_epi32(lstep3[17], lstep3[31]);
+          u[ 3] = _mm256_unpackhi_epi32(lstep3[17], lstep3[31]);
+          u[ 4] = _mm256_unpacklo_epi32(lstep3[18], lstep3[28]);
+          u[ 5] = _mm256_unpackhi_epi32(lstep3[18], lstep3[28]);
+          u[ 6] = _mm256_unpacklo_epi32(lstep3[19], lstep3[29]);
+          u[ 7] = _mm256_unpackhi_epi32(lstep3[19], lstep3[29]);
+          u[ 8] = _mm256_unpacklo_epi32(lstep3[20], lstep3[26]);
+          u[ 9] = _mm256_unpackhi_epi32(lstep3[20], lstep3[26]);
+          u[10] = _mm256_unpacklo_epi32(lstep3[21], lstep3[27]);
+          u[11] = _mm256_unpackhi_epi32(lstep3[21], lstep3[27]);
+          u[12] = _mm256_unpacklo_epi32(lstep3[22], lstep3[24]);
+          u[13] = _mm256_unpackhi_epi32(lstep3[22], lstep3[24]);
+          u[14] = _mm256_unpacklo_epi32(lstep3[23], lstep3[25]);
+          u[15] = _mm256_unpackhi_epi32(lstep3[23], lstep3[25]);
+
+          v[ 0] = k_madd_epi32_avx2(u[ 0], k32_p30_p02);
+          v[ 1] = k_madd_epi32_avx2(u[ 1], k32_p30_p02);
+          v[ 2] = k_madd_epi32_avx2(u[ 2], k32_p30_p02);
+          v[ 3] = k_madd_epi32_avx2(u[ 3], k32_p30_p02);
+          v[ 4] = k_madd_epi32_avx2(u[ 4], k32_p14_p18);
+          v[ 5] = k_madd_epi32_avx2(u[ 5], k32_p14_p18);
+          v[ 6] = k_madd_epi32_avx2(u[ 6], k32_p14_p18);
+          v[ 7] = k_madd_epi32_avx2(u[ 7], k32_p14_p18);
+          v[ 8] = k_madd_epi32_avx2(u[ 8], k32_p22_p10);
+          v[ 9] = k_madd_epi32_avx2(u[ 9], k32_p22_p10);
+          v[10] = k_madd_epi32_avx2(u[10], k32_p22_p10);
+          v[11] = k_madd_epi32_avx2(u[11], k32_p22_p10);
+          v[12] = k_madd_epi32_avx2(u[12], k32_p06_p26);
+          v[13] = k_madd_epi32_avx2(u[13], k32_p06_p26);
+          v[14] = k_madd_epi32_avx2(u[14], k32_p06_p26);
+          v[15] = k_madd_epi32_avx2(u[15], k32_p06_p26);
+          v[16] = k_madd_epi32_avx2(u[12], k32_m26_p06);
+          v[17] = k_madd_epi32_avx2(u[13], k32_m26_p06);
+          v[18] = k_madd_epi32_avx2(u[14], k32_m26_p06);
+          v[19] = k_madd_epi32_avx2(u[15], k32_m26_p06);
+          v[20] = k_madd_epi32_avx2(u[ 8], k32_m10_p22);
+          v[21] = k_madd_epi32_avx2(u[ 9], k32_m10_p22);
+          v[22] = k_madd_epi32_avx2(u[10], k32_m10_p22);
+          v[23] = k_madd_epi32_avx2(u[11], k32_m10_p22);
+          v[24] = k_madd_epi32_avx2(u[ 4], k32_m18_p14);
+          v[25] = k_madd_epi32_avx2(u[ 5], k32_m18_p14);
+          v[26] = k_madd_epi32_avx2(u[ 6], k32_m18_p14);
+          v[27] = k_madd_epi32_avx2(u[ 7], k32_m18_p14);
+          v[28] = k_madd_epi32_avx2(u[ 0], k32_m02_p30);
+          v[29] = k_madd_epi32_avx2(u[ 1], k32_m02_p30);
+          v[30] = k_madd_epi32_avx2(u[ 2], k32_m02_p30);
+          v[31] = k_madd_epi32_avx2(u[ 3], k32_m02_p30);
+
+          u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]);
+          u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]);
+          u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]);
+          u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]);
+          u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]);
+          u[ 5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[ 6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[ 7] = k_packs_epi64_avx2(v[14], v[15]);
+          u[ 8] = k_packs_epi64_avx2(v[16], v[17]);
+          u[ 9] = k_packs_epi64_avx2(v[18], v[19]);
+          u[10] = k_packs_epi64_avx2(v[20], v[21]);
+          u[11] = k_packs_epi64_avx2(v[22], v[23]);
+          u[12] = k_packs_epi64_avx2(v[24], v[25]);
+          u[13] = k_packs_epi64_avx2(v[26], v[27]);
+          u[14] = k_packs_epi64_avx2(v[28], v[29]);
+          u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+          v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+          v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+          v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+          v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+          v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+          v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+          v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+          v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+          v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+          v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+          v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+          v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+          v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+          v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+          v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+          u[ 0] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS);
+          u[ 1] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS);
+          u[ 2] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS);
+          u[ 3] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS);
+          u[ 4] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS);
+          u[ 5] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS);
+          u[ 6] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS);
+          u[ 7] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS);
+          u[ 8] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS);
+          u[ 9] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS);
+          u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+          u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+          u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+          u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+          u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+          u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+
+          v[ 0] = _mm256_cmpgt_epi32(kZero,u[ 0]);
+          v[ 1] = _mm256_cmpgt_epi32(kZero,u[ 1]);
+          v[ 2] = _mm256_cmpgt_epi32(kZero,u[ 2]);
+          v[ 3] = _mm256_cmpgt_epi32(kZero,u[ 3]);
+          v[ 4] = _mm256_cmpgt_epi32(kZero,u[ 4]);
+          v[ 5] = _mm256_cmpgt_epi32(kZero,u[ 5]);
+          v[ 6] = _mm256_cmpgt_epi32(kZero,u[ 6]);
+          v[ 7] = _mm256_cmpgt_epi32(kZero,u[ 7]);
+          v[ 8] = _mm256_cmpgt_epi32(kZero,u[ 8]);
+          v[ 9] = _mm256_cmpgt_epi32(kZero,u[ 9]);
+          v[10] = _mm256_cmpgt_epi32(kZero,u[10]);
+          v[11] = _mm256_cmpgt_epi32(kZero,u[11]);
+          v[12] = _mm256_cmpgt_epi32(kZero,u[12]);
+          v[13] = _mm256_cmpgt_epi32(kZero,u[13]);
+          v[14] = _mm256_cmpgt_epi32(kZero,u[14]);
+          v[15] = _mm256_cmpgt_epi32(kZero,u[15]);
+
+          u[ 0] = _mm256_sub_epi32(u[ 0], v[ 0]);
+          u[ 1] = _mm256_sub_epi32(u[ 1], v[ 1]);
+          u[ 2] = _mm256_sub_epi32(u[ 2], v[ 2]);
+          u[ 3] = _mm256_sub_epi32(u[ 3], v[ 3]);
+          u[ 4] = _mm256_sub_epi32(u[ 4], v[ 4]);
+          u[ 5] = _mm256_sub_epi32(u[ 5], v[ 5]);
+          u[ 6] = _mm256_sub_epi32(u[ 6], v[ 6]);
+          u[ 7] = _mm256_sub_epi32(u[ 7], v[ 7]);
+          u[ 8] = _mm256_sub_epi32(u[ 8], v[ 8]);
+          u[ 9] = _mm256_sub_epi32(u[ 9], v[ 9]);
+          u[10] = _mm256_sub_epi32(u[10], v[10]);
+          u[11] = _mm256_sub_epi32(u[11], v[11]);
+          u[12] = _mm256_sub_epi32(u[12], v[12]);
+          u[13] = _mm256_sub_epi32(u[13], v[13]);
+          u[14] = _mm256_sub_epi32(u[14], v[14]);
+          u[15] = _mm256_sub_epi32(u[15], v[15]);
+
+          v[ 0] = _mm256_add_epi32(u[ 0], K32One);
+          v[ 1] = _mm256_add_epi32(u[ 1], K32One);
+          v[ 2] = _mm256_add_epi32(u[ 2], K32One);
+          v[ 3] = _mm256_add_epi32(u[ 3], K32One);
+          v[ 4] = _mm256_add_epi32(u[ 4], K32One);
+          v[ 5] = _mm256_add_epi32(u[ 5], K32One);
+          v[ 6] = _mm256_add_epi32(u[ 6], K32One);
+          v[ 7] = _mm256_add_epi32(u[ 7], K32One);
+          v[ 8] = _mm256_add_epi32(u[ 8], K32One);
+          v[ 9] = _mm256_add_epi32(u[ 9], K32One);
+          v[10] = _mm256_add_epi32(u[10], K32One);
+          v[11] = _mm256_add_epi32(u[11], K32One);
+          v[12] = _mm256_add_epi32(u[12], K32One);
+          v[13] = _mm256_add_epi32(u[13], K32One);
+          v[14] = _mm256_add_epi32(u[14], K32One);
+          v[15] = _mm256_add_epi32(u[15], K32One);
+
+          u[ 0] = _mm256_srai_epi32(v[ 0], 2);
+          u[ 1] = _mm256_srai_epi32(v[ 1], 2);
+          u[ 2] = _mm256_srai_epi32(v[ 2], 2);
+          u[ 3] = _mm256_srai_epi32(v[ 3], 2);
+          u[ 4] = _mm256_srai_epi32(v[ 4], 2);
+          u[ 5] = _mm256_srai_epi32(v[ 5], 2);
+          u[ 6] = _mm256_srai_epi32(v[ 6], 2);
+          u[ 7] = _mm256_srai_epi32(v[ 7], 2);
+          u[ 8] = _mm256_srai_epi32(v[ 8], 2);
+          u[ 9] = _mm256_srai_epi32(v[ 9], 2);
+          u[10] = _mm256_srai_epi32(v[10], 2);
+          u[11] = _mm256_srai_epi32(v[11], 2);
+          u[12] = _mm256_srai_epi32(v[12], 2);
+          u[13] = _mm256_srai_epi32(v[13], 2);
+          u[14] = _mm256_srai_epi32(v[14], 2);
+          u[15] = _mm256_srai_epi32(v[15], 2);
+
+          out[ 2] = _mm256_packs_epi32(u[0], u[1]);
+          out[18] = _mm256_packs_epi32(u[2], u[3]);
+          out[10] = _mm256_packs_epi32(u[4], u[5]);
+          out[26] = _mm256_packs_epi32(u[6], u[7]);
+          out[ 6] = _mm256_packs_epi32(u[8], u[9]);
+          out[22] = _mm256_packs_epi32(u[10], u[11]);
+          out[14] = _mm256_packs_epi32(u[12], u[13]);
+          out[30] = _mm256_packs_epi32(u[14], u[15]);
+        }
+        {
+          lstep1[32] = _mm256_add_epi32(lstep3[34], lstep2[32]);
+          lstep1[33] = _mm256_add_epi32(lstep3[35], lstep2[33]);
+          lstep1[34] = _mm256_sub_epi32(lstep2[32], lstep3[34]);
+          lstep1[35] = _mm256_sub_epi32(lstep2[33], lstep3[35]);
+          lstep1[36] = _mm256_sub_epi32(lstep2[38], lstep3[36]);
+          lstep1[37] = _mm256_sub_epi32(lstep2[39], lstep3[37]);
+          lstep1[38] = _mm256_add_epi32(lstep3[36], lstep2[38]);
+          lstep1[39] = _mm256_add_epi32(lstep3[37], lstep2[39]);
+          lstep1[40] = _mm256_add_epi32(lstep3[42], lstep2[40]);
+          lstep1[41] = _mm256_add_epi32(lstep3[43], lstep2[41]);
+          lstep1[42] = _mm256_sub_epi32(lstep2[40], lstep3[42]);
+          lstep1[43] = _mm256_sub_epi32(lstep2[41], lstep3[43]);
+          lstep1[44] = _mm256_sub_epi32(lstep2[46], lstep3[44]);
+          lstep1[45] = _mm256_sub_epi32(lstep2[47], lstep3[45]);
+          lstep1[46] = _mm256_add_epi32(lstep3[44], lstep2[46]);
+          lstep1[47] = _mm256_add_epi32(lstep3[45], lstep2[47]);
+          lstep1[48] = _mm256_add_epi32(lstep3[50], lstep2[48]);
+          lstep1[49] = _mm256_add_epi32(lstep3[51], lstep2[49]);
+          lstep1[50] = _mm256_sub_epi32(lstep2[48], lstep3[50]);
+          lstep1[51] = _mm256_sub_epi32(lstep2[49], lstep3[51]);
+          lstep1[52] = _mm256_sub_epi32(lstep2[54], lstep3[52]);
+          lstep1[53] = _mm256_sub_epi32(lstep2[55], lstep3[53]);
+          lstep1[54] = _mm256_add_epi32(lstep3[52], lstep2[54]);
+          lstep1[55] = _mm256_add_epi32(lstep3[53], lstep2[55]);
+          lstep1[56] = _mm256_add_epi32(lstep3[58], lstep2[56]);
+          lstep1[57] = _mm256_add_epi32(lstep3[59], lstep2[57]);
+          lstep1[58] = _mm256_sub_epi32(lstep2[56], lstep3[58]);
+          lstep1[59] = _mm256_sub_epi32(lstep2[57], lstep3[59]);
+          lstep1[60] = _mm256_sub_epi32(lstep2[62], lstep3[60]);
+          lstep1[61] = _mm256_sub_epi32(lstep2[63], lstep3[61]);
+          lstep1[62] = _mm256_add_epi32(lstep3[60], lstep2[62]);
+          lstep1[63] = _mm256_add_epi32(lstep3[61], lstep2[63]);
+        }
+        // stage 8
+        {
+          const __m256i k32_p31_p01 = pair256_set_epi32(cospi_31_64, cospi_1_64);
+          const __m256i k32_p15_p17 = pair256_set_epi32(cospi_15_64, cospi_17_64);
+          const __m256i k32_p23_p09 = pair256_set_epi32(cospi_23_64, cospi_9_64);
+          const __m256i k32_p07_p25 = pair256_set_epi32(cospi_7_64, cospi_25_64);
+          const __m256i k32_m25_p07 = pair256_set_epi32(-cospi_25_64, cospi_7_64);
+          const __m256i k32_m09_p23 = pair256_set_epi32(-cospi_9_64, cospi_23_64);
+          const __m256i k32_m17_p15 = pair256_set_epi32(-cospi_17_64, cospi_15_64);
+          const __m256i k32_m01_p31 = pair256_set_epi32(-cospi_1_64, cospi_31_64);
+
+          u[ 0] = _mm256_unpacklo_epi32(lstep1[32], lstep1[62]);
+          u[ 1] = _mm256_unpackhi_epi32(lstep1[32], lstep1[62]);
+          u[ 2] = _mm256_unpacklo_epi32(lstep1[33], lstep1[63]);
+          u[ 3] = _mm256_unpackhi_epi32(lstep1[33], lstep1[63]);
+          u[ 4] = _mm256_unpacklo_epi32(lstep1[34], lstep1[60]);
+          u[ 5] = _mm256_unpackhi_epi32(lstep1[34], lstep1[60]);
+          u[ 6] = _mm256_unpacklo_epi32(lstep1[35], lstep1[61]);
+          u[ 7] = _mm256_unpackhi_epi32(lstep1[35], lstep1[61]);
+          u[ 8] = _mm256_unpacklo_epi32(lstep1[36], lstep1[58]);
+          u[ 9] = _mm256_unpackhi_epi32(lstep1[36], lstep1[58]);
+          u[10] = _mm256_unpacklo_epi32(lstep1[37], lstep1[59]);
+          u[11] = _mm256_unpackhi_epi32(lstep1[37], lstep1[59]);
+          u[12] = _mm256_unpacklo_epi32(lstep1[38], lstep1[56]);
+          u[13] = _mm256_unpackhi_epi32(lstep1[38], lstep1[56]);
+          u[14] = _mm256_unpacklo_epi32(lstep1[39], lstep1[57]);
+          u[15] = _mm256_unpackhi_epi32(lstep1[39], lstep1[57]);
+
+          v[ 0] = k_madd_epi32_avx2(u[ 0], k32_p31_p01);
+          v[ 1] = k_madd_epi32_avx2(u[ 1], k32_p31_p01);
+          v[ 2] = k_madd_epi32_avx2(u[ 2], k32_p31_p01);
+          v[ 3] = k_madd_epi32_avx2(u[ 3], k32_p31_p01);
+          v[ 4] = k_madd_epi32_avx2(u[ 4], k32_p15_p17);
+          v[ 5] = k_madd_epi32_avx2(u[ 5], k32_p15_p17);
+          v[ 6] = k_madd_epi32_avx2(u[ 6], k32_p15_p17);
+          v[ 7] = k_madd_epi32_avx2(u[ 7], k32_p15_p17);
+          v[ 8] = k_madd_epi32_avx2(u[ 8], k32_p23_p09);
+          v[ 9] = k_madd_epi32_avx2(u[ 9], k32_p23_p09);
+          v[10] = k_madd_epi32_avx2(u[10], k32_p23_p09);
+          v[11] = k_madd_epi32_avx2(u[11], k32_p23_p09);
+          v[12] = k_madd_epi32_avx2(u[12], k32_p07_p25);
+          v[13] = k_madd_epi32_avx2(u[13], k32_p07_p25);
+          v[14] = k_madd_epi32_avx2(u[14], k32_p07_p25);
+          v[15] = k_madd_epi32_avx2(u[15], k32_p07_p25);
+          v[16] = k_madd_epi32_avx2(u[12], k32_m25_p07);
+          v[17] = k_madd_epi32_avx2(u[13], k32_m25_p07);
+          v[18] = k_madd_epi32_avx2(u[14], k32_m25_p07);
+          v[19] = k_madd_epi32_avx2(u[15], k32_m25_p07);
+          v[20] = k_madd_epi32_avx2(u[ 8], k32_m09_p23);
+          v[21] = k_madd_epi32_avx2(u[ 9], k32_m09_p23);
+          v[22] = k_madd_epi32_avx2(u[10], k32_m09_p23);
+          v[23] = k_madd_epi32_avx2(u[11], k32_m09_p23);
+          v[24] = k_madd_epi32_avx2(u[ 4], k32_m17_p15);
+          v[25] = k_madd_epi32_avx2(u[ 5], k32_m17_p15);
+          v[26] = k_madd_epi32_avx2(u[ 6], k32_m17_p15);
+          v[27] = k_madd_epi32_avx2(u[ 7], k32_m17_p15);
+          v[28] = k_madd_epi32_avx2(u[ 0], k32_m01_p31);
+          v[29] = k_madd_epi32_avx2(u[ 1], k32_m01_p31);
+          v[30] = k_madd_epi32_avx2(u[ 2], k32_m01_p31);
+          v[31] = k_madd_epi32_avx2(u[ 3], k32_m01_p31);
+
+          u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]);
+          u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]);
+          u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]);
+          u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]);
+          u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]);
+          u[ 5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[ 6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[ 7] = k_packs_epi64_avx2(v[14], v[15]);
+          u[ 8] = k_packs_epi64_avx2(v[16], v[17]);
+          u[ 9] = k_packs_epi64_avx2(v[18], v[19]);
+          u[10] = k_packs_epi64_avx2(v[20], v[21]);
+          u[11] = k_packs_epi64_avx2(v[22], v[23]);
+          u[12] = k_packs_epi64_avx2(v[24], v[25]);
+          u[13] = k_packs_epi64_avx2(v[26], v[27]);
+          u[14] = k_packs_epi64_avx2(v[28], v[29]);
+          u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+          v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+          v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+          v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+          v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+          v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+          v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+          v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+          v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+          v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+          v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+          v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+          v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+          v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+          v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+          v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+          u[ 0] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS);
+          u[ 1] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS);
+          u[ 2] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS);
+          u[ 3] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS);
+          u[ 4] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS);
+          u[ 5] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS);
+          u[ 6] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS);
+          u[ 7] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS);
+          u[ 8] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS);
+          u[ 9] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS);
+          u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+          u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+          u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+          u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+          u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+          u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+
+          v[ 0] = _mm256_cmpgt_epi32(kZero,u[ 0]);
+          v[ 1] = _mm256_cmpgt_epi32(kZero,u[ 1]);
+          v[ 2] = _mm256_cmpgt_epi32(kZero,u[ 2]);
+          v[ 3] = _mm256_cmpgt_epi32(kZero,u[ 3]);
+          v[ 4] = _mm256_cmpgt_epi32(kZero,u[ 4]);
+          v[ 5] = _mm256_cmpgt_epi32(kZero,u[ 5]);
+          v[ 6] = _mm256_cmpgt_epi32(kZero,u[ 6]);
+          v[ 7] = _mm256_cmpgt_epi32(kZero,u[ 7]);
+          v[ 8] = _mm256_cmpgt_epi32(kZero,u[ 8]);
+          v[ 9] = _mm256_cmpgt_epi32(kZero,u[ 9]);
+          v[10] = _mm256_cmpgt_epi32(kZero,u[10]);
+          v[11] = _mm256_cmpgt_epi32(kZero,u[11]);
+          v[12] = _mm256_cmpgt_epi32(kZero,u[12]);
+          v[13] = _mm256_cmpgt_epi32(kZero,u[13]);
+          v[14] = _mm256_cmpgt_epi32(kZero,u[14]);
+          v[15] = _mm256_cmpgt_epi32(kZero,u[15]);
+
+          u[ 0] = _mm256_sub_epi32(u[ 0], v[ 0]);
+          u[ 1] = _mm256_sub_epi32(u[ 1], v[ 1]);
+          u[ 2] = _mm256_sub_epi32(u[ 2], v[ 2]);
+          u[ 3] = _mm256_sub_epi32(u[ 3], v[ 3]);
+          u[ 4] = _mm256_sub_epi32(u[ 4], v[ 4]);
+          u[ 5] = _mm256_sub_epi32(u[ 5], v[ 5]);
+          u[ 6] = _mm256_sub_epi32(u[ 6], v[ 6]);
+          u[ 7] = _mm256_sub_epi32(u[ 7], v[ 7]);
+          u[ 8] = _mm256_sub_epi32(u[ 8], v[ 8]);
+          u[ 9] = _mm256_sub_epi32(u[ 9], v[ 9]);
+          u[10] = _mm256_sub_epi32(u[10], v[10]);
+          u[11] = _mm256_sub_epi32(u[11], v[11]);
+          u[12] = _mm256_sub_epi32(u[12], v[12]);
+          u[13] = _mm256_sub_epi32(u[13], v[13]);
+          u[14] = _mm256_sub_epi32(u[14], v[14]);
+          u[15] = _mm256_sub_epi32(u[15], v[15]);
+
+          v[0] = _mm256_add_epi32(u[0], K32One);
+          v[1] = _mm256_add_epi32(u[1], K32One);
+          v[2] = _mm256_add_epi32(u[2], K32One);
+          v[3] = _mm256_add_epi32(u[3], K32One);
+          v[4] = _mm256_add_epi32(u[4], K32One);
+          v[5] = _mm256_add_epi32(u[5], K32One);
+          v[6] = _mm256_add_epi32(u[6], K32One);
+          v[7] = _mm256_add_epi32(u[7], K32One);
+          v[8] = _mm256_add_epi32(u[8], K32One);
+          v[9] = _mm256_add_epi32(u[9], K32One);
+          v[10] = _mm256_add_epi32(u[10], K32One);
+          v[11] = _mm256_add_epi32(u[11], K32One);
+          v[12] = _mm256_add_epi32(u[12], K32One);
+          v[13] = _mm256_add_epi32(u[13], K32One);
+          v[14] = _mm256_add_epi32(u[14], K32One);
+          v[15] = _mm256_add_epi32(u[15], K32One);
+
+          u[0] = _mm256_srai_epi32(v[0], 2);
+          u[1] = _mm256_srai_epi32(v[1], 2);
+          u[2] = _mm256_srai_epi32(v[2], 2);
+          u[3] = _mm256_srai_epi32(v[3], 2);
+          u[4] = _mm256_srai_epi32(v[4], 2);
+          u[5] = _mm256_srai_epi32(v[5], 2);
+          u[6] = _mm256_srai_epi32(v[6], 2);
+          u[7] = _mm256_srai_epi32(v[7], 2);
+          u[8] = _mm256_srai_epi32(v[8], 2);
+          u[9] = _mm256_srai_epi32(v[9], 2);
+          u[10] = _mm256_srai_epi32(v[10], 2);
+          u[11] = _mm256_srai_epi32(v[11], 2);
+          u[12] = _mm256_srai_epi32(v[12], 2);
+          u[13] = _mm256_srai_epi32(v[13], 2);
+          u[14] = _mm256_srai_epi32(v[14], 2);
+          u[15] = _mm256_srai_epi32(v[15], 2);
+
+          out[ 1] = _mm256_packs_epi32(u[0], u[1]);
+          out[17] = _mm256_packs_epi32(u[2], u[3]);
+          out[ 9] = _mm256_packs_epi32(u[4], u[5]);
+          out[25] = _mm256_packs_epi32(u[6], u[7]);
+          out[ 7] = _mm256_packs_epi32(u[8], u[9]);
+          out[23] = _mm256_packs_epi32(u[10], u[11]);
+          out[15] = _mm256_packs_epi32(u[12], u[13]);
+          out[31] = _mm256_packs_epi32(u[14], u[15]);
+        }
+        {
+          const __m256i k32_p27_p05 = pair256_set_epi32(cospi_27_64, cospi_5_64);
+          const __m256i k32_p11_p21 = pair256_set_epi32(cospi_11_64, cospi_21_64);
+          const __m256i k32_p19_p13 = pair256_set_epi32(cospi_19_64, cospi_13_64);
+          const __m256i k32_p03_p29 = pair256_set_epi32(cospi_3_64, cospi_29_64);
+          const __m256i k32_m29_p03 = pair256_set_epi32(-cospi_29_64, cospi_3_64);
+          const __m256i k32_m13_p19 = pair256_set_epi32(-cospi_13_64, cospi_19_64);
+          const __m256i k32_m21_p11 = pair256_set_epi32(-cospi_21_64, cospi_11_64);
+          const __m256i k32_m05_p27 = pair256_set_epi32(-cospi_5_64, cospi_27_64);
+
+          u[ 0] = _mm256_unpacklo_epi32(lstep1[40], lstep1[54]);
+          u[ 1] = _mm256_unpackhi_epi32(lstep1[40], lstep1[54]);
+          u[ 2] = _mm256_unpacklo_epi32(lstep1[41], lstep1[55]);
+          u[ 3] = _mm256_unpackhi_epi32(lstep1[41], lstep1[55]);
+          u[ 4] = _mm256_unpacklo_epi32(lstep1[42], lstep1[52]);
+          u[ 5] = _mm256_unpackhi_epi32(lstep1[42], lstep1[52]);
+          u[ 6] = _mm256_unpacklo_epi32(lstep1[43], lstep1[53]);
+          u[ 7] = _mm256_unpackhi_epi32(lstep1[43], lstep1[53]);
+          u[ 8] = _mm256_unpacklo_epi32(lstep1[44], lstep1[50]);
+          u[ 9] = _mm256_unpackhi_epi32(lstep1[44], lstep1[50]);
+          u[10] = _mm256_unpacklo_epi32(lstep1[45], lstep1[51]);
+          u[11] = _mm256_unpackhi_epi32(lstep1[45], lstep1[51]);
+          u[12] = _mm256_unpacklo_epi32(lstep1[46], lstep1[48]);
+          u[13] = _mm256_unpackhi_epi32(lstep1[46], lstep1[48]);
+          u[14] = _mm256_unpacklo_epi32(lstep1[47], lstep1[49]);
+          u[15] = _mm256_unpackhi_epi32(lstep1[47], lstep1[49]);
+
+          v[ 0] = k_madd_epi32_avx2(u[ 0], k32_p27_p05);
+          v[ 1] = k_madd_epi32_avx2(u[ 1], k32_p27_p05);
+          v[ 2] = k_madd_epi32_avx2(u[ 2], k32_p27_p05);
+          v[ 3] = k_madd_epi32_avx2(u[ 3], k32_p27_p05);
+          v[ 4] = k_madd_epi32_avx2(u[ 4], k32_p11_p21);
+          v[ 5] = k_madd_epi32_avx2(u[ 5], k32_p11_p21);
+          v[ 6] = k_madd_epi32_avx2(u[ 6], k32_p11_p21);
+          v[ 7] = k_madd_epi32_avx2(u[ 7], k32_p11_p21);
+          v[ 8] = k_madd_epi32_avx2(u[ 8], k32_p19_p13);
+          v[ 9] = k_madd_epi32_avx2(u[ 9], k32_p19_p13);
+          v[10] = k_madd_epi32_avx2(u[10], k32_p19_p13);
+          v[11] = k_madd_epi32_avx2(u[11], k32_p19_p13);
+          v[12] = k_madd_epi32_avx2(u[12], k32_p03_p29);
+          v[13] = k_madd_epi32_avx2(u[13], k32_p03_p29);
+          v[14] = k_madd_epi32_avx2(u[14], k32_p03_p29);
+          v[15] = k_madd_epi32_avx2(u[15], k32_p03_p29);
+          v[16] = k_madd_epi32_avx2(u[12], k32_m29_p03);
+          v[17] = k_madd_epi32_avx2(u[13], k32_m29_p03);
+          v[18] = k_madd_epi32_avx2(u[14], k32_m29_p03);
+          v[19] = k_madd_epi32_avx2(u[15], k32_m29_p03);
+          v[20] = k_madd_epi32_avx2(u[ 8], k32_m13_p19);
+          v[21] = k_madd_epi32_avx2(u[ 9], k32_m13_p19);
+          v[22] = k_madd_epi32_avx2(u[10], k32_m13_p19);
+          v[23] = k_madd_epi32_avx2(u[11], k32_m13_p19);
+          v[24] = k_madd_epi32_avx2(u[ 4], k32_m21_p11);
+          v[25] = k_madd_epi32_avx2(u[ 5], k32_m21_p11);
+          v[26] = k_madd_epi32_avx2(u[ 6], k32_m21_p11);
+          v[27] = k_madd_epi32_avx2(u[ 7], k32_m21_p11);
+          v[28] = k_madd_epi32_avx2(u[ 0], k32_m05_p27);
+          v[29] = k_madd_epi32_avx2(u[ 1], k32_m05_p27);
+          v[30] = k_madd_epi32_avx2(u[ 2], k32_m05_p27);
+          v[31] = k_madd_epi32_avx2(u[ 3], k32_m05_p27);
+
+          u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]);
+          u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]);
+          u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]);
+          u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]);
+          u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]);
+          u[ 5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[ 6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[ 7] = k_packs_epi64_avx2(v[14], v[15]);
+          u[ 8] = k_packs_epi64_avx2(v[16], v[17]);
+          u[ 9] = k_packs_epi64_avx2(v[18], v[19]);
+          u[10] = k_packs_epi64_avx2(v[20], v[21]);
+          u[11] = k_packs_epi64_avx2(v[22], v[23]);
+          u[12] = k_packs_epi64_avx2(v[24], v[25]);
+          u[13] = k_packs_epi64_avx2(v[26], v[27]);
+          u[14] = k_packs_epi64_avx2(v[28], v[29]);
+          u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+          v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+          v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+          v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+          v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+          v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+          v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+          v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+          v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+          v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+          v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+          v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+          v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+          v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+          v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+          v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+          u[ 0] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS);
+          u[ 1] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS);
+          u[ 2] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS);
+          u[ 3] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS);
+          u[ 4] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS);
+          u[ 5] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS);
+          u[ 6] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS);
+          u[ 7] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS);
+          u[ 8] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS);
+          u[ 9] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS);
+          u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+          u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+          u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+          u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+          u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+          u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+
+          v[ 0] = _mm256_cmpgt_epi32(kZero,u[ 0]);
+          v[ 1] = _mm256_cmpgt_epi32(kZero,u[ 1]);
+          v[ 2] = _mm256_cmpgt_epi32(kZero,u[ 2]);
+          v[ 3] = _mm256_cmpgt_epi32(kZero,u[ 3]);
+          v[ 4] = _mm256_cmpgt_epi32(kZero,u[ 4]);
+          v[ 5] = _mm256_cmpgt_epi32(kZero,u[ 5]);
+          v[ 6] = _mm256_cmpgt_epi32(kZero,u[ 6]);
+          v[ 7] = _mm256_cmpgt_epi32(kZero,u[ 7]);
+          v[ 8] = _mm256_cmpgt_epi32(kZero,u[ 8]);
+          v[ 9] = _mm256_cmpgt_epi32(kZero,u[ 9]);
+          v[10] = _mm256_cmpgt_epi32(kZero,u[10]);
+          v[11] = _mm256_cmpgt_epi32(kZero,u[11]);
+          v[12] = _mm256_cmpgt_epi32(kZero,u[12]);
+          v[13] = _mm256_cmpgt_epi32(kZero,u[13]);
+          v[14] = _mm256_cmpgt_epi32(kZero,u[14]);
+          v[15] = _mm256_cmpgt_epi32(kZero,u[15]);
+
+          u[ 0] = _mm256_sub_epi32(u[ 0], v[ 0]);
+          u[ 1] = _mm256_sub_epi32(u[ 1], v[ 1]);
+          u[ 2] = _mm256_sub_epi32(u[ 2], v[ 2]);
+          u[ 3] = _mm256_sub_epi32(u[ 3], v[ 3]);
+          u[ 4] = _mm256_sub_epi32(u[ 4], v[ 4]);
+          u[ 5] = _mm256_sub_epi32(u[ 5], v[ 5]);
+          u[ 6] = _mm256_sub_epi32(u[ 6], v[ 6]);
+          u[ 7] = _mm256_sub_epi32(u[ 7], v[ 7]);
+          u[ 8] = _mm256_sub_epi32(u[ 8], v[ 8]);
+          u[ 9] = _mm256_sub_epi32(u[ 9], v[ 9]);
+          u[10] = _mm256_sub_epi32(u[10], v[10]);
+          u[11] = _mm256_sub_epi32(u[11], v[11]);
+          u[12] = _mm256_sub_epi32(u[12], v[12]);
+          u[13] = _mm256_sub_epi32(u[13], v[13]);
+          u[14] = _mm256_sub_epi32(u[14], v[14]);
+          u[15] = _mm256_sub_epi32(u[15], v[15]);
+
+          v[0] = _mm256_add_epi32(u[0], K32One);
+          v[1] = _mm256_add_epi32(u[1], K32One);
+          v[2] = _mm256_add_epi32(u[2], K32One);
+          v[3] = _mm256_add_epi32(u[3], K32One);
+          v[4] = _mm256_add_epi32(u[4], K32One);
+          v[5] = _mm256_add_epi32(u[5], K32One);
+          v[6] = _mm256_add_epi32(u[6], K32One);
+          v[7] = _mm256_add_epi32(u[7], K32One);
+          v[8] = _mm256_add_epi32(u[8], K32One);
+          v[9] = _mm256_add_epi32(u[9], K32One);
+          v[10] = _mm256_add_epi32(u[10], K32One);
+          v[11] = _mm256_add_epi32(u[11], K32One);
+          v[12] = _mm256_add_epi32(u[12], K32One);
+          v[13] = _mm256_add_epi32(u[13], K32One);
+          v[14] = _mm256_add_epi32(u[14], K32One);
+          v[15] = _mm256_add_epi32(u[15], K32One);
+
+          u[0] = _mm256_srai_epi32(v[0], 2);
+          u[1] = _mm256_srai_epi32(v[1], 2);
+          u[2] = _mm256_srai_epi32(v[2], 2);
+          u[3] = _mm256_srai_epi32(v[3], 2);
+          u[4] = _mm256_srai_epi32(v[4], 2);
+          u[5] = _mm256_srai_epi32(v[5], 2);
+          u[6] = _mm256_srai_epi32(v[6], 2);
+          u[7] = _mm256_srai_epi32(v[7], 2);
+          u[8] = _mm256_srai_epi32(v[8], 2);
+          u[9] = _mm256_srai_epi32(v[9], 2);
+          u[10] = _mm256_srai_epi32(v[10], 2);
+          u[11] = _mm256_srai_epi32(v[11], 2);
+          u[12] = _mm256_srai_epi32(v[12], 2);
+          u[13] = _mm256_srai_epi32(v[13], 2);
+          u[14] = _mm256_srai_epi32(v[14], 2);
+          u[15] = _mm256_srai_epi32(v[15], 2);
+
+          out[ 5] = _mm256_packs_epi32(u[0], u[1]);
+          out[21] = _mm256_packs_epi32(u[2], u[3]);
+          out[13] = _mm256_packs_epi32(u[4], u[5]);
+          out[29] = _mm256_packs_epi32(u[6], u[7]);
+          out[ 3] = _mm256_packs_epi32(u[8], u[9]);
+          out[19] = _mm256_packs_epi32(u[10], u[11]);
+          out[11] = _mm256_packs_epi32(u[12], u[13]);
+          out[27] = _mm256_packs_epi32(u[14], u[15]);
+        }
+      }
+#endif
+      // Transpose the results, do it as four 8x8 transposes.
+      {
+        int transpose_block;
+        int16_t *output_currStep,*output_nextStep;
+        if (0 == pass){
+                 output_currStep = &intermediate[column_start * 32];
+                 output_nextStep = &intermediate[(column_start + 8) * 32];
+        } else{
+                 output_currStep = &output_org[column_start * 32];
+                 output_nextStep = &output_org[(column_start + 8) * 32];
+        }
+        for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
+          __m256i *this_out = &out[8 * transpose_block];
+          // 00  01  02  03  04  05  06  07  08  09  10  11  12  13  14  15
+          // 20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
+          // 40  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55
+          // 60  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75
+          // 80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95
+          // 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
+          // 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
+          // 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
+          const __m256i tr0_0 = _mm256_unpacklo_epi16(this_out[0], this_out[1]);
+          const __m256i tr0_1 = _mm256_unpacklo_epi16(this_out[2], this_out[3]);
+          const __m256i tr0_2 = _mm256_unpackhi_epi16(this_out[0], this_out[1]);
+          const __m256i tr0_3 = _mm256_unpackhi_epi16(this_out[2], this_out[3]);
+          const __m256i tr0_4 = _mm256_unpacklo_epi16(this_out[4], this_out[5]);
+          const __m256i tr0_5 = _mm256_unpacklo_epi16(this_out[6], this_out[7]);
+          const __m256i tr0_6 = _mm256_unpackhi_epi16(this_out[4], this_out[5]);
+          const __m256i tr0_7 = _mm256_unpackhi_epi16(this_out[6], this_out[7]);
+          // 00  20  01  21  02  22  03  23  08  28  09  29  10  30  11  31
+          // 40  60  41  61  42  62  43  63  48  68  49  69  50  70  51  71
+          // 04  24  05  25  06  26  07  27  12  32  13  33  14  34  15  35
+          // 44  64  45  65  46  66  47  67  52  72  53  73  54  74  55  75
+          // 80  100 81  101 82  102 83  103 88  108 89  109 90  110 91  101
+          // 120 140 121 141 122 142 123 143 128 148 129 149 130 150 131 151
+          // 84  104 85  105 86  106 87  107 92  112 93  113 94  114 95  115
+          // 124 144 125 145 126 146 127 147 132 152 133 153 134 154 135 155
+
+          const __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_1);
+          const __m256i tr1_1 = _mm256_unpacklo_epi32(tr0_2, tr0_3);
+          const __m256i tr1_2 = _mm256_unpackhi_epi32(tr0_0, tr0_1);
+          const __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_2, tr0_3);
+          const __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_5);
+          const __m256i tr1_5 = _mm256_unpacklo_epi32(tr0_6, tr0_7);
+          const __m256i tr1_6 = _mm256_unpackhi_epi32(tr0_4, tr0_5);
+          const __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_6, tr0_7);
+          // 00 20  40  60  01 21  41  61  08 28  48  68  09 29  49  69
+          // 04 24  44  64  05 25  45  65  12 32  52  72  13 33  53  73
+          // 02 22  42  62  03 23  43  63  10 30  50  70  11 31  51  71
+          // 06 26  46  66  07 27  47  67  14 34  54  74  15 35  55  75
+          // 80 100 120 140 81 101 121 141 88 108 128 148 89 109 129 149
+          // 84 104 124 144 85 105 125 145 92 112 132 152 93 113 133 153
+          // 82 102 122 142 83 103 123 143 90 110 130 150 91 101 131 151
+          // 86 106 126 146 87 107 127 147 94 114 134 154 95 115 135 155
+          __m256i tr2_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4);
+          __m256i tr2_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4);
+          __m256i tr2_2 = _mm256_unpacklo_epi64(tr1_2, tr1_6);
+          __m256i tr2_3 = _mm256_unpackhi_epi64(tr1_2, tr1_6);
+          __m256i tr2_4 = _mm256_unpacklo_epi64(tr1_1, tr1_5);
+          __m256i tr2_5 = _mm256_unpackhi_epi64(tr1_1, tr1_5);
+          __m256i tr2_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7);
+          __m256i tr2_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7);
+          // 00 20 40 60 80 100 120 140 08 28 48 68 88 108 128 148
+          // 01 21 41 61 81 101 121 141 09 29 49 69 89 109 129 149
+          // 02 22 42 62 82 102 122 142 10 30 50 70 90 110 130 150
+          // 03 23 43 63 83 103 123 143 11 31 51 71 91 101 131 151
+          // 04 24 44 64 84 104 124 144 12 32 52 72 92 112 132 152
+          // 05 25 45 65 85 105 125 145 13 33 53 73 93 113 133 153
+          // 06 26 46 66 86 106 126 146 14 34 54 74 94 114 134 154
+          // 07 27 47 67 87 107 127 147 15 35 55 75 95 115 135 155
+          if (0 == pass) {
+            // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2;
+            // TODO(cd): see quality impact of only doing
+            //           output[j] = (output[j] + 1) >> 2;
+            //           which would remove the code between here ...
+            __m256i tr2_0_0 = _mm256_cmpgt_epi16(tr2_0, kZero);
+            __m256i tr2_1_0 = _mm256_cmpgt_epi16(tr2_1, kZero);
+            __m256i tr2_2_0 = _mm256_cmpgt_epi16(tr2_2, kZero);
+            __m256i tr2_3_0 = _mm256_cmpgt_epi16(tr2_3, kZero);
+            __m256i tr2_4_0 = _mm256_cmpgt_epi16(tr2_4, kZero);
+            __m256i tr2_5_0 = _mm256_cmpgt_epi16(tr2_5, kZero);
+            __m256i tr2_6_0 = _mm256_cmpgt_epi16(tr2_6, kZero);
+            __m256i tr2_7_0 = _mm256_cmpgt_epi16(tr2_7, kZero);
+            tr2_0 = _mm256_sub_epi16(tr2_0, tr2_0_0);
+            tr2_1 = _mm256_sub_epi16(tr2_1, tr2_1_0);
+            tr2_2 = _mm256_sub_epi16(tr2_2, tr2_2_0);
+            tr2_3 = _mm256_sub_epi16(tr2_3, tr2_3_0);
+            tr2_4 = _mm256_sub_epi16(tr2_4, tr2_4_0);
+            tr2_5 = _mm256_sub_epi16(tr2_5, tr2_5_0);
+            tr2_6 = _mm256_sub_epi16(tr2_6, tr2_6_0);
+            tr2_7 = _mm256_sub_epi16(tr2_7, tr2_7_0);
+            //           ... and here.
+            //           PS: also change code in vp9/encoder/vp9_dct.c
+            tr2_0 = _mm256_add_epi16(tr2_0, kOne);
+            tr2_1 = _mm256_add_epi16(tr2_1, kOne);
+            tr2_2 = _mm256_add_epi16(tr2_2, kOne);
+            tr2_3 = _mm256_add_epi16(tr2_3, kOne);
+            tr2_4 = _mm256_add_epi16(tr2_4, kOne);
+            tr2_5 = _mm256_add_epi16(tr2_5, kOne);
+            tr2_6 = _mm256_add_epi16(tr2_6, kOne);
+            tr2_7 = _mm256_add_epi16(tr2_7, kOne);
+            tr2_0 = _mm256_srai_epi16(tr2_0, 2);
+            tr2_1 = _mm256_srai_epi16(tr2_1, 2);
+            tr2_2 = _mm256_srai_epi16(tr2_2, 2);
+            tr2_3 = _mm256_srai_epi16(tr2_3, 2);
+            tr2_4 = _mm256_srai_epi16(tr2_4, 2);
+            tr2_5 = _mm256_srai_epi16(tr2_5, 2);
+            tr2_6 = _mm256_srai_epi16(tr2_6, 2);
+            tr2_7 = _mm256_srai_epi16(tr2_7, 2);
+          }
+          // Note: even though all these stores are aligned, using the aligned
+          //       intrinsic make the code slightly slower.
+          _mm_storeu_si128((__m128i *)(output_currStep + 0 * 32), _mm256_castsi256_si128(tr2_0));
+          _mm_storeu_si128((__m128i *)(output_currStep + 1 * 32), _mm256_castsi256_si128(tr2_1));
+          _mm_storeu_si128((__m128i *)(output_currStep + 2 * 32), _mm256_castsi256_si128(tr2_2));
+          _mm_storeu_si128((__m128i *)(output_currStep + 3 * 32), _mm256_castsi256_si128(tr2_3));
+          _mm_storeu_si128((__m128i *)(output_currStep + 4 * 32), _mm256_castsi256_si128(tr2_4));
+          _mm_storeu_si128((__m128i *)(output_currStep + 5 * 32), _mm256_castsi256_si128(tr2_5));
+          _mm_storeu_si128((__m128i *)(output_currStep + 6 * 32), _mm256_castsi256_si128(tr2_6));
+          _mm_storeu_si128((__m128i *)(output_currStep + 7 * 32), _mm256_castsi256_si128(tr2_7));
+
+          _mm_storeu_si128((__m128i *)(output_nextStep + 0 * 32), _mm256_extractf128_si256(tr2_0,1));
+          _mm_storeu_si128((__m128i *)(output_nextStep + 1 * 32), _mm256_extractf128_si256(tr2_1,1));
+          _mm_storeu_si128((__m128i *)(output_nextStep + 2 * 32), _mm256_extractf128_si256(tr2_2,1));
+          _mm_storeu_si128((__m128i *)(output_nextStep + 3 * 32), _mm256_extractf128_si256(tr2_3,1));
+          _mm_storeu_si128((__m128i *)(output_nextStep + 4 * 32), _mm256_extractf128_si256(tr2_4,1));
+          _mm_storeu_si128((__m128i *)(output_nextStep + 5 * 32), _mm256_extractf128_si256(tr2_5,1));
+          _mm_storeu_si128((__m128i *)(output_nextStep + 6 * 32), _mm256_extractf128_si256(tr2_6,1));
+          _mm_storeu_si128((__m128i *)(output_nextStep + 7 * 32), _mm256_extractf128_si256(tr2_7,1));
+          // Process next 8x8
+          output_currStep += 8;
+          output_nextStep += 8;
+        }
+      }
+    }
+  }
+}  // NOLINT

diff --git a/vp9/encoder/x86/vp9_dct_avx2.c b/vp9/encoder/x86/vp9_dct_avx2.c
new file mode 100644
index 0000000..d81b72b
--- /dev/null
+++ b/vp9/encoder/x86/vp9_dct_avx2.c

@@ -0,0 +1,2579 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>  // AVX2
+#include "vp9/common/vp9_idct.h"  // for cospi constants
+#include "vpx_ports/mem.h"
+
+void vp9_fdct4x4_avx2(const int16_t *input, int16_t *output, int stride) {
+  // The 2D transform is done with two passes which are actually pretty
+  // similar. In the first one, we transform the columns and transpose
+  // the results. In the second one, we transform the rows. To achieve that,
+  // as the first pass results are transposed, we tranpose the columns (that
+  // is the transposed rows) and transpose the results (so that it goes back
+  // in normal/row positions).
+  int pass;
+  // Constants
+  //    When we use them, in one case, they are all the same. In all others
+  //    it's a pair of them that we need to repeat four times. This is done
+  //    by constructing the 32 bit constant corresponding to that pair.
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
+  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
+  const __m128i kOne = _mm_set1_epi16(1);
+  __m128i in0, in1, in2, in3;
+  // Load inputs.
+  {
+    in0  = _mm_loadl_epi64((const __m128i *)(input +  0 * stride));
+    in1  = _mm_loadl_epi64((const __m128i *)(input +  1 * stride));
+    in2  = _mm_loadl_epi64((const __m128i *)(input +  2 * stride));
+    in3  = _mm_loadl_epi64((const __m128i *)(input +  3 * stride));
+    // x = x << 4
+    in0 = _mm_slli_epi16(in0, 4);
+    in1 = _mm_slli_epi16(in1, 4);
+    in2 = _mm_slli_epi16(in2, 4);
+    in3 = _mm_slli_epi16(in3, 4);
+    // if (i == 0 && input[0]) input[0] += 1;
+    {
+      // The mask will only contain wether the first value is zero, all
+      // other comparison will fail as something shifted by 4 (above << 4)
+      // can never be equal to one. To increment in the non-zero case, we
+      // add the mask and one for the first element:
+      //   - if zero, mask = -1, v = v - 1 + 1 = v
+      //   - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
+      __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);
+      in0 = _mm_add_epi16(in0, mask);
+      in0 = _mm_add_epi16(in0, k__nonzero_bias_b);
+    }
+  }
+  // Do the two transform/transpose passes
+  for (pass = 0; pass < 2; ++pass) {
+    // Transform 1/2: Add/substract
+    const __m128i r0 = _mm_add_epi16(in0, in3);
+    const __m128i r1 = _mm_add_epi16(in1, in2);
+    const __m128i r2 = _mm_sub_epi16(in1, in2);
+    const __m128i r3 = _mm_sub_epi16(in0, in3);
+    // Transform 1/2: Interleave to do the multiply by constants which gets us
+    //                into 32 bits.
+    const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+    const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+    const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+    const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+    const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+    const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+    const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+    const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+    const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+    const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+    const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+    const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+    const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+    const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+    // Combine and transpose
+    const __m128i res0 = _mm_packs_epi32(w0, w2);
+    const __m128i res1 = _mm_packs_epi32(w4, w6);
+    // 00 01 02 03 20 21 22 23
+    // 10 11 12 13 30 31 32 33
+    const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
+    const __m128i tr0_1 = _mm_unpackhi_epi16(res0, res1);
+    // 00 10 01 11 02 12 03 13
+    // 20 30 21 31 22 32 23 33
+    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+    in2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+    // 00 10 20 30 01 11 21 31      in0 contains 0 followed by 1
+    // 02 12 22 32 03 13 23 33      in2 contains 2 followed by 3
+    if (0 == pass) {
+      // Extract values in the high part for second pass as transform code
+      // only uses the first four values.
+      in1 = _mm_unpackhi_epi64(in0, in0);
+      in3 = _mm_unpackhi_epi64(in2, in2);
+    } else {
+      // Post-condition output and store it (v + 1) >> 2, taking advantage
+      // of the fact 1/3 are stored just after 0/2.
+      __m128i out01 = _mm_add_epi16(in0, kOne);
+      __m128i out23 = _mm_add_epi16(in2, kOne);
+      out01 = _mm_srai_epi16(out01, 2);
+      out23 = _mm_srai_epi16(out23, 2);
+      _mm_storeu_si128((__m128i *)(output + 0 * 4), out01);
+      _mm_storeu_si128((__m128i *)(output + 2 * 4), out23);
+    }
+  }
+}
+
+static INLINE void load_buffer_4x4_avx2(const int16_t *input, __m128i *in,
+                                   int stride) {
+  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
+  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
+  __m128i mask;
+
+  in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+  in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+  in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+  in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+
+  in[0] = _mm_slli_epi16(in[0], 4);
+  in[1] = _mm_slli_epi16(in[1], 4);
+  in[2] = _mm_slli_epi16(in[2], 4);
+  in[3] = _mm_slli_epi16(in[3], 4);
+
+  mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a);
+  in[0] = _mm_add_epi16(in[0], mask);
+  in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b);
+}
+
+static INLINE void write_buffer_4x4_avx2(int16_t *output, __m128i *res) {
+  const __m128i kOne = _mm_set1_epi16(1);
+  __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]);
+  __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]);
+  __m128i out01 = _mm_add_epi16(in01, kOne);
+  __m128i out23 = _mm_add_epi16(in23, kOne);
+  out01 = _mm_srai_epi16(out01, 2);
+  out23 = _mm_srai_epi16(out23, 2);
+  _mm_store_si128((__m128i *)(output + 0 * 8), out01);
+  _mm_store_si128((__m128i *)(output + 1 * 8), out23);
+}
+
+static INLINE void transpose_4x4_avx2(__m128i *res) {
+  // Combine and transpose
+  // 00 01 02 03 20 21 22 23
+  // 10 11 12 13 30 31 32 33
+  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
+  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
+
+  // 00 10 01 11 02 12 03 13
+  // 20 30 21 31 22 32 23 33
+  res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
+
+  // 00 10 20 30 01 11 21 31
+  // 02 12 22 32 03 13 23 33
+  // only use the first 4 16-bit integers
+  res[1] = _mm_unpackhi_epi64(res[0], res[0]);
+  res[3] = _mm_unpackhi_epi64(res[2], res[2]);
+}
+
+void fdct4_1d_avx2(__m128i *in) {
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  __m128i u[4], v[4];
+  u[0]=_mm_unpacklo_epi16(in[0], in[1]);
+  u[1]=_mm_unpacklo_epi16(in[3], in[2]);
+
+  v[0] = _mm_add_epi16(u[0], u[1]);
+  v[1] = _mm_sub_epi16(u[0], u[1]);
+
+  u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);  // 0
+  u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16);  // 2
+  u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24);  // 1
+  u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08);  // 3
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u[0], u[1]);
+  in[1] = _mm_packs_epi32(u[2], u[3]);
+  transpose_4x4_avx2(in);
+}
+
+void fadst4_1d_avx2(__m128i *in) {
+  const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
+  const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
+  const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
+  const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
+  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
+  const __m128i kZero = _mm_set1_epi16(0);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  __m128i u[8], v[8];
+  __m128i in7 = _mm_add_epi16(in[0], in[1]);
+
+  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
+  u[1] = _mm_unpacklo_epi16(in[2], in[3]);
+  u[2] = _mm_unpacklo_epi16(in7, kZero);
+  u[3] = _mm_unpacklo_epi16(in[2], kZero);
+  u[4] = _mm_unpacklo_epi16(in[3], kZero);
+
+  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02);  // s0 + s2
+  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04);  // s4 + s5
+  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x1
+  v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01);  // s1 - s3
+  v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02);  // -s4 + s6
+  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s4
+  v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03);
+
+  u[0] = _mm_add_epi32(v[0], v[1]);
+  u[1] = _mm_sub_epi32(v[2], v[6]);
+  u[2] = _mm_add_epi32(v[3], v[4]);
+  u[3] = _mm_sub_epi32(u[2], u[0]);
+  u[4] = _mm_slli_epi32(v[5], 2);
+  u[5] = _mm_sub_epi32(u[4], v[5]);
+  u[6] = _mm_add_epi32(u[3], u[5]);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u[0], u[2]);
+  in[1] = _mm_packs_epi32(u[1], u[3]);
+  transpose_4x4_avx2(in);
+}
+
+void vp9_short_fht4x4_avx2(const int16_t *input, int16_t *output,
+                           int stride, int tx_type) {
+  __m128i in[4];
+  load_buffer_4x4_avx2(input, in, stride);
+  switch (tx_type) {
+    case 0:  // DCT_DCT
+      fdct4_1d_avx2(in);
+      fdct4_1d_avx2(in);
+      break;
+    case 1:  // ADST_DCT
+      fadst4_1d_avx2(in);
+      fdct4_1d_avx2(in);
+      break;
+    case 2:  // DCT_ADST
+      fdct4_1d_avx2(in);
+      fadst4_1d_avx2(in);
+      break;
+    case 3:  // ADST_ADST
+      fadst4_1d_avx2(in);
+      fadst4_1d_avx2(in);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+  write_buffer_4x4_avx2(output, in);
+}
+
+void vp9_fdct8x8_avx2(const int16_t *input, int16_t *output, int stride) {
+  int pass;
+  // Constants
+  //    When we use them, in one case, they are all the same. In all others
+  //    it's a pair of them that we need to repeat four times. This is done
+  //    by constructing the 32 bit constant corresponding to that pair.
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  // Load input
+  __m128i in0  = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  __m128i in1  = _mm_load_si128((const __m128i *)(input + 1 * stride));
+  __m128i in2  = _mm_load_si128((const __m128i *)(input + 2 * stride));
+  __m128i in3  = _mm_load_si128((const __m128i *)(input + 3 * stride));
+  __m128i in4  = _mm_load_si128((const __m128i *)(input + 4 * stride));
+  __m128i in5  = _mm_load_si128((const __m128i *)(input + 5 * stride));
+  __m128i in6  = _mm_load_si128((const __m128i *)(input + 6 * stride));
+  __m128i in7  = _mm_load_si128((const __m128i *)(input + 7 * stride));
+  // Pre-condition input (shift by two)
+  in0 = _mm_slli_epi16(in0, 2);
+  in1 = _mm_slli_epi16(in1, 2);
+  in2 = _mm_slli_epi16(in2, 2);
+  in3 = _mm_slli_epi16(in3, 2);
+  in4 = _mm_slli_epi16(in4, 2);
+  in5 = _mm_slli_epi16(in5, 2);
+  in6 = _mm_slli_epi16(in6, 2);
+  in7 = _mm_slli_epi16(in7, 2);
+
+  // We do two passes, first the columns, then the rows. The results of the
+  // first pass are transposed so that the same column code can be reused. The
+  // results of the second pass are also transposed so that the rows (processed
+  // as columns) are put back in row positions.
+  for (pass = 0; pass < 2; pass++) {
+    // To store results of each pass before the transpose.
+    __m128i res0, res1, res2, res3, res4, res5, res6, res7;
+    // Add/substract
+    const __m128i q0 = _mm_add_epi16(in0, in7);
+    const __m128i q1 = _mm_add_epi16(in1, in6);
+    const __m128i q2 = _mm_add_epi16(in2, in5);
+    const __m128i q3 = _mm_add_epi16(in3, in4);
+    const __m128i q4 = _mm_sub_epi16(in3, in4);
+    const __m128i q5 = _mm_sub_epi16(in2, in5);
+    const __m128i q6 = _mm_sub_epi16(in1, in6);
+    const __m128i q7 = _mm_sub_epi16(in0, in7);
+    // Work on first four results
+    {
+      // Add/substract
+      const __m128i r0 = _mm_add_epi16(q0, q3);
+      const __m128i r1 = _mm_add_epi16(q1, q2);
+      const __m128i r2 = _mm_sub_epi16(q1, q2);
+      const __m128i r3 = _mm_sub_epi16(q0, q3);
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+      const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+      const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+      const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
+      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+      // dct_const_round_shift
+      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+      // Combine
+      res0 = _mm_packs_epi32(w0, w1);
+      res4 = _mm_packs_epi32(w2, w3);
+      res2 = _mm_packs_epi32(w4, w5);
+      res6 = _mm_packs_epi32(w6, w7);
+    }
+    // Work on next four results
+    {
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
+      const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
+      const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
+      const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
+      const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
+      const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
+      // dct_const_round_shift
+      const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
+      const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
+      const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
+      const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
+      const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
+      const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
+      const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
+      const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
+      // Combine
+      const __m128i r0 = _mm_packs_epi32(s0, s1);
+      const __m128i r1 = _mm_packs_epi32(s2, s3);
+      // Add/substract
+      const __m128i x0 = _mm_add_epi16(q4, r0);
+      const __m128i x1 = _mm_sub_epi16(q4, r0);
+      const __m128i x2 = _mm_sub_epi16(q7, r1);
+      const __m128i x3 = _mm_add_epi16(q7, r1);
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+      const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+      const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+      const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
+      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
+      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
+      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
+      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
+      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
+      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
+      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
+      // dct_const_round_shift
+      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+      // Combine
+      res1 = _mm_packs_epi32(w0, w1);
+      res7 = _mm_packs_epi32(w2, w3);
+      res5 = _mm_packs_epi32(w4, w5);
+      res3 = _mm_packs_epi32(w6, w7);
+    }
+    // Transpose the 8x8.
+    {
+      // 00 01 02 03 04 05 06 07
+      // 10 11 12 13 14 15 16 17
+      // 20 21 22 23 24 25 26 27
+      // 30 31 32 33 34 35 36 37
+      // 40 41 42 43 44 45 46 47
+      // 50 51 52 53 54 55 56 57
+      // 60 61 62 63 64 65 66 67
+      // 70 71 72 73 74 75 76 77
+      const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
+      const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
+      const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
+      const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
+      const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
+      const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
+      const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
+      const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
+      // 00 10 01 11 02 12 03 13
+      // 20 30 21 31 22 32 23 33
+      // 04 14 05 15 06 16 07 17
+      // 24 34 25 35 26 36 27 37
+      // 40 50 41 51 42 52 43 53
+      // 60 70 61 71 62 72 63 73
+      // 54 54 55 55 56 56 57 57
+      // 64 74 65 75 66 76 67 77
+      const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+      const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+      const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+      const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+      const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+      const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+      const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+      const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+      // 00 10 20 30 01 11 21 31
+      // 40 50 60 70 41 51 61 71
+      // 02 12 22 32 03 13 23 33
+      // 42 52 62 72 43 53 63 73
+      // 04 14 24 34 05 15 21 36
+      // 44 54 64 74 45 55 61 76
+      // 06 16 26 36 07 17 27 37
+      // 46 56 66 76 47 57 67 77
+      in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+      in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+      in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+      in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+      in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+      in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+      in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+      in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      // 06 16 26 36 46 56 66 76
+      // 07 17 27 37 47 57 67 77
+    }
+  }
+  // Post-condition output and store it
+  {
+    // Post-condition (division by two)
+    //    division of two 16 bits signed numbers using shifts
+    //    n / 2 = (n - (n >> 15)) >> 1
+    const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
+    const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
+    const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
+    const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
+    const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
+    const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
+    const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
+    const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
+    in0 = _mm_sub_epi16(in0, sign_in0);
+    in1 = _mm_sub_epi16(in1, sign_in1);
+    in2 = _mm_sub_epi16(in2, sign_in2);
+    in3 = _mm_sub_epi16(in3, sign_in3);
+    in4 = _mm_sub_epi16(in4, sign_in4);
+    in5 = _mm_sub_epi16(in5, sign_in5);
+    in6 = _mm_sub_epi16(in6, sign_in6);
+    in7 = _mm_sub_epi16(in7, sign_in7);
+    in0 = _mm_srai_epi16(in0, 1);
+    in1 = _mm_srai_epi16(in1, 1);
+    in2 = _mm_srai_epi16(in2, 1);
+    in3 = _mm_srai_epi16(in3, 1);
+    in4 = _mm_srai_epi16(in4, 1);
+    in5 = _mm_srai_epi16(in5, 1);
+    in6 = _mm_srai_epi16(in6, 1);
+    in7 = _mm_srai_epi16(in7, 1);
+    // store results
+    _mm_store_si128((__m128i *)(output + 0 * 8), in0);
+    _mm_store_si128((__m128i *)(output + 1 * 8), in1);
+    _mm_store_si128((__m128i *)(output + 2 * 8), in2);
+    _mm_store_si128((__m128i *)(output + 3 * 8), in3);
+    _mm_store_si128((__m128i *)(output + 4 * 8), in4);
+    _mm_store_si128((__m128i *)(output + 5 * 8), in5);
+    _mm_store_si128((__m128i *)(output + 6 * 8), in6);
+    _mm_store_si128((__m128i *)(output + 7 * 8), in7);
+  }
+}
+
+// load 8x8 array
+static INLINE void load_buffer_8x8_avx2(const int16_t *input, __m128i *in,
+                                   int stride) {
+  in[0]  = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  in[1]  = _mm_load_si128((const __m128i *)(input + 1 * stride));
+  in[2]  = _mm_load_si128((const __m128i *)(input + 2 * stride));
+  in[3]  = _mm_load_si128((const __m128i *)(input + 3 * stride));
+  in[4]  = _mm_load_si128((const __m128i *)(input + 4 * stride));
+  in[5]  = _mm_load_si128((const __m128i *)(input + 5 * stride));
+  in[6]  = _mm_load_si128((const __m128i *)(input + 6 * stride));
+  in[7]  = _mm_load_si128((const __m128i *)(input + 7 * stride));
+
+  in[0] = _mm_slli_epi16(in[0], 2);
+  in[1] = _mm_slli_epi16(in[1], 2);
+  in[2] = _mm_slli_epi16(in[2], 2);
+  in[3] = _mm_slli_epi16(in[3], 2);
+  in[4] = _mm_slli_epi16(in[4], 2);
+  in[5] = _mm_slli_epi16(in[5], 2);
+  in[6] = _mm_slli_epi16(in[6], 2);
+  in[7] = _mm_slli_epi16(in[7], 2);
+}
+
+// right shift and rounding
+static INLINE void right_shift_8x8_avx2(__m128i *res, int const bit) {
+  const __m128i kOne = _mm_set1_epi16(1);
+  const int bit_m02 = bit - 2;
+  __m128i sign0 = _mm_srai_epi16(res[0], 15);
+  __m128i sign1 = _mm_srai_epi16(res[1], 15);
+  __m128i sign2 = _mm_srai_epi16(res[2], 15);
+  __m128i sign3 = _mm_srai_epi16(res[3], 15);
+  __m128i sign4 = _mm_srai_epi16(res[4], 15);
+  __m128i sign5 = _mm_srai_epi16(res[5], 15);
+  __m128i sign6 = _mm_srai_epi16(res[6], 15);
+  __m128i sign7 = _mm_srai_epi16(res[7], 15);
+
+  if (bit_m02 >= 0) {
+    __m128i k_const_rounding = _mm_slli_epi16(kOne, bit_m02);
+    res[0] = _mm_add_epi16(res[0], k_const_rounding);
+    res[1] = _mm_add_epi16(res[1], k_const_rounding);
+    res[2] = _mm_add_epi16(res[2], k_const_rounding);
+    res[3] = _mm_add_epi16(res[3], k_const_rounding);
+    res[4] = _mm_add_epi16(res[4], k_const_rounding);
+    res[5] = _mm_add_epi16(res[5], k_const_rounding);
+    res[6] = _mm_add_epi16(res[6], k_const_rounding);
+    res[7] = _mm_add_epi16(res[7], k_const_rounding);
+  }
+
+  res[0] = _mm_sub_epi16(res[0], sign0);
+  res[1] = _mm_sub_epi16(res[1], sign1);
+  res[2] = _mm_sub_epi16(res[2], sign2);
+  res[3] = _mm_sub_epi16(res[3], sign3);
+  res[4] = _mm_sub_epi16(res[4], sign4);
+  res[5] = _mm_sub_epi16(res[5], sign5);
+  res[6] = _mm_sub_epi16(res[6], sign6);
+  res[7] = _mm_sub_epi16(res[7], sign7);
+
+  res[0] = _mm_srai_epi16(res[0], bit);
+  res[1] = _mm_srai_epi16(res[1], bit);
+  res[2] = _mm_srai_epi16(res[2], bit);
+  res[3] = _mm_srai_epi16(res[3], bit);
+  res[4] = _mm_srai_epi16(res[4], bit);
+  res[5] = _mm_srai_epi16(res[5], bit);
+  res[6] = _mm_srai_epi16(res[6], bit);
+  res[7] = _mm_srai_epi16(res[7], bit);
+}
+
+// write 8x8 array
+static INLINE void write_buffer_8x8_avx2(int16_t *output, __m128i *res, int stride) {
+  _mm_store_si128((__m128i *)(output + 0 * stride), res[0]);
+  _mm_store_si128((__m128i *)(output + 1 * stride), res[1]);
+  _mm_store_si128((__m128i *)(output + 2 * stride), res[2]);
+  _mm_store_si128((__m128i *)(output + 3 * stride), res[3]);
+  _mm_store_si128((__m128i *)(output + 4 * stride), res[4]);
+  _mm_store_si128((__m128i *)(output + 5 * stride), res[5]);
+  _mm_store_si128((__m128i *)(output + 6 * stride), res[6]);
+  _mm_store_si128((__m128i *)(output + 7 * stride), res[7]);
+}
+
+// perform in-place transpose
+static INLINE void array_transpose_8x8_avx2(__m128i *in, __m128i *res) {
+  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
+  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
+  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
+  // 00 10 01 11 02 12 03 13
+  // 20 30 21 31 22 32 23 33
+  // 04 14 05 15 06 16 07 17
+  // 24 34 25 35 26 36 27 37
+  // 40 50 41 51 42 52 43 53
+  // 60 70 61 71 62 72 63 73
+  // 44 54 45 55 46 56 47 57
+  // 64 74 65 75 66 76 67 77
+  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+  // 00 10 20 30 01 11 21 31
+  // 40 50 60 70 41 51 61 71
+  // 02 12 22 32 03 13 23 33
+  // 42 52 62 72 43 53 63 73
+  // 04 14 24 34 05 15 25 35
+  // 44 54 64 74 45 55 65 75
+  // 06 16 26 36 07 17 27 37
+  // 46 56 66 76 47 57 67 77
+  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
+  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
+  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
+  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
+  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
+  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
+  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
+  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
+  // 00 10 20 30 40 50 60 70
+  // 01 11 21 31 41 51 61 71
+  // 02 12 22 32 42 52 62 72
+  // 03 13 23 33 43 53 63 73
+  // 04 14 24 34 44 54 64 74
+  // 05 15 25 35 45 55 65 75
+  // 06 16 26 36 46 56 66 76
+  // 07 17 27 37 47 57 67 77
+}
+
+void fdct8_1d_avx2(__m128i *in) {
+  // constants
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+
+  // stage 1
+  s0 = _mm_add_epi16(in[0], in[7]);
+  s1 = _mm_add_epi16(in[1], in[6]);
+  s2 = _mm_add_epi16(in[2], in[5]);
+  s3 = _mm_add_epi16(in[3], in[4]);
+  s4 = _mm_sub_epi16(in[3], in[4]);
+  s5 = _mm_sub_epi16(in[2], in[5]);
+  s6 = _mm_sub_epi16(in[1], in[6]);
+  s7 = _mm_sub_epi16(in[0], in[7]);
+
+  u0 = _mm_add_epi16(s0, s3);
+  u1 = _mm_add_epi16(s1, s2);
+  u2 = _mm_sub_epi16(s1, s2);
+  u3 = _mm_sub_epi16(s0, s3);
+  // interleave and perform butterfly multiplication/addition
+  v0 = _mm_unpacklo_epi16(u0, u1);
+  v1 = _mm_unpackhi_epi16(u0, u1);
+  v2 = _mm_unpacklo_epi16(u2, u3);
+  v3 = _mm_unpackhi_epi16(u2, u3);
+
+  u0 = _mm_madd_epi16(v0, k__cospi_p16_p16);
+  u1 = _mm_madd_epi16(v1, k__cospi_p16_p16);
+  u2 = _mm_madd_epi16(v0, k__cospi_p16_m16);
+  u3 = _mm_madd_epi16(v1, k__cospi_p16_m16);
+  u4 = _mm_madd_epi16(v2, k__cospi_p24_p08);
+  u5 = _mm_madd_epi16(v3, k__cospi_p24_p08);
+  u6 = _mm_madd_epi16(v2, k__cospi_m08_p24);
+  u7 = _mm_madd_epi16(v3, k__cospi_m08_p24);
+
+  // shift and rounding
+  v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+  v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+  v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+  v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+  v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u0, u1);
+  in[2] = _mm_packs_epi32(u4, u5);
+  in[4] = _mm_packs_epi32(u2, u3);
+  in[6] = _mm_packs_epi32(u6, u7);
+
+  // stage 2
+  // interleave and perform butterfly multiplication/addition
+  u0 = _mm_unpacklo_epi16(s6, s5);
+  u1 = _mm_unpackhi_epi16(s6, s5);
+  v0 = _mm_madd_epi16(u0, k__cospi_p16_m16);
+  v1 = _mm_madd_epi16(u1, k__cospi_p16_m16);
+  v2 = _mm_madd_epi16(u0, k__cospi_p16_p16);
+  v3 = _mm_madd_epi16(u1, k__cospi_p16_p16);
+
+  // shift and rounding
+  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+
+  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+
+  u0 = _mm_packs_epi32(v0, v1);
+  u1 = _mm_packs_epi32(v2, v3);
+
+  // stage 3
+  s0 = _mm_add_epi16(s4, u0);
+  s1 = _mm_sub_epi16(s4, u0);
+  s2 = _mm_sub_epi16(s7, u1);
+  s3 = _mm_add_epi16(s7, u1);
+
+  // stage 4
+  u0 = _mm_unpacklo_epi16(s0, s3);
+  u1 = _mm_unpackhi_epi16(s0, s3);
+  u2 = _mm_unpacklo_epi16(s1, s2);
+  u3 = _mm_unpackhi_epi16(s1, s2);
+
+  v0 = _mm_madd_epi16(u0, k__cospi_p28_p04);
+  v1 = _mm_madd_epi16(u1, k__cospi_p28_p04);
+  v2 = _mm_madd_epi16(u2, k__cospi_p12_p20);
+  v3 = _mm_madd_epi16(u3, k__cospi_p12_p20);
+  v4 = _mm_madd_epi16(u2, k__cospi_m20_p12);
+  v5 = _mm_madd_epi16(u3, k__cospi_m20_p12);
+  v6 = _mm_madd_epi16(u0, k__cospi_m04_p28);
+  v7 = _mm_madd_epi16(u1, k__cospi_m04_p28);
+
+  // shift and rounding
+  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+  in[1] = _mm_packs_epi32(v0, v1);
+  in[3] = _mm_packs_epi32(v4, v5);
+  in[5] = _mm_packs_epi32(v2, v3);
+  in[7] = _mm_packs_epi32(v6, v7);
+
+  // transpose
+  array_transpose_8x8_avx2(in, in);
+}
+
+void fadst8_1d_avx2(__m128i *in) {
+  // Constants
+  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
+  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
+  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__const_0 = _mm_set1_epi16(0);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
+  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+  // properly aligned for butterfly input
+  in0  = in[7];
+  in1  = in[0];
+  in2  = in[5];
+  in3  = in[2];
+  in4  = in[3];
+  in5  = in[4];
+  in6  = in[1];
+  in7  = in[6];
+
+  // column transformation
+  // stage 1
+  // interleave and multiply/add into 32-bit integer
+  s0 = _mm_unpacklo_epi16(in0, in1);
+  s1 = _mm_unpackhi_epi16(in0, in1);
+  s2 = _mm_unpacklo_epi16(in2, in3);
+  s3 = _mm_unpackhi_epi16(in2, in3);
+  s4 = _mm_unpacklo_epi16(in4, in5);
+  s5 = _mm_unpackhi_epi16(in4, in5);
+  s6 = _mm_unpacklo_epi16(in6, in7);
+  s7 = _mm_unpackhi_epi16(in6, in7);
+
+  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
+  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
+  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
+  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
+  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
+  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
+  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
+  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
+  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
+  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
+  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
+  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
+  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
+  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
+  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
+  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
+
+  // addition
+  w0 = _mm_add_epi32(u0, u8);
+  w1 = _mm_add_epi32(u1, u9);
+  w2 = _mm_add_epi32(u2, u10);
+  w3 = _mm_add_epi32(u3, u11);
+  w4 = _mm_add_epi32(u4, u12);
+  w5 = _mm_add_epi32(u5, u13);
+  w6 = _mm_add_epi32(u6, u14);
+  w7 = _mm_add_epi32(u7, u15);
+  w8 = _mm_sub_epi32(u0, u8);
+  w9 = _mm_sub_epi32(u1, u9);
+  w10 = _mm_sub_epi32(u2, u10);
+  w11 = _mm_sub_epi32(u3, u11);
+  w12 = _mm_sub_epi32(u4, u12);
+  w13 = _mm_sub_epi32(u5, u13);
+  w14 = _mm_sub_epi32(u6, u14);
+  w15 = _mm_sub_epi32(u7, u15);
+
+  // shift and rounding
+  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
+  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
+  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
+  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
+  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
+  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
+  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
+  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+  u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
+  u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
+  u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
+  u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
+  u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
+  u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
+  u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
+  u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
+
+  // back to 16-bit and pack 8 integers into __m128i
+  in[0] = _mm_packs_epi32(u0, u1);
+  in[1] = _mm_packs_epi32(u2, u3);
+  in[2] = _mm_packs_epi32(u4, u5);
+  in[3] = _mm_packs_epi32(u6, u7);
+  in[4] = _mm_packs_epi32(u8, u9);
+  in[5] = _mm_packs_epi32(u10, u11);
+  in[6] = _mm_packs_epi32(u12, u13);
+  in[7] = _mm_packs_epi32(u14, u15);
+
+  // stage 2
+  s0 = _mm_add_epi16(in[0], in[2]);
+  s1 = _mm_add_epi16(in[1], in[3]);
+  s2 = _mm_sub_epi16(in[0], in[2]);
+  s3 = _mm_sub_epi16(in[1], in[3]);
+  u0 = _mm_unpacklo_epi16(in[4], in[5]);
+  u1 = _mm_unpackhi_epi16(in[4], in[5]);
+  u2 = _mm_unpacklo_epi16(in[6], in[7]);
+  u3 = _mm_unpackhi_epi16(in[6], in[7]);
+
+  v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
+  v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
+  v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
+  v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
+  v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
+  v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
+  v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
+  v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
+
+  w0 = _mm_add_epi32(v0, v4);
+  w1 = _mm_add_epi32(v1, v5);
+  w2 = _mm_add_epi32(v2, v6);
+  w3 = _mm_add_epi32(v3, v7);
+  w4 = _mm_sub_epi32(v0, v4);
+  w5 = _mm_sub_epi32(v1, v5);
+  w6 = _mm_sub_epi32(v2, v6);
+  w7 = _mm_sub_epi32(v3, v7);
+
+  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+  // back to 16-bit intergers
+  s4 = _mm_packs_epi32(u0, u1);
+  s5 = _mm_packs_epi32(u2, u3);
+  s6 = _mm_packs_epi32(u4, u5);
+  s7 = _mm_packs_epi32(u6, u7);
+
+  // stage 3
+  u0 = _mm_unpacklo_epi16(s2, s3);
+  u1 = _mm_unpackhi_epi16(s2, s3);
+  u2 = _mm_unpacklo_epi16(s6, s7);
+  u3 = _mm_unpackhi_epi16(s6, s7);
+
+  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
+  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
+  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
+  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
+  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
+  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
+  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
+  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
+
+  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+  s2 = _mm_packs_epi32(v0, v1);
+  s3 = _mm_packs_epi32(v2, v3);
+  s6 = _mm_packs_epi32(v4, v5);
+  s7 = _mm_packs_epi32(v6, v7);
+
+  // FIXME(jingning): do subtract using bit inversion?
+  in[0] = s0;
+  in[1] = _mm_sub_epi16(k__const_0, s4);
+  in[2] = s6;
+  in[3] = _mm_sub_epi16(k__const_0, s2);
+  in[4] = s3;
+  in[5] = _mm_sub_epi16(k__const_0, s7);
+  in[6] = s5;
+  in[7] = _mm_sub_epi16(k__const_0, s1);
+
+  // transpose
+  array_transpose_8x8_avx2(in, in);
+}
+
+void vp9_short_fht8x8_avx2(const int16_t *input, int16_t *output,
+                           int stride, int tx_type) {
+  __m128i in[8];
+  load_buffer_8x8_avx2(input, in, stride);
+  switch (tx_type) {
+    case 0:  // DCT_DCT
+      fdct8_1d_avx2(in);
+      fdct8_1d_avx2(in);
+      break;
+    case 1:  // ADST_DCT
+      fadst8_1d_avx2(in);
+      fdct8_1d_avx2(in);
+      break;
+    case 2:  // DCT_ADST
+      fdct8_1d_avx2(in);
+      fadst8_1d_avx2(in);
+      break;
+    case 3:  // ADST_ADST
+      fadst8_1d_avx2(in);
+      fadst8_1d_avx2(in);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+  right_shift_8x8_avx2(in, 1);
+  write_buffer_8x8_avx2(output, in, 8);
+}
+
+void vp9_fdct16x16_avx2(const int16_t *input, int16_t *output, int stride) {
+  // The 2D transform is done with two passes which are actually pretty
+  // similar. In the first one, we transform the columns and transpose
+  // the results. In the second one, we transform the rows. To achieve that,
+  // as the first pass results are transposed, we tranpose the columns (that
+  // is the transposed rows) and transpose the results (so that it goes back
+  // in normal/row positions).
+  int pass;
+  // We need an intermediate buffer between passes.
+  DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256);
+  const int16_t *in = input;
+  int16_t *out = intermediate;
+  // Constants
+  //    When we use them, in one case, they are all the same. In all others
+  //    it's a pair of them that we need to repeat four times. This is done
+  //    by constructing the 32 bit constant corresponding to that pair.
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
+  const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
+  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
+  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
+  const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
+  const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
+  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
+  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i kOne = _mm_set1_epi16(1);
+  // Do the two transform/transpose passes
+  for (pass = 0; pass < 2; ++pass) {
+    // We process eight columns (transposed rows in second pass) at a time.
+    int column_start;
+    for (column_start = 0; column_start < 16; column_start += 8) {
+      __m128i in00, in01, in02, in03, in04, in05, in06, in07;
+      __m128i in08, in09, in10, in11, in12, in13, in14, in15;
+      __m128i input0, input1, input2, input3, input4, input5, input6, input7;
+      __m128i step1_0, step1_1, step1_2, step1_3;
+      __m128i step1_4, step1_5, step1_6, step1_7;
+      __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
+      __m128i step3_0, step3_1, step3_2, step3_3;
+      __m128i step3_4, step3_5, step3_6, step3_7;
+      __m128i res00, res01, res02, res03, res04, res05, res06, res07;
+      __m128i res08, res09, res10, res11, res12, res13, res14, res15;
+      // Load and pre-condition input.
+      if (0 == pass) {
+        in00  = _mm_load_si128((const __m128i *)(in +  0 * stride));
+        in01  = _mm_load_si128((const __m128i *)(in +  1 * stride));
+        in02  = _mm_load_si128((const __m128i *)(in +  2 * stride));
+        in03  = _mm_load_si128((const __m128i *)(in +  3 * stride));
+        in04  = _mm_load_si128((const __m128i *)(in +  4 * stride));
+        in05  = _mm_load_si128((const __m128i *)(in +  5 * stride));
+        in06  = _mm_load_si128((const __m128i *)(in +  6 * stride));
+        in07  = _mm_load_si128((const __m128i *)(in +  7 * stride));
+        in08  = _mm_load_si128((const __m128i *)(in +  8 * stride));
+        in09  = _mm_load_si128((const __m128i *)(in +  9 * stride));
+        in10  = _mm_load_si128((const __m128i *)(in + 10 * stride));
+        in11  = _mm_load_si128((const __m128i *)(in + 11 * stride));
+        in12  = _mm_load_si128((const __m128i *)(in + 12 * stride));
+        in13  = _mm_load_si128((const __m128i *)(in + 13 * stride));
+        in14  = _mm_load_si128((const __m128i *)(in + 14 * stride));
+        in15  = _mm_load_si128((const __m128i *)(in + 15 * stride));
+        // x = x << 2
+        in00 = _mm_slli_epi16(in00, 2);
+        in01 = _mm_slli_epi16(in01, 2);
+        in02 = _mm_slli_epi16(in02, 2);
+        in03 = _mm_slli_epi16(in03, 2);
+        in04 = _mm_slli_epi16(in04, 2);
+        in05 = _mm_slli_epi16(in05, 2);
+        in06 = _mm_slli_epi16(in06, 2);
+        in07 = _mm_slli_epi16(in07, 2);
+        in08 = _mm_slli_epi16(in08, 2);
+        in09 = _mm_slli_epi16(in09, 2);
+        in10 = _mm_slli_epi16(in10, 2);
+        in11 = _mm_slli_epi16(in11, 2);
+        in12 = _mm_slli_epi16(in12, 2);
+        in13 = _mm_slli_epi16(in13, 2);
+        in14 = _mm_slli_epi16(in14, 2);
+        in15 = _mm_slli_epi16(in15, 2);
+      } else {
+        in00  = _mm_load_si128((const __m128i *)(in +  0 * 16));
+        in01  = _mm_load_si128((const __m128i *)(in +  1 * 16));
+        in02  = _mm_load_si128((const __m128i *)(in +  2 * 16));
+        in03  = _mm_load_si128((const __m128i *)(in +  3 * 16));
+        in04  = _mm_load_si128((const __m128i *)(in +  4 * 16));
+        in05  = _mm_load_si128((const __m128i *)(in +  5 * 16));
+        in06  = _mm_load_si128((const __m128i *)(in +  6 * 16));
+        in07  = _mm_load_si128((const __m128i *)(in +  7 * 16));
+        in08  = _mm_load_si128((const __m128i *)(in +  8 * 16));
+        in09  = _mm_load_si128((const __m128i *)(in +  9 * 16));
+        in10  = _mm_load_si128((const __m128i *)(in + 10 * 16));
+        in11  = _mm_load_si128((const __m128i *)(in + 11 * 16));
+        in12  = _mm_load_si128((const __m128i *)(in + 12 * 16));
+        in13  = _mm_load_si128((const __m128i *)(in + 13 * 16));
+        in14  = _mm_load_si128((const __m128i *)(in + 14 * 16));
+        in15  = _mm_load_si128((const __m128i *)(in + 15 * 16));
+        // x = (x + 1) >> 2
+        in00 = _mm_add_epi16(in00, kOne);
+        in01 = _mm_add_epi16(in01, kOne);
+        in02 = _mm_add_epi16(in02, kOne);
+        in03 = _mm_add_epi16(in03, kOne);
+        in04 = _mm_add_epi16(in04, kOne);
+        in05 = _mm_add_epi16(in05, kOne);
+        in06 = _mm_add_epi16(in06, kOne);
+        in07 = _mm_add_epi16(in07, kOne);
+        in08 = _mm_add_epi16(in08, kOne);
+        in09 = _mm_add_epi16(in09, kOne);
+        in10 = _mm_add_epi16(in10, kOne);
+        in11 = _mm_add_epi16(in11, kOne);
+        in12 = _mm_add_epi16(in12, kOne);
+        in13 = _mm_add_epi16(in13, kOne);
+        in14 = _mm_add_epi16(in14, kOne);
+        in15 = _mm_add_epi16(in15, kOne);
+        in00 = _mm_srai_epi16(in00, 2);
+        in01 = _mm_srai_epi16(in01, 2);
+        in02 = _mm_srai_epi16(in02, 2);
+        in03 = _mm_srai_epi16(in03, 2);
+        in04 = _mm_srai_epi16(in04, 2);
+        in05 = _mm_srai_epi16(in05, 2);
+        in06 = _mm_srai_epi16(in06, 2);
+        in07 = _mm_srai_epi16(in07, 2);
+        in08 = _mm_srai_epi16(in08, 2);
+        in09 = _mm_srai_epi16(in09, 2);
+        in10 = _mm_srai_epi16(in10, 2);
+        in11 = _mm_srai_epi16(in11, 2);
+        in12 = _mm_srai_epi16(in12, 2);
+        in13 = _mm_srai_epi16(in13, 2);
+        in14 = _mm_srai_epi16(in14, 2);
+        in15 = _mm_srai_epi16(in15, 2);
+      }
+      in += 8;
+      // Calculate input for the first 8 results.
+      {
+        input0 = _mm_add_epi16(in00, in15);
+        input1 = _mm_add_epi16(in01, in14);
+        input2 = _mm_add_epi16(in02, in13);
+        input3 = _mm_add_epi16(in03, in12);
+        input4 = _mm_add_epi16(in04, in11);
+        input5 = _mm_add_epi16(in05, in10);
+        input6 = _mm_add_epi16(in06, in09);
+        input7 = _mm_add_epi16(in07, in08);
+      }
+      // Calculate input for the next 8 results.
+      {
+        step1_0 = _mm_sub_epi16(in07, in08);
+        step1_1 = _mm_sub_epi16(in06, in09);
+        step1_2 = _mm_sub_epi16(in05, in10);
+        step1_3 = _mm_sub_epi16(in04, in11);
+        step1_4 = _mm_sub_epi16(in03, in12);
+        step1_5 = _mm_sub_epi16(in02, in13);
+        step1_6 = _mm_sub_epi16(in01, in14);
+        step1_7 = _mm_sub_epi16(in00, in15);
+      }
+      // Work on the first eight values; fdct8_1d(input, even_results);
+      {
+        // Add/substract
+        const __m128i q0 = _mm_add_epi16(input0, input7);
+        const __m128i q1 = _mm_add_epi16(input1, input6);
+        const __m128i q2 = _mm_add_epi16(input2, input5);
+        const __m128i q3 = _mm_add_epi16(input3, input4);
+        const __m128i q4 = _mm_sub_epi16(input3, input4);
+        const __m128i q5 = _mm_sub_epi16(input2, input5);
+        const __m128i q6 = _mm_sub_epi16(input1, input6);
+        const __m128i q7 = _mm_sub_epi16(input0, input7);
+        // Work on first four results
+        {
+          // Add/substract
+          const __m128i r0 = _mm_add_epi16(q0, q3);
+          const __m128i r1 = _mm_add_epi16(q1, q2);
+          const __m128i r2 = _mm_sub_epi16(q1, q2);
+          const __m128i r3 = _mm_sub_epi16(q0, q3);
+          // Interleave to do the multiply by constants which gets us
+          // into 32 bits.
+          const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+          const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+          const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+          const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+          const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+          const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+          const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+          const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
+          const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+          const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+          const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+          const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+          const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+          const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+          const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+          const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+          // Combine
+          res00 = _mm_packs_epi32(w0, w1);
+          res08 = _mm_packs_epi32(w2, w3);
+          res04 = _mm_packs_epi32(w4, w5);
+          res12 = _mm_packs_epi32(w6, w7);
+        }
+        // Work on next four results
+        {
+          // Interleave to do the multiply by constants which gets us
+          // into 32 bits.
+          const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
+          const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
+          const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
+          const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
+          const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
+          const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
+          // dct_const_round_shift
+          const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
+          const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
+          const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
+          const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
+          const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
+          const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
+          const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
+          const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
+          // Combine
+          const __m128i r0 = _mm_packs_epi32(s0, s1);
+          const __m128i r1 = _mm_packs_epi32(s2, s3);
+          // Add/substract
+          const __m128i x0 = _mm_add_epi16(q4, r0);
+          const __m128i x1 = _mm_sub_epi16(q4, r0);
+          const __m128i x2 = _mm_sub_epi16(q7, r1);
+          const __m128i x3 = _mm_add_epi16(q7, r1);
+          // Interleave to do the multiply by constants which gets us
+          // into 32 bits.
+          const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+          const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+          const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+          const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
+          const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
+          const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
+          const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
+          const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
+          const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
+          const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+          const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+          const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+          const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+          const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+          const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+          const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+          // Combine
+          res02 = _mm_packs_epi32(w0, w1);
+          res14 = _mm_packs_epi32(w2, w3);
+          res10 = _mm_packs_epi32(w4, w5);
+          res06 = _mm_packs_epi32(w6, w7);
+        }
+      }
+      // Work on the next eight values; step1 -> odd_results
+      {
+        // step 2
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
+          const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
+          const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
+          const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_m16);
+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_m16);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          // Combine
+          step2_2 = _mm_packs_epi32(w0, w1);
+          step2_3 = _mm_packs_epi32(w2, w3);
+        }
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
+          const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
+          const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
+          const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_p16);
+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_p16);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          // Combine
+          step2_5 = _mm_packs_epi32(w0, w1);
+          step2_4 = _mm_packs_epi32(w2, w3);
+        }
+        // step 3
+        {
+          step3_0 = _mm_add_epi16(step1_0, step2_3);
+          step3_1 = _mm_add_epi16(step1_1, step2_2);
+          step3_2 = _mm_sub_epi16(step1_1, step2_2);
+          step3_3 = _mm_sub_epi16(step1_0, step2_3);
+          step3_4 = _mm_sub_epi16(step1_7, step2_4);
+          step3_5 = _mm_sub_epi16(step1_6, step2_5);
+          step3_6 = _mm_add_epi16(step1_6, step2_5);
+          step3_7 = _mm_add_epi16(step1_7, step2_4);
+        }
+        // step 4
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
+          const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
+          const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
+          const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24);
+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m24_m08);
+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m24_m08);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          // Combine
+          step2_1 = _mm_packs_epi32(w0, w1);
+          step2_2 = _mm_packs_epi32(w2, w3);
+        }
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
+          const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
+          const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
+          const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08);
+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          // Combine
+          step2_6 = _mm_packs_epi32(w0, w1);
+          step2_5 = _mm_packs_epi32(w2, w3);
+        }
+        // step 5
+        {
+          step1_0 = _mm_add_epi16(step3_0, step2_1);
+          step1_1 = _mm_sub_epi16(step3_0, step2_1);
+          step1_2 = _mm_sub_epi16(step3_3, step2_2);
+          step1_3 = _mm_add_epi16(step3_3, step2_2);
+          step1_4 = _mm_add_epi16(step3_4, step2_5);
+          step1_5 = _mm_sub_epi16(step3_4, step2_5);
+          step1_6 = _mm_sub_epi16(step3_7, step2_6);
+          step1_7 = _mm_add_epi16(step3_7, step2_6);
+        }
+        // step 6
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
+          const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
+          const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
+          const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p30_p02);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p30_p02);
+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p14_p18);
+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p14_p18);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          // Combine
+          res01 = _mm_packs_epi32(w0, w1);
+          res09 = _mm_packs_epi32(w2, w3);
+        }
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
+          const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
+          const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
+          const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p22_p10);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p22_p10);
+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p06_p26);
+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p06_p26);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          // Combine
+          res05 = _mm_packs_epi32(w0, w1);
+          res13 = _mm_packs_epi32(w2, w3);
+        }
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
+          const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
+          const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
+          const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m10_p22);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m10_p22);
+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m26_p06);
+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m26_p06);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          // Combine
+          res11 = _mm_packs_epi32(w0, w1);
+          res03 = _mm_packs_epi32(w2, w3);
+        }
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
+          const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
+          const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
+          const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m02_p30);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m02_p30);
+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m18_p14);
+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m18_p14);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          // Combine
+          res15 = _mm_packs_epi32(w0, w1);
+          res07 = _mm_packs_epi32(w2, w3);
+        }
+      }
+      // Transpose the results, do it as two 8x8 transposes.
+      {
+        // 00 01 02 03 04 05 06 07
+        // 10 11 12 13 14 15 16 17
+        // 20 21 22 23 24 25 26 27
+        // 30 31 32 33 34 35 36 37
+        // 40 41 42 43 44 45 46 47
+        // 50 51 52 53 54 55 56 57
+        // 60 61 62 63 64 65 66 67
+        // 70 71 72 73 74 75 76 77
+        const __m128i tr0_0 = _mm_unpacklo_epi16(res00, res01);
+        const __m128i tr0_1 = _mm_unpacklo_epi16(res02, res03);
+        const __m128i tr0_2 = _mm_unpackhi_epi16(res00, res01);
+        const __m128i tr0_3 = _mm_unpackhi_epi16(res02, res03);
+        const __m128i tr0_4 = _mm_unpacklo_epi16(res04, res05);
+        const __m128i tr0_5 = _mm_unpacklo_epi16(res06, res07);
+        const __m128i tr0_6 = _mm_unpackhi_epi16(res04, res05);
+        const __m128i tr0_7 = _mm_unpackhi_epi16(res06, res07);
+        // 00 10 01 11 02 12 03 13
+        // 20 30 21 31 22 32 23 33
+        // 04 14 05 15 06 16 07 17
+        // 24 34 25 35 26 36 27 37
+        // 40 50 41 51 42 52 43 53
+        // 60 70 61 71 62 72 63 73
+        // 54 54 55 55 56 56 57 57
+        // 64 74 65 75 66 76 67 77
+        const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+        const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+        const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+        const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+        const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+        const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+        const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+        const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+        // 00 10 20 30 01 11 21 31
+        // 40 50 60 70 41 51 61 71
+        // 02 12 22 32 03 13 23 33
+        // 42 52 62 72 43 53 63 73
+        // 04 14 24 34 05 15 21 36
+        // 44 54 64 74 45 55 61 76
+        // 06 16 26 36 07 17 27 37
+        // 46 56 66 76 47 57 67 77
+        const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+        const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+        const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+        const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+        const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+        const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+        const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+        const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+        // 00 10 20 30 40 50 60 70
+        // 01 11 21 31 41 51 61 71
+        // 02 12 22 32 42 52 62 72
+        // 03 13 23 33 43 53 63 73
+        // 04 14 24 34 44 54 64 74
+        // 05 15 25 35 45 55 65 75
+        // 06 16 26 36 46 56 66 76
+        // 07 17 27 37 47 57 67 77
+        _mm_storeu_si128((__m128i *)(out + 0 * 16), tr2_0);
+        _mm_storeu_si128((__m128i *)(out + 1 * 16), tr2_1);
+        _mm_storeu_si128((__m128i *)(out + 2 * 16), tr2_2);
+        _mm_storeu_si128((__m128i *)(out + 3 * 16), tr2_3);
+        _mm_storeu_si128((__m128i *)(out + 4 * 16), tr2_4);
+        _mm_storeu_si128((__m128i *)(out + 5 * 16), tr2_5);
+        _mm_storeu_si128((__m128i *)(out + 6 * 16), tr2_6);
+        _mm_storeu_si128((__m128i *)(out + 7 * 16), tr2_7);
+      }
+      {
+        // 00 01 02 03 04 05 06 07
+        // 10 11 12 13 14 15 16 17
+        // 20 21 22 23 24 25 26 27
+        // 30 31 32 33 34 35 36 37
+        // 40 41 42 43 44 45 46 47
+        // 50 51 52 53 54 55 56 57
+        // 60 61 62 63 64 65 66 67
+        // 70 71 72 73 74 75 76 77
+        const __m128i tr0_0 = _mm_unpacklo_epi16(res08, res09);
+        const __m128i tr0_1 = _mm_unpacklo_epi16(res10, res11);
+        const __m128i tr0_2 = _mm_unpackhi_epi16(res08, res09);
+        const __m128i tr0_3 = _mm_unpackhi_epi16(res10, res11);
+        const __m128i tr0_4 = _mm_unpacklo_epi16(res12, res13);
+        const __m128i tr0_5 = _mm_unpacklo_epi16(res14, res15);
+        const __m128i tr0_6 = _mm_unpackhi_epi16(res12, res13);
+        const __m128i tr0_7 = _mm_unpackhi_epi16(res14, res15);
+        // 00 10 01 11 02 12 03 13
+        // 20 30 21 31 22 32 23 33
+        // 04 14 05 15 06 16 07 17
+        // 24 34 25 35 26 36 27 37
+        // 40 50 41 51 42 52 43 53
+        // 60 70 61 71 62 72 63 73
+        // 54 54 55 55 56 56 57 57
+        // 64 74 65 75 66 76 67 77
+        const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+        const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+        const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+        const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+        const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+        const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+        const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+        const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+        // 00 10 20 30 01 11 21 31
+        // 40 50 60 70 41 51 61 71
+        // 02 12 22 32 03 13 23 33
+        // 42 52 62 72 43 53 63 73
+        // 04 14 24 34 05 15 21 36
+        // 44 54 64 74 45 55 61 76
+        // 06 16 26 36 07 17 27 37
+        // 46 56 66 76 47 57 67 77
+        const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+        const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+        const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+        const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+        const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+        const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+        const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+        const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+        // 00 10 20 30 40 50 60 70
+        // 01 11 21 31 41 51 61 71
+        // 02 12 22 32 42 52 62 72
+        // 03 13 23 33 43 53 63 73
+        // 04 14 24 34 44 54 64 74
+        // 05 15 25 35 45 55 65 75
+        // 06 16 26 36 46 56 66 76
+        // 07 17 27 37 47 57 67 77
+        // Store results
+        _mm_store_si128((__m128i *)(out + 8 + 0 * 16), tr2_0);
+        _mm_store_si128((__m128i *)(out + 8 + 1 * 16), tr2_1);
+        _mm_store_si128((__m128i *)(out + 8 + 2 * 16), tr2_2);
+        _mm_store_si128((__m128i *)(out + 8 + 3 * 16), tr2_3);
+        _mm_store_si128((__m128i *)(out + 8 + 4 * 16), tr2_4);
+        _mm_store_si128((__m128i *)(out + 8 + 5 * 16), tr2_5);
+        _mm_store_si128((__m128i *)(out + 8 + 6 * 16), tr2_6);
+        _mm_store_si128((__m128i *)(out + 8 + 7 * 16), tr2_7);
+      }
+      out += 8*16;
+    }
+    // Setup in/out for next pass.
+    in = intermediate;
+    out = output;
+  }
+}
+
+static INLINE void load_buffer_16x16_avx2(const int16_t* input, __m128i *in0,
+                                     __m128i *in1, int stride) {
+  // load first 8 columns
+  load_buffer_8x8_avx2(input, in0, stride);
+  load_buffer_8x8_avx2(input + 8 * stride, in0 + 8, stride);
+
+  input += 8;
+  // load second 8 columns
+  load_buffer_8x8_avx2(input, in1, stride);
+  load_buffer_8x8_avx2(input + 8 * stride, in1 + 8, stride);
+}
+
+static INLINE void write_buffer_16x16_avx2(int16_t *output, __m128i *in0,
+                                      __m128i *in1, int stride) {
+  // write first 8 columns
+  write_buffer_8x8_avx2(output, in0, stride);
+  write_buffer_8x8_avx2(output + 8 * stride, in0 + 8, stride);
+  // write second 8 columns
+  output += 8;
+  write_buffer_8x8_avx2(output, in1, stride);
+  write_buffer_8x8_avx2(output + 8 * stride, in1 + 8, stride);
+}
+
+static INLINE void array_transpose_16x16_avx2(__m128i *res0, __m128i *res1) {
+  __m128i tbuf[8];
+  array_transpose_8x8_avx2(res0, res0);
+  array_transpose_8x8_avx2(res1, tbuf);
+  array_transpose_8x8_avx2(res0 + 8, res1);
+  array_transpose_8x8_avx2(res1 + 8, res1 + 8);
+
+  res0[8] = tbuf[0];
+  res0[9] = tbuf[1];
+  res0[10] = tbuf[2];
+  res0[11] = tbuf[3];
+  res0[12] = tbuf[4];
+  res0[13] = tbuf[5];
+  res0[14] = tbuf[6];
+  res0[15] = tbuf[7];
+}
+
+static INLINE void right_shift_16x16_avx2(__m128i *res0, __m128i *res1) {
+  // perform rounding operations
+  right_shift_8x8_avx2(res0, 2);
+  right_shift_8x8_avx2(res0 + 8, 2);
+  right_shift_8x8_avx2(res1, 2);
+  right_shift_8x8_avx2(res1 + 8, 2);
+}
+
+void fdct16_1d_8col_avx2(__m128i *in) {
+  // perform 16x16 1-D DCT for 8 columns
+  __m128i i[8], s[8], p[8], t[8], u[16], v[16];
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
+  const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
+  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
+  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
+  const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
+  const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
+  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
+  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  // stage 1
+  i[0] = _mm_add_epi16(in[0], in[15]);
+  i[1] = _mm_add_epi16(in[1], in[14]);
+  i[2] = _mm_add_epi16(in[2], in[13]);
+  i[3] = _mm_add_epi16(in[3], in[12]);
+  i[4] = _mm_add_epi16(in[4], in[11]);
+  i[5] = _mm_add_epi16(in[5], in[10]);
+  i[6] = _mm_add_epi16(in[6], in[9]);
+  i[7] = _mm_add_epi16(in[7], in[8]);
+
+  s[0] = _mm_sub_epi16(in[7], in[8]);
+  s[1] = _mm_sub_epi16(in[6], in[9]);
+  s[2] = _mm_sub_epi16(in[5], in[10]);
+  s[3] = _mm_sub_epi16(in[4], in[11]);
+  s[4] = _mm_sub_epi16(in[3], in[12]);
+  s[5] = _mm_sub_epi16(in[2], in[13]);
+  s[6] = _mm_sub_epi16(in[1], in[14]);
+  s[7] = _mm_sub_epi16(in[0], in[15]);
+
+  p[0] = _mm_add_epi16(i[0], i[7]);
+  p[1] = _mm_add_epi16(i[1], i[6]);
+  p[2] = _mm_add_epi16(i[2], i[5]);
+  p[3] = _mm_add_epi16(i[3], i[4]);
+  p[4] = _mm_sub_epi16(i[3], i[4]);
+  p[5] = _mm_sub_epi16(i[2], i[5]);
+  p[6] = _mm_sub_epi16(i[1], i[6]);
+  p[7] = _mm_sub_epi16(i[0], i[7]);
+
+  u[0] = _mm_add_epi16(p[0], p[3]);
+  u[1] = _mm_add_epi16(p[1], p[2]);
+  u[2] = _mm_sub_epi16(p[1], p[2]);
+  u[3] = _mm_sub_epi16(p[0], p[3]);
+
+  v[0] = _mm_unpacklo_epi16(u[0], u[1]);
+  v[1] = _mm_unpackhi_epi16(u[0], u[1]);
+  v[2] = _mm_unpacklo_epi16(u[2], u[3]);
+  v[3] = _mm_unpackhi_epi16(u[2], u[3]);
+
+  u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);
+  u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16);
+  u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16);
+  u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16);
+  u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08);
+  u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08);
+  u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24);
+  u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u[0], u[1]);
+  in[4] = _mm_packs_epi32(u[4], u[5]);
+  in[8] = _mm_packs_epi32(u[2], u[3]);
+  in[12] = _mm_packs_epi32(u[6], u[7]);
+
+  u[0] = _mm_unpacklo_epi16(p[5], p[6]);
+  u[1] = _mm_unpackhi_epi16(p[5], p[6]);
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+
+  u[0] = _mm_packs_epi32(v[0], v[1]);
+  u[1] = _mm_packs_epi32(v[2], v[3]);
+
+  t[0] = _mm_add_epi16(p[4], u[0]);
+  t[1] = _mm_sub_epi16(p[4], u[0]);
+  t[2] = _mm_sub_epi16(p[7], u[1]);
+  t[3] = _mm_add_epi16(p[7], u[1]);
+
+  u[0] = _mm_unpacklo_epi16(t[0], t[3]);
+  u[1] = _mm_unpackhi_epi16(t[0], t[3]);
+  u[2] = _mm_unpacklo_epi16(t[1], t[2]);
+  u[3] = _mm_unpackhi_epi16(t[1], t[2]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04);
+  v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20);
+  v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12);
+  v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28);
+  v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+  in[2] = _mm_packs_epi32(v[0], v[1]);
+  in[6] = _mm_packs_epi32(v[4], v[5]);
+  in[10] = _mm_packs_epi32(v[2], v[3]);
+  in[14] = _mm_packs_epi32(v[6], v[7]);
+
+  // stage 2
+  u[0] = _mm_unpacklo_epi16(s[2], s[5]);
+  u[1] = _mm_unpackhi_epi16(s[2], s[5]);
+  u[2] = _mm_unpacklo_epi16(s[3], s[4]);
+  u[3] = _mm_unpackhi_epi16(s[3], s[4]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+  v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+  v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+  v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+  v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+  t[2] = _mm_packs_epi32(v[0], v[1]);
+  t[3] = _mm_packs_epi32(v[2], v[3]);
+  t[4] = _mm_packs_epi32(v[4], v[5]);
+  t[5] = _mm_packs_epi32(v[6], v[7]);
+
+  // stage 3
+  p[0] = _mm_add_epi16(s[0], t[3]);
+  p[1] = _mm_add_epi16(s[1], t[2]);
+  p[2] = _mm_sub_epi16(s[1], t[2]);
+  p[3] = _mm_sub_epi16(s[0], t[3]);
+  p[4] = _mm_sub_epi16(s[7], t[4]);
+  p[5] = _mm_sub_epi16(s[6], t[5]);
+  p[6] = _mm_add_epi16(s[6], t[5]);
+  p[7] = _mm_add_epi16(s[7], t[4]);
+
+  // stage 4
+  u[0] = _mm_unpacklo_epi16(p[1], p[6]);
+  u[1] = _mm_unpackhi_epi16(p[1], p[6]);
+  u[2] = _mm_unpacklo_epi16(p[2], p[5]);
+  u[3] = _mm_unpackhi_epi16(p[2], p[5]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24);
+  v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08);
+  v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24);
+  v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08);
+  v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+  t[1] = _mm_packs_epi32(v[0], v[1]);
+  t[2] = _mm_packs_epi32(v[2], v[3]);
+  t[5] = _mm_packs_epi32(v[4], v[5]);
+  t[6] = _mm_packs_epi32(v[6], v[7]);
+
+  // stage 5
+  s[0] = _mm_add_epi16(p[0], t[1]);
+  s[1] = _mm_sub_epi16(p[0], t[1]);
+  s[2] = _mm_sub_epi16(p[3], t[2]);
+  s[3] = _mm_add_epi16(p[3], t[2]);
+  s[4] = _mm_add_epi16(p[4], t[5]);
+  s[5] = _mm_sub_epi16(p[4], t[5]);
+  s[6] = _mm_sub_epi16(p[7], t[6]);
+  s[7] = _mm_add_epi16(p[7], t[6]);
+
+  // stage 6
+  u[0] = _mm_unpacklo_epi16(s[0], s[7]);
+  u[1] = _mm_unpackhi_epi16(s[0], s[7]);
+  u[2] = _mm_unpacklo_epi16(s[1], s[6]);
+  u[3] = _mm_unpackhi_epi16(s[1], s[6]);
+  u[4] = _mm_unpacklo_epi16(s[2], s[5]);
+  u[5] = _mm_unpackhi_epi16(s[2], s[5]);
+  u[6] = _mm_unpacklo_epi16(s[3], s[4]);
+  u[7] = _mm_unpackhi_epi16(s[3], s[4]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02);
+  v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18);
+  v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18);
+  v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10);
+  v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10);
+  v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26);
+  v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26);
+  v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06);
+  v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22);
+  v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14);
+  v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14);
+  v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30);
+  v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  in[1]  = _mm_packs_epi32(v[0], v[1]);
+  in[9]  = _mm_packs_epi32(v[2], v[3]);
+  in[5]  = _mm_packs_epi32(v[4], v[5]);
+  in[13] = _mm_packs_epi32(v[6], v[7]);
+  in[3]  = _mm_packs_epi32(v[8], v[9]);
+  in[11] = _mm_packs_epi32(v[10], v[11]);
+  in[7]  = _mm_packs_epi32(v[12], v[13]);
+  in[15] = _mm_packs_epi32(v[14], v[15]);
+}
+
+void fadst16_1d_8col_avx2(__m128i *in) {
+  // perform 16x16 1-D ADST for 8 columns
+  __m128i s[16], x[16], u[32], v[32];
+  const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
+  const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+  const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
+  const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+  const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
+  const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+  const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
+  const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+  const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
+  const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+  const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+  const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+  const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
+  const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+  const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
+  const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
+  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i kZero = _mm_set1_epi16(0);
+
+  u[0] = _mm_unpacklo_epi16(in[15], in[0]);
+  u[1] = _mm_unpackhi_epi16(in[15], in[0]);
+  u[2] = _mm_unpacklo_epi16(in[13], in[2]);
+  u[3] = _mm_unpackhi_epi16(in[13], in[2]);
+  u[4] = _mm_unpacklo_epi16(in[11], in[4]);
+  u[5] = _mm_unpackhi_epi16(in[11], in[4]);
+  u[6] = _mm_unpacklo_epi16(in[9], in[6]);
+  u[7] = _mm_unpackhi_epi16(in[9], in[6]);
+  u[8] = _mm_unpacklo_epi16(in[7], in[8]);
+  u[9] = _mm_unpackhi_epi16(in[7], in[8]);
+  u[10] = _mm_unpacklo_epi16(in[5], in[10]);
+  u[11] = _mm_unpackhi_epi16(in[5], in[10]);
+  u[12] = _mm_unpacklo_epi16(in[3], in[12]);
+  u[13] = _mm_unpackhi_epi16(in[3], in[12]);
+  u[14] = _mm_unpacklo_epi16(in[1], in[14]);
+  u[15] = _mm_unpackhi_epi16(in[1], in[14]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
+  v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
+  v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
+  v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
+  v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
+  v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
+  v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
+  v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
+  v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
+  v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
+  v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
+  v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
+  v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
+  v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
+  v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
+  v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
+  v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
+
+  u[0] = _mm_add_epi32(v[0], v[16]);
+  u[1] = _mm_add_epi32(v[1], v[17]);
+  u[2] = _mm_add_epi32(v[2], v[18]);
+  u[3] = _mm_add_epi32(v[3], v[19]);
+  u[4] = _mm_add_epi32(v[4], v[20]);
+  u[5] = _mm_add_epi32(v[5], v[21]);
+  u[6] = _mm_add_epi32(v[6], v[22]);
+  u[7] = _mm_add_epi32(v[7], v[23]);
+  u[8] = _mm_add_epi32(v[8], v[24]);
+  u[9] = _mm_add_epi32(v[9], v[25]);
+  u[10] = _mm_add_epi32(v[10], v[26]);
+  u[11] = _mm_add_epi32(v[11], v[27]);
+  u[12] = _mm_add_epi32(v[12], v[28]);
+  u[13] = _mm_add_epi32(v[13], v[29]);
+  u[14] = _mm_add_epi32(v[14], v[30]);
+  u[15] = _mm_add_epi32(v[15], v[31]);
+  u[16] = _mm_sub_epi32(v[0], v[16]);
+  u[17] = _mm_sub_epi32(v[1], v[17]);
+  u[18] = _mm_sub_epi32(v[2], v[18]);
+  u[19] = _mm_sub_epi32(v[3], v[19]);
+  u[20] = _mm_sub_epi32(v[4], v[20]);
+  u[21] = _mm_sub_epi32(v[5], v[21]);
+  u[22] = _mm_sub_epi32(v[6], v[22]);
+  u[23] = _mm_sub_epi32(v[7], v[23]);
+  u[24] = _mm_sub_epi32(v[8], v[24]);
+  u[25] = _mm_sub_epi32(v[9], v[25]);
+  u[26] = _mm_sub_epi32(v[10], v[26]);
+  u[27] = _mm_sub_epi32(v[11], v[27]);
+  u[28] = _mm_sub_epi32(v[12], v[28]);
+  u[29] = _mm_sub_epi32(v[13], v[29]);
+  u[30] = _mm_sub_epi32(v[14], v[30]);
+  u[31] = _mm_sub_epi32(v[15], v[31]);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+  v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
+  v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
+  v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
+  v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
+  v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
+  v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
+  v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
+  v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
+  v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
+  v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
+  v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
+  v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
+  v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
+  v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
+  v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
+  v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+  u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
+  u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
+  u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
+  u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
+  u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
+  u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
+  u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
+  u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
+  u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
+  u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
+  u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
+  u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
+  u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
+  u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
+  u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
+  u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
+
+  s[0] = _mm_packs_epi32(u[0], u[1]);
+  s[1] = _mm_packs_epi32(u[2], u[3]);
+  s[2] = _mm_packs_epi32(u[4], u[5]);
+  s[3] = _mm_packs_epi32(u[6], u[7]);
+  s[4] = _mm_packs_epi32(u[8], u[9]);
+  s[5] = _mm_packs_epi32(u[10], u[11]);
+  s[6] = _mm_packs_epi32(u[12], u[13]);
+  s[7] = _mm_packs_epi32(u[14], u[15]);
+  s[8] = _mm_packs_epi32(u[16], u[17]);
+  s[9] = _mm_packs_epi32(u[18], u[19]);
+  s[10] = _mm_packs_epi32(u[20], u[21]);
+  s[11] = _mm_packs_epi32(u[22], u[23]);
+  s[12] = _mm_packs_epi32(u[24], u[25]);
+  s[13] = _mm_packs_epi32(u[26], u[27]);
+  s[14] = _mm_packs_epi32(u[28], u[29]);
+  s[15] = _mm_packs_epi32(u[30], u[31]);
+
+  // stage 2
+  u[0] = _mm_unpacklo_epi16(s[8], s[9]);
+  u[1] = _mm_unpackhi_epi16(s[8], s[9]);
+  u[2] = _mm_unpacklo_epi16(s[10], s[11]);
+  u[3] = _mm_unpackhi_epi16(s[10], s[11]);
+  u[4] = _mm_unpacklo_epi16(s[12], s[13]);
+  u[5] = _mm_unpackhi_epi16(s[12], s[13]);
+  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
+
+  u[0] = _mm_add_epi32(v[0], v[8]);
+  u[1] = _mm_add_epi32(v[1], v[9]);
+  u[2] = _mm_add_epi32(v[2], v[10]);
+  u[3] = _mm_add_epi32(v[3], v[11]);
+  u[4] = _mm_add_epi32(v[4], v[12]);
+  u[5] = _mm_add_epi32(v[5], v[13]);
+  u[6] = _mm_add_epi32(v[6], v[14]);
+  u[7] = _mm_add_epi32(v[7], v[15]);
+  u[8] = _mm_sub_epi32(v[0], v[8]);
+  u[9] = _mm_sub_epi32(v[1], v[9]);
+  u[10] = _mm_sub_epi32(v[2], v[10]);
+  u[11] = _mm_sub_epi32(v[3], v[11]);
+  u[12] = _mm_sub_epi32(v[4], v[12]);
+  u[13] = _mm_sub_epi32(v[5], v[13]);
+  u[14] = _mm_sub_epi32(v[6], v[14]);
+  u[15] = _mm_sub_epi32(v[7], v[15]);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+  x[0] = _mm_add_epi16(s[0], s[4]);
+  x[1] = _mm_add_epi16(s[1], s[5]);
+  x[2] = _mm_add_epi16(s[2], s[6]);
+  x[3] = _mm_add_epi16(s[3], s[7]);
+  x[4] = _mm_sub_epi16(s[0], s[4]);
+  x[5] = _mm_sub_epi16(s[1], s[5]);
+  x[6] = _mm_sub_epi16(s[2], s[6]);
+  x[7] = _mm_sub_epi16(s[3], s[7]);
+  x[8] = _mm_packs_epi32(u[0], u[1]);
+  x[9] = _mm_packs_epi32(u[2], u[3]);
+  x[10] = _mm_packs_epi32(u[4], u[5]);
+  x[11] = _mm_packs_epi32(u[6], u[7]);
+  x[12] = _mm_packs_epi32(u[8], u[9]);
+  x[13] = _mm_packs_epi32(u[10], u[11]);
+  x[14] = _mm_packs_epi32(u[12], u[13]);
+  x[15] = _mm_packs_epi32(u[14], u[15]);
+
+  // stage 3
+  u[0] = _mm_unpacklo_epi16(x[4], x[5]);
+  u[1] = _mm_unpackhi_epi16(x[4], x[5]);
+  u[2] = _mm_unpacklo_epi16(x[6], x[7]);
+  u[3] = _mm_unpackhi_epi16(x[6], x[7]);
+  u[4] = _mm_unpacklo_epi16(x[12], x[13]);
+  u[5] = _mm_unpackhi_epi16(x[12], x[13]);
+  u[6] = _mm_unpacklo_epi16(x[14], x[15]);
+  u[7] = _mm_unpackhi_epi16(x[14], x[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
+
+  u[0] = _mm_add_epi32(v[0], v[4]);
+  u[1] = _mm_add_epi32(v[1], v[5]);
+  u[2] = _mm_add_epi32(v[2], v[6]);
+  u[3] = _mm_add_epi32(v[3], v[7]);
+  u[4] = _mm_sub_epi32(v[0], v[4]);
+  u[5] = _mm_sub_epi32(v[1], v[5]);
+  u[6] = _mm_sub_epi32(v[2], v[6]);
+  u[7] = _mm_sub_epi32(v[3], v[7]);
+  u[8] = _mm_add_epi32(v[8], v[12]);
+  u[9] = _mm_add_epi32(v[9], v[13]);
+  u[10] = _mm_add_epi32(v[10], v[14]);
+  u[11] = _mm_add_epi32(v[11], v[15]);
+  u[12] = _mm_sub_epi32(v[8], v[12]);
+  u[13] = _mm_sub_epi32(v[9], v[13]);
+  u[14] = _mm_sub_epi32(v[10], v[14]);
+  u[15] = _mm_sub_epi32(v[11], v[15]);
+
+  u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  s[0] = _mm_add_epi16(x[0], x[2]);
+  s[1] = _mm_add_epi16(x[1], x[3]);
+  s[2] = _mm_sub_epi16(x[0], x[2]);
+  s[3] = _mm_sub_epi16(x[1], x[3]);
+  s[4] = _mm_packs_epi32(v[0], v[1]);
+  s[5] = _mm_packs_epi32(v[2], v[3]);
+  s[6] = _mm_packs_epi32(v[4], v[5]);
+  s[7] = _mm_packs_epi32(v[6], v[7]);
+  s[8] = _mm_add_epi16(x[8], x[10]);
+  s[9] = _mm_add_epi16(x[9], x[11]);
+  s[10] = _mm_sub_epi16(x[8], x[10]);
+  s[11] = _mm_sub_epi16(x[9], x[11]);
+  s[12] = _mm_packs_epi32(v[8], v[9]);
+  s[13] = _mm_packs_epi32(v[10], v[11]);
+  s[14] = _mm_packs_epi32(v[12], v[13]);
+  s[15] = _mm_packs_epi32(v[14], v[15]);
+
+  // stage 4
+  u[0] = _mm_unpacklo_epi16(s[2], s[3]);
+  u[1] = _mm_unpackhi_epi16(s[2], s[3]);
+  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
+  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
+  u[4] = _mm_unpacklo_epi16(s[10], s[11]);
+  u[5] = _mm_unpackhi_epi16(s[10], s[11]);
+  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  in[0] = s[0];
+  in[1] = _mm_sub_epi16(kZero, s[8]);
+  in[2] = s[12];
+  in[3] = _mm_sub_epi16(kZero, s[4]);
+  in[4] = _mm_packs_epi32(v[4], v[5]);
+  in[5] = _mm_packs_epi32(v[12], v[13]);
+  in[6] = _mm_packs_epi32(v[8], v[9]);
+  in[7] = _mm_packs_epi32(v[0], v[1]);
+  in[8] = _mm_packs_epi32(v[2], v[3]);
+  in[9] = _mm_packs_epi32(v[10], v[11]);
+  in[10] = _mm_packs_epi32(v[14], v[15]);
+  in[11] = _mm_packs_epi32(v[6], v[7]);
+  in[12] = s[5];
+  in[13] = _mm_sub_epi16(kZero, s[13]);
+  in[14] = s[9];
+  in[15] = _mm_sub_epi16(kZero, s[1]);
+}
+
+void fdct16_1d_avx2(__m128i *in0, __m128i *in1) {
+  fdct16_1d_8col_avx2(in0);
+  fdct16_1d_8col_avx2(in1);
+  array_transpose_16x16_avx2(in0, in1);
+}
+
+void fadst16_1d_avx2(__m128i *in0, __m128i *in1) {
+  fadst16_1d_8col_avx2(in0);
+  fadst16_1d_8col_avx2(in1);
+  array_transpose_16x16_avx2(in0, in1);
+}
+
+void vp9_short_fht16x16_avx2(const int16_t *input, int16_t *output,
+                             int stride, int tx_type) {
+  __m128i in0[16], in1[16];
+  load_buffer_16x16_avx2(input, in0, in1, stride);
+  switch (tx_type) {
+    case 0:  // DCT_DCT
+      fdct16_1d_avx2(in0, in1);
+      right_shift_16x16_avx2(in0, in1);
+      fdct16_1d_avx2(in0, in1);
+      break;
+    case 1:  // ADST_DCT
+      fadst16_1d_avx2(in0, in1);
+      right_shift_16x16_avx2(in0, in1);
+      fdct16_1d_avx2(in0, in1);
+      break;
+    case 2:  // DCT_ADST
+      fdct16_1d_avx2(in0, in1);
+      right_shift_16x16_avx2(in0, in1);
+      fadst16_1d_avx2(in0, in1);
+      break;
+    case 3:  // ADST_ADST
+      fadst16_1d_avx2(in0, in1);
+      right_shift_16x16_avx2(in0, in1);
+      fadst16_1d_avx2(in0, in1);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+  write_buffer_16x16_avx2(output, in0, in1, 16);
+}
+
+#define FDCT32x32_2D_AVX2 vp9_fdct32x32_rd_avx2
+#define FDCT32x32_HIGH_PRECISION 0
+#include "vp9/encoder/x86/vp9_dct32x32_avx2.c"
+#undef  FDCT32x32_2D_AVX2
+#undef  FDCT32x32_HIGH_PRECISION
+
+#define FDCT32x32_2D_AVX2 vp9_fdct32x32_avx2
+#define FDCT32x32_HIGH_PRECISION 1
+#include "vp9/encoder/x86/vp9_dct32x32_avx2.c" // NOLINT
+#undef  FDCT32x32_2D_AVX2
+#undef  FDCT32x32_HIGH_PRECISION

diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 7472784..ce83a67 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk

@@ -104,4 +104,7 @@
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct32x32_sse2.c
 
+VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct_avx2.c
+VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct32x32_avx2.c
+
 VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))