Merge "SSE2 speed up of  h_predictor_4x4"
diff --git a/build/make/configure.sh b/build/make/configure.sh
index 37ed86f..98248b0 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -1208,14 +1208,20 @@
       soft_enable runtime_cpu_detect
       # We can't use 'check_cflags' until the compiler is configured and CC is
       # populated.
-      check_gcc_machine_option mmx
-      check_gcc_machine_option sse
-      check_gcc_machine_option sse2
-      check_gcc_machine_option sse3
-      check_gcc_machine_option ssse3
-      check_gcc_machine_option sse4 sse4_1
-      check_gcc_machine_option avx
-      check_gcc_machine_option avx2
+      for ext in ${ARCH_EXT_LIST_X86}; do
+        # disable higher order extensions to simplify asm dependencies
+        if [ "$disable_exts" = "yes" ]; then
+          if ! disabled $ext; then
+            RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${ext} "
+            disable_feature $ext
+          fi
+        elif disabled $ext; then
+          disable_exts="yes"
+        else
+          # use the shortened version for the flag: sse4_1 -> sse4
+          check_gcc_machine_option ${ext%_*} $ext
+        fi
+      done
 
       if enabled external_build; then
         log_echo "  skipping assembler detection"
diff --git a/configure b/configure
index 24992c4..f12779c 100755
--- a/configure
+++ b/configure
@@ -234,6 +234,16 @@
     x86
     x86_64
 "
+ARCH_EXT_LIST_X86="
+    mmx
+    sse
+    sse2
+    sse3
+    ssse3
+    sse4_1
+    avx
+    avx2
+"
 ARCH_EXT_LIST="
     edsp
     media
@@ -245,14 +255,7 @@
     msa
     mips64
 
-    mmx
-    sse
-    sse2
-    sse3
-    ssse3
-    sse4_1
-    avx
-    avx2
+    ${ARCH_EXT_LIST_X86}
 "
 HAVE_LIST="
     ${ARCH_EXT_LIST}
diff --git a/test/vp9_avg_test.cc b/test/vp9_avg_test.cc
index d383131..290bdc7 100644
--- a/test/vp9_avg_test.cc
+++ b/test/vp9_avg_test.cc
@@ -194,6 +194,48 @@
   int16_t sum_c_;
 };
 
+typedef int (*SatdFunc)(const int16_t *coeffs, int length);
+typedef std::tr1::tuple<int, SatdFunc> SatdTestParam;
+
+class SatdTest
+    : public ::testing::Test,
+      public ::testing::WithParamInterface<SatdTestParam> {
+ protected:
+  virtual void SetUp() {
+    satd_size_ = GET_PARAM(0);
+    satd_func_ = GET_PARAM(1);
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    src_ = reinterpret_cast<int16_t*>(
+        vpx_memalign(16, sizeof(*src_) * satd_size_));
+    ASSERT_TRUE(src_ != NULL);
+  }
+
+  virtual void TearDown() {
+    libvpx_test::ClearSystemState();
+    vpx_free(src_);
+  }
+
+  void FillConstant(const int16_t val) {
+    for (int i = 0; i < satd_size_; ++i) src_[i] = val;
+  }
+
+  void FillRandom() {
+    for (int i = 0; i < satd_size_; ++i) src_[i] = rnd_.Rand16();
+  }
+
+  void Check(const int expected) {
+    int total;
+    ASM_REGISTER_STATE_CHECK(total = satd_func_(src_, satd_size_));
+    EXPECT_EQ(expected, total);
+  }
+
+  int satd_size_;
+
+ private:
+  int16_t *src_;
+  SatdFunc satd_func_;
+  ACMRandom rnd_;
+};
 
 uint8_t* AverageTestBase::source_data_ = NULL;
 
@@ -246,6 +288,36 @@
   RunComparison();
 }
 
+
+TEST_P(SatdTest, MinValue) {
+  const int kMin = -32640;
+  const int expected = -kMin * satd_size_;
+  FillConstant(kMin);
+  Check(expected);
+}
+
+TEST_P(SatdTest, MaxValue) {
+  const int kMax = 32640;
+  const int expected = kMax * satd_size_;
+  FillConstant(kMax);
+  Check(expected);
+}
+
+TEST_P(SatdTest, Random) {
+  int expected;
+  switch (satd_size_) {
+    case 16: expected = 205298; break;
+    case 64: expected = 1113950; break;
+    case 256: expected = 4268415; break;
+    case 1024: expected = 16954082; break;
+    default:
+      FAIL() << "Invalid satd size (" << satd_size_
+             << ") valid: 16/64/256/1024";
+  }
+  FillRandom();
+  Check(expected);
+}
+
 using std::tr1::make_tuple;
 
 INSTANTIATE_TEST_CASE_P(
@@ -254,6 +326,14 @@
         make_tuple(16, 16, 1, 8, &vp9_avg_8x8_c),
         make_tuple(16, 16, 1, 4, &vp9_avg_4x4_c)));
 
+INSTANTIATE_TEST_CASE_P(
+    C, SatdTest,
+    ::testing::Values(
+        make_tuple(16, &vp9_satd_c),
+        make_tuple(64, &vp9_satd_c),
+        make_tuple(256, &vp9_satd_c),
+        make_tuple(1024, &vp9_satd_c)));
+
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
     SSE2, AverageTest,
@@ -276,6 +356,14 @@
         make_tuple(16, &vp9_int_pro_col_sse2, &vp9_int_pro_col_c),
         make_tuple(32, &vp9_int_pro_col_sse2, &vp9_int_pro_col_c),
         make_tuple(64, &vp9_int_pro_col_sse2, &vp9_int_pro_col_c)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, SatdTest,
+    ::testing::Values(
+        make_tuple(16, &vp9_satd_sse2),
+        make_tuple(64, &vp9_satd_sse2),
+        make_tuple(256, &vp9_satd_sse2),
+        make_tuple(1024, &vp9_satd_sse2)));
 #endif
 
 #if HAVE_NEON
@@ -297,6 +385,14 @@
         make_tuple(16, &vp9_int_pro_col_neon, &vp9_int_pro_col_c),
         make_tuple(32, &vp9_int_pro_col_neon, &vp9_int_pro_col_c),
         make_tuple(64, &vp9_int_pro_col_neon, &vp9_int_pro_col_c)));
+
+INSTANTIATE_TEST_CASE_P(
+    NEON, SatdTest,
+    ::testing::Values(
+        make_tuple(16, &vp9_satd_neon),
+        make_tuple(64, &vp9_satd_neon),
+        make_tuple(256, &vp9_satd_neon),
+        make_tuple(1024, &vp9_satd_neon)));
 #endif
 
 #if HAVE_MSA
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 890b638..8fe6503 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -209,8 +209,8 @@
 add_proto qw/void vp9_hadamard_16x16/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
 specialize qw/vp9_hadamard_16x16 sse2/;
 
-add_proto qw/int16_t vp9_satd/, "const int16_t *coeff, int length";
-specialize qw/vp9_satd sse2/;
+add_proto qw/int vp9_satd/, "const int16_t *coeff, int length";
+specialize qw/vp9_satd sse2 neon/;
 
 add_proto qw/void vp9_int_pro_row/, "int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height";
 specialize qw/vp9_int_pro_row sse2 neon/;
diff --git a/vp9/encoder/arm/neon/vp9_avg_neon.c b/vp9/encoder/arm/neon/vp9_avg_neon.c
index d569ec9..5996bd4 100644
--- a/vp9/encoder/arm/neon/vp9_avg_neon.c
+++ b/vp9/encoder/arm/neon/vp9_avg_neon.c
@@ -50,6 +50,33 @@
   return (horizontal_add_u16x8(v_sum) + 32) >> 6;
 }
 
+// coeff: 16 bits, dynamic range [-32640, 32640].
+// length: value range {16, 64, 256, 1024}.
+int vp9_satd_neon(const int16_t *coeff, int length) {
+  const int16x4_t zero = vdup_n_s16(0);
+  int32x4_t accum = vdupq_n_s32(0);
+
+  do {
+    const int16x8_t src0 = vld1q_s16(coeff);
+    const int16x8_t src8 = vld1q_s16(coeff + 8);
+    accum = vabal_s16(accum, vget_low_s16(src0), zero);
+    accum = vabal_s16(accum, vget_high_s16(src0), zero);
+    accum = vabal_s16(accum, vget_low_s16(src8), zero);
+    accum = vabal_s16(accum, vget_high_s16(src8), zero);
+    length -= 16;
+    coeff += 16;
+  } while (length != 0);
+
+  {
+    // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
+    const int64x2_t s0 = vpaddlq_s32(accum);  // cascading summation of 'accum'.
+    const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)),
+                                  vreinterpret_s32_s64(vget_high_s64(s0)));
+    const int satd = vget_lane_s32(s1, 0);
+    return satd;
+  }
+}
+
 void vp9_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
                           const int ref_stride, const int height) {
   int i;
diff --git a/vp9/encoder/vp9_avg.c b/vp9/encoder/vp9_avg.c
index a9a4c30..7baa09a 100644
--- a/vp9/encoder/vp9_avg.c
+++ b/vp9/encoder/vp9_avg.c
@@ -117,14 +117,14 @@
 
 // coeff: 16 bits, dynamic range [-32640, 32640].
 // length: value range {16, 64, 256, 1024}.
-int16_t vp9_satd_c(const int16_t *coeff, int length) {
+int vp9_satd_c(const int16_t *coeff, int length) {
   int i;
   int satd = 0;
   for (i = 0; i < length; ++i)
     satd += abs(coeff[i]);
 
   // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
-  return (int16_t)satd;
+  return satd;
 }
 
 // Integer projection onto row vectors.
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index e87a12e..fc76c11 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -316,7 +316,8 @@
 
 void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
                           int mi_row, int mi_col, BLOCK_SIZE bs,
-                          PICK_MODE_CONTEXT *ctx) {
+                          PICK_MODE_CONTEXT *ctx,
+                          VP9_DENOISER_DECISION *denoiser_decision) {
   int mv_col, mv_row;
   int motion_magnitude = 0;
   VP9_DENOISER_DECISION decision = COPY_BLOCK;
@@ -380,6 +381,7 @@
                       num_4x4_blocks_wide_lookup[bs] << 2,
                       num_4x4_blocks_high_lookup[bs] << 2);
   }
+  *denoiser_decision = decision;
 }
 
 static void copy_frame(YV12_BUFFER_CONFIG * const dest,
@@ -458,6 +460,7 @@
 void vp9_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx) {
   ctx->zeromv_sse = UINT_MAX;
   ctx->newmv_sse = UINT_MAX;
+  ctx->zeromv_lastref_sse = UINT_MAX;
 }
 
 void vp9_denoiser_update_frame_stats(MB_MODE_INFO *mbmi, unsigned int sse,
diff --git a/vp9/encoder/vp9_denoiser.h b/vp9/encoder/vp9_denoiser.h
index bc676e9..c8c9352 100644
--- a/vp9/encoder/vp9_denoiser.h
+++ b/vp9/encoder/vp9_denoiser.h
@@ -54,7 +54,8 @@
 
 void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
                           int mi_row, int mi_col, BLOCK_SIZE bs,
-                          PICK_MODE_CONTEXT *ctx);
+                          PICK_MODE_CONTEXT *ctx ,
+                          VP9_DENOISER_DECISION *denoiser_decision);
 
 void vp9_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx);
 
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index f9c28f6..e9b62bb 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -843,7 +843,8 @@
           force_split[split_index] = 1;
           force_split[i + 1] = 1;
           force_split[0] = 1;
-        } else if (vt.split[i].split[j].part_variances.none.variance >
+        } else if (cpi->oxcf.speed < 8 &&
+                   vt.split[i].split[j].part_variances.none.variance >
                    thresholds[1] &&
                    !cyclic_refresh_segment_id_boosted(segment_id)) {
           // We have some nominal amount of 16x16 variance (based on average),
@@ -1746,16 +1747,6 @@
   set_offsets(cpi, tile, x, mi_row, mi_col, bsize);
   update_state_rt(cpi, td, ctx, mi_row, mi_col, bsize);
 
-#if CONFIG_VP9_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0 &&
-      output_enabled &&
-      cpi->common.frame_type != KEY_FRAME &&
-      cpi->resize_pending == 0) {
-    vp9_denoiser_denoise(&cpi->denoiser, x, mi_row, mi_col,
-                         VPXMAX(BLOCK_8X8, bsize), ctx);
-  }
-#endif
-
   encode_superblock(cpi, td, tp, output_enabled, mi_row, mi_col, bsize, ctx);
   update_stats(&cpi->common, td);
 
@@ -2432,8 +2423,15 @@
 
   if (cpi->sf.use_square_partition_only &&
       bsize > cpi->sf.use_square_only_threshold) {
+    if (cpi->use_svc) {
+      if (!vp9_active_h_edge(cpi, mi_row, mi_step) || x->e_mbd.lossless)
+        partition_horz_allowed &= force_horz_split;
+      if (!vp9_active_v_edge(cpi, mi_row, mi_step) || x->e_mbd.lossless)
+        partition_vert_allowed &= force_vert_split;
+    } else {
       partition_horz_allowed &= force_horz_split;
       partition_vert_allowed &= force_vert_split;
+    }
   }
 
   save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index a57cf87..72fa828 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -3682,12 +3682,16 @@
   if (cm->mi_cols * MI_SIZE != unscaled->y_width ||
       cm->mi_rows * MI_SIZE != unscaled->y_height) {
 #if CONFIG_VP9_HIGHBITDEPTH
-    if (use_normative_scaler)
+    if (use_normative_scaler &&
+        unscaled->y_width <= (scaled->y_width << 1) &&
+        unscaled->y_height <= (scaled->y_height << 1))
       scale_and_extend_frame(unscaled, scaled, (int)cm->bit_depth);
     else
       scale_and_extend_frame_nonnormative(unscaled, scaled, (int)cm->bit_depth);
 #else
-    if (use_normative_scaler)
+    if (use_normative_scaler &&
+        unscaled->y_width <= (scaled->y_width << 1) &&
+        unscaled->y_height <= (scaled->y_height << 1))
       scale_and_extend_frame(unscaled, scaled);
     else
       scale_and_extend_frame_nonnormative(unscaled, scaled);
diff --git a/vp9/encoder/vp9_noise_estimate.c b/vp9/encoder/vp9_noise_estimate.c
index b41ffd0..b26f6f2 100644
--- a/vp9/encoder/vp9_noise_estimate.c
+++ b/vp9/encoder/vp9_noise_estimate.c
@@ -25,7 +25,7 @@
                              int width,
                              int height) {
   ne->enabled = 0;
-  ne->level = kLow;
+  ne->level = kLowLow;
   ne->value = 0;
   ne->count = 0;
   ne->thresh = 90;
@@ -220,22 +220,25 @@
         // Reset counter and check noise level condition.
         ne->num_frames_estimate = 30;
         ne->count = 0;
-        if (ne->value > (ne->thresh << 1))
+        if (ne->value > (ne->thresh << 1)) {
           ne->level = kHigh;
-        else
+        } else {
           if (ne->value > ne->thresh)
             ne->level = kMedium;
           else if (ne->value > (ne->thresh >> 1))
             ne->level = kLow;
           else
             ne->level = kLowLow;
+        }
+#if CONFIG_VP9_TEMPORAL_DENOISING
+        if (cpi->oxcf.noise_sensitivity > 0)
+          vp9_denoiser_set_noise_level(&cpi->denoiser, ne->level);
+#endif
       }
     }
   }
 #if CONFIG_VP9_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0) {
+  if (cpi->oxcf.noise_sensitivity > 0)
     copy_frame(&cpi->denoiser.last_source, cpi->Source);
-    vp9_denoiser_set_noise_level(&cpi->denoiser, ne->level);
-  }
 #endif
 }
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 8aafae1..095847a 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -673,7 +673,7 @@
         if (*eob == 1)
           *rate += (int)abs(qcoeff[0]);
         else if (*eob > 1)
-          *rate += (int)vp9_satd((const int16_t *)qcoeff, step << 4);
+          *rate += vp9_satd((const int16_t *)qcoeff, step << 4);
 
         *dist += vp9_block_error_fp(coeff, dqcoeff, step << 4) >> shift;
       }
@@ -1143,6 +1143,9 @@
   int best_pred_sad = INT_MAX;
   int best_early_term = 0;
   int ref_frame_cost[MAX_REF_FRAMES];
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  int64_t zero_last_cost_orig = INT64_MAX;
+#endif
 
   init_ref_frame_cost(cm, xd, ref_frame_cost);
 
@@ -1524,8 +1527,12 @@
     }
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
-    if (cpi->oxcf.noise_sensitivity > 0)
+    if (cpi->oxcf.noise_sensitivity > 0) {
       vp9_denoiser_update_frame_stats(mbmi, sse_y, this_mode, ctx);
+      // Keep track of zero_last cost.
+      if (ref_frame == LAST_FRAME && frame_mv[this_mode][ref_frame].as_int == 0)
+        zero_last_cost_orig = this_rdc.rdcost;
+    }
 #else
     (void)ctx;
 #endif
@@ -1683,6 +1690,54 @@
     }
   }
 
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  if (cpi->oxcf.noise_sensitivity > 0 &&
+      cpi->resize_pending == 0) {
+    VP9_DENOISER_DECISION decision = COPY_BLOCK;
+    vp9_denoiser_denoise(&cpi->denoiser, x, mi_row, mi_col,
+                         VPXMAX(BLOCK_8X8, bsize), ctx, &decision);
+    // If INTRA mode was selected, re-evaluate ZEROMV on denoised result.
+    // Only do this under noise conditions, and if rdcost of ZEROMV on
+    // original source is not significantly higher than rdcost of INTRA MODE.
+    if (best_ref_frame == INTRA_FRAME &&
+        decision == FILTER_BLOCK &&
+        cpi->noise_estimate.enabled &&
+        cpi->noise_estimate.level > kLow &&
+        zero_last_cost_orig < (best_rdc.rdcost << 2)) {
+      // Check if we should pick ZEROMV on denoised signal.
+      int rate = 0;
+      int64_t dist = 0;
+      mbmi->mode = ZEROMV;
+      mbmi->ref_frame[0] = LAST_FRAME;
+      mbmi->ref_frame[1] = NONE;
+      mbmi->mv[0].as_int = 0;
+      mbmi->interp_filter = EIGHTTAP;
+      xd->plane[0].pre[0] = yv12_mb[LAST_FRAME][0];
+      vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+      model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist, &var_y, &sse_y);
+      this_rdc.rate = rate + ref_frame_cost[LAST_FRAME] +
+          cpi->inter_mode_cost[x->mbmi_ext->mode_context[LAST_FRAME]]
+                              [INTER_OFFSET(ZEROMV)];
+      this_rdc.dist = dist;
+      this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, rate, dist);
+      // Switch to ZEROMV if the rdcost for ZEROMV on denoised source
+      // is lower than INTRA (on original source).
+      if (this_rdc.rdcost > best_rdc.rdcost) {
+        this_rdc = best_rdc;
+        mbmi->mode = best_mode;
+        mbmi->ref_frame[0] = best_ref_frame;
+        mbmi->mv[0].as_int = INVALID_MV;
+        mbmi->interp_filter = best_pred_filter;
+        mbmi->tx_size = best_tx_size;
+        x->skip_txfm[0] = best_mode_skip_txfm;
+      } else {
+        best_ref_frame = LAST_FRAME;
+        best_rdc = this_rdc;
+      }
+    }
+  }
+#endif
+
   if (cpi->sf.adaptive_rd_thresh) {
     THR_MODES best_mode_idx = mode_idx[best_ref_frame][mode_offset(mbmi->mode)];
 
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index 13da155..b0617c1 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -279,7 +279,7 @@
   // Reset the frames_since_key and frames_to_key counters to their values
   // before the layer restore. Keep these defined for the stream (not layer).
   if (cpi->svc.number_temporal_layers > 1 ||
-      cpi->svc.number_spatial_layers > 1) {
+      (cpi->svc.number_spatial_layers > 1 && !is_two_pass_svc(cpi))) {
     cpi->rc.frames_since_key = old_frame_since_key;
     cpi->rc.frames_to_key = old_frame_to_key;
   }
diff --git a/vp9/encoder/x86/vp9_avg_intrin_sse2.c b/vp9/encoder/x86/vp9_avg_intrin_sse2.c
index 4531d79..4414871 100644
--- a/vp9/encoder/x86/vp9_avg_intrin_sse2.c
+++ b/vp9/encoder/x86/vp9_avg_intrin_sse2.c
@@ -283,31 +283,30 @@
   }
 }
 
-int16_t vp9_satd_sse2(const int16_t *coeff, int length) {
+int vp9_satd_sse2(const int16_t *coeff, int length) {
   int i;
-  __m128i sum = _mm_load_si128((const __m128i *)coeff);
-  __m128i sign = _mm_srai_epi16(sum, 15);
-  __m128i val = _mm_xor_si128(sum, sign);
-  sum = _mm_sub_epi16(val, sign);
-  coeff += 8;
+  const __m128i zero = _mm_setzero_si128();
+  __m128i accum = zero;
 
-  for (i = 8; i < length; i += 8) {
-    __m128i src_line = _mm_load_si128((const __m128i *)coeff);
-    sign = _mm_srai_epi16(src_line, 15);
-    val = _mm_xor_si128(src_line, sign);
-    val = _mm_sub_epi16(val, sign);
-    sum = _mm_add_epi16(sum, val);
+  for (i = 0; i < length; i += 8) {
+    const __m128i src_line = _mm_load_si128((const __m128i *)coeff);
+    const __m128i inv = _mm_sub_epi16(zero, src_line);
+    const __m128i abs = _mm_max_epi16(src_line, inv);  // abs(src_line)
+    const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero);
+    const __m128i abs_hi = _mm_unpackhi_epi16(abs, zero);
+    const __m128i sum = _mm_add_epi32(abs_lo, abs_hi);
+    accum = _mm_add_epi32(accum, sum);
     coeff += 8;
   }
 
-  val = _mm_srli_si128(sum, 8);
-  sum = _mm_add_epi16(sum, val);
-  val = _mm_srli_epi64(sum, 32);
-  sum = _mm_add_epi16(sum, val);
-  val = _mm_srli_epi32(sum, 16);
-  sum = _mm_add_epi16(sum, val);
+  {  // cascading summation of accum
+    __m128i hi = _mm_srli_si128(accum, 8);
+    accum = _mm_add_epi32(accum, hi);
+    hi = _mm_srli_epi64(accum, 32);
+    accum = _mm_add_epi32(accum, hi);
+  }
 
-  return _mm_extract_epi16(sum, 0);
+  return _mm_cvtsi128_si32(accum);
 }
 
 void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,
diff --git a/vpx/src/svc_encodeframe.c b/vpx/src/svc_encodeframe.c
index 68d1d8d..5c3fe93 100644
--- a/vpx/src/svc_encodeframe.c
+++ b/vpx/src/svc_encodeframe.c
@@ -423,13 +423,15 @@
     svc_ctx->temporal_layers = 2;
   }
 
+  for (sl = 0; sl < VPX_SS_MAX_LAYERS; ++sl) {
+    si->svc_params.scaling_factor_num[sl] = DEFAULT_SCALE_FACTORS_NUM[sl];
+    si->svc_params.scaling_factor_den[sl] = DEFAULT_SCALE_FACTORS_DEN[sl];
+  }
   for (tl = 0; tl < svc_ctx->temporal_layers; ++tl) {
     for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) {
       i = sl * svc_ctx->temporal_layers + tl;
       si->svc_params.max_quantizers[i] = MAX_QUANTIZER;
       si->svc_params.min_quantizers[i] = 0;
-      si->svc_params.scaling_factor_num[sl] = DEFAULT_SCALE_FACTORS_NUM[sl];
-      si->svc_params.scaling_factor_den[sl] = DEFAULT_SCALE_FACTORS_DEN[sl];
     }
   }
 
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index d11b327..1352c34 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -890,7 +890,7 @@
     specialize qw/vpx_idct16x16_10_add sse2 neon dspr2 msa/;
 
     add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct32x32_1024_add sse2 neon dspr2 msa/;
+    specialize qw/vpx_idct32x32_1024_add sse2 neon dspr2 msa/, "$ssse3_x86_64_x86inc";
 
     add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
     specialize qw/vpx_idct32x32_34_add sse2 neon_asm dspr2 msa/, "$ssse3_x86_64_x86inc";
diff --git a/vpx_dsp/x86/intrapred_sse2.asm b/vpx_dsp/x86/intrapred_sse2.asm
index 5ef7ae3..0189f32 100644
--- a/vpx_dsp/x86/intrapred_sse2.asm
+++ b/vpx_dsp/x86/intrapred_sse2.asm
@@ -561,33 +561,31 @@
   RET
 
 INIT_XMM sse2
-cglobal tm_predictor_8x8, 4, 4, 4, dst, stride, above, left
+cglobal tm_predictor_8x8, 4, 4, 5, dst, stride, above, left
   pxor                  m1, m1
   movd                  m2, [aboveq-1]
   movq                  m0, [aboveq]
   punpcklbw             m2, m1
-  punpcklbw             m0, m1
-  pshuflw               m2, m2, 0x0
+  punpcklbw             m0, m1        ; t1 t2 t3 t4 t5 t6 t7 t8 [word]
+  pshuflw               m2, m2, 0x0   ; [63:0] tl tl tl tl [word]
   DEFINE_ARGS dst, stride, line, left
   mov                lineq, -4
-  punpcklqdq            m2, m2
-  add                leftq, 8
-  psubw                 m0, m2
-.loop:
-  movd                  m2, [leftq+lineq*2]
-  movd                  m3, [leftq+lineq*2+1]
-  punpcklbw             m2, m1
-  punpcklbw             m3, m1
-  pshuflw               m2, m2, 0x0
-  pshuflw               m3, m3, 0x0
-  punpcklqdq            m2, m2
-  punpcklqdq            m3, m3
-  paddw                 m2, m0
+  punpcklqdq            m2, m2        ; tl tl tl tl tl tl tl tl [word]
+  psubw                 m0, m2        ; t1-tl t2-tl ... t8-tl [word]
+  movq                  m2, [leftq]
+  punpcklbw             m2, m1        ; l1 l2 l3 l4 l5 l6 l7 l8 [word]
+.loop
+  pshuflw               m4, m2, 0x0   ; [63:0] l1 l1 l1 l1 [word]
+  pshuflw               m3, m2, 0x55  ; [63:0] l2 l2 l2 l2 [word]
+  punpcklqdq            m4, m4        ; l1 l1 l1 l1 l1 l1 l1 l1 [word]
+  punpcklqdq            m3, m3        ; l2 l2 l2 l2 l2 l2 l2 l2 [word]
+  paddw                 m4, m0
   paddw                 m3, m0
-  packuswb              m2, m3
-  movq      [dstq        ], m2
-  movhps    [dstq+strideq], m2
+  packuswb              m4, m3
+  movq      [dstq        ], m4
+  movhps    [dstq+strideq], m4
   lea                 dstq, [dstq+strideq*2]
+  psrldq                m2, 4
   inc                lineq
   jnz .loop
   REP_RET
diff --git a/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm b/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
index 2675eab..43c7ad8 100644
--- a/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
+++ b/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
@@ -57,6 +57,14 @@
 TRANSFORM_COEFFS    15679,  4756
 TRANSFORM_COEFFS    11585, 11585
 
+; constants for 32x32_1024
+TRANSFORM_COEFFS    12140, 11003
+TRANSFORM_COEFFS     7005, 14811
+TRANSFORM_COEFFS    14053,  8423
+TRANSFORM_COEFFS     9760, 13160
+TRANSFORM_COEFFS    12665, 10394
+TRANSFORM_COEFFS     7723, 14449
+
 %macro PAIR_PP_COEFFS 2
 dpw_%1_%2:   dw  %1,  %1,  %1,  %1,  %2,  %2,  %2,  %2
 %endmacro
@@ -368,23 +376,24 @@
 %define idx30 16 * 6
 %define idx31 16 * 7
 
+; FROM idct32x32_add_neon.asm
+;
+; Instead of doing the transforms stage by stage, it is done by loading
+; some input values and doing as many stages as possible to minimize the
+; storing/loading of intermediate results. To fit within registers, the
+; final coefficients are cut into four blocks:
+; BLOCK A: 16-19,28-31
+; BLOCK B: 20-23,24-27
+; BLOCK C: 8-11,12-15
+; BLOCK D: 0-3,4-7
+; Blocks A and C are straight calculation through the various stages. In
+; block B, further calculations are performed using the results from
+; block A. In block D, further calculations are performed using the results
+; from block C and then the final calculations are done using results from
+; block A and B which have been combined at the end of block B.
+;
+
 %macro IDCT32X32_34 4
-  ; FROM idct32x32_add_neon.asm
-  ;
-  ; Instead of doing the transforms stage by stage, it is done by loading
-  ; some input values and doing as many stages as possible to minimize the
-  ; storing/loading of intermediate results. To fit within registers, the
-  ; final coefficients are cut into four blocks:
-  ; BLOCK A: 16-19,28-31
-  ; BLOCK B: 20-23,24-27
-  ; BLOCK C: 8-11,12-15
-  ; BLOCK D: 0-3,4-7
-  ; Blocks A and C are straight calculation through the various stages. In
-  ; block B, further calculations are performed using the results from
-  ; block A. In block D, further calculations are performed using the results
-  ; from block C and then the final calculations are done using results from
-  ; block A and B which have been combined at the end of block B.
-  ;
   ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   mova                m11, m1
   pmulhrsw             m1, [pw___804x2] ; stp1_16
@@ -475,7 +484,7 @@
   ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 %if 0 ; overflow occurs in SUM_SUB when using test streams
   mova                m10, [pw_11585x2]
-  SUM_SUB               6,    5,  9
+  SUM_SUB               6,  5, 9
   pmulhrsw             m6, m10  ; stp1_27
   pmulhrsw             m5, m10  ; stp1_20
   SUM_SUB              13, 14,  9
@@ -539,10 +548,10 @@
   ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 %if 0 ; overflow occurs in SUM_SUB when using test streams
   mova                m10, [pw_11585x2]
-  SUM_SUB               5,    4,  9
+  SUM_SUB               5,  4, 9
   pmulhrsw             m5, m10  ; stp1_13
   pmulhrsw             m4, m10  ; stp1_10
-  SUM_SUB               6,    7,  9
+  SUM_SUB               6,  7, 9
   pmulhrsw             m6, m10  ; stp1_12
   pmulhrsw             m7, m10  ; stp1_11
 %else
@@ -783,4 +792,443 @@
   RECON_AND_STORE pass_two_start
 
   RET
+
+%macro IDCT32X32_1024 4
+  ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m1, [rsp + transposed_in + 16 *  1]
+  mova                m11, [rsp + transposed_in + 16 * 31]
+  BUTTERFLY_4X          1,    11,    804, 16364,  m8,  9,  10 ; stp1_16, stp1_31
+
+  mova                 m0, [rsp + transposed_in + 16 * 15]
+  mova                 m2, [rsp + transposed_in + 16 * 17]
+  BUTTERFLY_4X          2,     0,  12140, 11003,  m8,  9,  10 ; stp1_17, stp1_30
+
+  mova                 m7, [rsp + transposed_in + 16 *  7]
+  mova                m12, [rsp + transposed_in + 16 * 25]
+  BUTTERFLY_4X         12,     7,  15426,  5520,  m8,  9,  10 ; stp1_19, stp1_28
+
+  mova                 m3, [rsp + transposed_in + 16 *  9]
+  mova                 m4, [rsp + transposed_in + 16 * 23]
+  BUTTERFLY_4X          3,     4,   7005, 14811,  m8,  9,  10 ; stp1_18, stp1_29
+
+  ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               1,  2, 9 ; stp2_16, stp2_17
+  SUM_SUB              12,  3, 9 ; stp2_19, stp2_18
+  SUM_SUB               7,  4, 9 ; stp2_28, stp2_29
+  SUM_SUB              11,  0, 9 ; stp2_31, stp2_30
+
+  ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4X          0,     2,   3196, 16069,  m8,  9,  10 ; stp1_17, stp1_30
+  BUTTERFLY_4Xmm        4,     3,   3196, 16069,  m8,  9,  10 ; stp1_29, stp1_18
+
+  ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               1, 12, 9 ; stp2_16, stp2_19
+  SUM_SUB               0,  3, 9 ; stp2_17, stp2_18
+  SUM_SUB              11,  7, 9 ; stp2_31, stp2_28
+  SUM_SUB               2,  4, 9 ; stp2_30, stp2_29
+
+  ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4X          4,     3,   6270, 15137,  m8,  9,  10 ; stp1_18, stp1_29
+  BUTTERFLY_4X          7,    12,   6270, 15137,  m8,  9,  10 ; stp1_19, stp1_28
+
+  mova [stp + %3 + idx16], m1
+  mova [stp + %3 + idx17], m0
+  mova [stp + %3 + idx18], m4
+  mova [stp + %3 + idx19], m7
+  mova [stp + %4 + idx28], m12
+  mova [stp + %4 + idx29], m3
+  mova [stp + %4 + idx30], m2
+  mova [stp + %4 + idx31], m11
+
+  ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m5, [rsp + transposed_in + 16 *  5]
+  mova                 m6, [rsp + transposed_in + 16 * 27]
+  BUTTERFLY_4X          5,     6,   3981, 15893,  m8,  9,  10 ; stp1_20, stp1_27
+
+  mova                m13, [rsp + transposed_in + 16 * 21]
+  mova                m14, [rsp + transposed_in + 16 * 11]
+  BUTTERFLY_4X         13,    14,  14053,  8423,  m8,  9,  10 ; stp1_21, stp1_26
+
+  mova                 m0, [rsp + transposed_in + 16 * 13]
+  mova                 m1, [rsp + transposed_in + 16 * 19]
+  BUTTERFLY_4X          0,     1,   9760, 13160,  m8,  9,  10 ; stp1_22, stp1_25
+
+  mova                 m2, [rsp + transposed_in + 16 *  3]
+  mova                 m3, [rsp + transposed_in + 16 * 29]
+  BUTTERFLY_4X          3,     2,  16207,  2404,  m8,  9,  10 ; stp1_23, stp1_24
+
+  ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               5, 13, 9 ; stp2_20, stp2_21
+  SUM_SUB               3,  0, 9 ; stp2_23, stp2_22
+  SUM_SUB               2,  1, 9 ; stp2_24, stp2_25
+  SUM_SUB               6, 14, 9 ; stp2_27, stp2_26
+
+  ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4X         14,    13,  13623,  9102,  m8,  9,  10 ; stp1_21, stp1_26
+  BUTTERFLY_4Xmm        1,     0,  13623,  9102,  m8,  9,  10 ; stp1_25, stp1_22
+
+  ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               3,  5, 9 ; stp2_23, stp2_20
+  SUM_SUB               0, 14, 9 ; stp2_22, stp2_21
+  SUM_SUB               2,  6, 9 ; stp2_24, stp2_27
+  SUM_SUB               1, 13, 9 ; stp2_25, stp2_26
+
+  ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4Xmm        6,     5,   6270, 15137,  m8,  9,  10 ; stp1_27, stp1_20
+  BUTTERFLY_4Xmm       13,    14,   6270, 15137,  m8,  9,  10 ; stp1_26, stp1_21
+
+  ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m4, [stp + %3 + idx16]
+  mova                 m7, [stp + %3 + idx17]
+  mova                m11, [stp + %3 + idx18]
+  mova                m12, [stp + %3 + idx19]
+  SUM_SUB               4,  3, 9 ; stp2_16, stp2_23
+  SUM_SUB               7,  0, 9 ; stp2_17, stp2_22
+  SUM_SUB              11, 14, 9 ; stp2_18, stp2_21
+  SUM_SUB              12,  5, 9 ; stp2_19, stp2_20
+  mova [stp + %3 + idx16], m4
+  mova [stp + %3 + idx17], m7
+  mova [stp + %3 + idx18], m11
+  mova [stp + %3 + idx19], m12
+
+  mova                 m4, [stp + %4 + idx28]
+  mova                 m7, [stp + %4 + idx29]
+  mova                m11, [stp + %4 + idx30]
+  mova                m12, [stp + %4 + idx31]
+  SUM_SUB               4,  6, 9 ; stp2_28, stp2_27
+  SUM_SUB               7, 13, 9 ; stp2_29, stp2_26
+  SUM_SUB              11,  1, 9 ; stp2_30, stp2_25
+  SUM_SUB              12,  2, 9 ; stp2_31, stp2_24
+  mova [stp + %4 + idx28], m4
+  mova [stp + %4 + idx29], m7
+  mova [stp + %4 + idx30], m11
+  mova [stp + %4 + idx31], m12
+
+  ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+%if 0 ; overflow occurs in SUM_SUB when using test streams
+  mova                m10, [pw_11585x2]
+  SUM_SUB               6,  5,  9
+  pmulhrsw             m6, m10  ; stp1_27
+  pmulhrsw             m5, m10  ; stp1_20
+  SUM_SUB              13, 14,  9
+  pmulhrsw            m13, m10  ; stp1_26
+  pmulhrsw            m14, m10  ; stp1_21
+  SUM_SUB               1,  0,  9
+  pmulhrsw             m1, m10  ; stp1_25
+  pmulhrsw             m0, m10  ; stp1_22
+  SUM_SUB               2,  3,  9
+  pmulhrsw             m2, m10  ; stp1_25
+  pmulhrsw             m3, m10  ; stp1_22
+%else
+  BUTTERFLY_4X          6,     5,  11585, 11585,  m8,  9,  10 ; stp1_20, stp1_27
+  SWAP  6, 5
+  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_21, stp1_26
+  SWAP 13, 14
+  BUTTERFLY_4X          1,     0,  11585, 11585,  m8,  9,  10 ; stp1_22, stp1_25
+  SWAP  1, 0
+  BUTTERFLY_4X          2,     3,  11585, 11585,  m8,  9,  10 ; stp1_23, stp1_24
+  SWAP  2, 3
+%endif
+  mova [stp + %3 + idx20], m5
+  mova [stp + %3 + idx21], m14
+  mova [stp + %3 + idx22], m0
+  mova [stp + %3 + idx23], m3
+  mova [stp + %4 + idx24], m2
+  mova [stp + %4 + idx25], m1
+  mova [stp + %4 + idx26], m13
+  mova [stp + %4 + idx27], m6
+
+  ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  ;
+  ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m0, [rsp + transposed_in + 16 *  2]
+  mova                 m1, [rsp + transposed_in + 16 * 30]
+  BUTTERFLY_4X          0,     1,   1606, 16305,  m8,  9,  10 ; stp1_8, stp1_15
+
+  mova                 m2, [rsp + transposed_in + 16 * 14]
+  mova                 m3, [rsp + transposed_in + 16 * 18]
+  BUTTERFLY_4X          3,     2,  12665, 10394,  m8,  9,  10 ; stp1_9, stp1_14
+
+  mova                 m4, [rsp + transposed_in + 16 * 10]
+  mova                 m5, [rsp + transposed_in + 16 * 22]
+  BUTTERFLY_4X          4,     5,   7723, 14449,  m8,  9,  10 ; stp1_10, stp1_13
+
+  mova                 m6, [rsp + transposed_in + 16 *  6]
+  mova                 m7, [rsp + transposed_in + 16 * 26]
+  BUTTERFLY_4X          7,     6,  15679,  4756,  m8,  9,  10 ; stp1_11, stp1_12
+
+  ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               0,  3, 9 ;  stp1_8, stp1_9
+  SUM_SUB               7,  4, 9 ; stp1_11, stp1_10
+  SUM_SUB               6,  5, 9 ; stp1_12, stp1_13
+  SUM_SUB               1,  2, 9 ; stp1_15, stp1_14
+
+  ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  BUTTERFLY_4X          2,     3,   6270, 15137,  m8,  9,  10 ;  stp1_9, stp1_14
+  BUTTERFLY_4Xmm        5,     4,   6270, 15137,  m8,  9,  10 ; stp1_13, stp1_10
+
+  ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               0,  7, 9 ;  stp1_8, stp1_11
+  SUM_SUB               2,  4, 9 ;  stp1_9, stp1_10
+  SUM_SUB               1,  6, 9 ;  stp1_15, stp1_12
+  SUM_SUB               3,  5, 9 ;  stp1_14, stp1_13
+
+  ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+%if 0 ; overflow occurs in SUM_SUB when using test streams
+  mova                m10, [pw_11585x2]
+  SUM_SUB               5,    4,  9
+  pmulhrsw             m5, m10  ; stp1_13
+  pmulhrsw             m4, m10  ; stp1_10
+  SUM_SUB               6,    7,  9
+  pmulhrsw             m6, m10  ; stp1_12
+  pmulhrsw             m7, m10  ; stp1_11
+%else
+  BUTTERFLY_4X       5,     4,  11585,  11585,  m8,  9,  10 ; stp1_10, stp1_13
+  SWAP  5, 4
+  BUTTERFLY_4X       6,     7,  11585,  11585,  m8,  9,  10 ; stp1_11, stp1_12
+  SWAP  6, 7
+%endif
+  ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova [stp + %2 +  idx8], m0
+  mova [stp + %2 +  idx9], m2
+  mova [stp + %2 + idx10], m4
+  mova [stp + %2 + idx11], m7
+  mova [stp + %2 + idx12], m6
+  mova [stp + %2 + idx13], m5
+  mova [stp + %2 + idx14], m3
+  mova [stp + %2 + idx15], m1
+
+  ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  ;
+  ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  ;
+  ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                m11, [rsp + transposed_in + 16 *  4]
+  mova                m12, [rsp + transposed_in + 16 * 28]
+  BUTTERFLY_4X         11,    12,   3196, 16069,  m8,  9,  10 ; stp1_4, stp1_7
+
+  mova                m13, [rsp + transposed_in + 16 * 12]
+  mova                m14, [rsp + transposed_in + 16 * 20]
+  BUTTERFLY_4X         14,    13,  13623,  9102,  m8,  9,  10 ; stp1_5, stp1_6
+
+  ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m0, [rsp + transposed_in + 16 *  0]
+  mova                 m1, [rsp + transposed_in + 16 * 16]
+
+%if 0 ; overflow occurs in SUM_SUB when using test streams
+  mova                m10, [pw_11585x2]
+  SUM_SUB               0,    1,  9
+  pmulhrsw             m0, m10  ; stp1_1
+  pmulhrsw             m1, m10  ; stp1_0
+%else
+  BUTTERFLY_4X          0,     1,  11585, 11585,  m8,  9,  10 ; stp1_1, stp1_0
+  SWAP  0, 1
+%endif
+  mova                 m2, [rsp + transposed_in + 16 *  8]
+  mova                 m3, [rsp + transposed_in + 16 * 24]
+  BUTTERFLY_4X          2,     3,   6270, 15137,  m8,  9,  10 ;  stp1_2, stp1_3
+
+  mova                m10, [pw_11585x2]
+  SUM_SUB              11, 14, 9 ;  stp1_4, stp1_5
+  SUM_SUB              12, 13, 9 ;  stp1_7, stp1_6
+
+  ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+%if 0 ; overflow occurs in SUM_SUB when using test streams
+  SUM_SUB              13,   14,  9
+  pmulhrsw            m13, m10  ; stp1_6
+  pmulhrsw            m14, m10  ; stp1_5
+%else
+  BUTTERFLY_4X         13,    14,  11585, 11585,  m8,  9,  10 ; stp1_5, stp1_6
+  SWAP 13, 14
+%endif
+  SUM_SUB               0,  3, 9 ;  stp1_0, stp1_3
+  SUM_SUB               1,  2, 9 ;  stp1_1, stp1_2
+
+  ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  SUM_SUB               0, 12, 9 ;  stp1_0, stp1_7
+  SUM_SUB               1, 13, 9 ;  stp1_1, stp1_6
+  SUM_SUB               2, 14, 9 ;  stp1_2, stp1_5
+  SUM_SUB               3, 11, 9 ;  stp1_3, stp1_4
+
+  ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  mova                 m4, [stp + %2 + idx12]
+  mova                 m5, [stp + %2 + idx13]
+  mova                 m6, [stp + %2 + idx14]
+  mova                 m7, [stp + %2 + idx15]
+  SUM_SUB               0,  7, 9 ;  stp1_0, stp1_15
+  SUM_SUB               1,  6, 9 ;  stp1_1, stp1_14
+  SUM_SUB               2,  5, 9 ;  stp1_2, stp1_13
+  SUM_SUB               3,  4, 9 ;  stp1_3, stp1_12
+
+  ; 0-3, 28-31 final stage
+  mova                m10, [stp + %4 + idx31]
+  mova                m15, [stp + %4 + idx30]
+  SUM_SUB               0, 10, 9 ;  stp1_0, stp1_31
+  SUM_SUB               1, 15, 9 ;  stp1_1, stp1_30
+  mova [stp + %1 +  idx0], m0
+  mova [stp + %1 +  idx1], m1
+  mova [stp + %4 + idx31], m10
+  mova [stp + %4 + idx30], m15
+  mova                 m0, [stp + %4 + idx29]
+  mova                 m1, [stp + %4 + idx28]
+  SUM_SUB               2,  0, 9 ;  stp1_2, stp1_29
+  SUM_SUB               3,  1, 9 ;  stp1_3, stp1_28
+  mova [stp + %1 +  idx2], m2
+  mova [stp + %1 +  idx3], m3
+  mova [stp + %4 + idx29], m0
+  mova [stp + %4 + idx28], m1
+
+  ; 12-15, 16-19 final stage
+  mova                 m0, [stp + %3 + idx16]
+  mova                 m1, [stp + %3 + idx17]
+  mova                 m2, [stp + %3 + idx18]
+  mova                 m3, [stp + %3 + idx19]
+  SUM_SUB               7,  0, 9 ;  stp1_15, stp1_16
+  SUM_SUB               6,  1, 9 ;  stp1_14, stp1_17
+  SUM_SUB               5,  2, 9 ;  stp1_13, stp1_18
+  SUM_SUB               4,  3, 9 ;  stp1_12, stp1_19
+  mova [stp + %2 + idx12], m4
+  mova [stp + %2 + idx13], m5
+  mova [stp + %2 + idx14], m6
+  mova [stp + %2 + idx15], m7
+  mova [stp + %3 + idx16], m0
+  mova [stp + %3 + idx17], m1
+  mova [stp + %3 + idx18], m2
+  mova [stp + %3 + idx19], m3
+
+  mova                 m4, [stp + %2 +  idx8]
+  mova                 m5, [stp + %2 +  idx9]
+  mova                 m6, [stp + %2 + idx10]
+  mova                 m7, [stp + %2 + idx11]
+  SUM_SUB              11,  7, 9 ;  stp1_4, stp1_11
+  SUM_SUB              14,  6, 9 ;  stp1_5, stp1_10
+  SUM_SUB              13,  5, 9 ;  stp1_6, stp1_9
+  SUM_SUB              12,  4, 9 ;  stp1_7, stp1_8
+
+  ; 4-7, 24-27 final stage
+  mova                 m3, [stp + %4 + idx24]
+  mova                 m2, [stp + %4 + idx25]
+  mova                 m1, [stp + %4 + idx26]
+  mova                 m0, [stp + %4 + idx27]
+  SUM_SUB              12,  3, 9 ;  stp1_7, stp1_24
+  SUM_SUB              13,  2, 9 ;  stp1_6, stp1_25
+  SUM_SUB              14,  1, 9 ;  stp1_5, stp1_26
+  SUM_SUB              11,  0, 9 ;  stp1_4, stp1_27
+  mova [stp + %4 + idx24], m3
+  mova [stp + %4 + idx25], m2
+  mova [stp + %4 + idx26], m1
+  mova [stp + %4 + idx27], m0
+  mova [stp + %1 +  idx4], m11
+  mova [stp + %1 +  idx5], m14
+  mova [stp + %1 +  idx6], m13
+  mova [stp + %1 +  idx7], m12
+
+  ; 8-11, 20-23 final stage
+  mova                 m0, [stp + %3 + idx20]
+  mova                 m1, [stp + %3 + idx21]
+  mova                 m2, [stp + %3 + idx22]
+  mova                 m3, [stp + %3 + idx23]
+  SUM_SUB               7,  0, 9 ;  stp1_11, stp_20
+  SUM_SUB               6,  1, 9 ;  stp1_10, stp_21
+  SUM_SUB               5,  2, 9 ;   stp1_9, stp_22
+  SUM_SUB               4,  3, 9 ;   stp1_8, stp_23
+  mova [stp + %2 +  idx8], m4
+  mova [stp + %2 +  idx9], m5
+  mova [stp + %2 + idx10], m6
+  mova [stp + %2 + idx11], m7
+  mova [stp + %3 + idx20], m0
+  mova [stp + %3 + idx21], m1
+  mova [stp + %3 + idx22], m2
+  mova [stp + %3 + idx23], m3
+%endmacro
+
+INIT_XMM ssse3
+cglobal idct32x32_1024_add, 3, 11, 16, i32x32_size, input, output, stride
+  mova            m8, [pd_8192]
+  mov             r6, 4
+  lea            stp, [rsp + pass_one_start]
+
+idct32x32_1024:
+  mov             r3, inputq
+  lea             r4, [rsp + transposed_in]
+  mov             r7, 4
+
+idct32x32_1024_transpose:
+  mova            m0, [r3 +       0]
+  mova            m1, [r3 + 16 *  4]
+  mova            m2, [r3 + 16 *  8]
+  mova            m3, [r3 + 16 * 12]
+  mova            m4, [r3 + 16 * 16]
+  mova            m5, [r3 + 16 * 20]
+  mova            m6, [r3 + 16 * 24]
+  mova            m7, [r3 + 16 * 28]
+
+  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
+
+  mova [r4 +      0], m0
+  mova [r4 + 16 * 1], m1
+  mova [r4 + 16 * 2], m2
+  mova [r4 + 16 * 3], m3
+  mova [r4 + 16 * 4], m4
+  mova [r4 + 16 * 5], m5
+  mova [r4 + 16 * 6], m6
+  mova [r4 + 16 * 7], m7
+
+  add             r3, 16
+  add             r4, 16 * 8
+  dec             r7
+  jne idct32x32_1024_transpose
+
+  IDCT32X32_1024 16*0, 16*32, 16*64, 16*96
+
+  lea            stp, [stp + 16 * 8]
+  lea         inputq, [inputq + 16 * 32]
+  dec             r6
+  jnz idct32x32_1024
+
+  mov             r6, 4
+  lea            stp, [rsp + pass_one_start]
+  lea             r9, [rsp + pass_one_start]
+
+idct32x32_1024_2:
+  lea             r4, [rsp + transposed_in]
+  mov             r3, r9
+  mov             r7, 4
+
+idct32x32_1024_transpose_2:
+  mova            m0, [r3 +      0]
+  mova            m1, [r3 + 16 * 1]
+  mova            m2, [r3 + 16 * 2]
+  mova            m3, [r3 + 16 * 3]
+  mova            m4, [r3 + 16 * 4]
+  mova            m5, [r3 + 16 * 5]
+  mova            m6, [r3 + 16 * 6]
+  mova            m7, [r3 + 16 * 7]
+
+  TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9
+
+  mova [r4 +      0], m0
+  mova [r4 + 16 * 1], m1
+  mova [r4 + 16 * 2], m2
+  mova [r4 + 16 * 3], m3
+  mova [r4 + 16 * 4], m4
+  mova [r4 + 16 * 5], m5
+  mova [r4 + 16 * 6], m6
+  mova [r4 + 16 * 7], m7
+
+  add             r3, 16 * 8
+  add             r4, 16 * 8
+  dec             r7
+  jne idct32x32_1024_transpose_2
+
+  IDCT32X32_1024 16*0, 16*8, 16*16, 16*24
+
+  lea            stp, [stp + 16 * 32]
+  add             r9, 16 * 32
+  dec             r6
+  jnz idct32x32_1024_2
+
+  RECON_AND_STORE pass_two_start
+
+  RET
 %endif