Merge "Speed-up for ext-intra" into nextgenv2
diff --git a/test/avg_test.cc b/test/avg_test.cc
index 7d5380f..44d8dd7 100644
--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@@ -55,19 +55,19 @@
   }
 
   // Sum Pixels
-  unsigned int ReferenceAverage8x8(const uint8_t* source, int pitch ) {
+  unsigned int ReferenceAverage8x8(const uint8_t* source, int pitch) {
     unsigned int average = 0;
     for (int h = 0; h < 8; ++h)
       for (int w = 0; w < 8; ++w)
-        average += source[h * source_stride_ + w];
+        average += source[h * pitch + w];
     return ((average + 32) >> 6);
   }
 
-  unsigned int ReferenceAverage4x4(const uint8_t* source, int pitch ) {
+  unsigned int ReferenceAverage4x4(const uint8_t* source, int pitch) {
     unsigned int average = 0;
     for (int h = 0; h < 4; ++h)
       for (int w = 0; w < 4; ++w)
-        average += source[h * source_stride_ + w];
+        average += source[h * pitch + w];
     return ((average + 8) >> 4);
   }
 
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 0826788..12022be 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -279,8 +279,7 @@
                               uint16_t *output_ptr,
                               unsigned int output_stride,
                               unsigned int output_width,
-                              unsigned int output_height,
-                              int bd) {
+                              unsigned int output_height) {
   unsigned int i, j;
   for (i = 0; i < output_height; ++i) {
     for (j = 0; j < output_width; ++j) {
@@ -306,7 +305,7 @@
   highbd_filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, 64,
                             output_width, output_height, bd);
   highbd_block2d_average_c(tmp, 64, dst_ptr, dst_stride,
-                           output_width, output_height, bd);
+                           output_width, output_height);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
diff --git a/test/datarate_test.cc b/test/datarate_test.cc
index b98f8c8..9d5074e 100644
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc
@@ -850,8 +850,7 @@
     const vpx_svc_extra_cfg_t *svc_params,
     int spatial_layers,
     int temporal_layers,
-    int temporal_layering_mode,
-    unsigned int total_rate) {
+    int temporal_layering_mode) {
   int sl, spatial_layer_target;
   float total = 0;
   float alloc_ratio[VPX_MAX_LAYERS] = {0};
@@ -914,8 +913,7 @@
     cfg_.rc_target_bitrate = i;
     ResetModel();
     assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-        cfg_.ts_number_layers, cfg_.temporal_layering_mode,
-        cfg_.rc_target_bitrate);
+        cfg_.ts_number_layers, cfg_.temporal_layering_mode);
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
     ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.85)
             << " The datarate for the file exceeds the target by too much!";
@@ -953,8 +951,7 @@
   cfg_.rc_target_bitrate = 800;
   ResetModel();
   assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-      cfg_.ts_number_layers, cfg_.temporal_layering_mode,
-      cfg_.rc_target_bitrate);
+      cfg_.ts_number_layers, cfg_.temporal_layering_mode);
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.85)
           << " The datarate for the file exceeds the target by too much!";
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index 0449b52..d6cc5e4 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -276,12 +276,12 @@
 }
 
 void idct16x16_10_ref(const tran_low_t *in, uint8_t *out, int stride,
-                      int tx_type) {
+                      int /*tx_type*/) {
   idct16x16_10(in, out, stride);
 }
 
 void idct16x16_12_ref(const tran_low_t *in, uint8_t *out, int stride,
-                      int tx_type) {
+                      int /*tx_type*/) {
   idct16x16_12(in, out, stride);
 }
 
@@ -778,7 +778,7 @@
   virtual void TearDown() { libvpx_test::ClearSystemState(); }
 
  protected:
-  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {}
+  void RunFwdTxfm(int16_t * /*in*/, tran_low_t * /*out*/, int /*stride*/) {}
   void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
     inv_txfm_(out, dst, stride);
   }
diff --git a/test/error_resilience_test.cc b/test/error_resilience_test.cc
index 9a2ad2f..cd0dca2 100644
--- a/test/error_resilience_test.cc
+++ b/test/error_resilience_test.cc
@@ -100,7 +100,7 @@
   }
 
   virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+                                  ::libvpx_test::Encoder * /*encoder*/) {
     frame_flags_ &= ~(VP8_EFLAG_NO_UPD_LAST |
                       VP8_EFLAG_NO_UPD_GF |
                       VP8_EFLAG_NO_UPD_ARF);
diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index 3f6b738..0c91aee 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -40,7 +40,7 @@
 typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht4x4Param;
 
 void fdct4x4_ref(const int16_t *in, tran_low_t *out, int stride,
-                 int tx_type) {
+                 int /*tx_type*/) {
   vpx_fdct4x4_c(in, out, stride);
 }
 
@@ -49,7 +49,7 @@
 }
 
 void fwht4x4_ref(const int16_t *in, tran_low_t *out, int stride,
-                 int tx_type) {
+                 int /*tx_type*/) {
   vp9_fwht4x4_c(in, out, stride);
 }
 
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index 72d2aed..edf4682 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -47,7 +47,7 @@
 typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht8x8Param;
 typedef std::tr1::tuple<IdctFunc, IdctFunc, int, vpx_bit_depth_t> Idct8x8Param;
 
-void reference_8x8_dct_1d(const double in[8], double out[8], int stride) {
+void reference_8x8_dct_1d(const double in[8], double out[8]) {
   const double kInvSqrt2 = 0.707106781186547524400844362104;
   for (int k = 0; k < 8; k++) {
     out[k] = 0.0;
@@ -65,7 +65,7 @@
     double temp_in[8], temp_out[8];
     for (int j = 0; j < 8; ++j)
       temp_in[j] = input[j*8 + i];
-    reference_8x8_dct_1d(temp_in, temp_out, 1);
+    reference_8x8_dct_1d(temp_in, temp_out);
     for (int j = 0; j < 8; ++j)
       output[j * 8 + i] = temp_out[j];
   }
@@ -74,7 +74,7 @@
     double temp_in[8], temp_out[8];
     for (int j = 0; j < 8; ++j)
       temp_in[j] = output[j + i*8];
-    reference_8x8_dct_1d(temp_in, temp_out, 1);
+    reference_8x8_dct_1d(temp_in, temp_out);
     // Scale by some magic number
     for (int j = 0; j < 8; ++j)
       output[j + i * 8] = temp_out[j] * 2;
@@ -82,7 +82,8 @@
 }
 
 
-void fdct8x8_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
+void fdct8x8_ref(const int16_t *in, tran_low_t *out, int stride,
+                 int /*tx_type*/) {
   vpx_fdct8x8_c(in, out, stride);
 }
 
@@ -642,7 +643,7 @@
   void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
     inv_txfm_(out, dst, stride);
   }
-  void RunFwdTxfm(int16_t *out, tran_low_t *dst, int stride) {}
+  void RunFwdTxfm(int16_t * /*out*/, tran_low_t * /*dst*/, int /*stride*/) {}
 
   IdctFunc ref_txfm_;
   IdctFunc inv_txfm_;
diff --git a/test/hbd_metrics_test.cc b/test/hbd_metrics_test.cc
index bf75a29..dac001f 100644
--- a/test/hbd_metrics_test.cc
+++ b/test/hbd_metrics_test.cc
@@ -16,6 +16,7 @@
 #include "test/acm_random.h"
 #include "test/util.h"
 #include "./vpx_config.h"
+#include "vpx_dsp/psnr.h"
 #include "vpx_dsp/ssim.h"
 #include "vpx_ports/mem.h"
 #include "vpx_ports/msvc.h"
@@ -32,6 +33,19 @@
                                 const YV12_BUFFER_CONFIG *dest,
                                 uint32_t bd);
 
+double compute_hbd_psnr(const YV12_BUFFER_CONFIG *source,
+  const YV12_BUFFER_CONFIG *dest, uint32_t bit_depth) {
+  PSNR_STATS psnr;
+  calc_highbd_psnr(source, dest, &psnr, bit_depth, bit_depth);
+  return psnr.psnr[0];
+}
+
+double compute_psnr(const YV12_BUFFER_CONFIG *source,
+  const YV12_BUFFER_CONFIG *dest) {
+  PSNR_STATS psnr;
+  calc_psnr(source, dest, &psnr);
+  return psnr.psnr[0];
+}
 
 double compute_hbd_psnrhvs(const YV12_BUFFER_CONFIG *source,
   const YV12_BUFFER_CONFIG *dest,
@@ -208,5 +222,13 @@
         MetricTestTParam(&compute_psnrhvs, &compute_hbd_psnrhvs, 12,
                          kPhvs_thresh)));
 
+INSTANTIATE_TEST_CASE_P(
+    PSNR, HBDMetricsTest,
+    ::testing::Values(
+        MetricTestTParam(&compute_psnr, &compute_hbd_psnr, 10,
+                         kPhvs_thresh),
+        MetricTestTParam(&compute_psnr, &compute_hbd_psnr, 12,
+                         kPhvs_thresh)));
+
 }  // namespace
 
diff --git a/test/resize_test.cc b/test/resize_test.cc
index c5f05f3..0177308 100644
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -90,74 +90,178 @@
   unsigned int h;
 };
 
-unsigned int ScaleForFrameNumber(unsigned int frame, unsigned int val) {
-  if (frame < 10)
-    return val;
-  if (frame < 20)
-    return val * 3 / 4;
-  if (frame < 30)
-    return val / 2;
-  if (frame < 40)
-    return val;
-  if (frame < 50)
-    return val * 3 / 4;
-  if (frame < 60)
-    return val / 2;
-  if (frame < 70)
-    return val * 3 / 4;
-  if (frame < 80)
-    return val;
-  if (frame < 90)
-    return val * 3 / 4;
-  if (frame < 100)
-    return val / 2;
-  if (frame < 110)
-    return val * 3 / 4;
-  if (frame < 120)
-    return val;
-  if (frame < 130)
-    return val * 3 / 4;
-  if (frame < 140)
-    return val / 2;
-  if (frame < 150)
-    return val * 3 / 4;
-  if (frame < 160)
-    return val;
-  if (frame < 170)
-    return val / 2;
-  if (frame < 180)
-    return val * 3 / 4;
-  if (frame < 190)
-    return val;
-  if (frame < 200)
-    return val * 3 / 4;
-  if (frame < 210)
-    return val / 2;
-  if (frame < 220)
-    return val * 3 / 4;
-  if (frame < 230)
-    return val;
-  if (frame < 240)
-    return val / 2;
-  if (frame < 250)
-    return val * 3 / 4;
-  return val;
+void ScaleForFrameNumber(unsigned int frame,
+                         unsigned int initial_w,
+                         unsigned int initial_h,
+                         unsigned int *w,
+                         unsigned int *h,
+                         int flag_codec) {
+  if (frame < 10) {
+    *w = initial_w;
+    *h = initial_h;
+    return;
+  }
+  if (frame < 20) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 30) {
+    *w = initial_w / 2;
+    *h = initial_h / 2;
+    return;
+  }
+  if (frame < 40) {
+    *w = initial_w;
+    *h = initial_h;
+    return;
+  }
+  if (frame < 50) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 60) {
+    *w = initial_w / 2;
+    *h = initial_h / 2;
+    return;
+  }
+  if (frame < 70) {
+    *w = initial_w;
+    *h = initial_h;
+    return;
+  }
+  if (frame < 80) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 90) {
+    *w = initial_w / 2;
+    *h = initial_h / 2;
+    return;
+  }
+  if (frame < 100) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 110) {
+    *w = initial_w;
+    *h = initial_h;
+    return;
+  }
+  if (frame < 120) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 130) {
+    *w = initial_w / 2;
+    *h = initial_h / 2;
+    return;
+  }
+  if (frame < 140) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 150) {
+    *w = initial_w;
+    *h = initial_h;
+    return;
+  }
+  if (frame < 160) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 170) {
+    *w = initial_w / 2;
+    *h = initial_h / 2;
+    return;
+  }
+  if (frame < 180) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 190) {
+    *w = initial_w;
+    *h = initial_h;
+    return;
+  }
+  if (frame < 200) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 210) {
+    *w = initial_w / 2;
+    *h = initial_h / 2;
+    return;
+  }
+  if (frame < 220) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 230) {
+    *w = initial_w;
+    *h = initial_h;
+    return;
+  }
+  if (frame < 240) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 250) {
+    *w = initial_w  / 2;
+    *h = initial_h / 2;
+    return;
+  }
+  if (frame < 260) {
+    *w = initial_w;
+    *h = initial_h;
+    return;
+  }
+  // Go down very low.
+  if (frame < 270) {
+    *w = initial_w / 4;
+    *h = initial_h / 4;
+    return;
+  }
+  if (flag_codec == 1) {
+    // Cases that only works for VP9.
+    // For VP9: Swap width and height of original.
+    if (frame < 320) {
+      *w = initial_h;
+      *h = initial_w;
+      return;
+    }
+  }
+  *w = initial_w;
+  *h = initial_h;
 }
 
 class ResizingVideoSource : public ::libvpx_test::DummyVideoSource {
  public:
   ResizingVideoSource() {
     SetSize(kInitialWidth, kInitialHeight);
-    limit_ = 300;
+    limit_ = 350;
   }
-
+  int flag_codec_;
   virtual ~ResizingVideoSource() {}
 
  protected:
   virtual void Next() {
     ++frame_;
-    SetSize(ScaleForFrameNumber(frame_, kInitialWidth),
-            ScaleForFrameNumber(frame_, kInitialHeight));
+    unsigned int width;
+    unsigned int height;
+    ScaleForFrameNumber(frame_, kInitialWidth, kInitialHeight, &width, &height,
+                        flag_codec_);
+    SetSize(width, height);
     FillFrame();
   }
 };
@@ -184,15 +288,17 @@
 
 TEST_P(ResizeTest, TestExternalResizeWorks) {
   ResizingVideoSource video;
+  video.flag_codec_ = 0;
   cfg_.g_lag_in_frames = 0;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
   for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
        info != frame_info_list_.end(); ++info) {
     const unsigned int frame = static_cast<unsigned>(info->pts);
-    const unsigned int expected_w = ScaleForFrameNumber(frame, kInitialWidth);
-    const unsigned int expected_h = ScaleForFrameNumber(frame, kInitialHeight);
-
+    unsigned int expected_w;
+    unsigned int expected_h;
+    ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight,
+                        &expected_w, &expected_h, 0);
     EXPECT_EQ(expected_w, info->w)
         << "Frame " << frame << " had unexpected width";
     EXPECT_EQ(expected_h, info->h)
@@ -386,6 +492,7 @@
 
 TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) {
   ResizingVideoSource video;
+  video.flag_codec_ = 1;
   DefaultConfig();
   // Disable internal resize for this test.
   cfg_.rc_resize_allowed = 0;
@@ -395,9 +502,10 @@
   for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
        info != frame_info_list_.end(); ++info) {
     const unsigned int frame = static_cast<unsigned>(info->pts);
-    const unsigned int expected_w = ScaleForFrameNumber(frame, kInitialWidth);
-    const unsigned int expected_h = ScaleForFrameNumber(frame, kInitialHeight);
-
+    unsigned int expected_w;
+    unsigned int expected_h;
+    ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight,
+                        &expected_w, &expected_h, 1);
     EXPECT_EQ(expected_w, info->w)
         << "Frame " << frame << " had unexpected width";
     EXPECT_EQ(expected_h, info->h)
diff --git a/test/test_vector_test.cc b/test/test_vector_test.cc
index 6f0cbdf..f1aa4d7 100644
--- a/test/test_vector_test.cc
+++ b/test/test_vector_test.cc
@@ -103,7 +103,7 @@
   const int mode = std::tr1::get<kDecodeMode>(input);
   libvpx_test::CompressedVideoSource *video = NULL;
   vpx_codec_flags_t flags = 0;
-  vpx_codec_dec_cfg_t cfg = {0};
+  vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
   char str[256];
 
   if (mode == kFrameParallelMode) {
diff --git a/test/vp9_encoder_parms_get_to_decoder.cc b/test/vp9_encoder_parms_get_to_decoder.cc
index 3ef6022..bd84098 100644
--- a/test/vp9_encoder_parms_get_to_decoder.cc
+++ b/test/vp9_encoder_parms_get_to_decoder.cc
@@ -45,9 +45,9 @@
 };
 
 const EncodeParameters kVP9EncodeParameterSet[] = {
-  {0, 0, 0, 1, 0, VPX_CR_STUDIO_RANGE, VPX_CS_BT_601},
-  {0, 0, 0, 0, 0, VPX_CR_FULL_RANGE, VPX_CS_BT_709},
-  {0, 0, 1, 0, 0, VPX_CR_FULL_RANGE, VPX_CS_BT_2020},
+  {0, 0, 0, 1, 0, VPX_CR_STUDIO_RANGE, VPX_CS_BT_601, { 0, 0 }},
+  {0, 0, 0, 0, 0, VPX_CR_FULL_RANGE, VPX_CS_BT_709, { 0, 0 }},
+  {0, 0, 1, 0, 0, VPX_CR_FULL_RANGE, VPX_CS_BT_2020, { 0, 0 }},
   {0, 2, 0, 0, 1, VPX_CR_STUDIO_RANGE, VPX_CS_UNKNOWN, { 640, 480 }},
   // TODO(JBB): Test profiles (requires more work).
 };
@@ -93,7 +93,7 @@
   }
 
   virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
-                                  const libvpx_test::VideoSource &video,
+                                  const libvpx_test::VideoSource & /*video*/,
                                   libvpx_test::Decoder *decoder) {
     vpx_codec_ctx_t *const vp9_decoder = decoder->GetDecoder();
     vpx_codec_alg_priv_t *const priv =
diff --git a/test/vp9_error_block_test.cc b/test/vp9_error_block_test.cc
index 77b12ea..23a249e 100644
--- a/test/vp9_error_block_test.cc
+++ b/test/vp9_error_block_test.cc
@@ -164,7 +164,7 @@
                                            const tran_low_t *dqcoeff,
                                            intptr_t block_size,
                                            int64_t *ssz, int bps) {
-  assert(bps == 8);
+  EXPECT_EQ(8, bps);
   return vp9_highbd_block_error_8bit_c(coeff, dqcoeff, block_size, ssz);
 }
 
@@ -173,7 +173,7 @@
                                               const tran_low_t *dqcoeff,
                                               intptr_t block_size,
                                               int64_t *ssz, int bps) {
-  assert(bps == 8);
+  EXPECT_EQ(8, bps);
   return vp9_highbd_block_error_8bit_sse2(coeff, dqcoeff, block_size, ssz);
 }
 
@@ -195,7 +195,7 @@
                                               const tran_low_t *dqcoeff,
                                               intptr_t block_size,
                                               int64_t *ssz, int bps) {
-  assert(bps == 8);
+  EXPECT_EQ(8, bps);
   return vp9_highbd_block_error_8bit_avx(coeff, dqcoeff, block_size, ssz);
 }
 
diff --git a/test/vp9_ethread_test.cc b/test/vp9_ethread_test.cc
index 8ac5c33..29a653f 100644
--- a/test/vp9_ethread_test.cc
+++ b/test/vp9_ethread_test.cc
@@ -62,7 +62,7 @@
     encoder_initialized_ = false;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource * /*video*/,
                                   ::libvpx_test::Encoder *encoder) {
     if (!encoder_initialized_) {
       // Encode 4 column tiles.
diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc
index e6198af..416f3c3 100644
--- a/test/vp9_intrapred_test.cc
+++ b/test/vp9_intrapred_test.cc
@@ -34,7 +34,7 @@
   virtual ~VP9IntraPredBase() { libvpx_test::ClearSystemState(); }
 
  protected:
-  virtual void Predict(PREDICTION_MODE mode) = 0;
+  virtual void Predict() = 0;
 
   void CheckPrediction(int test_case_number, int *error_count) const {
     // For each pixel ensure that the calculated value is the same as reference.
@@ -73,7 +73,7 @@
           left_col_[y] = rnd.Rand16() & mask_;
         }
       }
-      Predict(DC_PRED);
+      Predict();
       CheckPrediction(i, &error_count);
     }
     ASSERT_EQ(0, error_count);
@@ -106,7 +106,7 @@
     mask_       = (1 << bit_depth_) - 1;
   }
 
-  virtual void Predict(PREDICTION_MODE mode) {
+  virtual void Predict() {
     const uint16_t *const_above_row = above_row_;
     const uint16_t *const_left_col = left_col_;
     ref_fn_(ref_dst_, stride_, const_above_row, const_left_col, bit_depth_);
diff --git a/vp10/common/enums.h b/vp10/common/enums.h
index f41b8d9..af6ef36 100644
--- a/vp10/common/enums.h
+++ b/vp10/common/enums.h
@@ -115,7 +115,6 @@
 #define EXT_TX_SIZES       3  // number of sizes that use extended transforms
 
 #if CONFIG_EXT_TX
-#define USE_DST2           1
 #define EXT_TX_SETS_INTER  4  // Sets of transform selections for INTER
 #define EXT_TX_SETS_INTRA  3  // Sets of transform selections for INTRA
 #endif  // CONFIG_EXT_TX
diff --git a/vp10/common/idct.c b/vp10/common/idct.c
index d42f5f5..dbb50fb 100644
--- a/vp10/common/idct.c
+++ b/vp10/common/idct.c
@@ -20,7 +20,6 @@
 
 #if CONFIG_EXT_TX
 void idst4_c(const tran_low_t *input, tran_low_t *output) {
-#if USE_DST2
   tran_low_t step[4];
   tran_high_t temp1, temp2;
   // stage 1
@@ -38,29 +37,9 @@
   output[1] = WRAPLOW(-step[1] - step[2], 8);
   output[2] = WRAPLOW(step[1] - step[2], 8);
   output[3] = WRAPLOW(step[3] - step[0], 8);
-#else
-  // {sin(pi/5), sin(pi*2/5)} * sqrt(2/5) * sqrt(2)
-  static const int32_t sinvalue_lookup[] = {
-    141124871, 228344838,
-  };
-  int64_t sum;
-  int64_t s03 = (input[0] + input[3]);
-  int64_t d03 = (input[0] - input[3]);
-  int64_t s12 = (input[1] + input[2]);
-  int64_t d12 = (input[1] - input[2]);
-  sum = s03 * sinvalue_lookup[0] + s12 * sinvalue_lookup[1];
-  output[0] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
-  sum = d03 * sinvalue_lookup[1] + d12 * sinvalue_lookup[0];
-  output[1] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
-  sum = s03 * sinvalue_lookup[1] - s12 * sinvalue_lookup[0];
-  output[2] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
-  sum = d03 * sinvalue_lookup[0] - d12 * sinvalue_lookup[1];
-  output[3] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
-#endif  // USE_DST2
 }
 
 void idst8_c(const tran_low_t *input, tran_low_t *output) {
-#if USE_DST2
   // vp9_igentx8(input, output, Tx8);
   tran_low_t step1[8], step2[8];
   tran_high_t temp1, temp2;
@@ -113,47 +92,9 @@
   output[5] = WRAPLOW(-step1[2] + step1[5], 8);
   output[6] = WRAPLOW(step1[1] - step1[6], 8);
   output[7] = WRAPLOW(-step1[0] + step1[7], 8);
-#else
-  // {sin(pi/9), sin(pi*2/9), ..., sin(pi*4/9)} * sqrt(2/9) * 2
-  static const int32_t sinvalue_lookup[] = {
-    86559612, 162678858, 219176632, 249238470
-  };
-  int64_t sum;
-  int64_t s07 = (input[0] + input[7]);
-  int64_t d07 = (input[0] - input[7]);
-  int64_t s16 = (input[1] + input[6]);
-  int64_t d16 = (input[1] - input[6]);
-  int64_t s25 = (input[2] + input[5]);
-  int64_t d25 = (input[2] - input[5]);
-  int64_t s34 = (input[3] + input[4]);
-  int64_t d34 = (input[3] - input[4]);
-  sum = s07 * sinvalue_lookup[0] + s16 * sinvalue_lookup[1] +
-        s25 * sinvalue_lookup[2] + s34 * sinvalue_lookup[3];
-  output[0] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
-  sum = d07 * sinvalue_lookup[1] + d16 * sinvalue_lookup[3] +
-        d25 * sinvalue_lookup[2] + d34 * sinvalue_lookup[0];
-  output[1] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
-  sum = (s07 + s16 - s34)* sinvalue_lookup[2];
-  output[2] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
-  sum = d07 * sinvalue_lookup[3] + d16 * sinvalue_lookup[0] -
-        d25 * sinvalue_lookup[2] - d34 * sinvalue_lookup[1];
-  output[3] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
-  sum = s07 * sinvalue_lookup[3] - s16 * sinvalue_lookup[0] -
-        s25 * sinvalue_lookup[2] + s34 * sinvalue_lookup[1];
-  output[4] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
-  sum = (d07 - d16 + d34)* sinvalue_lookup[2];
-  output[5] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
-  sum = s07 * sinvalue_lookup[1] - s16 * sinvalue_lookup[3] +
-        s25 * sinvalue_lookup[2] - s34 * sinvalue_lookup[0];
-  output[6] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
-  sum = d07 * sinvalue_lookup[0] - d16 * sinvalue_lookup[1] +
-        d25 * sinvalue_lookup[2] - d34 * sinvalue_lookup[3];
-  output[7] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
-#endif  // USE_DST2
 }
 
 void idst16_c(const tran_low_t *input, tran_low_t *output) {
-#if USE_DST2
   tran_low_t step1[16], step2[16];
   tran_high_t temp1, temp2;
 
@@ -316,112 +257,75 @@
   output[13] = WRAPLOW(-step2[2] + step2[13], 8);
   output[14] = WRAPLOW(step2[1] - step2[14], 8);
   output[15] = WRAPLOW(-step2[0] + step2[15], 8);
-#else
-  // {sin(pi/17), sin(pi*2/17, ..., sin(pi*8/17)} * sqrt(2/17) * 2 * sqrt(2)
-  static const int32_t sinvalue_lookup[] = {
-    47852167, 94074787, 137093803, 175444254,
-    207820161, 233119001, 250479254, 259309736
-  };
-  int64_t sum;
-  int64_t s015 = (input[0] + input[15]);
-  int64_t d015 = (input[0] - input[15]);
-  int64_t s114 = (input[1] + input[14]);
-  int64_t d114 = (input[1] - input[14]);
-  int64_t s213 = (input[2] + input[13]);
-  int64_t d213 = (input[2] - input[13]);
-  int64_t s312 = (input[3] + input[12]);
-  int64_t d312 = (input[3] - input[12]);
-  int64_t s411 = (input[4] + input[11]);
-  int64_t d411 = (input[4] - input[11]);
-  int64_t s510 = (input[5] + input[10]);
-  int64_t d510 = (input[5] - input[10]);
-  int64_t s69  = (input[6] + input[9]);
-  int64_t d69  = (input[6] - input[9]);
-  int64_t s78  = (input[7] + input[8]);
-  int64_t d78  = (input[7] - input[8]);
-  sum = s015 * sinvalue_lookup[0] + s114 * sinvalue_lookup[1] +
-        s213 * sinvalue_lookup[2] + s312 * sinvalue_lookup[3] +
-        s411 * sinvalue_lookup[4] + s510 * sinvalue_lookup[5] +
-        s69  * sinvalue_lookup[6] + s78  * sinvalue_lookup[7];
-  output[0]  = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
-  sum = d015 * sinvalue_lookup[1] + d114 * sinvalue_lookup[3] +
-        d213 * sinvalue_lookup[5] + d312 * sinvalue_lookup[7] +
-        d411 * sinvalue_lookup[6] + d510 * sinvalue_lookup[4] +
-        d69  * sinvalue_lookup[2] + d78  * sinvalue_lookup[0];
-  output[1]  = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
-  sum = s015 * sinvalue_lookup[2] + s114 * sinvalue_lookup[5] +
-        s213 * sinvalue_lookup[7] + s312 * sinvalue_lookup[4] +
-        s411 * sinvalue_lookup[1] - s510 * sinvalue_lookup[0] -
-        s69  * sinvalue_lookup[3] - s78  * sinvalue_lookup[6];
-  output[2]  = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
-  sum = d015 * sinvalue_lookup[3] + d114 * sinvalue_lookup[7] +
-        d213 * sinvalue_lookup[4] + d312 * sinvalue_lookup[0] -
-        d411 * sinvalue_lookup[2] - d510 * sinvalue_lookup[6] -
-        d69  * sinvalue_lookup[5] - d78  * sinvalue_lookup[1];
-  output[3]  = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
-  sum = s015 * sinvalue_lookup[4] + s114 * sinvalue_lookup[6] +
-        s213 * sinvalue_lookup[1] - s312 * sinvalue_lookup[2] -
-        s411 * sinvalue_lookup[7] - s510 * sinvalue_lookup[3] +
-        s69  * sinvalue_lookup[0] + s78  * sinvalue_lookup[5];
-  output[4]  = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
-  sum = d015 * sinvalue_lookup[5] + d114 * sinvalue_lookup[4] -
-        d213 * sinvalue_lookup[0] - d312 * sinvalue_lookup[6] -
-        d411 * sinvalue_lookup[3] + d510 * sinvalue_lookup[1] +
-        d69  * sinvalue_lookup[7] + d78  * sinvalue_lookup[2];
-  output[5]  = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
-  sum = s015 * sinvalue_lookup[6] + s114 * sinvalue_lookup[2] -
-        s213 * sinvalue_lookup[3] - s312 * sinvalue_lookup[5] +
-        s411 * sinvalue_lookup[0] + s510 * sinvalue_lookup[7] +
-        s69  * sinvalue_lookup[1] - s78  * sinvalue_lookup[4];
-  output[6]  = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
-  sum = d015 * sinvalue_lookup[7] + d114 * sinvalue_lookup[0] -
-        d213 * sinvalue_lookup[6] - d312 * sinvalue_lookup[1] +
-        d411 * sinvalue_lookup[5] + d510 * sinvalue_lookup[2] -
-        d69  * sinvalue_lookup[4] - d78  * sinvalue_lookup[3];
-  output[7]  = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
-  sum = s015 * sinvalue_lookup[7] - s114 * sinvalue_lookup[0] -
-        s213 * sinvalue_lookup[6] + s312 * sinvalue_lookup[1] +
-        s411 * sinvalue_lookup[5] - s510 * sinvalue_lookup[2] -
-        s69  * sinvalue_lookup[4] + s78  * sinvalue_lookup[3];
-  output[8]  = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
-  sum = d015 * sinvalue_lookup[6] - d114 * sinvalue_lookup[2] -
-        d213 * sinvalue_lookup[3] + d312 * sinvalue_lookup[5] +
-        d411 * sinvalue_lookup[0] - d510 * sinvalue_lookup[7] +
-        d69  * sinvalue_lookup[1] + d78  * sinvalue_lookup[4];
-  output[9]  = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
-  sum = s015 * sinvalue_lookup[5] - s114 * sinvalue_lookup[4] -
-        s213 * sinvalue_lookup[0] + s312 * sinvalue_lookup[6] -
-        s411 * sinvalue_lookup[3] - s510 * sinvalue_lookup[1] +
-        s69  * sinvalue_lookup[7] - s78  * sinvalue_lookup[2];
-  output[10] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
-  sum = d015 * sinvalue_lookup[4] - d114 * sinvalue_lookup[6] +
-        d213 * sinvalue_lookup[1] + d312 * sinvalue_lookup[2] -
-        d411 * sinvalue_lookup[7] + d510 * sinvalue_lookup[3] +
-        d69  * sinvalue_lookup[0] - d78  * sinvalue_lookup[5];
-  output[11] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
-  sum = s015 * sinvalue_lookup[3] - s114 * sinvalue_lookup[7] +
-        s213 * sinvalue_lookup[4] - s312 * sinvalue_lookup[0] -
-        s411 * sinvalue_lookup[2] + s510 * sinvalue_lookup[6] -
-        s69  * sinvalue_lookup[5] + s78  * sinvalue_lookup[1];
-  output[12] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
-  sum = d015 * sinvalue_lookup[2] - d114 * sinvalue_lookup[5] +
-        d213 * sinvalue_lookup[7] - d312 * sinvalue_lookup[4] +
-        d411 * sinvalue_lookup[1] + d510 * sinvalue_lookup[0] -
-        d69  * sinvalue_lookup[3] + d78  * sinvalue_lookup[6];
-  output[13] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
-  sum = s015 * sinvalue_lookup[1] - s114 * sinvalue_lookup[3] +
-        s213 * sinvalue_lookup[5] - s312 * sinvalue_lookup[7] +
-        s411 * sinvalue_lookup[6] - s510 * sinvalue_lookup[4] +
-        s69  * sinvalue_lookup[2] - s78  * sinvalue_lookup[0];
-  output[14] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
-  sum = d015 * sinvalue_lookup[0] - d114 * sinvalue_lookup[1] +
-        d213 * sinvalue_lookup[2] - d312 * sinvalue_lookup[3] +
-        d411 * sinvalue_lookup[4] - d510 * sinvalue_lookup[5] +
-        d69  * sinvalue_lookup[6] - d78  * sinvalue_lookup[7];
-  output[15] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), 8);
-#endif  // USE_DST2
 }
 
+#if CONFIG_EXT_TX
+// For use in lieu of DST
+static void ihalfcenter32_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  tran_low_t inputhalf[16];
+  for (i = 0; i < 8; ++i) {
+    output[i] = input[16 + i] * 4;
+    output[24 + i] = input[24 + i] * 4;
+  }
+  // Multiply input by sqrt(2)
+  for (i = 0; i < 16; ++i) {
+    inputhalf[i] = (tran_low_t)dct_const_round_shift(input[i] * Sqrt2);
+  }
+  idct16_c(inputhalf, output + 8);
+  // Note overall scaling factor is 4 times orthogonal
+}
+
+static void ihalfright32_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  tran_low_t inputhalf[16];
+  for (i = 0; i < 16; ++i) {
+    output[i] = input[16 + i] * 4;
+  }
+  // Multiply input by sqrt(2)
+  for (i = 0; i < 16; ++i) {
+    inputhalf[i] = (tran_low_t)dct_const_round_shift(input[i] * Sqrt2);
+  }
+  idct16_c(inputhalf, output + 16);
+  // Note overall scaling factor is 4 times orthogonal
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_ihalfcenter32_c(const tran_low_t *input, tran_low_t *output,
+                                   int bd) {
+  int i;
+  tran_low_t inputhalf[16];
+  for (i = 0; i < 8; ++i) {
+    output[i] = input[16 + i] * 4;
+    output[24 + i] = input[24 + i] * 4;
+  }
+  // Multiply input by sqrt(2)
+  for (i = 0; i < 16; ++i) {
+    inputhalf[i] = (tran_low_t)highbd_dct_const_round_shift(
+        input[i] * Sqrt2, bd);
+  }
+  vpx_highbd_idct16_c(inputhalf, output + 8, bd);
+  // Note overall scaling factor is 4 times orthogonal
+}
+
+static void highbd_ihalfright32_c(const tran_low_t *input, tran_low_t *output,
+                                  int bd) {
+  int i;
+  tran_low_t inputhalf[16];
+  for (i = 0; i < 16; ++i) {
+    output[i] = input[16 + i] * 4;
+  }
+  // Multiply input by sqrt(2)
+  for (i = 0; i < 16; ++i) {
+    inputhalf[i] = (tran_low_t)highbd_dct_const_round_shift(
+        input[i] * Sqrt2, bd);
+  }
+  vpx_highbd_idct16_c(inputhalf, output + 16, bd);
+  // Note overall scaling factor is 4 times orthogonal
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // CONFIG_EXT_TX
+
 // Inverse identiy transform and add.
 static void inv_idtx_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                            int bs) {
@@ -483,7 +387,6 @@
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void highbd_idst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
-#if USE_DST2
   tran_low_t step[4];
   tran_high_t temp1, temp2;
   (void) bd;
@@ -502,34 +405,9 @@
   output[1] = WRAPLOW(-step[1] - step[2], bd);
   output[2] = WRAPLOW(step[1] - step[2], bd);
   output[3] = WRAPLOW(step[3] - step[0], bd);
-#else
-  // {sin(pi/5), sin(pi*2/5)} * sqrt(2/5) * sqrt(2)
-  static const int32_t sinvalue_lookup[] = {
-    141124871, 228344838,
-  };
-  int64_t sum;
-  int64_t s03 = (input[0] + input[3]);
-  int64_t d03 = (input[0] - input[3]);
-  int64_t s12 = (input[1] + input[2]);
-  int64_t d12 = (input[1] - input[2]);
-
-#if !CONFIG_EMULATE_HARDWARE
-  (void)bd;
-#endif
-
-  sum = s03 * sinvalue_lookup[0] + s12 * sinvalue_lookup[1];
-  output[0] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
-  sum = d03 * sinvalue_lookup[1] + d12 * sinvalue_lookup[0];
-  output[1] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
-  sum = s03 * sinvalue_lookup[1] - s12 * sinvalue_lookup[0];
-  output[2] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
-  sum = d03 * sinvalue_lookup[0] - d12 * sinvalue_lookup[1];
-  output[3] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
-#endif  // USE_DST2
 }
 
 void highbd_idst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
-#if USE_DST2
   tran_low_t step1[8], step2[8];
   tran_high_t temp1, temp2;
   (void) bd;
@@ -582,52 +460,9 @@
   output[5] = WRAPLOW(-step1[2] + step1[5], bd);
   output[6] = WRAPLOW(step1[1] - step1[6], bd);
   output[7] = WRAPLOW(-step1[0] + step1[7], bd);
-#else
-  // {sin(pi/9), sin(pi*2/9), ..., sin(pi*4/9)} * sqrt(2/9) * 2
-  static const int32_t sinvalue_lookup[] = {
-    86559612, 162678858, 219176632, 249238470
-  };
-  int64_t sum;
-  int64_t s07 = (input[0] + input[7]);
-  int64_t d07 = (input[0] - input[7]);
-  int64_t s16 = (input[1] + input[6]);
-  int64_t d16 = (input[1] - input[6]);
-  int64_t s25 = (input[2] + input[5]);
-  int64_t d25 = (input[2] - input[5]);
-  int64_t s34 = (input[3] + input[4]);
-  int64_t d34 = (input[3] - input[4]);
-
-#if !CONFIG_EMULATE_HARDWARE
-  (void)bd;
-#endif
-
-  sum = s07 * sinvalue_lookup[0] + s16 * sinvalue_lookup[1] +
-        s25 * sinvalue_lookup[2] + s34 * sinvalue_lookup[3];
-  output[0] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
-  sum = d07 * sinvalue_lookup[1] + d16 * sinvalue_lookup[3] +
-        d25 * sinvalue_lookup[2] + d34 * sinvalue_lookup[0];
-  output[1] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
-  sum = (s07 + s16 - s34)* sinvalue_lookup[2];
-  output[2] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
-  sum = d07 * sinvalue_lookup[3] + d16 * sinvalue_lookup[0] -
-        d25 * sinvalue_lookup[2] - d34 * sinvalue_lookup[1];
-  output[3] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
-  sum = s07 * sinvalue_lookup[3] - s16 * sinvalue_lookup[0] -
-        s25 * sinvalue_lookup[2] + s34 * sinvalue_lookup[1];
-  output[4] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
-  sum = (d07 - d16 + d34)* sinvalue_lookup[2];
-  output[5] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
-  sum = s07 * sinvalue_lookup[1] - s16 * sinvalue_lookup[3] +
-        s25 * sinvalue_lookup[2] - s34 * sinvalue_lookup[0];
-  output[6] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
-  sum = d07 * sinvalue_lookup[0] - d16 * sinvalue_lookup[1] +
-        d25 * sinvalue_lookup[2] - d34 * sinvalue_lookup[3];
-  output[7] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
-#endif  // USE_DST2
 }
 
 void highbd_idst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
-#if USE_DST2
   // vp9_highbd_igentx16(input, output, bd, Tx16);
   tran_low_t step1[16], step2[16];
   tran_high_t temp1, temp2;
@@ -792,115 +627,6 @@
   output[13] = WRAPLOW(-step2[2] + step2[13], bd);
   output[14] = WRAPLOW(step2[1] - step2[14], bd);
   output[15] = WRAPLOW(-step2[0] + step2[15], bd);
-#else
-  // {sin(pi/17), sin(pi*2/17, ..., sin(pi*8/17)} * sqrt(2/17) * 2 * sqrt(2)
-  static const int32_t sinvalue_lookup[] = {
-    47852167, 94074787, 137093803, 175444254,
-    207820161, 233119001, 250479254, 259309736
-  };
-  int64_t sum;
-  int64_t s015 = (input[0] + input[15]);
-  int64_t d015 = (input[0] - input[15]);
-  int64_t s114 = (input[1] + input[14]);
-  int64_t d114 = (input[1] - input[14]);
-  int64_t s213 = (input[2] + input[13]);
-  int64_t d213 = (input[2] - input[13]);
-  int64_t s312 = (input[3] + input[12]);
-  int64_t d312 = (input[3] - input[12]);
-  int64_t s411 = (input[4] + input[11]);
-  int64_t d411 = (input[4] - input[11]);
-  int64_t s510 = (input[5] + input[10]);
-  int64_t d510 = (input[5] - input[10]);
-  int64_t s69  = (input[6] + input[9]);
-  int64_t d69  = (input[6] - input[9]);
-  int64_t s78  = (input[7] + input[8]);
-  int64_t d78  = (input[7] - input[8]);
-
-#if !CONFIG_EMULATE_HARDWARE
-  (void)bd;
-#endif
-
-  sum = s015 * sinvalue_lookup[0] + s114 * sinvalue_lookup[1] +
-        s213 * sinvalue_lookup[2] + s312 * sinvalue_lookup[3] +
-        s411 * sinvalue_lookup[4] + s510 * sinvalue_lookup[5] +
-        s69  * sinvalue_lookup[6] + s78  * sinvalue_lookup[7];
-  output[0]  = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
-  sum = d015 * sinvalue_lookup[1] + d114 * sinvalue_lookup[3] +
-        d213 * sinvalue_lookup[5] + d312 * sinvalue_lookup[7] +
-        d411 * sinvalue_lookup[6] + d510 * sinvalue_lookup[4] +
-        d69  * sinvalue_lookup[2] + d78  * sinvalue_lookup[0];
-  output[1]  = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
-  sum = s015 * sinvalue_lookup[2] + s114 * sinvalue_lookup[5] +
-        s213 * sinvalue_lookup[7] + s312 * sinvalue_lookup[4] +
-        s411 * sinvalue_lookup[1] - s510 * sinvalue_lookup[0] -
-        s69  * sinvalue_lookup[3] - s78  * sinvalue_lookup[6];
-  output[2]  = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
-  sum = d015 * sinvalue_lookup[3] + d114 * sinvalue_lookup[7] +
-        d213 * sinvalue_lookup[4] + d312 * sinvalue_lookup[0] -
-        d411 * sinvalue_lookup[2] - d510 * sinvalue_lookup[6] -
-        d69  * sinvalue_lookup[5] - d78  * sinvalue_lookup[1];
-  output[3]  = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
-  sum = s015 * sinvalue_lookup[4] + s114 * sinvalue_lookup[6] +
-        s213 * sinvalue_lookup[1] - s312 * sinvalue_lookup[2] -
-        s411 * sinvalue_lookup[7] - s510 * sinvalue_lookup[3] +
-        s69  * sinvalue_lookup[0] + s78  * sinvalue_lookup[5];
-  output[4]  = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
-  sum = d015 * sinvalue_lookup[5] + d114 * sinvalue_lookup[4] -
-        d213 * sinvalue_lookup[0] - d312 * sinvalue_lookup[6] -
-        d411 * sinvalue_lookup[3] + d510 * sinvalue_lookup[1] +
-        d69  * sinvalue_lookup[7] + d78  * sinvalue_lookup[2];
-  output[5]  = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
-  sum = s015 * sinvalue_lookup[6] + s114 * sinvalue_lookup[2] -
-        s213 * sinvalue_lookup[3] - s312 * sinvalue_lookup[5] +
-        s411 * sinvalue_lookup[0] + s510 * sinvalue_lookup[7] +
-        s69  * sinvalue_lookup[1] - s78  * sinvalue_lookup[4];
-  output[6]  = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
-  sum = d015 * sinvalue_lookup[7] + d114 * sinvalue_lookup[0] -
-        d213 * sinvalue_lookup[6] - d312 * sinvalue_lookup[1] +
-        d411 * sinvalue_lookup[5] + d510 * sinvalue_lookup[2] -
-        d69  * sinvalue_lookup[4] - d78  * sinvalue_lookup[3];
-  output[7]  = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
-  sum = s015 * sinvalue_lookup[7] - s114 * sinvalue_lookup[0] -
-        s213 * sinvalue_lookup[6] + s312 * sinvalue_lookup[1] +
-        s411 * sinvalue_lookup[5] - s510 * sinvalue_lookup[2] -
-        s69  * sinvalue_lookup[4] + s78  * sinvalue_lookup[3];
-  output[8]  = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
-  sum = d015 * sinvalue_lookup[6] - d114 * sinvalue_lookup[2] -
-        d213 * sinvalue_lookup[3] + d312 * sinvalue_lookup[5] +
-        d411 * sinvalue_lookup[0] - d510 * sinvalue_lookup[7] +
-        d69  * sinvalue_lookup[1] + d78  * sinvalue_lookup[4];
-  output[9]  = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
-  sum = s015 * sinvalue_lookup[5] - s114 * sinvalue_lookup[4] -
-        s213 * sinvalue_lookup[0] + s312 * sinvalue_lookup[6] -
-        s411 * sinvalue_lookup[3] - s510 * sinvalue_lookup[1] +
-        s69  * sinvalue_lookup[7] - s78  * sinvalue_lookup[2];
-  output[10] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
-  sum = d015 * sinvalue_lookup[4] - d114 * sinvalue_lookup[6] +
-        d213 * sinvalue_lookup[1] + d312 * sinvalue_lookup[2] -
-        d411 * sinvalue_lookup[7] + d510 * sinvalue_lookup[3] +
-        d69  * sinvalue_lookup[0] - d78  * sinvalue_lookup[5];
-  output[11] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
-  sum = s015 * sinvalue_lookup[3] - s114 * sinvalue_lookup[7] +
-        s213 * sinvalue_lookup[4] - s312 * sinvalue_lookup[0] -
-        s411 * sinvalue_lookup[2] + s510 * sinvalue_lookup[6] -
-        s69  * sinvalue_lookup[5] + s78  * sinvalue_lookup[1];
-  output[12] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
-  sum = d015 * sinvalue_lookup[2] - d114 * sinvalue_lookup[5] +
-        d213 * sinvalue_lookup[7] - d312 * sinvalue_lookup[4] +
-        d411 * sinvalue_lookup[1] + d510 * sinvalue_lookup[0] -
-        d69  * sinvalue_lookup[3] + d78  * sinvalue_lookup[6];
-  output[13] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
-  sum = s015 * sinvalue_lookup[1] - s114 * sinvalue_lookup[3] +
-        s213 * sinvalue_lookup[5] - s312 * sinvalue_lookup[7] +
-        s411 * sinvalue_lookup[6] - s510 * sinvalue_lookup[4] +
-        s69  * sinvalue_lookup[2] - s78  * sinvalue_lookup[0];
-  output[14] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
-  sum = d015 * sinvalue_lookup[0] - d114 * sinvalue_lookup[1] +
-        d213 * sinvalue_lookup[2] - d312 * sinvalue_lookup[3] +
-        d411 * sinvalue_lookup[4] - d510 * sinvalue_lookup[5] +
-        d69  * sinvalue_lookup[6] - d78  * sinvalue_lookup[7];
-  output[15] = WRAPLOW(ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS)), bd);
-#endif  // USE_DST2
 }
 
 static void highbd_inv_idtx_add_c(const tran_low_t *input, uint8_t *dest8,
@@ -1149,6 +875,67 @@
   }
 }
 
+#if CONFIG_EXT_TX
+void vp10_iht32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
+                              int stride, int tx_type) {
+  static const transform_2d IHT_32[] = {
+    { idct32_c,  idct32_c  },                // DCT_DCT           = 0,
+    { ihalfright32_c, idct32_c  },           // ADST_DCT          = 1,
+    { idct32_c,  ihalfright32_c },           // DCT_ADST          = 2,
+    { ihalfright32_c, ihalfright32_c },      // ADST_ADST         = 3,
+    { ihalfright32_c, idct32_c  },           // FLIPADST_DCT      = 4,
+    { idct32_c,  ihalfright32_c },           // DCT_FLIPADST      = 5,
+    { ihalfright32_c, ihalfright32_c },      // FLIPADST_FLIPADST = 6,
+    { ihalfright32_c, ihalfright32_c },      // ADST_FLIPADST     = 7,
+    { ihalfright32_c, ihalfright32_c },      // FLIPADST_ADST     = 8,
+    { ihalfcenter32_c,  idct32_c  },         // DST_DCT           = 9,
+    { idct32_c,  ihalfcenter32_c  },         // DCT_DST           = 10,
+    { ihalfcenter32_c,  ihalfright32_c },    // DST_ADST          = 11,
+    { ihalfright32_c, ihalfcenter32_c  },    // ADST_DST          = 12,
+    { ihalfcenter32_c,  ihalfright32_c },    // DST_FLIPADST      = 13,
+    { ihalfright32_c, ihalfcenter32_c  },    // FLIPADST_DST      = 14,
+    { ihalfcenter32_c,  ihalfcenter32_c  },  // DST_DST           = 15
+  };
+
+  int i, j;
+  tran_low_t tmp;
+  tran_low_t out[32][32];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 32;
+
+  // inverse transform row vectors
+  for (i = 0; i < 32; ++i) {
+    IHT_32[tx_type].rows(input, out[i]);
+    input  += 32;
+  }
+
+  // transpose
+  for (i = 1 ; i < 32; i++) {
+    for (j = 0; j < i; j++) {
+            tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < 32; ++i) {
+    IHT_32[tx_type].cols(out[i], out[i]);
+  }
+
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 32);
+
+  // Sum with the destination
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
+    }
+  }
+}
+#endif  // CONFIG_EXT_TX
+
 // idct
 void vp10_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
                      int eob) {
@@ -1339,15 +1126,27 @@
       vp10_idct32x32_add(input, dest, stride, eob);
       break;
 #if CONFIG_EXT_TX
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case DST_DST:
+    case DST_DCT:
+    case DCT_DST:
+    case DST_ADST:
+    case ADST_DST:
+    case FLIPADST_DST:
+    case DST_FLIPADST:
+      vp10_iht32x32_1024_add_c(input, dest, stride, tx_type);
+      break;
     case IDTX:
       inv_idtx_add_c(input, dest, stride, 32);
       break;
 #endif  // CONFIG_EXT_TX
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-      assert(0);
-      break;
     default:
       assert(0);
       break;
@@ -1553,6 +1352,70 @@
   }
 }
 
+#if CONFIG_EXT_TX
+void vp10_highbd_iht32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
+                                     int stride, int tx_type, int bd) {
+  static const highbd_transform_2d HIGH_IHT_32[] = {
+    { vpx_highbd_idct32_c, vpx_highbd_idct32_c  },        // DCT_DCT
+    { highbd_ihalfright32_c, vpx_highbd_idct32_c  },      // ADST_DCT
+    { vpx_highbd_idct32_c, highbd_ihalfright32_c },       // DCT_ADST
+    { highbd_ihalfright32_c, highbd_ihalfright32_c },     // ADST_ADST
+    { highbd_ihalfright32_c, vpx_highbd_idct32_c  },      // FLIPADST_DCT
+    { vpx_highbd_idct32_c, highbd_ihalfright32_c },       // DCT_FLIPADST
+    { highbd_ihalfright32_c, highbd_ihalfright32_c },     // FLIPADST_FLIPADST
+    { highbd_ihalfright32_c, highbd_ihalfright32_c },     // ADST_FLIPADST
+    { highbd_ihalfright32_c, highbd_ihalfright32_c },     // FLIPADST_ADST
+    { highbd_ihalfcenter32_c, vpx_highbd_idct32_c  },     // DST_DCT
+    { vpx_highbd_idct32_c, highbd_ihalfcenter32_c  },     // DCT_DST
+    { highbd_ihalfcenter32_c, highbd_ihalfright32_c },    // DST_ADST
+    { highbd_ihalfright32_c, highbd_ihalfcenter32_c  },   // ADST_DST
+    { highbd_ihalfcenter32_c, highbd_ihalfright32_c },    // DST_FLIPADST
+    { highbd_ihalfright32_c, highbd_ihalfcenter32_c  },   // FLIPADST_DST
+    { highbd_ihalfcenter32_c, highbd_ihalfcenter32_c  },  // DST_DST
+  };
+
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  int i, j;
+  tran_low_t tmp;
+  tran_low_t out[32][32];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 32;
+
+  // inverse transform row vectors
+  for (i = 0; i < 32; ++i) {
+    HIGH_IHT_32[tx_type].rows(input, out[i], bd);
+    input  += 32;
+  }
+
+  // transpose
+  for (i = 1 ; i < 32; i++) {
+    for (j = 0; j < i; j++) {
+            tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < 32; ++i) {
+    HIGH_IHT_32[tx_type].cols(out[i], out[i], bd);
+  }
+
+  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 32);
+
+  // Sum with the destination
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = highbd_clip_pixel_add(dest[d],
+                                      ROUND_POWER_OF_TWO(outp[s], 6), bd);
+    }
+  }
+}
+#endif  // CONFIG_EXT_TX
+
 // idct
 void vp10_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
                             int eob, int bd) {
@@ -1750,15 +1613,27 @@
       vp10_highbd_idct32x32_add(input, dest, stride, eob, bd);
       break;
 #if CONFIG_EXT_TX
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case DST_DST:
+    case DST_DCT:
+    case DCT_DST:
+    case DST_ADST:
+    case ADST_DST:
+    case FLIPADST_DST:
+    case DST_FLIPADST:
+      vp10_highbd_iht32x32_1024_add_c(input, dest, stride, tx_type, bd);
+      break;
     case IDTX:
       highbd_inv_idtx_add_c(input, dest, stride, 32, bd);
       break;
 #endif  // CONFIG_EXT_TX
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-      assert(0);
-      break;
     default:
       assert(0);
       break;
diff --git a/vp10/common/reconinter.c b/vp10/common/reconinter.c
index efc1ee1..c1cb69d 100644
--- a/vp10/common/reconinter.c
+++ b/vp10/common/reconinter.c
@@ -654,8 +654,6 @@
       uint8_t *dst = use_tmp_dst_buf ?
           &final_buf[plane][(i * 8) >> pd->subsampling_x] :
           &pd->dst.buf[(i * 8) >> pd->subsampling_x];
-      int bmc_stride = pd->dst.stride;
-      uint8_t *bmc = &pd->dst.buf[(i * 8) >> pd->subsampling_x];
       int tmp_stride = tmp_stride1[plane];
       uint8_t *tmp = &tmp_buf1[plane][(i * 8) >> pd->subsampling_x];
       const uint8_t *mask[2];
@@ -665,27 +663,22 @@
 #if CONFIG_VP9_HIGHBITDEPTH
       if (is_hbd) {
         uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
-        uint16_t *bmc16 = CONVERT_TO_SHORTPTR(bmc);
         uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp);
 
         for (row = 0; row < bh; ++row) {
-          for (col = 0; col < bw; ++col) {
-            dst16[col] = (mask[0][row] * bmc16[col] + mask[1][row] * tmp16[col]
+          for (col = 0; col < bw; ++col)
+            dst16[col] = (mask[0][row] * dst16[col] + mask[1][row] * tmp16[col]
                           + 32) >> 6;
-          }
           dst16 += dst_stride;
-          bmc16 += bmc_stride;
           tmp16 += tmp_stride;
         }
       } else {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
       for (row = 0; row < bh; ++row) {
-        for (col = 0; col < bw; ++col) {
-          dst[col] = (mask[0][row] * bmc[col] + mask[1][row] * tmp[col] + 32)
+        for (col = 0; col < bw; ++col)
+          dst[col] = (mask[0][row] * dst[col] + mask[1][row] * tmp[col] + 32)
                      >> 6;
-        }
         dst += dst_stride;
-        bmc += bmc_stride;
         tmp += tmp_stride;
       }
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -727,8 +720,6 @@
       uint8_t *dst = use_tmp_dst_buf ?
           &final_buf[plane][(i * 8 * dst_stride) >> pd->subsampling_y] :
           &pd->dst.buf[(i * 8 * dst_stride) >> pd->subsampling_y];
-      int bmc_stride = pd->dst.stride;
-      uint8_t *bmc = &pd->dst.buf[(i * 8 * bmc_stride) >> pd->subsampling_y];
       int tmp_stride = tmp_stride2[plane];
       uint8_t *tmp = &tmp_buf2[plane]
                               [(i * 8 * tmp_stride) >> pd->subsampling_y];
@@ -739,27 +730,22 @@
 #if CONFIG_VP9_HIGHBITDEPTH
       if (is_hbd) {
         uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
-        uint16_t *bmc16 = CONVERT_TO_SHORTPTR(bmc);
         uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp);
 
         for (row = 0; row < bh; ++row) {
-          for (col = 0; col < bw; ++col) {
-            dst16[col] = (mask[0][row] * bmc16[col] + mask[1][row] * tmp16[col]
+          for (col = 0; col < bw; ++col)
+            dst16[col] = (mask[0][row] * dst16[col] + mask[1][row] * tmp16[col]
                           + 32) >> 6;
-          }
           dst16 += dst_stride;
-          bmc16 += bmc_stride;
           tmp16 += tmp_stride;
         }
       } else {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
       for (row = 0; row < bh; ++row) {
-        for (col = 0; col < bw; ++col) {
-          dst[col] = (mask[0][col] * bmc[col] + mask[1][col] * tmp[col] + 32)
+        for (col = 0; col < bw; ++col)
+          dst[col] = (mask[0][col] * dst[col] + mask[1][col] * tmp[col] + 32)
                      >> 6;
-        }
         dst += dst_stride;
-        bmc += bmc_stride;
         tmp += tmp_stride;
       }
 #if CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp10/common/vp10_rtcd_defs.pl b/vp10/common/vp10_rtcd_defs.pl
index 9860bae..c9f0295 100644
--- a/vp10/common/vp10_rtcd_defs.pl
+++ b/vp10/common/vp10_rtcd_defs.pl
@@ -404,6 +404,9 @@
   add_proto qw/void vp10_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
   specialize qw/vp10_fht16x16 sse2/;
 
+  add_proto qw/void vp10_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp10_fht32x32/;
+
   add_proto qw/void vp10_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vp10_fwht4x4/, "$mmx_x86inc";
 } else {
@@ -416,6 +419,9 @@
   add_proto qw/void vp10_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
   specialize qw/vp10_fht16x16 sse2 msa/;
 
+  add_proto qw/void vp10_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp10_fht32x32/;
+
   add_proto qw/void vp10_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vp10_fwht4x4 msa/, "$mmx_x86inc";
 }
@@ -642,6 +648,9 @@
   add_proto qw/void vp10_highbd_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
   specialize qw/vp10_highbd_fht16x16/;
 
+  add_proto qw/void vp10_highbd_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp10_highbd_fht32x32/;
+
   add_proto qw/void vp10_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vp10_highbd_fwht4x4/;
 
diff --git a/vp10/encoder/dct.c b/vp10/encoder/dct.c
index 5602753..333adbb 100644
--- a/vp10/encoder/dct.c
+++ b/vp10/encoder/dct.c
@@ -14,7 +14,6 @@
 #include "./vp10_rtcd.h"
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
-
 #include "vp10/common/blockd.h"
 #include "vp10/common/idct.h"
 #include "vpx_dsp/fwd_txfm.h"
@@ -39,7 +38,6 @@
 
 #if CONFIG_EXT_TX
 void fdst4(const tran_low_t *input, tran_low_t *output) {
-#if USE_DST2
   tran_high_t step[4];
   tran_high_t temp1, temp2;
 
@@ -56,29 +54,9 @@
   temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
   output[2] = fdct_round_shift(temp1);
   output[0] = fdct_round_shift(temp2);
-#else
-  // {sin(pi/5), sin(pi*2/5)} * sqrt(2/5) * sqrt(2)
-  static const int32_t sinvalue_lookup[] = {
-    141124871, 228344838,
-  };
-  int64_t sum;
-  int64_t s03 = (input[0] + input[3]);
-  int64_t d03 = (input[0] - input[3]);
-  int64_t s12 = (input[1] + input[2]);
-  int64_t d12 = (input[1] - input[2]);
-  sum = s03 * sinvalue_lookup[0] + s12 * sinvalue_lookup[1];
-  output[0] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
-  sum = d03 * sinvalue_lookup[1] + d12 * sinvalue_lookup[0];
-  output[1] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
-  sum = s03 * sinvalue_lookup[1] - s12 * sinvalue_lookup[0];
-  output[2] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
-  sum = d03 * sinvalue_lookup[0] - d12 * sinvalue_lookup[1];
-  output[3] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
-#endif  // USE_DST2
 }
 
 void fdst8(const tran_low_t *input, tran_low_t *output) {
-#if USE_DST2
   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
   tran_high_t t0, t1, t2, t3;                  // needs32
   tran_high_t x0, x1, x2, x3;                  // canbe16
@@ -127,47 +105,9 @@
   output[4] = fdct_round_shift(t2);
   output[2] = fdct_round_shift(t1);
   output[0] = fdct_round_shift(t3);
-#else
-  // {sin(pi/9), sin(pi*2/9), ..., sin(pi*4/9)} * sqrt(2/9) * 2
-  static const int sinvalue_lookup[] = {
-    86559612, 162678858, 219176632, 249238470
-  };
-  int64_t sum;
-  int64_t s07 = (input[0] + input[7]);
-  int64_t d07 = (input[0] - input[7]);
-  int64_t s16 = (input[1] + input[6]);
-  int64_t d16 = (input[1] - input[6]);
-  int64_t s25 = (input[2] + input[5]);
-  int64_t d25 = (input[2] - input[5]);
-  int64_t s34 = (input[3] + input[4]);
-  int64_t d34 = (input[3] - input[4]);
-  sum = s07 * sinvalue_lookup[0] + s16 * sinvalue_lookup[1] +
-        s25 * sinvalue_lookup[2] + s34 * sinvalue_lookup[3];
-  output[0] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
-  sum = d07 * sinvalue_lookup[1] + d16 * sinvalue_lookup[3] +
-        d25 * sinvalue_lookup[2] + d34 * sinvalue_lookup[0];
-  output[1] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
-  sum = (s07 + s16 - s34)* sinvalue_lookup[2];
-  output[2] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
-  sum = d07 * sinvalue_lookup[3] + d16 * sinvalue_lookup[0] -
-        d25 * sinvalue_lookup[2] - d34 * sinvalue_lookup[1];
-  output[3] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
-  sum = s07 * sinvalue_lookup[3] - s16 * sinvalue_lookup[0] -
-        s25 * sinvalue_lookup[2] + s34 * sinvalue_lookup[1];
-  output[4] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
-  sum = (d07 - d16 + d34)* sinvalue_lookup[2];
-  output[5] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
-  sum = s07 * sinvalue_lookup[1] - s16 * sinvalue_lookup[3] +
-        s25 * sinvalue_lookup[2] - s34 * sinvalue_lookup[0];
-  output[6] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
-  sum = d07 * sinvalue_lookup[0] - d16 * sinvalue_lookup[1] +
-        d25 * sinvalue_lookup[2] - d34 * sinvalue_lookup[3];
-  output[7] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
-#endif  // USE_DST2
 }
 
 void fdst16(const tran_low_t *input, tran_low_t *output) {
-#if USE_DST2
   tran_high_t step1[8];      // canbe16
   tran_high_t step2[8];      // canbe16
   tran_high_t step3[8];      // canbe16
@@ -306,110 +246,6 @@
   temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
   output[8] = fdct_round_shift(temp1);
   output[0] = fdct_round_shift(temp2);
-#else
-  // {sin(pi/17), sin(pi*2/17, ..., sin(pi*8/17)} * sqrt(2/17) * 2 * sqrt(2)
-  static const int sinvalue_lookup[] = {
-    47852167, 94074787, 137093803, 175444254,
-    207820161, 233119001, 250479254, 259309736
-  };
-  int64_t sum;
-  int64_t s015 = (input[0] + input[15]);
-  int64_t d015 = (input[0] - input[15]);
-  int64_t s114 = (input[1] + input[14]);
-  int64_t d114 = (input[1] - input[14]);
-  int64_t s213 = (input[2] + input[13]);
-  int64_t d213 = (input[2] - input[13]);
-  int64_t s312 = (input[3] + input[12]);
-  int64_t d312 = (input[3] - input[12]);
-  int64_t s411 = (input[4] + input[11]);
-  int64_t d411 = (input[4] - input[11]);
-  int64_t s510 = (input[5] + input[10]);
-  int64_t d510 = (input[5] - input[10]);
-  int64_t s69  = (input[6] + input[9]);
-  int64_t d69  = (input[6] - input[9]);
-  int64_t s78  = (input[7] + input[8]);
-  int64_t d78  = (input[7] - input[8]);
-  sum = s015 * sinvalue_lookup[0] + s114 * sinvalue_lookup[1] +
-        s213 * sinvalue_lookup[2] + s312 * sinvalue_lookup[3] +
-        s411 * sinvalue_lookup[4] + s510 * sinvalue_lookup[5] +
-        s69  * sinvalue_lookup[6] + s78  * sinvalue_lookup[7];
-  output[0]  = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
-  sum = d015 * sinvalue_lookup[1] + d114 * sinvalue_lookup[3] +
-        d213 * sinvalue_lookup[5] + d312 * sinvalue_lookup[7] +
-        d411 * sinvalue_lookup[6] + d510 * sinvalue_lookup[4] +
-        d69  * sinvalue_lookup[2] + d78  * sinvalue_lookup[0];
-  output[1]  = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
-  sum = s015 * sinvalue_lookup[2] + s114 * sinvalue_lookup[5] +
-        s213 * sinvalue_lookup[7] + s312 * sinvalue_lookup[4] +
-        s411 * sinvalue_lookup[1] - s510 * sinvalue_lookup[0] -
-        s69  * sinvalue_lookup[3] - s78  * sinvalue_lookup[6];
-  output[2]  = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
-  sum = d015 * sinvalue_lookup[3] + d114 * sinvalue_lookup[7] +
-        d213 * sinvalue_lookup[4] + d312 * sinvalue_lookup[0] -
-        d411 * sinvalue_lookup[2] - d510 * sinvalue_lookup[6] -
-        d69  * sinvalue_lookup[5] - d78  * sinvalue_lookup[1];
-  output[3]  = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
-  sum = s015 * sinvalue_lookup[4] + s114 * sinvalue_lookup[6] +
-        s213 * sinvalue_lookup[1] - s312 * sinvalue_lookup[2] -
-        s411 * sinvalue_lookup[7] - s510 * sinvalue_lookup[3] +
-        s69  * sinvalue_lookup[0] + s78  * sinvalue_lookup[5];
-  output[4]  = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
-  sum = d015 * sinvalue_lookup[5] + d114 * sinvalue_lookup[4] -
-        d213 * sinvalue_lookup[0] - d312 * sinvalue_lookup[6] -
-        d411 * sinvalue_lookup[3] + d510 * sinvalue_lookup[1] +
-        d69  * sinvalue_lookup[7] + d78  * sinvalue_lookup[2];
-  output[5]  = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
-  sum = s015 * sinvalue_lookup[6] + s114 * sinvalue_lookup[2] -
-        s213 * sinvalue_lookup[3] - s312 * sinvalue_lookup[5] +
-        s411 * sinvalue_lookup[0] + s510 * sinvalue_lookup[7] +
-        s69  * sinvalue_lookup[1] - s78  * sinvalue_lookup[4];
-  output[6]  = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
-  sum = d015 * sinvalue_lookup[7] + d114 * sinvalue_lookup[0] -
-        d213 * sinvalue_lookup[6] - d312 * sinvalue_lookup[1] +
-        d411 * sinvalue_lookup[5] + d510 * sinvalue_lookup[2] -
-        d69  * sinvalue_lookup[4] - d78  * sinvalue_lookup[3];
-  output[7]  = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
-  sum = s015 * sinvalue_lookup[7] - s114 * sinvalue_lookup[0] -
-        s213 * sinvalue_lookup[6] + s312 * sinvalue_lookup[1] +
-        s411 * sinvalue_lookup[5] - s510 * sinvalue_lookup[2] -
-        s69  * sinvalue_lookup[4] + s78  * sinvalue_lookup[3];
-  output[8]  = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
-  sum = d015 * sinvalue_lookup[6] - d114 * sinvalue_lookup[2] -
-        d213 * sinvalue_lookup[3] + d312 * sinvalue_lookup[5] +
-        d411 * sinvalue_lookup[0] - d510 * sinvalue_lookup[7] +
-        d69  * sinvalue_lookup[1] + d78  * sinvalue_lookup[4];
-  output[9]  = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
-  sum = s015 * sinvalue_lookup[5] - s114 * sinvalue_lookup[4] -
-        s213 * sinvalue_lookup[0] + s312 * sinvalue_lookup[6] -
-        s411 * sinvalue_lookup[3] - s510 * sinvalue_lookup[1] +
-        s69  * sinvalue_lookup[7] - s78  * sinvalue_lookup[2];
-  output[10] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
-  sum = d015 * sinvalue_lookup[4] - d114 * sinvalue_lookup[6] +
-        d213 * sinvalue_lookup[1] + d312 * sinvalue_lookup[2] -
-        d411 * sinvalue_lookup[7] + d510 * sinvalue_lookup[3] +
-        d69  * sinvalue_lookup[0] - d78  * sinvalue_lookup[5];
-  output[11] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
-  sum = s015 * sinvalue_lookup[3] - s114 * sinvalue_lookup[7] +
-        s213 * sinvalue_lookup[4] - s312 * sinvalue_lookup[0] -
-        s411 * sinvalue_lookup[2] + s510 * sinvalue_lookup[6] -
-        s69  * sinvalue_lookup[5] + s78  * sinvalue_lookup[1];
-  output[12] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
-  sum = d015 * sinvalue_lookup[2] - d114 * sinvalue_lookup[5] +
-        d213 * sinvalue_lookup[7] - d312 * sinvalue_lookup[4] +
-        d411 * sinvalue_lookup[1] + d510 * sinvalue_lookup[0] -
-        d69  * sinvalue_lookup[3] + d78  * sinvalue_lookup[6];
-  output[13] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
-  sum = s015 * sinvalue_lookup[1] - s114 * sinvalue_lookup[3] +
-        s213 * sinvalue_lookup[5] - s312 * sinvalue_lookup[7] +
-        s411 * sinvalue_lookup[6] - s510 * sinvalue_lookup[4] +
-        s69  * sinvalue_lookup[2] - s78  * sinvalue_lookup[0];
-  output[14] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
-  sum = d015 * sinvalue_lookup[0] - d114 * sinvalue_lookup[1] +
-        d213 * sinvalue_lookup[2] - d312 * sinvalue_lookup[3] +
-        d411 * sinvalue_lookup[4] - d510 * sinvalue_lookup[5] +
-        d69  * sinvalue_lookup[6] - d78  * sinvalue_lookup[7];
-  output[15] = ROUND_POWER_OF_TWO(sum, (2 * DCT_CONST_BITS));
-#endif  // USE_DST2
 }
 #endif  // CONFIG_EXT_TX
 
@@ -701,7 +537,7 @@
   range_check(output, 16, 16);
 }
 
-/* TODO(angiebird): Unify this with vp10_fwd_txfm.c: vp10_fdct32
+#if CONFIG_EXT_TX
 static void fdct32(const tran_low_t *input, tran_low_t *output) {
   tran_high_t temp;
   tran_low_t step[32];
@@ -1099,7 +935,7 @@
 
   range_check(output, 32, 18);
 }
-*/
+#endif  // CONFIG_EXT_TX
 
 static void fadst4(const tran_low_t *input, tran_low_t *output) {
   tran_high_t x0, x1, x2, x3;
@@ -1376,6 +1212,37 @@
 }
 
 #if CONFIG_EXT_TX
+// For use in lieu of DST
+static void fhalfcenter32(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  tran_low_t inputhalf[16];
+  for (i = 0; i < 8; ++i) {
+    output[16 + i] = input[i] * 4;
+    output[24 + i] = input[24 + i] * 4;
+  }
+  // Multiply input by sqrt(2)
+  for (i = 0; i < 16; ++i) {
+    inputhalf[i] = (tran_low_t)fdct_round_shift(input[i + 8] * Sqrt2);
+  }
+  fdct16(inputhalf, output);
+  // Note overall scaling factor is 4 times orthogonal
+}
+
+// For use in lieu of ADST
+static void fhalfright32(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  tran_low_t inputhalf[16];
+  for (i = 0; i < 16; ++i) {
+    output[16 + i] = input[i] * 4;
+  }
+  // Multiply input by sqrt(2)
+  for (i = 0; i < 16; ++i) {
+    inputhalf[i] = (tran_low_t)fdct_round_shift(input[i + 16] * Sqrt2);
+  }
+  fdct16(inputhalf, output);
+  // Note overall scaling factor is 4 times orthogonal
+}
+
 static void copy_block(const int16_t *src, int src_stride, int l,
                        int16_t *dest, int dest_stride) {
   int i;
@@ -1538,6 +1405,27 @@
 #endif  // CONFIG_EXT_TX
 };
 
+#if CONFIG_EXT_TX
+static const transform_2d FHT_32[] = {
+  { fdct32,  fdct32  },                // DCT_DCT           = 0,
+  { fhalfright32, fdct32  },           // ADST_DCT          = 1,
+  { fdct32,  fhalfright32 },           // DCT_ADST          = 2,
+  { fhalfright32, fhalfright32 },      // ADST_ADST         = 3,
+  { fhalfright32, fdct32  },           // FLIPADST_DCT      = 4,
+  { fdct32,  fhalfright32 },           // DCT_FLIPADST      = 5,
+  { fhalfright32, fhalfright32 },      // FLIPADST_FLIPADST = 6,
+  { fhalfright32, fhalfright32 },      // ADST_FLIPADST     = 7,
+  { fhalfright32, fhalfright32 },      // FLIPADST_ADST     = 8,
+  { fhalfcenter32,  fdct32  },         // DST_DCT           = 9,
+  { fdct32,  fhalfcenter32  },         // DCT_DST           = 10,
+  { fhalfcenter32,  fhalfright32 },    // DST_ADST          = 11,
+  { fhalfright32, fhalfcenter32  },    // ADST_DST          = 12,
+  { fhalfcenter32,  fhalfright32 },    // DST_FLIPADST      = 13,
+  { fhalfright32, fhalfcenter32  },    // FLIPADST_DST      = 14,
+  { fhalfcenter32,  fhalfcenter32  },  // DST_DST           = 15
+};
+#endif  // CONFIG_EXT_TX
+
 void vp10_fht4x4_c(const int16_t *input, tran_low_t *output,
                    int stride, int tx_type) {
   if (tx_type == DCT_DCT) {
@@ -1834,3 +1722,46 @@
   vp10_fht16x16_c(input, output, stride, tx_type);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_EXT_TX
+void vp10_fht32x32_c(const int16_t *input, tran_low_t *output,
+                     int stride, int tx_type) {
+  if (tx_type == DCT_DCT) {
+    vpx_fdct32x32_c(input, output, stride);
+  } else {
+    tran_low_t out[1024];
+    int i, j;
+    tran_low_t temp_in[32], temp_out[32];
+    const transform_2d ht = FHT_32[tx_type];
+
+    int16_t flipped_input[32 * 32];
+    maybe_flip_input(&input, &stride, 32, flipped_input, tx_type);
+
+    // Columns
+    for (i = 0; i < 32; ++i) {
+      for (j = 0; j < 32; ++j)
+        temp_in[j] = input[j * stride + i] * 4;
+      ht.cols(temp_in, temp_out);
+      for (j = 0; j < 32; ++j)
+        out[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+    }
+
+    // Rows
+    for (i = 0; i < 32; ++i) {
+      for (j = 0; j < 32; ++j)
+        temp_in[j] = out[j + i * 32];
+      ht.rows(temp_in, temp_out);
+      for (j = 0; j < 32; ++j)
+        output[j + i * 32] =
+            (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
+    }
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp10_highbd_fht32x32_c(const int16_t *input, tran_low_t *output,
+                            int stride, int tx_type) {
+  vp10_fht32x32_c(input, output, stride, tx_type);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // CONFIG_EXT_TX
diff --git a/vp10/encoder/encodeframe.c b/vp10/encoder/encodeframe.c
index 755c33b..2b96a86 100644
--- a/vp10/encoder/encodeframe.c
+++ b/vp10/encoder/encodeframe.c
@@ -2065,6 +2065,11 @@
       if (!x->skip) {
         // TODO(geza.lore): Investigate if this can be relaxed
         x->skip_recode = 0;
+        memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
+
+        x->skip_optimize = 0;
+        x->use_lp32x32fdct = cpi->sf.use_lp32x32fdct;
+
         vp10_encode_sb_supertx(x, bsize);
         vp10_tokenize_sb_supertx(cpi, td, tp, !output_enabled, bsize);
       } else {
diff --git a/vp10/encoder/encoder.c b/vp10/encoder/encoder.c
index 4c4261e..55ec9c1 100644
--- a/vp10/encoder/encoder.c
+++ b/vp10/encoder/encoder.c
@@ -50,7 +50,7 @@
 #include "./vp10_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
-#include "vpx/internal/vpx_psnr.h"
+#include "vpx_dsp/psnr.h"
 #if CONFIG_INTERNAL_STATS
 #include "vpx_dsp/ssim.h"
 #endif
@@ -2033,261 +2033,6 @@
 #endif
 }
 
-/* TODO(yaowu): The block_variance calls the unoptimized versions of variance()
- * and highbd_8_variance(). It should not.
- */
-static void encoder_variance(const uint8_t *a, int  a_stride,
-                             const uint8_t *b, int  b_stride,
-                             int  w, int  h, unsigned int *sse, int *sum) {
-  int i, j;
-
-  *sum = 0;
-  *sse = 0;
-
-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; j++) {
-      const int diff = a[j] - b[j];
-      *sum += diff;
-      *sse += diff * diff;
-    }
-
-    a += a_stride;
-    b += b_stride;
-  }
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static void encoder_highbd_variance64(const uint8_t *a8, int  a_stride,
-                                      const uint8_t *b8, int  b_stride,
-                                      int w, int h, uint64_t *sse,
-                                      uint64_t *sum) {
-  int i, j;
-
-  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-  *sum = 0;
-  *sse = 0;
-
-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; j++) {
-      const int diff = a[j] - b[j];
-      *sum += diff;
-      *sse += diff * diff;
-    }
-    a += a_stride;
-    b += b_stride;
-  }
-}
-
-static void encoder_highbd_8_variance(const uint8_t *a8, int  a_stride,
-                                      const uint8_t *b8, int  b_stride,
-                                      int w, int h,
-                                      unsigned int *sse, int *sum) {
-  uint64_t sse_long = 0;
-  uint64_t sum_long = 0;
-  encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h,
-                            &sse_long, &sum_long);
-  *sse = (unsigned int)sse_long;
-  *sum = (int)sum_long;
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-static int64_t get_sse(const uint8_t *a, int a_stride,
-                       const uint8_t *b, int b_stride,
-                       int width, int height) {
-  const int dw = width % 16;
-  const int dh = height % 16;
-  int64_t total_sse = 0;
-  unsigned int sse = 0;
-  int sum = 0;
-  int x, y;
-
-  if (dw > 0) {
-    encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride,
-                     dw, height, &sse, &sum);
-    total_sse += sse;
-  }
-
-  if (dh > 0) {
-    encoder_variance(&a[(height - dh) * a_stride], a_stride,
-                     &b[(height - dh) * b_stride], b_stride,
-                     width - dw, dh, &sse, &sum);
-    total_sse += sse;
-  }
-
-  for (y = 0; y < height / 16; ++y) {
-    const uint8_t *pa = a;
-    const uint8_t *pb = b;
-    for (x = 0; x < width / 16; ++x) {
-      vpx_mse16x16(pa, a_stride, pb, b_stride, &sse);
-      total_sse += sse;
-
-      pa += 16;
-      pb += 16;
-    }
-
-    a += 16 * a_stride;
-    b += 16 * b_stride;
-  }
-
-  return total_sse;
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride,
-                                    const uint8_t *b8, int b_stride,
-                                    int width, int height,
-                                    unsigned int input_shift) {
-  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-  int64_t total_sse = 0;
-  int x, y;
-  for (y = 0; y < height; ++y) {
-    for (x = 0; x < width; ++x) {
-      int64_t diff;
-      diff = (a[x] >> input_shift) - (b[x] >> input_shift);
-      total_sse += diff * diff;
-    }
-    a += a_stride;
-    b += b_stride;
-  }
-  return total_sse;
-}
-
-static int64_t highbd_get_sse(const uint8_t *a, int a_stride,
-                              const uint8_t *b, int b_stride,
-                              int width, int height) {
-  int64_t total_sse = 0;
-  int x, y;
-  const int dw = width % 16;
-  const int dh = height % 16;
-  unsigned int sse = 0;
-  int sum = 0;
-  if (dw > 0) {
-    encoder_highbd_8_variance(&a[width - dw], a_stride,
-                              &b[width - dw], b_stride,
-                              dw, height, &sse, &sum);
-    total_sse += sse;
-  }
-  if (dh > 0) {
-    encoder_highbd_8_variance(&a[(height - dh) * a_stride], a_stride,
-                              &b[(height - dh) * b_stride], b_stride,
-                              width - dw, dh, &sse, &sum);
-    total_sse += sse;
-  }
-  for (y = 0; y < height / 16; ++y) {
-    const uint8_t *pa = a;
-    const uint8_t *pb = b;
-    for (x = 0; x < width / 16; ++x) {
-      vpx_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse);
-      total_sse += sse;
-      pa += 16;
-      pb += 16;
-    }
-    a += 16 * a_stride;
-    b += 16 * b_stride;
-  }
-  return total_sse;
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-typedef struct {
-  double psnr[4];       // total/y/u/v
-  uint64_t sse[4];      // total/y/u/v
-  uint32_t samples[4];  // total/y/u/v
-} PSNR_STATS;
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static void calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
-                             const YV12_BUFFER_CONFIG *b,
-                             PSNR_STATS *psnr,
-                             unsigned int bit_depth,
-                             unsigned int in_bit_depth) {
-  const int widths[3] =
-      {a->y_crop_width,  a->uv_crop_width,  a->uv_crop_width };
-  const int heights[3] =
-      {a->y_crop_height, a->uv_crop_height, a->uv_crop_height};
-  const uint8_t *a_planes[3] = {a->y_buffer, a->u_buffer,  a->v_buffer };
-  const int a_strides[3] = {a->y_stride, a->uv_stride, a->uv_stride};
-  const uint8_t *b_planes[3] = {b->y_buffer, b->u_buffer,  b->v_buffer };
-  const int b_strides[3] = {b->y_stride, b->uv_stride, b->uv_stride};
-  int i;
-  uint64_t total_sse = 0;
-  uint32_t total_samples = 0;
-  const double peak = (double)((1 << in_bit_depth) - 1);
-  const unsigned int input_shift = bit_depth - in_bit_depth;
-
-  for (i = 0; i < 3; ++i) {
-    const int w = widths[i];
-    const int h = heights[i];
-    const uint32_t samples = w * h;
-    uint64_t sse;
-    if (a->flags & YV12_FLAG_HIGHBITDEPTH) {
-      if (input_shift) {
-        sse = highbd_get_sse_shift(a_planes[i], a_strides[i],
-                                   b_planes[i], b_strides[i], w, h,
-                                   input_shift);
-      } else {
-        sse = highbd_get_sse(a_planes[i], a_strides[i],
-                             b_planes[i], b_strides[i], w, h);
-      }
-    } else {
-      sse = get_sse(a_planes[i], a_strides[i],
-                    b_planes[i], b_strides[i],
-                    w, h);
-    }
-    psnr->sse[1 + i] = sse;
-    psnr->samples[1 + i] = samples;
-    psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, peak, (double)sse);
-
-    total_sse += sse;
-    total_samples += samples;
-  }
-
-  psnr->sse[0] = total_sse;
-  psnr->samples[0] = total_samples;
-  psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, peak,
-                                  (double)total_sse);
-}
-
-#else  // !CONFIG_VP9_HIGHBITDEPTH
-
-static void calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
-                      PSNR_STATS *psnr) {
-  static const double peak = 255.0;
-  const int widths[3]        = {
-      a->y_crop_width, a->uv_crop_width, a->uv_crop_width};
-  const int heights[3]       = {
-      a->y_crop_height, a->uv_crop_height, a->uv_crop_height};
-  const uint8_t *a_planes[3] = {a->y_buffer, a->u_buffer, a->v_buffer};
-  const int a_strides[3]     = {a->y_stride, a->uv_stride, a->uv_stride};
-  const uint8_t *b_planes[3] = {b->y_buffer, b->u_buffer, b->v_buffer};
-  const int b_strides[3]     = {b->y_stride, b->uv_stride, b->uv_stride};
-  int i;
-  uint64_t total_sse = 0;
-  uint32_t total_samples = 0;
-
-  for (i = 0; i < 3; ++i) {
-    const int w = widths[i];
-    const int h = heights[i];
-    const uint32_t samples = w * h;
-    const uint64_t sse = get_sse(a_planes[i], a_strides[i],
-                                 b_planes[i], b_strides[i],
-                                 w, h);
-    psnr->sse[1 + i] = sse;
-    psnr->samples[1 + i] = samples;
-    psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, peak, (double)sse);
-
-    total_sse += sse;
-    total_samples += samples;
-  }
-
-  psnr->sse[0] = total_sse;
-  psnr->samples[0] = total_samples;
-  psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, peak,
-                                  (double)total_sse);
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 static void generate_psnr_packet(VP10_COMP *cpi) {
   struct vpx_codec_cx_pkt pkt;
@@ -2955,7 +2700,7 @@
 
   vpx_clear_system_state();
 
-  recon_err = vp10_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+  recon_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
 
   if (cpi->twopass.total_left_stats.coded_error != 0.0)
     fprintf(f, "%10u %dx%d  %10d %10d %d %d %10d %10d %10d %10d"
@@ -3380,12 +3125,12 @@
 
 #if CONFIG_VP9_HIGHBITDEPTH
         if (cm->use_highbitdepth) {
-          kf_err = vp10_highbd_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+          kf_err = vpx_highbd_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
         } else {
-          kf_err = vp10_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+          kf_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
         }
 #else
-        kf_err = vp10_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+        kf_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
         // Prevent possible divide by zero error below for perfect KF
@@ -3804,13 +3549,13 @@
   if (cpi->rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) {
 #if CONFIG_VP9_HIGHBITDEPTH
     if (cm->use_highbitdepth) {
-      cpi->ambient_err = vp10_highbd_get_y_sse(cpi->Source,
+      cpi->ambient_err = vpx_highbd_get_y_sse(cpi->Source,
                                               get_frame_new_buffer(cm));
     } else {
-      cpi->ambient_err = vp10_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+      cpi->ambient_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
     }
 #else
-    cpi->ambient_err = vp10_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+    cpi->ambient_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   }
 
@@ -3993,13 +3738,22 @@
 int vp10_receive_raw_frame(VP10_COMP *cpi, unsigned int frame_flags,
                           YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
                           int64_t end_time) {
-  VP10_COMMON *cm = &cpi->common;
+  VP10_COMMON *volatile const cm = &cpi->common;
   struct vpx_usec_timer timer;
-  int res = 0;
+  volatile int res = 0;
   const int subsampling_x = sd->subsampling_x;
   const int subsampling_y = sd->subsampling_y;
 #if CONFIG_VP9_HIGHBITDEPTH
   const int use_highbitdepth = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0;
+#endif
+
+  if (setjmp(cm->error.jmp)) {
+    cm->error.setjmp = 0;
+    return -1;
+  }
+  cm->error.setjmp = 1;
+
+#if CONFIG_VP9_HIGHBITDEPTH
   check_initial_width(cpi, use_highbitdepth, subsampling_x, subsampling_y);
 #else
   check_initial_width(cpi, subsampling_x, subsampling_y);
@@ -4032,6 +3786,7 @@
     res = -1;
   }
 
+  cm->error.setjmp = 0;
   return res;
 }
 
@@ -4547,28 +4302,6 @@
   return 0;
 }
 
-int64_t vp10_get_y_sse(const YV12_BUFFER_CONFIG *a,
-                      const YV12_BUFFER_CONFIG *b) {
-  assert(a->y_crop_width == b->y_crop_width);
-  assert(a->y_crop_height == b->y_crop_height);
-
-  return get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
-                 a->y_crop_width, a->y_crop_height);
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-int64_t vp10_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
-                             const YV12_BUFFER_CONFIG *b) {
-  assert(a->y_crop_width == b->y_crop_width);
-  assert(a->y_crop_height == b->y_crop_height);
-  assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
-  assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
-
-  return highbd_get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
-                        a->y_crop_width, a->y_crop_height);
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
 int vp10_get_quantizer(VP10_COMP *cpi) {
   return cpi->common.base_qindex;
 }
diff --git a/vp10/encoder/encoder.h b/vp10/encoder/encoder.h
index cc20765..59c7682 100644
--- a/vp10/encoder/encoder.h
+++ b/vp10/encoder/encoder.h
@@ -633,12 +633,6 @@
   return get_token_alloc(tile_mb_rows, tile_mb_cols);
 }
 
-int64_t vp10_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
-#if CONFIG_VP9_HIGHBITDEPTH
-int64_t vp10_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
-                             const YV12_BUFFER_CONFIG *b);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
 void vp10_alloc_compressor_data(VP10_COMP *cpi);
 
 void vp10_scale_references(VP10_COMP *cpi);
diff --git a/vp10/encoder/picklpf.c b/vp10/encoder/picklpf.c
index f116c00..cb2c1c7 100644
--- a/vp10/encoder/picklpf.c
+++ b/vp10/encoder/picklpf.c
@@ -13,6 +13,7 @@
 
 #include "./vpx_scale_rtcd.h"
 
+#include "vpx_dsp/psnr.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
@@ -56,12 +57,12 @@
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (cm->use_highbitdepth) {
-    filt_err = vp10_highbd_get_y_sse(sd, cm->frame_to_show);
+    filt_err = vpx_highbd_get_y_sse(sd, cm->frame_to_show);
   } else {
-    filt_err = vp10_get_y_sse(sd, cm->frame_to_show);
+    filt_err = vpx_get_y_sse(sd, cm->frame_to_show);
   }
 #else
-  filt_err = vp10_get_y_sse(sd, cm->frame_to_show);
+  filt_err = vpx_get_y_sse(sd, cm->frame_to_show);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
   // Re-instate the unfiltered frame
diff --git a/vp10/encoder/pickrst.c b/vp10/encoder/pickrst.c
index 79cda43..9982836 100644
--- a/vp10/encoder/pickrst.c
+++ b/vp10/encoder/pickrst.c
@@ -14,6 +14,7 @@
 
 #include "./vpx_scale_rtcd.h"
 
+#include "vpx_dsp/psnr.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
@@ -36,12 +37,12 @@
                               rsi, 1, partial_frame);
 #if CONFIG_VP9_HIGHBITDEPTH
   if (cm->use_highbitdepth) {
-    filt_err = vp10_highbd_get_y_sse(sd, cm->frame_to_show);
+    filt_err = vpx_highbd_get_y_sse(sd, cm->frame_to_show);
   } else {
-    filt_err = vp10_get_y_sse(sd, cm->frame_to_show);
+    filt_err = vpx_get_y_sse(sd, cm->frame_to_show);
   }
 #else
-  filt_err = vp10_get_y_sse(sd, cm->frame_to_show);
+  filt_err = vpx_get_y_sse(sd, cm->frame_to_show);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
   // Re-instate the unfiltered frame
diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index 30b8406..d9be29d 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@@ -539,47 +539,70 @@
     cost = token_costs[0][0][pt][EOB_TOKEN];
     c = 0;
   } else {
-    int band_left = *band_count++;
+    if (use_fast_coef_costing) {
+      int band_left = *band_count++;
 
-    // dc token
-    int v = qcoeff[0];
-    int16_t prev_t;
-    EXTRABIT e;
-    vp10_get_token_extra(v, &prev_t, &e);
-    cost = (*token_costs)[0][pt][prev_t] +
-        vp10_get_cost(prev_t, e, cat6_high_cost);
+      // dc token
+      int v = qcoeff[0];
+      int16_t prev_t;
+      cost = vp10_get_token_cost(v, &prev_t, cat6_high_cost);
+      cost += (*token_costs)[0][pt][prev_t];
 
-    token_cache[0] = vp10_pt_energy_class[prev_t];
-    ++token_costs;
+      token_cache[0] = vp10_pt_energy_class[prev_t];
+      ++token_costs;
 
-    // ac tokens
-    for (c = 1; c < eob; c++) {
-      const int rc = scan[c];
-      int16_t t;
+      // ac tokens
+      for (c = 1; c < eob; c++) {
+        const int rc = scan[c];
+        int16_t t;
 
-      v = qcoeff[rc];
-      vp10_get_token_extra(v, &t, &e);
-      if (use_fast_coef_costing) {
-        cost += (*token_costs)[!prev_t][!prev_t][t] +
-            vp10_get_cost(t, e, cat6_high_cost);
-      } else {
-        pt = get_coef_context(nb, token_cache, c);
-        cost += (*token_costs)[!prev_t][pt][t] +
-            vp10_get_cost(t, e, cat6_high_cost);
-        token_cache[rc] = vp10_pt_energy_class[t];
+        v = qcoeff[rc];
+        cost += vp10_get_token_cost(v, &t, cat6_high_cost);
+        cost += (*token_costs)[!prev_t][!prev_t][t];
+        prev_t = t;
+        if (!--band_left) {
+          band_left = *band_count++;
+          ++token_costs;
+        }
       }
-      prev_t = t;
-      if (!--band_left) {
-        band_left = *band_count++;
-        ++token_costs;
-      }
-    }
 
-    // eob token
-    if (band_left) {
-      if (use_fast_coef_costing) {
+      // eob token
+      if (band_left)
         cost += (*token_costs)[0][!prev_t][EOB_TOKEN];
-      } else {
+
+    } else {  // !use_fast_coef_costing
+      int band_left = *band_count++;
+
+      // dc token
+      int v = qcoeff[0];
+      int16_t tok;
+      unsigned int (*tok_cost_ptr)[COEFF_CONTEXTS][ENTROPY_TOKENS];
+      cost = vp10_get_token_cost(v, &tok, cat6_high_cost);
+      cost += (*token_costs)[0][pt][tok];
+
+      token_cache[0] = vp10_pt_energy_class[tok];
+      ++token_costs;
+
+      tok_cost_ptr = &((*token_costs)[!tok]);
+
+      // ac tokens
+      for (c = 1; c < eob; c++) {
+        const int rc = scan[c];
+
+        v = qcoeff[rc];
+        cost += vp10_get_token_cost(v, &tok, cat6_high_cost);
+        pt = get_coef_context(nb, token_cache, c);
+        cost += (*tok_cost_ptr)[pt][tok];
+        token_cache[rc] = vp10_pt_energy_class[tok];
+        if (!--band_left) {
+          band_left = *band_count++;
+          ++token_costs;
+        }
+        tok_cost_ptr = &((*token_costs)[!tok]);
+      }
+
+      // eob token
+      if (band_left) {
         pt = get_coef_context(nb, token_cache, c);
         cost += (*token_costs)[0][pt][EOB_TOKEN];
       }
@@ -5089,6 +5112,8 @@
   DECLARE_ALIGNED(16, uint8_t, tmp_buf[MAX_MB_PLANE * 64 * 64]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #if CONFIG_OBMC
+  int allow_obmc = is_obmc_allowed(mbmi);
+  int best_obmc_flag = 0;
 #if CONFIG_VP9_HIGHBITDEPTH
   DECLARE_ALIGNED(16, uint16_t, tmp_buf1_16[MAX_MB_PLANE * 64 * 64]);
   uint8_t *tmp_buf1;
@@ -5098,13 +5123,11 @@
   uint8_t *obmc_tmp_buf[3] = {tmp_buf1, tmp_buf1 + 4096, tmp_buf1 + 8192};
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   int obmc_tmp_stride[3] = {64, 64, 64};
-  int best_obmc_flag = 0;
   uint8_t tmp_skip_txfm[MAX_MB_PLANE << 2] = {0};
   int64_t tmp_bsse[MAX_MB_PLANE << 2] = {0};
   int64_t rdobmc;
   int skip_txfm_sb_obmc = 0;
   int64_t skip_sse_sb_obmc = INT64_MAX;
-  int allow_obmc = is_obmc_allowed(mbmi);
 #endif  // CONFIG_OBMC
   int pred_exists = 0;
   int intpel_mv;
@@ -5334,8 +5357,9 @@
   if (this_mode == NEARMV && is_comp_pred) {
     uint8_t ref_frame_type = vp10_ref_frame_type(mbmi->ref_frame);
     if (mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
-      cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][1].this_mv;
-      cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][1].comp_mv;
+      int ref_mv_idx = mbmi->ref_mv_idx + 1;
+      cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
+      cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
 
       for (i = 0; i < 2; ++i) {
         lower_mv_precision(&cur_mv[i].as_mv, cm->allow_high_precision_mv);
@@ -5587,6 +5611,7 @@
 #if CONFIG_OBMC
     int tmp_rate_obmc;
     int64_t tmp_dist_obmc;
+    restore_dst_buf(xd, orig_dst, orig_dst_stride);
 #endif  // CONFIG_OBMC
     // Handles the special case when a filter that is not in the
     // switchable list (ex. bilinear) is indicated at the frame level, or
@@ -5594,19 +5619,14 @@
     vp10_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
 #if CONFIG_OBMC
     if (mbmi->obmc) {
-      vp10_build_obmc_inter_prediction(cm, xd, mi_row, mi_col, 1,
-                                       obmc_tmp_buf, obmc_tmp_stride,
+      vp10_build_obmc_inter_prediction(cm, xd, mi_row, mi_col, 0,
+                                       NULL, NULL,
                                        dst_buf1, dst_stride1,
                                        dst_buf2, dst_stride2);
-      for (i = 0; i < MAX_MB_PLANE; ++i) {
-        xd->plane[i].dst.buf = obmc_tmp_buf[i];
-        xd->plane[i].dst.stride = obmc_tmp_stride[i];
-      }
       model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist,
                       &skip_txfm_sb, &skip_sse_sb);
       rd = RDCOST(x->rdmult, x->rddiv,
-                  rs + tmp_rate + cpi->obmc_cost[bsize][1],
-                  tmp_dist);
+                  rs + tmp_rate + cpi->obmc_cost[bsize][1], tmp_dist);
     } else {
 #endif  // CONFIG_OBMC
     model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist,
diff --git a/vp10/encoder/tokenize.c b/vp10/encoder/tokenize.c
index 0aaeb2a..5cae8e3 100644
--- a/vp10/encoder/tokenize.c
+++ b/vp10/encoder/tokenize.c
@@ -50,6 +50,35 @@
 const TOKENVALUE *vp10_dct_cat_lt_10_value_tokens = dct_cat_lt_10_value_tokens +
     (sizeof(dct_cat_lt_10_value_tokens) / sizeof(*dct_cat_lt_10_value_tokens))
     / 2;
+// The corresponding costs of the extrabits for the tokens in the above table
+// are stored in the table below. The values are obtained from looking up the
+// entry for the specified extrabits in the table corresponding to the token
+// (as defined in cost element vp10_extra_bits)
+// e.g. {9, 63} maps to cat5_cost[63 >> 1], {1, 1} maps to sign_cost[1 >> 1]
+static const int dct_cat_lt_10_value_cost[] = {
+  3773, 3750, 3704, 3681, 3623, 3600, 3554, 3531,
+  3432, 3409, 3363, 3340, 3282, 3259, 3213, 3190,
+  3136, 3113, 3067, 3044, 2986, 2963, 2917, 2894,
+  2795, 2772, 2726, 2703, 2645, 2622, 2576, 2553,
+  3197, 3116, 3058, 2977, 2881, 2800,
+  2742, 2661, 2615, 2534, 2476, 2395,
+  2299, 2218, 2160, 2079,
+  2566, 2427, 2334, 2195, 2023, 1884, 1791, 1652,
+  1893, 1696, 1453, 1256, 1229, 864,
+  512, 512, 512, 512, 0,
+  512, 512, 512, 512,
+  864, 1229, 1256, 1453, 1696, 1893,
+  1652, 1791, 1884, 2023, 2195, 2334, 2427, 2566,
+  2079, 2160, 2218, 2299, 2395, 2476, 2534, 2615,
+  2661, 2742, 2800, 2881, 2977, 3058, 3116, 3197,
+  2553, 2576, 2622, 2645, 2703, 2726, 2772, 2795,
+  2894, 2917, 2963, 2986, 3044, 3067, 3113, 3136,
+  3190, 3213, 3259, 3282, 3340, 3363, 3409, 3432,
+  3531, 3554, 3600, 3623, 3681, 3704, 3750, 3773,
+};
+const int *vp10_dct_cat_lt_10_value_cost = dct_cat_lt_10_value_cost +
+    (sizeof(dct_cat_lt_10_value_cost) / sizeof(*dct_cat_lt_10_value_cost))
+    / 2;
 
 // Array indices are identical to previously-existing CONTEXT_NODE indices
 const vpx_tree_index vp10_coef_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
diff --git a/vp10/encoder/tokenize.h b/vp10/encoder/tokenize.h
index 12f5f1f..46b7f3f 100644
--- a/vp10/encoder/tokenize.h
+++ b/vp10/encoder/tokenize.h
@@ -76,6 +76,7 @@
  */
 extern const TOKENVALUE *vp10_dct_value_tokens_ptr;
 extern const TOKENVALUE *vp10_dct_cat_lt_10_value_tokens;
+extern const int *vp10_dct_cat_lt_10_value_cost;
 extern const int16_t vp10_cat6_low_cost[256];
 extern const int vp10_cat6_high_cost[64];
 extern const int vp10_cat6_high10_high_cost[256];
@@ -119,6 +120,18 @@
   return vp10_dct_cat_lt_10_value_tokens[v].token;
 }
 
+static INLINE int vp10_get_token_cost(int v, int16_t *token,
+                                          const int *cat6_high_table) {
+  if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) {
+    EXTRABIT extrabits;
+    *token = CATEGORY6_TOKEN;
+    extrabits = abs(v) - CAT6_MIN_VAL;
+    return vp10_cat6_low_cost[extrabits & 0xff]
+        + cat6_high_table[extrabits >> 8];
+  }
+  *token = vp10_dct_cat_lt_10_value_tokens[v].token;
+  return vp10_dct_cat_lt_10_value_cost[v];
+}
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index b1c2e11..edfd60c 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -21,7 +21,7 @@
 #include "vp8/common/alloccommon.h"
 #include "mcomp.h"
 #include "firstpass.h"
-#include "vpx/internal/vpx_psnr.h"
+#include "vpx_dsp/psnr.h"
 #include "vpx_scale/vpx_scale.h"
 #include "vp8/common/extend.h"
 #include "ratectrl.h"
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index 24c6c54..7dd1005 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -119,6 +119,20 @@
   cm->lf.lfm = NULL;
 }
 
+
+int vp9_alloc_loop_filter(VP9_COMMON *cm) {
+  vpx_free(cm->lf.lfm);
+  // Each lfm holds bit masks for all the 8x8 blocks in a 64x64 region.  The
+  // stride and rows are rounded up / truncated to a multiple of 8.
+  cm->lf.lfm_stride = (cm->mi_cols + (MI_BLOCK_SIZE - 1)) >> 3;
+  cm->lf.lfm = (LOOP_FILTER_MASK *)vpx_calloc(
+      ((cm->mi_rows + (MI_BLOCK_SIZE - 1)) >> 3) * cm->lf.lfm_stride,
+      sizeof(*cm->lf.lfm));
+  if (!cm->lf.lfm)
+    return 1;
+  return 0;
+}
+
 int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) {
   int new_mi_size;
 
@@ -151,15 +165,8 @@
     cm->above_context_alloc_cols = cm->mi_cols;
   }
 
-  vpx_free(cm->lf.lfm);
-
-  // Each lfm holds bit masks for all the 8x8 blocks in a 64x64 region.  The
-  // stride and rows are rounded up / truncated to a multiple of 8.
-  cm->lf.lfm_stride = (cm->mi_cols + (MI_BLOCK_SIZE - 1)) >> 3;
-  cm->lf.lfm = (LOOP_FILTER_MASK *)vpx_calloc(
-      ((cm->mi_rows + (MI_BLOCK_SIZE - 1)) >> 3) * cm->lf.lfm_stride,
-      sizeof(*cm->lf.lfm));
-  if (!cm->lf.lfm) goto fail;
+  if (vp9_alloc_loop_filter(cm))
+    goto fail;
 
   return 0;
 
diff --git a/vp9/common/vp9_alloccommon.h b/vp9/common/vp9_alloccommon.h
index c0e51a6..e53955b 100644
--- a/vp9/common/vp9_alloccommon.h
+++ b/vp9/common/vp9_alloccommon.h
@@ -23,6 +23,7 @@
 
 void vp9_remove_common(struct VP9Common *cm);
 
+int vp9_alloc_loop_filter(struct VP9Common *cm);
 int vp9_alloc_context_buffers(struct VP9Common *cm, int width, int height);
 void vp9_init_context_buffers(struct VP9Common *cm);
 void vp9_free_context_buffers(struct VP9Common *cm);
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index a2445b0..cf1fe81 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -3031,10 +3031,24 @@
   TileInfo *const tile_info = &tile_data->tile_info;
   MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO *mi;
+  ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
+  BLOCK_SIZE bs = VPXMAX(bsize, BLOCK_8X8);  // processing unit block size
+  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bs];
+  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bs];
+  int plane;
+
   set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
   mi = xd->mi[0];
   mi->sb_type = bsize;
 
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    struct macroblockd_plane *pd = &xd->plane[plane];
+    memcpy(a + num_4x4_blocks_wide * plane, pd->above_context,
+           (sizeof(a[0]) * num_4x4_blocks_wide) >> pd->subsampling_x);
+    memcpy(l + num_4x4_blocks_high * plane, pd->left_context,
+           (sizeof(l[0]) * num_4x4_blocks_high) >> pd->subsampling_y);
+  }
+
   if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled)
     if (cyclic_refresh_segment_id_boosted(mi->segment_id))
       x->rdmult = vp9_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
@@ -3052,6 +3066,14 @@
 
   duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize);
 
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    struct macroblockd_plane *pd = &xd->plane[plane];
+    memcpy(pd->above_context, a + num_4x4_blocks_wide * plane,
+           (sizeof(a[0]) * num_4x4_blocks_wide) >> pd->subsampling_x);
+    memcpy(pd->left_context, l + num_4x4_blocks_high * plane,
+           (sizeof(l[0]) * num_4x4_blocks_high) >> pd->subsampling_y);
+  }
+
   if (rd_cost->rate == INT_MAX)
     vp9_rd_cost_reset(rd_cost);
 
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index f3147e9..713b5f7 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -16,7 +16,7 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
-#include "vpx/internal/vpx_psnr.h"
+#include "vpx_dsp/psnr.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/vpx_filter.h"
 #if CONFIG_INTERNAL_STATS
@@ -1538,8 +1538,12 @@
       realloc_segmentation_maps(cpi);
       cpi->initial_width = cpi->initial_height = 0;
       cpi->external_resize = 0;
+    } else if (cm->mi_alloc_size == new_mi_size &&
+             (cpi->oxcf.width > last_w || cpi->oxcf.height > last_h)) {
+        vp9_alloc_loop_filter(cm);
     }
   }
+
   update_frame_size(cpi);
 
   if ((last_w != cpi->oxcf.width || last_h != cpi->oxcf.height) &&
@@ -2136,262 +2140,6 @@
 #endif
 }
 
-/* TODO(yaowu): The block_variance calls the unoptimized versions of variance()
- * and highbd_8_variance(). It should not.
- */
-static void encoder_variance(const uint8_t *a, int  a_stride,
-                             const uint8_t *b, int  b_stride,
-                             int  w, int  h, unsigned int *sse, int *sum) {
-  int i, j;
-
-  *sum = 0;
-  *sse = 0;
-
-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; j++) {
-      const int diff = a[j] - b[j];
-      *sum += diff;
-      *sse += diff * diff;
-    }
-
-    a += a_stride;
-    b += b_stride;
-  }
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static void encoder_highbd_variance64(const uint8_t *a8, int  a_stride,
-                                      const uint8_t *b8, int  b_stride,
-                                      int w, int h, uint64_t *sse,
-                                      uint64_t *sum) {
-  int i, j;
-
-  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-  *sum = 0;
-  *sse = 0;
-
-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; j++) {
-      const int diff = a[j] - b[j];
-      *sum += diff;
-      *sse += diff * diff;
-    }
-    a += a_stride;
-    b += b_stride;
-  }
-}
-
-static void encoder_highbd_8_variance(const uint8_t *a8, int  a_stride,
-                                      const uint8_t *b8, int  b_stride,
-                                      int w, int h,
-                                      unsigned int *sse, int *sum) {
-  uint64_t sse_long = 0;
-  uint64_t sum_long = 0;
-  encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h,
-                            &sse_long, &sum_long);
-  *sse = (unsigned int)sse_long;
-  *sum = (int)sum_long;
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-static int64_t get_sse(const uint8_t *a, int a_stride,
-                       const uint8_t *b, int b_stride,
-                       int width, int height) {
-  const int dw = width % 16;
-  const int dh = height % 16;
-  int64_t total_sse = 0;
-  unsigned int sse = 0;
-  int sum = 0;
-  int x, y;
-
-  if (dw > 0) {
-    encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride,
-                     dw, height, &sse, &sum);
-    total_sse += sse;
-  }
-
-  if (dh > 0) {
-    encoder_variance(&a[(height - dh) * a_stride], a_stride,
-                     &b[(height - dh) * b_stride], b_stride,
-                     width - dw, dh, &sse, &sum);
-    total_sse += sse;
-  }
-
-  for (y = 0; y < height / 16; ++y) {
-    const uint8_t *pa = a;
-    const uint8_t *pb = b;
-    for (x = 0; x < width / 16; ++x) {
-      vpx_mse16x16(pa, a_stride, pb, b_stride, &sse);
-      total_sse += sse;
-
-      pa += 16;
-      pb += 16;
-    }
-
-    a += 16 * a_stride;
-    b += 16 * b_stride;
-  }
-
-  return total_sse;
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride,
-                                    const uint8_t *b8, int b_stride,
-                                    int width, int height,
-                                    unsigned int input_shift) {
-  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-  int64_t total_sse = 0;
-  int x, y;
-  for (y = 0; y < height; ++y) {
-    for (x = 0; x < width; ++x) {
-      int64_t diff;
-      diff = (a[x] >> input_shift) - (b[x] >> input_shift);
-      total_sse += diff * diff;
-    }
-    a += a_stride;
-    b += b_stride;
-  }
-  return total_sse;
-}
-
-static int64_t highbd_get_sse(const uint8_t *a, int a_stride,
-                              const uint8_t *b, int b_stride,
-                              int width, int height) {
-  int64_t total_sse = 0;
-  int x, y;
-  const int dw = width % 16;
-  const int dh = height % 16;
-  unsigned int sse = 0;
-  int sum = 0;
-  if (dw > 0) {
-    encoder_highbd_8_variance(&a[width - dw], a_stride,
-                              &b[width - dw], b_stride,
-                              dw, height, &sse, &sum);
-    total_sse += sse;
-  }
-  if (dh > 0) {
-    encoder_highbd_8_variance(&a[(height - dh) * a_stride], a_stride,
-                              &b[(height - dh) * b_stride], b_stride,
-                              width - dw, dh, &sse, &sum);
-    total_sse += sse;
-  }
-  for (y = 0; y < height / 16; ++y) {
-    const uint8_t *pa = a;
-    const uint8_t *pb = b;
-    for (x = 0; x < width / 16; ++x) {
-      vpx_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse);
-      total_sse += sse;
-      pa += 16;
-      pb += 16;
-    }
-    a += 16 * a_stride;
-    b += 16 * b_stride;
-  }
-  return total_sse;
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-typedef struct {
-  double psnr[4];       // total/y/u/v
-  uint64_t sse[4];      // total/y/u/v
-  uint32_t samples[4];  // total/y/u/v
-} PSNR_STATS;
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static void calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
-                             const YV12_BUFFER_CONFIG *b,
-                             PSNR_STATS *psnr,
-                             unsigned int bit_depth,
-                             unsigned int in_bit_depth) {
-  const int widths[3] =
-      {a->y_crop_width,  a->uv_crop_width,  a->uv_crop_width };
-  const int heights[3] =
-      {a->y_crop_height, a->uv_crop_height, a->uv_crop_height};
-  const uint8_t *a_planes[3] = {a->y_buffer, a->u_buffer,  a->v_buffer };
-  const int a_strides[3] = {a->y_stride, a->uv_stride, a->uv_stride};
-  const uint8_t *b_planes[3] = {b->y_buffer, b->u_buffer,  b->v_buffer };
-  const int b_strides[3] = {b->y_stride, b->uv_stride, b->uv_stride};
-  int i;
-  uint64_t total_sse = 0;
-  uint32_t total_samples = 0;
-  const double peak = (double)((1 << in_bit_depth) - 1);
-  const unsigned int input_shift = bit_depth - in_bit_depth;
-
-  for (i = 0; i < 3; ++i) {
-    const int w = widths[i];
-    const int h = heights[i];
-    const uint32_t samples = w * h;
-    uint64_t sse;
-    if (a->flags & YV12_FLAG_HIGHBITDEPTH) {
-      if (input_shift) {
-        sse = highbd_get_sse_shift(a_planes[i], a_strides[i],
-                                   b_planes[i], b_strides[i], w, h,
-                                   input_shift);
-      } else {
-        sse = highbd_get_sse(a_planes[i], a_strides[i],
-                             b_planes[i], b_strides[i], w, h);
-      }
-    } else {
-      sse = get_sse(a_planes[i], a_strides[i],
-                    b_planes[i], b_strides[i],
-                    w, h);
-    }
-    psnr->sse[1 + i] = sse;
-    psnr->samples[1 + i] = samples;
-    psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, peak, (double)sse);
-
-    total_sse += sse;
-    total_samples += samples;
-  }
-
-  psnr->sse[0] = total_sse;
-  psnr->samples[0] = total_samples;
-  psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, peak,
-                                  (double)total_sse);
-}
-
-#else  // !CONFIG_VP9_HIGHBITDEPTH
-
-static void calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
-                      PSNR_STATS *psnr) {
-  static const double peak = 255.0;
-  const int widths[3]        = {
-      a->y_crop_width, a->uv_crop_width, a->uv_crop_width};
-  const int heights[3]       = {
-      a->y_crop_height, a->uv_crop_height, a->uv_crop_height};
-  const uint8_t *a_planes[3] = {a->y_buffer, a->u_buffer, a->v_buffer};
-  const int a_strides[3]     = {a->y_stride, a->uv_stride, a->uv_stride};
-  const uint8_t *b_planes[3] = {b->y_buffer, b->u_buffer, b->v_buffer};
-  const int b_strides[3]     = {b->y_stride, b->uv_stride, b->uv_stride};
-  int i;
-  uint64_t total_sse = 0;
-  uint32_t total_samples = 0;
-
-  for (i = 0; i < 3; ++i) {
-    const int w = widths[i];
-    const int h = heights[i];
-    const uint32_t samples = w * h;
-    const uint64_t sse = get_sse(a_planes[i], a_strides[i],
-                                 b_planes[i], b_strides[i],
-                                 w, h);
-    psnr->sse[1 + i] = sse;
-    psnr->samples[1 + i] = samples;
-    psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, peak, (double)sse);
-
-    total_sse += sse;
-    total_samples += samples;
-  }
-
-  psnr->sse[0] = total_sse;
-  psnr->samples[0] = total_samples;
-  psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, peak,
-                                  (double)total_sse);
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
 static void generate_psnr_packet(VP9_COMP *cpi) {
   struct vpx_codec_cx_pkt pkt;
   int i;
@@ -3057,7 +2805,7 @@
 
   vpx_clear_system_state();
 
-  recon_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+  recon_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
 
   if (cpi->twopass.total_left_stats.coded_error != 0.0)
     fprintf(f, "%10u %dx%d %10d %10d %d %d %10d %10d %10d %10d"
@@ -3567,12 +3315,12 @@
 
 #if CONFIG_VP9_HIGHBITDEPTH
         if (cm->use_highbitdepth) {
-          kf_err = vp9_highbd_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+          kf_err = vpx_highbd_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
         } else {
-          kf_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+          kf_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
         }
 #else
-        kf_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+        kf_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
         // Prevent possible divide by zero error below for perfect KF
@@ -3963,13 +3711,13 @@
   if (cpi->rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) {
 #if CONFIG_VP9_HIGHBITDEPTH
     if (cm->use_highbitdepth) {
-      cpi->ambient_err = vp9_highbd_get_y_sse(cpi->Source,
+      cpi->ambient_err = vpx_highbd_get_y_sse(cpi->Source,
                                               get_frame_new_buffer(cm));
     } else {
-      cpi->ambient_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+      cpi->ambient_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
     }
 #else
-    cpi->ambient_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+    cpi->ambient_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   }
 
@@ -4141,13 +3889,22 @@
 int vp9_receive_raw_frame(VP9_COMP *cpi, unsigned int frame_flags,
                           YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
                           int64_t end_time) {
-  VP9_COMMON *cm = &cpi->common;
+  VP9_COMMON *volatile const cm = &cpi->common;
   struct vpx_usec_timer timer;
-  int res = 0;
+  volatile int res = 0;
   const int subsampling_x = sd->subsampling_x;
   const int subsampling_y = sd->subsampling_y;
 #if CONFIG_VP9_HIGHBITDEPTH
   const int use_highbitdepth = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0;
+#endif
+
+  if (setjmp(cm->error.jmp)) {
+    cm->error.setjmp = 0;
+    return -1;
+  }
+  cm->error.setjmp = 1;
+
+#if CONFIG_VP9_HIGHBITDEPTH
   check_initial_width(cpi, use_highbitdepth, subsampling_x, subsampling_y);
 #else
   check_initial_width(cpi, subsampling_x, subsampling_y);
@@ -4180,6 +3937,7 @@
     res = -1;
   }
 
+  cm->error.setjmp = 0;
   return res;
 }
 
@@ -4830,28 +4588,6 @@
   return;
 }
 
-int64_t vp9_get_y_sse(const YV12_BUFFER_CONFIG *a,
-                      const YV12_BUFFER_CONFIG *b) {
-  assert(a->y_crop_width == b->y_crop_width);
-  assert(a->y_crop_height == b->y_crop_height);
-
-  return get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
-                 a->y_crop_width, a->y_crop_height);
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-int64_t vp9_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
-                             const YV12_BUFFER_CONFIG *b) {
-  assert(a->y_crop_width == b->y_crop_width);
-  assert(a->y_crop_height == b->y_crop_height);
-  assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
-  assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
-
-  return highbd_get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
-                        a->y_crop_width, a->y_crop_height);
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
 int vp9_get_quantizer(VP9_COMP *cpi) {
   return cpi->common.base_qindex;
 }
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 8759cbe..017fa61 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -608,12 +608,6 @@
   return get_token_alloc(tile_mb_rows, tile_mb_cols);
 }
 
-int64_t vp9_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
-#if CONFIG_VP9_HIGHBITDEPTH
-int64_t vp9_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
-                             const YV12_BUFFER_CONFIG *b);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
 void vp9_scale_references(VP9_COMP *cpi);
 
 void vp9_update_reference_frames(VP9_COMP *cpi);
diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c
index f6b1dfc..80ab238 100644
--- a/vp9/encoder/vp9_picklpf.c
+++ b/vp9/encoder/vp9_picklpf.c
@@ -12,7 +12,7 @@
 #include <limits.h>
 
 #include "./vpx_scale_rtcd.h"
-
+#include "vpx_dsp/psnr.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 
@@ -52,12 +52,12 @@
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (cm->use_highbitdepth) {
-    filt_err = vp9_highbd_get_y_sse(sd, cm->frame_to_show);
+    filt_err = vpx_highbd_get_y_sse(sd, cm->frame_to_show);
   } else {
-    filt_err = vp9_get_y_sse(sd, cm->frame_to_show);
+    filt_err = vpx_get_y_sse(sd, cm->frame_to_show);
   }
 #else
-  filt_err = vp9_get_y_sse(sd, cm->frame_to_show);
+  filt_err = vpx_get_y_sse(sd, cm->frame_to_show);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
   // Re-instate the unfiltered frame
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 1480ea4..193c9d3 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -787,9 +787,9 @@
 #if CONFIG_VP9_HIGHBITDEPTH
   uint16_t best_dst16[8 * 8];
 #endif
+  memcpy(ta, a, num_4x4_blocks_wide * sizeof(a[0]));
+  memcpy(tl, l, num_4x4_blocks_high * sizeof(l[0]));
 
-  memcpy(ta, a, sizeof(ta));
-  memcpy(tl, l, sizeof(tl));
   xd->mi[0]->tx_size = TX_4X4;
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -810,8 +810,8 @@
             continue;
       }
 
-      memcpy(tempa, ta, sizeof(ta));
-      memcpy(templ, tl, sizeof(tl));
+      memcpy(tempa, ta, num_4x4_blocks_wide * sizeof(ta[0]));
+      memcpy(templ, tl, num_4x4_blocks_high * sizeof(tl[0]));
 
       for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
         for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
@@ -874,8 +874,8 @@
         *bestdistortion = distortion;
         best_rd = this_rd;
         *best_mode = mode;
-        memcpy(a, tempa, sizeof(tempa));
-        memcpy(l, templ, sizeof(templ));
+        memcpy(a, tempa, num_4x4_blocks_wide * sizeof(tempa[0]));
+        memcpy(l, templ, num_4x4_blocks_high * sizeof(templ[0]));
         for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy) {
           memcpy(best_dst16 + idy * 8,
                  CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),
@@ -914,8 +914,8 @@
           continue;
     }
 
-    memcpy(tempa, ta, sizeof(ta));
-    memcpy(templ, tl, sizeof(tl));
+    memcpy(tempa, ta, num_4x4_blocks_wide * sizeof(ta[0]));
+    memcpy(templ, tl, num_4x4_blocks_high * sizeof(tl[0]));
 
     for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
       for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
@@ -976,8 +976,8 @@
       *bestdistortion = distortion;
       best_rd = this_rd;
       *best_mode = mode;
-      memcpy(a, tempa, sizeof(tempa));
-      memcpy(l, templ, sizeof(templ));
+      memcpy(a, tempa, num_4x4_blocks_wide * sizeof(tempa[0]));
+      memcpy(l, templ, num_4x4_blocks_high * sizeof(templ[0]));
       for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
         memcpy(best_dst + idy * 8, dst_init + idy * dst_stride,
                num_4x4_blocks_wide * 4);
@@ -1013,12 +1013,8 @@
   int64_t total_distortion = 0;
   int tot_rate_y = 0;
   int64_t total_rd = 0;
-  ENTROPY_CONTEXT t_above[4], t_left[4];
   const int *bmode_costs = cpi->mbmode_cost;
 
-  memcpy(t_above, xd->plane[0].above_context, sizeof(t_above));
-  memcpy(t_left, xd->plane[0].left_context, sizeof(t_left));
-
   // Pick modes for each sub-block (of size 4x4, 4x8, or 8x4) in an 8x8 block.
   for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
     for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
@@ -1034,8 +1030,11 @@
       }
 
       this_rd = rd_pick_intra4x4block(cpi, mb, idy, idx, &best_mode,
-                                      bmode_costs, t_above + idx, t_left + idy,
+                                      bmode_costs,
+                                      xd->plane[0].above_context + idx,
+                                      xd->plane[0].left_context + idy,
                                       &r, &ry, &d, bsize, best_rd - total_rd);
+
       if (this_rd >= best_rd - total_rd)
         return INT64_MAX;
 
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 8a34fd9..f684507 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -303,14 +303,26 @@
                                  FLAG_SKIP_INTRA_LOWVAR;
     sf->adaptive_pred_interp_filter = 2;
 
-    // Disable reference masking if using spatial scaling or for dynamic
-    // resizing (internal or external) since pred_mv_sad will not be set
-    // (since vp9_mv_pred will not be called).
-    // TODO(marpan): Fix this condition to allow reference masking for when
-    // all references have same resolution as source frame.
-    sf->reference_masking = (cpi->external_resize == 0 &&
-                             cpi->oxcf.resize_mode != RESIZE_DYNAMIC &&
-                             cpi->svc.number_spatial_layers == 1) ? 1 : 0;
+    // Reference masking only enabled for 1 spatial layer, and if none of the
+    // references have been scaled. The latter condition needs to be checked
+    // for external or internal dynamic resize.
+    sf->reference_masking = (cpi->svc.number_spatial_layers == 1);
+    if (sf->reference_masking == 1 &&
+        (cpi->external_resize == 1 ||
+         cpi->oxcf.resize_mode == RESIZE_DYNAMIC)) {
+      MV_REFERENCE_FRAME ref_frame;
+      static const int flag_list[4] =
+          {0, VP9_LAST_FLAG, VP9_GOLD_FLAG, VP9_ALT_FLAG};
+      for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+        const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
+        if (yv12 != NULL && (cpi->ref_frame_flags & flag_list[ref_frame])) {
+          const struct scale_factors *const scale_fac =
+              &cm->frame_refs[ref_frame - 1].sf;
+          if (vp9_is_scaled(scale_fac))
+            sf->reference_masking = 0;
+        }
+      }
+    }
 
     sf->disable_filter_search_var_thresh = 50;
     sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
diff --git a/vpx/internal/vpx_psnr.h b/vpx/internal/vpx_psnr.h
deleted file mode 100644
index 0e90085..0000000
--- a/vpx/internal/vpx_psnr.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VPX_INTERNAL_VPX_PSNR_H_
-#define VPX_INTERNAL_VPX_PSNR_H_
-
-#define MAX_PSNR 100.0
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// TODO(dkovalev) change vpx_sse_to_psnr signature: double -> int64_t
-
-/*!\brief Converts SSE to PSNR
- *
- * Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PNSR).
- *
- * \param[in]    samples       Number of samples
- * \param[in]    peak          Max sample value
- * \param[in]    sse           Sum of squared errors
- */
-double vpx_sse_to_psnr(double samples, double peak, double sse);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // VPX_INTERNAL_VPX_PSNR_H_
diff --git a/vpx/src/vpx_psnr.c b/vpx/src/vpx_psnr.c
deleted file mode 100644
index 27a6180..0000000
--- a/vpx/src/vpx_psnr.c
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <math.h>
-
-#include "vpx/internal/vpx_psnr.h"
-
-
-double vpx_sse_to_psnr(double samples, double peak, double sse) {
-  if (sse > 0.0) {
-    const double psnr = 10.0 * log10(samples * peak * peak / sse);
-    return psnr > MAX_PSNR ? MAX_PSNR : psnr;
-  } else {
-    return MAX_PSNR;
-  }
-}
diff --git a/vpx/vpx_codec.mk b/vpx/vpx_codec.mk
index ccdef04..b77f458 100644
--- a/vpx/vpx_codec.mk
+++ b/vpx/vpx_codec.mk
@@ -36,10 +36,8 @@
 API_SRCS-yes += src/vpx_encoder.c
 API_SRCS-yes += vpx_encoder.h
 API_SRCS-yes += internal/vpx_codec_internal.h
-API_SRCS-yes += internal/vpx_psnr.h
 API_SRCS-yes += src/vpx_codec.c
 API_SRCS-yes += src/vpx_image.c
-API_SRCS-yes += src/vpx_psnr.c
 API_SRCS-yes += vpx_codec.h
 API_SRCS-yes += vpx_codec.mk
 API_SRCS-yes += vpx_frame_buffer.h
diff --git a/vpx_dsp/intrapred.c b/vpx_dsp/intrapred.c
index dcc9b30..b1076f8 100644
--- a/vpx_dsp/intrapred.c
+++ b/vpx_dsp/intrapred.c
@@ -320,6 +320,7 @@
   const int K = above[2];
   const int L = above[3];
   const int M = above[4];
+  (void)left;
 
   dst[0] = AVG3(H, I, J);
   dst[1] = AVG3(I, J, K);
diff --git a/vpx_dsp/inv_txfm.c b/vpx_dsp/inv_txfm.c
index a0f59bf..402fd9a 100644
--- a/vpx_dsp/inv_txfm.c
+++ b/vpx_dsp/inv_txfm.c
@@ -2057,8 +2057,8 @@
   }
 }
 
-static void highbd_idct32_c(const tran_low_t *input,
-                            tran_low_t *output, int bd) {
+void vpx_highbd_idct32_c(const tran_low_t *input,
+                         tran_low_t *output, int bd) {
   tran_low_t step1[32], step2[32];
   tran_high_t temp1, temp2;
   (void) bd;
@@ -2447,7 +2447,7 @@
       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
 
     if (zero_coeff[0] | zero_coeff[1])
-      highbd_idct32_c(input, outptr, bd);
+      vpx_highbd_idct32_c(input, outptr, bd);
     else
       memset(outptr, 0, sizeof(tran_low_t) * 32);
     input += 32;
@@ -2458,7 +2458,7 @@
   for (i = 0; i < 32; ++i) {
     for (j = 0; j < 32; ++j)
       temp_in[j] = out[j * 32 + i];
-    highbd_idct32_c(temp_in, temp_out, bd);
+    vpx_highbd_idct32_c(temp_in, temp_out, bd);
     for (j = 0; j < 32; ++j) {
       dest[j * stride + i] = highbd_clip_pixel_add(
           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
@@ -2477,7 +2477,7 @@
   // Rows
   // Only upper-left 8x8 has non-zero coeff.
   for (i = 0; i < 8; ++i) {
-    highbd_idct32_c(input, outptr, bd);
+    vpx_highbd_idct32_c(input, outptr, bd);
     input += 32;
     outptr += 32;
   }
@@ -2485,7 +2485,7 @@
   for (i = 0; i < 32; ++i) {
     for (j = 0; j < 32; ++j)
       temp_in[j] = out[j * 32 + i];
-    highbd_idct32_c(temp_in, temp_out, bd);
+    vpx_highbd_idct32_c(temp_in, temp_out, bd);
     for (j = 0; j < 32; ++j) {
       dest[j * stride + i] = highbd_clip_pixel_add(
           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
diff --git a/vpx_dsp/inv_txfm.h b/vpx_dsp/inv_txfm.h
index 2358813..adbb838 100644
--- a/vpx_dsp/inv_txfm.h
+++ b/vpx_dsp/inv_txfm.h
@@ -100,6 +100,7 @@
 void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd);
 void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd);
 void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd);
+void vpx_highbd_idct32_c(const tran_low_t *input, tran_low_t *output, int bd);
 
 void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd);
 void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd);
diff --git a/vpx_dsp/psnr.c b/vpx_dsp/psnr.c
new file mode 100644
index 0000000..1b92e2a
--- /dev/null
+++ b/vpx_dsp/psnr.c
@@ -0,0 +1,297 @@
+/*
+*  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+*
+*  Use of this source code is governed by a BSD-style license
+*  that can be found in the LICENSE file in the root of the source
+*  tree. An additional intellectual property rights grant can be found
+*  in the file PATENTS.  All contributing project authors may
+*  be found in the AUTHORS file in the root of the source tree.
+*/
+
+#include <math.h>
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/psnr.h"
+#include "vpx_scale/yv12config.h"
+
+
+double vpx_sse_to_psnr(double samples, double peak, double sse) {
+  if (sse > 0.0) {
+    const double psnr = 10.0 * log10(samples * peak * peak / sse);
+    return psnr > MAX_PSNR ? MAX_PSNR : psnr;
+  } else {
+    return MAX_PSNR;
+  }
+}
+
+/* TODO(yaowu): The block_variance calls the unoptimized versions of variance()
+* and highbd_8_variance(). It should not.
+*/
+static void encoder_variance(const uint8_t *a, int  a_stride,
+  const uint8_t *b, int  b_stride,
+  int  w, int  h, unsigned int *sse, int *sum) {
+  int i, j;
+
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      const int diff = a[j] - b[j];
+      *sum += diff;
+      *sse += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void encoder_highbd_variance64(const uint8_t *a8, int  a_stride,
+  const uint8_t *b8, int  b_stride,
+  int w, int h, uint64_t *sse,
+  uint64_t *sum) {
+  int i, j;
+
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      const int diff = a[j] - b[j];
+      *sum += diff;
+      *sse += diff * diff;
+    }
+    a += a_stride;
+    b += b_stride;
+  }
+}
+
+static void encoder_highbd_8_variance(const uint8_t *a8, int  a_stride,
+  const uint8_t *b8, int  b_stride,
+  int w, int h,
+  unsigned int *sse, int *sum) {
+  uint64_t sse_long = 0;
+  uint64_t sum_long = 0;
+  encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h,
+    &sse_long, &sum_long);
+  *sse = (unsigned int)sse_long;
+  *sum = (int)sum_long;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static int64_t get_sse(const uint8_t *a, int a_stride,
+  const uint8_t *b, int b_stride,
+  int width, int height) {
+  const int dw = width % 16;
+  const int dh = height % 16;
+  int64_t total_sse = 0;
+  unsigned int sse = 0;
+  int sum = 0;
+  int x, y;
+
+  if (dw > 0) {
+    encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride,
+      dw, height, &sse, &sum);
+    total_sse += sse;
+  }
+
+  if (dh > 0) {
+    encoder_variance(&a[(height - dh) * a_stride], a_stride,
+      &b[(height - dh) * b_stride], b_stride,
+      width - dw, dh, &sse, &sum);
+    total_sse += sse;
+  }
+
+  for (y = 0; y < height / 16; ++y) {
+    const uint8_t *pa = a;
+    const uint8_t *pb = b;
+    for (x = 0; x < width / 16; ++x) {
+      vpx_mse16x16(pa, a_stride, pb, b_stride, &sse);
+      total_sse += sse;
+
+      pa += 16;
+      pb += 16;
+    }
+
+    a += 16 * a_stride;
+    b += 16 * b_stride;
+  }
+
+  return total_sse;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride,
+  const uint8_t *b8, int b_stride,
+  int width, int height,
+  unsigned int input_shift) {
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  int64_t total_sse = 0;
+  int x, y;
+  for (y = 0; y < height; ++y) {
+    for (x = 0; x < width; ++x) {
+      int64_t diff;
+      diff = (a[x] >> input_shift) - (b[x] >> input_shift);
+      total_sse += diff * diff;
+    }
+    a += a_stride;
+    b += b_stride;
+  }
+  return total_sse;
+}
+
+int64_t highbd_get_sse(const uint8_t *a, int a_stride,
+  const uint8_t *b, int b_stride,
+  int width, int height) {
+  int64_t total_sse = 0;
+  int x, y;
+  const int dw = width % 16;
+  const int dh = height % 16;
+  unsigned int sse = 0;
+  int sum = 0;
+  if (dw > 0) {
+    encoder_highbd_8_variance(&a[width - dw], a_stride,
+      &b[width - dw], b_stride,
+      dw, height, &sse, &sum);
+    total_sse += sse;
+  }
+  if (dh > 0) {
+    encoder_highbd_8_variance(&a[(height - dh) * a_stride], a_stride,
+      &b[(height - dh) * b_stride], b_stride,
+      width - dw, dh, &sse, &sum);
+    total_sse += sse;
+  }
+  for (y = 0; y < height / 16; ++y) {
+    const uint8_t *pa = a;
+    const uint8_t *pb = b;
+    for (x = 0; x < width / 16; ++x) {
+      vpx_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse);
+      total_sse += sse;
+      pa += 16;
+      pb += 16;
+    }
+    a += 16 * a_stride;
+    b += 16 * b_stride;
+  }
+  return total_sse;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+
+int64_t vpx_get_y_sse(const YV12_BUFFER_CONFIG *a,
+  const YV12_BUFFER_CONFIG *b) {
+  assert(a->y_crop_width == b->y_crop_width);
+  assert(a->y_crop_height == b->y_crop_height);
+
+  return get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
+    a->y_crop_width, a->y_crop_height);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+int64_t vpx_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
+  const YV12_BUFFER_CONFIG *b) {
+  assert(a->y_crop_width == b->y_crop_width);
+  assert(a->y_crop_height == b->y_crop_height);
+  assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+  assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+
+  return highbd_get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
+    a->y_crop_width, a->y_crop_height);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
+  const YV12_BUFFER_CONFIG *b,
+  PSNR_STATS *psnr,
+  unsigned int bit_depth,
+  unsigned int in_bit_depth) {
+  const int widths[3] =
+  { a->y_crop_width, a->uv_crop_width, a->uv_crop_width };
+  const int heights[3] =
+  { a->y_crop_height, a->uv_crop_height, a->uv_crop_height };
+  const uint8_t *a_planes[3] = { a->y_buffer, a->u_buffer, a->v_buffer };
+  const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride };
+  const uint8_t *b_planes[3] = { b->y_buffer, b->u_buffer, b->v_buffer };
+  const int b_strides[3] = { b->y_stride, b->uv_stride, b->uv_stride };
+  int i;
+  uint64_t total_sse = 0;
+  uint32_t total_samples = 0;
+  const double peak = (double)((1 << in_bit_depth) - 1);
+  const unsigned int input_shift = bit_depth - in_bit_depth;
+
+  for (i = 0; i < 3; ++i) {
+    const int w = widths[i];
+    const int h = heights[i];
+    const uint32_t samples = w * h;
+    uint64_t sse;
+    if (a->flags & YV12_FLAG_HIGHBITDEPTH) {
+      if (input_shift) {
+        sse = highbd_get_sse_shift(a_planes[i], a_strides[i],
+          b_planes[i], b_strides[i], w, h,
+          input_shift);
+      } else {
+        sse = highbd_get_sse(a_planes[i], a_strides[i],
+          b_planes[i], b_strides[i], w, h);
+      }
+    } else {
+      sse = get_sse(a_planes[i], a_strides[i],
+        b_planes[i], b_strides[i],
+        w, h);
+    }
+    psnr->sse[1 + i] = sse;
+    psnr->samples[1 + i] = samples;
+    psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, peak, (double)sse);
+
+    total_sse += sse;
+    total_samples += samples;
+  }
+
+  psnr->sse[0] = total_sse;
+  psnr->samples[0] = total_samples;
+  psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, peak,
+    (double)total_sse);
+}
+
+#endif  // !CONFIG_VP9_HIGHBITDEPTH
+
+void calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
+  PSNR_STATS *psnr) {
+  static const double peak = 255.0;
+  const int widths[3] = {
+    a->y_crop_width, a->uv_crop_width, a->uv_crop_width };
+  const int heights[3] = {
+    a->y_crop_height, a->uv_crop_height, a->uv_crop_height };
+  const uint8_t *a_planes[3] = { a->y_buffer, a->u_buffer, a->v_buffer };
+  const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride };
+  const uint8_t *b_planes[3] = { b->y_buffer, b->u_buffer, b->v_buffer };
+  const int b_strides[3] = { b->y_stride, b->uv_stride, b->uv_stride };
+  int i;
+  uint64_t total_sse = 0;
+  uint32_t total_samples = 0;
+
+  for (i = 0; i < 3; ++i) {
+    const int w = widths[i];
+    const int h = heights[i];
+    const uint32_t samples = w * h;
+    const uint64_t sse = get_sse(a_planes[i], a_strides[i],
+      b_planes[i], b_strides[i],
+      w, h);
+    psnr->sse[1 + i] = sse;
+    psnr->samples[1 + i] = samples;
+    psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, peak, (double)sse);
+
+    total_sse += sse;
+    total_samples += samples;
+  }
+
+  psnr->sse[0] = total_sse;
+  psnr->samples[0] = total_samples;
+  psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, peak,
+    (double)total_sse);
+}
diff --git a/vpx_dsp/psnr.h b/vpx_dsp/psnr.h
new file mode 100644
index 0000000..c8da94f
--- /dev/null
+++ b/vpx_dsp/psnr.h
@@ -0,0 +1,65 @@
+/*
+*  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+*
+*  Use of this source code is governed by a BSD-style license
+*  that can be found in the LICENSE file in the root of the source
+*  tree. An additional intellectual property rights grant can be found
+*  in the file PATENTS.  All contributing project authors may
+*  be found in the AUTHORS file in the root of the source tree.
+*/
+
+#ifndef VPX_DSP_PSNR_H_
+#define VPX_DSP_PSNR_H_
+
+
+#include "vpx_scale/yv12config.h"
+
+#define MAX_PSNR 100.0
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  double psnr[4];       // total/y/u/v
+  uint64_t sse[4];      // total/y/u/v
+  uint32_t samples[4];  // total/y/u/v
+} PSNR_STATS;
+
+// TODO(dkovalev) change vpx_sse_to_psnr signature: double -> int64_t
+
+/*!\brief Converts SSE to PSNR
+*
+* Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PNSR).
+*
+* \param[in]    samples       Number of samples
+* \param[in]    peak          Max sample value
+* \param[in]    sse           Sum of squared errors
+*/
+double vpx_sse_to_psnr(double samples, double peak, double sse);
+int64_t vpx_get_y_sse(const YV12_BUFFER_CONFIG *a,
+                      const YV12_BUFFER_CONFIG *b);
+#if CONFIG_VP9_HIGHBITDEPTH
+int64_t vpx_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
+                             const YV12_BUFFER_CONFIG *b);
+void calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
+                      const YV12_BUFFER_CONFIG *b,
+                      PSNR_STATS *psnr,
+                      unsigned int bit_depth,
+                      unsigned int in_bit_depth);
+int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride,
+                             const uint8_t *b8, int b_stride,
+                             int width, int height,
+                             unsigned int input_shift);
+#endif
+void calc_psnr(const YV12_BUFFER_CONFIG *a,
+               const YV12_BUFFER_CONFIG *b,
+               PSNR_STATS *psnr);
+
+int64_t highbd_get_sse(const uint8_t *a, int a_stride,
+                       const uint8_t *b, int b_stride,
+                       int width, int height);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif  // VPX_DSP_PSNR_H_
diff --git a/vpx_dsp/psnrhvs.c b/vpx_dsp/psnrhvs.c
index 4d3d6ee..9b70c6a 100644
--- a/vpx_dsp/psnrhvs.c
+++ b/vpx_dsp/psnrhvs.c
@@ -19,7 +19,7 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/ssim.h"
 #include "vpx_ports/system_state.h"
-#include "vpx/internal/vpx_psnr.h"
+#include "vpx_dsp/psnr.h"
 
 #if !defined(M_PI)
 # define M_PI (3.141592653589793238462643)
diff --git a/vpx_dsp/txfm_common.h b/vpx_dsp/txfm_common.h
index 442e6a5..9b0e990 100644
--- a/vpx_dsp/txfm_common.h
+++ b/vpx_dsp/txfm_common.h
@@ -57,10 +57,13 @@
 static const tran_high_t cospi_30_64 = 1606;
 static const tran_high_t cospi_31_64 = 804;
 
-//  16384 * sqrt(2) * sin(kPi/9) * 2 / 3
+// 16384 * sqrt(2) * sin(kPi/9) * 2 / 3
 static const tran_high_t sinpi_1_9 = 5283;
 static const tran_high_t sinpi_2_9 = 9929;
 static const tran_high_t sinpi_3_9 = 13377;
 static const tran_high_t sinpi_4_9 = 15212;
 
+// 16384 * sqrt(2)
+static const tran_high_t Sqrt2 = 23170;
+
 #endif  // VPX_DSP_TXFM_COMMON_H_
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index a44f948..dbb41aa 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -22,6 +22,8 @@
 DSP_SRCS-yes += bitwriter.c
 DSP_SRCS-yes += bitwriter_buffer.c
 DSP_SRCS-yes += bitwriter_buffer.h
+DSP_SRCS-yes += psnr.c
+DSP_SRCS-yes += psnr.h
 DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.c
 DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.h
 DSP_SRCS-$(CONFIG_INTERNAL_STATS) += psnrhvs.c
diff --git a/vpxenc.c b/vpxenc.c
index c61d060..f14470a 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -1470,6 +1470,8 @@
                            global->codec->fourcc,
                            pixel_aspect_ratio);
   }
+#else
+  (void)pixel_aspect_ratio;
 #endif
 
   if (!stream->config.write_webm) {