Merge "Add static syntax to total_adj_strong_thresh"
diff --git a/test/consistency_test.cc b/test/consistency_test.cc
index 6f5f452..9c2fd55 100644
--- a/test/consistency_test.cc
+++ b/test/consistency_test.cc
@@ -23,11 +23,11 @@
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
-#include "vp9/encoder/vp9_ssim.h"
+#include "vpx_dsp/ssim.h"
 #include "vpx_mem/vpx_mem.h"
 
 extern "C"
-double vp9_get_ssim_metrics(uint8_t *img1, int img1_pitch,
+double vpx_get_ssim_metrics(uint8_t *img1, int img1_pitch,
                             uint8_t *img2, int img2_pitch,
                             int width, int height,
                             Ssimv *sv2, Metrics *m,
@@ -144,7 +144,7 @@
   double CheckConsistency(int frame) {
     EXPECT_LT(frame, 2)<< "Frame to check has to be less than 2.";
     return
-        vp9_get_ssim_metrics(source_data_[frame], source_stride_,
+        vpx_get_ssim_metrics(source_data_[frame], source_stride_,
                              reference_data_[frame], reference_stride_,
                              width_, height_, ssim_array_, &metrics_, 1);
   }
diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index d65ca60..7924ae7 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -255,19 +255,6 @@
 }
 
 #
-# Structured Similarity (SSIM)
-#
-if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {
-    $opts{arch} eq "x86_64" and $sse2_on_x86_64 = "sse2";
-
-    add_proto qw/void vp8_ssim_parms_8x8/, "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr";
-    specialize qw/vp8_ssim_parms_8x8/, "$sse2_on_x86_64";
-
-    add_proto qw/void vp8_ssim_parms_16x16/, "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr";
-    specialize qw/vp8_ssim_parms_16x16/, "$sse2_on_x86_64";
-}
-
-#
 # Forward DCT
 #
 add_proto qw/void vp8_short_fdct4x4/, "short *input, short *output, int pitch";
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index c2a7ac4..d2fb05a 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -74,26 +74,7 @@
 
 #if CONFIG_INTERNAL_STATS
 #include "math.h"
-
-extern double vp8_calc_ssim
-(
-    YV12_BUFFER_CONFIG *source,
-    YV12_BUFFER_CONFIG *dest,
-    int lumamask,
-    double *weight
-);
-
-
-extern double vp8_calc_ssimg
-(
-    YV12_BUFFER_CONFIG *source,
-    YV12_BUFFER_CONFIG *dest,
-    double *ssim_y,
-    double *ssim_u,
-    double *ssim_v
-);
-
-
+#include "vpx_dsp/ssim.h"
 #endif
 
 
@@ -5741,8 +5722,8 @@
                     cpi->total_sq_error2 += sq_error2;
                     cpi->totalp  += frame_psnr2;
 
-                    frame_ssim2 = vp8_calc_ssim(cpi->Source,
-                      &cm->post_proc_buffer, 1, &weight);
+                    frame_ssim2 = vpx_calc_ssim(cpi->Source,
+                      &cm->post_proc_buffer, &weight);
 
                     cpi->summed_quality += frame_ssim2 * weight;
                     cpi->summed_weights += weight;
@@ -5772,7 +5753,7 @@
             if (cpi->b_calculate_ssimg)
             {
                 double y, u, v, frame_all;
-                frame_all =  vp8_calc_ssimg(cpi->Source, cm->frame_to_show,
+                frame_all = vpx_calc_ssimg(cpi->Source, cm->frame_to_show,
                     &y, &u, &v);
 
                 if (cpi->oxcf.number_of_layers > 1)
diff --git a/vp8/encoder/ssim.c b/vp8/encoder/ssim.c
deleted file mode 100644
index e751608..0000000
--- a/vp8/encoder/ssim.c
+++ /dev/null
@@ -1,233 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "onyx_int.h"
-
-void vp8_ssim_parms_16x16_c
-(
-    unsigned char *s,
-    int sp,
-    unsigned char *r,
-    int rp,
-    unsigned long *sum_s,
-    unsigned long *sum_r,
-    unsigned long *sum_sq_s,
-    unsigned long *sum_sq_r,
-    unsigned long *sum_sxr
-)
-{
-    int i,j;
-    for(i=0;i<16;i++,s+=sp,r+=rp)
-     {
-         for(j=0;j<16;j++)
-         {
-             *sum_s += s[j];
-             *sum_r += r[j];
-             *sum_sq_s += s[j] * s[j];
-             *sum_sq_r += r[j] * r[j];
-             *sum_sxr += s[j] * r[j];
-         }
-     }
-}
-void vp8_ssim_parms_8x8_c
-(
-    unsigned char *s,
-    int sp,
-    unsigned char *r,
-    int rp,
-    unsigned long *sum_s,
-    unsigned long *sum_r,
-    unsigned long *sum_sq_s,
-    unsigned long *sum_sq_r,
-    unsigned long *sum_sxr
-)
-{
-    int i,j;
-    for(i=0;i<8;i++,s+=sp,r+=rp)
-     {
-         for(j=0;j<8;j++)
-         {
-             *sum_s += s[j];
-             *sum_r += r[j];
-             *sum_sq_s += s[j] * s[j];
-             *sum_sq_r += r[j] * r[j];
-             *sum_sxr += s[j] * r[j];
-         }
-     }
-}
-
-const static int64_t cc1 =  26634; // (64^2*(.01*255)^2
-const static int64_t cc2 = 239708; // (64^2*(.03*255)^2
-
-static double similarity
-(
-    unsigned long sum_s,
-    unsigned long sum_r,
-    unsigned long sum_sq_s,
-    unsigned long sum_sq_r,
-    unsigned long sum_sxr,
-    int count
-)
-{
-    int64_t ssim_n, ssim_d;
-    int64_t c1, c2;
-
-    //scale the constants by number of pixels
-    c1 = (cc1*count*count)>>12;
-    c2 = (cc2*count*count)>>12;
-
-    ssim_n = (2*sum_s*sum_r+ c1)*((int64_t) 2*count*sum_sxr-
-          (int64_t) 2*sum_s*sum_r+c2);
-
-    ssim_d = (sum_s*sum_s +sum_r*sum_r+c1)*
-        ((int64_t)count*sum_sq_s-(int64_t)sum_s*sum_s +
-        (int64_t)count*sum_sq_r-(int64_t) sum_r*sum_r +c2) ;
-
-    return ssim_n * 1.0 / ssim_d;
-}
-
-static double ssim_16x16(unsigned char *s,int sp, unsigned char *r,int rp)
-{
-    unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0;
-    vp8_ssim_parms_16x16(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
-    return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 256);
-}
-static double ssim_8x8(unsigned char *s,int sp, unsigned char *r,int rp)
-{
-    unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0;
-    vp8_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
-    return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64);
-}
-
-// TODO: (jbb) tried to scale this function such that we may be able to use it
-// for distortion metric in mode selection code ( provided we do a reconstruction)
-long dssim(unsigned char *s,int sp, unsigned char *r,int rp)
-{
-    unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0;
-    int64_t ssim3;
-    int64_t ssim_n1,ssim_n2;
-    int64_t ssim_d1,ssim_d2;
-    int64_t ssim_t1,ssim_t2;
-    int64_t c1, c2;
-
-    // normalize by 256/64
-    c1 = cc1*16;
-    c2 = cc2*16;
-
-    vp8_ssim_parms_16x16(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
-    ssim_n1 = (2*sum_s*sum_r+ c1);
-
-    ssim_n2 =((int64_t) 2*256*sum_sxr-(int64_t) 2*sum_s*sum_r+c2);
-
-    ssim_d1 =((int64_t)sum_s*sum_s +(int64_t)sum_r*sum_r+c1);
-
-    ssim_d2 = (256 * (int64_t) sum_sq_s-(int64_t) sum_s*sum_s +
-                    (int64_t) 256*sum_sq_r-(int64_t) sum_r*sum_r +c2) ;
-
-    ssim_t1 = 256 - 256 * ssim_n1 / ssim_d1;
-    ssim_t2 = 256 - 256 * ssim_n2 / ssim_d2;
-
-    ssim3 = 256 *ssim_t1 * ssim_t2;
-    if(ssim3 <0 )
-        ssim3=0;
-    return (long)( ssim3  );
-}
-
-// We are using a 8x8 moving window with starting location of each 8x8 window
-// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
-// block boundaries to penalize blocking artifacts.
-double vp8_ssim2
-(
-    unsigned char *img1,
-    unsigned char *img2,
-    int stride_img1,
-    int stride_img2,
-    int width,
-    int height
-)
-{
-    int i,j;
-    int samples =0;
-    double ssim_total=0;
-
-    // sample point start with each 4x4 location
-    for(i=0; i < height-8; i+=4, img1 += stride_img1*4, img2 += stride_img2*4)
-    {
-        for(j=0; j < width-8; j+=4 )
-        {
-            double v = ssim_8x8(img1+j, stride_img1, img2+j, stride_img2);
-            ssim_total += v;
-            samples++;
-        }
-    }
-    ssim_total /= samples;
-    return ssim_total;
-}
-double vp8_calc_ssim
-(
-    YV12_BUFFER_CONFIG *source,
-    YV12_BUFFER_CONFIG *dest,
-    int lumamask,
-    double *weight
-)
-{
-    double a, b, c;
-    double ssimv;
-
-    a = vp8_ssim2(source->y_buffer, dest->y_buffer,
-                 source->y_stride, dest->y_stride, source->y_width,
-                 source->y_height);
-
-    b = vp8_ssim2(source->u_buffer, dest->u_buffer,
-                 source->uv_stride, dest->uv_stride, source->uv_width,
-                 source->uv_height);
-
-    c = vp8_ssim2(source->v_buffer, dest->v_buffer,
-                 source->uv_stride, dest->uv_stride, source->uv_width,
-                 source->uv_height);
-
-    ssimv = a * .8 + .1 * (b + c);
-
-    *weight = 1;
-
-    return ssimv;
-}
-
-double vp8_calc_ssimg
-(
-    YV12_BUFFER_CONFIG *source,
-    YV12_BUFFER_CONFIG *dest,
-    double *ssim_y,
-    double *ssim_u,
-    double *ssim_v
-)
-{
-    double ssim_all = 0;
-    double a, b, c;
-
-    a = vp8_ssim2(source->y_buffer, dest->y_buffer,
-                 source->y_stride, dest->y_stride, source->y_width,
-                 source->y_height);
-
-    b = vp8_ssim2(source->u_buffer, dest->u_buffer,
-                 source->uv_stride, dest->uv_stride, source->uv_width,
-                 source->uv_height);
-
-    c = vp8_ssim2(source->v_buffer, dest->v_buffer,
-                 source->uv_stride, dest->uv_stride, source->uv_width,
-                 source->uv_height);
-    *ssim_y = a;
-    *ssim_u = b;
-    *ssim_v = c;
-    ssim_all = (a * 4 + b + c) /6;
-
-    return ssim_all;
-}
diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index 59f9fe8..ea7d472 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -65,7 +65,6 @@
 VP8_CX_SRCS-yes += encoder/rdopt.c
 VP8_CX_SRCS-yes += encoder/segmentation.c
 VP8_CX_SRCS-yes += encoder/segmentation.h
-VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/ssim.c
 VP8_CX_SRCS-yes += encoder/tokenize.c
 VP8_CX_SRCS-yes += encoder/dct_value_cost.h
 VP8_CX_SRCS-yes += encoder/dct_value_tokens.h
@@ -97,7 +96,6 @@
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_enc_stubs_sse2.c
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm
-VP8_CX_SRCS-$(ARCH_X86_64) += encoder/x86/ssim_opt_x86_64.asm
 
 ifeq ($(CONFIG_REALTIME_ONLY),yes)
 VP8_CX_SRCS_REMOVE-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index f029bbe..737fc56 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -261,17 +261,6 @@
   specialize qw/vp9_fdct8x8_quant sse2 ssse3 neon/;
 }
 
-#
-# Structured Similarity (SSIM)
-#
-if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {
-    add_proto qw/void vp9_ssim_parms_8x8/, "uint8_t *s, int sp, uint8_t *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr";
-    specialize qw/vp9_ssim_parms_8x8/, "$sse2_x86_64";
-
-    add_proto qw/void vp9_ssim_parms_16x16/, "uint8_t *s, int sp, uint8_t *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr";
-    specialize qw/vp9_ssim_parms_16x16/, "$sse2_x86_64";
-}
-
 # fdct functions
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
@@ -330,14 +319,6 @@
   add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_highbd_quantize_fp_32x32/;
 
-  #
-  # Structured Similarity (SSIM)
-  #
-  if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {
-    add_proto qw/void vp9_highbd_ssim_parms_8x8/, "uint16_t *s, int sp, uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
-    specialize qw/vp9_highbd_ssim_parms_8x8/;
-  }
-
   # fdct functions
   add_proto qw/void vp9_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
   specialize qw/vp9_highbd_fht4x4/;
diff --git a/vp9/common/vp9_systemdependent.h b/vp9/common/vp9_systemdependent.h
index fc77762..e4178b2 100644
--- a/vp9/common/vp9_systemdependent.h
+++ b/vp9/common/vp9_systemdependent.h
@@ -33,16 +33,6 @@
 #define vp9_clear_system_state()
 #endif
 
-#if defined(_MSC_VER) && _MSC_VER < 1800
-// round is not defined in MSVC before VS2013.
-static INLINE int round(double x) {
-  if (x < 0)
-    return (int)ceil(x - 0.5);
-  else
-    return (int)floor(x + 0.5);
-}
-#endif
-
 // use GNU builtins where available.
 #if defined(__GNUC__) && \
     ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
index b619063..a13f0c0 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -459,7 +459,10 @@
   cr->time_for_refresh = 0;
   // Use larger delta-qp (increase rate_ratio_qdelta) for first few (~4)
   // periods of the refresh cycle, after a key frame.
-  if (rc->frames_since_key <  4 * cr->percent_refresh)
+  // Account for larger interval on base layer for temporal layers.
+  if (cr->percent_refresh > 0 &&
+      rc->frames_since_key <  (4 * cpi->svc.number_temporal_layers) *
+      (100 / cr->percent_refresh))
     cr->rate_ratio_qdelta = 3.0;
   else
     cr->rate_ratio_qdelta = 2.0;
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 7184801..b4e07a6 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -18,6 +18,9 @@
 #include "./vpx_scale_rtcd.h"
 #include "vpx/internal/vpx_psnr.h"
 #include "vpx_dsp/vpx_filter.h"
+#if CONFIG_INTERNAL_STATS
+#include "vpx_dsp/ssim.h"
+#endif
 #include "vpx_ports/mem.h"
 #include "vpx_ports/vpx_timer.h"
 #include "vpx_scale/vpx_scale.h"
@@ -51,9 +54,6 @@
 #include "vp9/encoder/vp9_segmentation.h"
 #include "vp9/encoder/vp9_skin_detection.h"
 #include "vp9/encoder/vp9_speed_features.h"
-#if CONFIG_INTERNAL_STATS
-#include "vp9/encoder/vp9_ssim.h"
-#endif
 #include "vp9/encoder/vp9_svc_layercontext.h"
 #include "vp9/encoder/vp9_temporal_filter.h"
 
@@ -4416,13 +4416,13 @@
 
 #if CONFIG_VP9_HIGHBITDEPTH
           if (cm->use_highbitdepth) {
-            frame_ssim2 = vp9_highbd_calc_ssim(orig, recon, &weight,
+            frame_ssim2 = vpx_highbd_calc_ssim(orig, recon, &weight,
                                                (int)cm->bit_depth);
           } else {
-            frame_ssim2 = vp9_calc_ssim(orig, recon, &weight);
+            frame_ssim2 = vpx_calc_ssim(orig, recon, &weight);
           }
 #else
-          frame_ssim2 = vp9_calc_ssim(orig, recon, &weight);
+          frame_ssim2 = vpx_calc_ssim(orig, recon, &weight);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
           cpi->worst_ssim= MIN(cpi->worst_ssim, frame_ssim2);
@@ -4431,13 +4431,13 @@
 
 #if CONFIG_VP9_HIGHBITDEPTH
           if (cm->use_highbitdepth) {
-            frame_ssim2 = vp9_highbd_calc_ssim(
+            frame_ssim2 = vpx_highbd_calc_ssim(
                 orig, &cm->post_proc_buffer, &weight, (int)cm->bit_depth);
           } else {
-            frame_ssim2 = vp9_calc_ssim(orig, &cm->post_proc_buffer, &weight);
+            frame_ssim2 = vpx_calc_ssim(orig, &cm->post_proc_buffer, &weight);
           }
 #else
-          frame_ssim2 = vp9_calc_ssim(orig, &cm->post_proc_buffer, &weight);
+          frame_ssim2 = vpx_calc_ssim(orig, &cm->post_proc_buffer, &weight);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
           cpi->summedp_quality += frame_ssim2 * weight;
@@ -4472,7 +4472,7 @@
         if (!cm->use_highbitdepth)
 #endif
         {
-          double this_inconsistency = vp9_get_ssim_metrics(
+          double this_inconsistency = vpx_get_ssim_metrics(
               cpi->Source->y_buffer, cpi->Source->y_stride,
               cm->frame_to_show->y_buffer, cm->frame_to_show->y_stride,
               cpi->Source->y_width, cpi->Source->y_height, cpi->ssim_vars,
@@ -4492,14 +4492,14 @@
         double y, u, v, frame_all;
 #if CONFIG_VP9_HIGHBITDEPTH
         if (cm->use_highbitdepth) {
-          frame_all = vp9_highbd_calc_ssimg(cpi->Source, cm->frame_to_show, &y,
+          frame_all = vpx_highbd_calc_ssimg(cpi->Source, cm->frame_to_show, &y,
                                             &u, &v, (int)cm->bit_depth);
         } else {
-          frame_all = vp9_calc_ssimg(cpi->Source, cm->frame_to_show, &y, &u,
+          frame_all = vpx_calc_ssimg(cpi->Source, cm->frame_to_show, &y, &u,
                                      &v);
         }
 #else
-        frame_all = vp9_calc_ssimg(cpi->Source, cm->frame_to_show, &y, &u, &v);
+        frame_all = vpx_calc_ssimg(cpi->Source, cm->frame_to_show, &y, &u, &v);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
         adjust_image_stat(y, u, v, frame_all, &cpi->ssimg);
       }
@@ -4508,7 +4508,7 @@
 #endif
       {
         double y, u, v, frame_all;
-        frame_all = vp9_calc_fastssim(cpi->Source, cm->frame_to_show, &y, &u,
+        frame_all = vpx_calc_fastssim(cpi->Source, cm->frame_to_show, &y, &u,
                                       &v);
         adjust_image_stat(y, u, v, frame_all, &cpi->fastssim);
         /* TODO(JBB): add 10/12 bit support */
@@ -4518,7 +4518,7 @@
 #endif
       {
         double y, u, v, frame_all;
-        frame_all = vp9_psnrhvs(cpi->Source, cm->frame_to_show, &y, &u, &v);
+        frame_all = vpx_psnrhvs(cpi->Source, cm->frame_to_show, &y, &u, &v);
         adjust_image_stat(y, u, v, frame_all, &cpi->psnrhvs);
       }
     }
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 4d4da92..78d55e1 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -16,6 +16,10 @@
 #include "./vpx_config.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "vpx/vp8cx.h"
+#if CONFIG_INTERNAL_STATS
+#include "vpx_dsp/ssim.h"
+#endif
+#include "vpx_dsp/variance.h"
 #include "vpx_util/vpx_thread.h"
 
 #include "vp9/common/vp9_alloccommon.h"
@@ -34,13 +38,9 @@
 #include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_rd.h"
-#if CONFIG_INTERNAL_STATS
-#include "vp9/encoder/vp9_ssim.h"
-#endif
 #include "vp9/encoder/vp9_speed_features.h"
 #include "vp9/encoder/vp9_svc_layercontext.h"
 #include "vp9/encoder/vp9_tokenize.h"
-#include "vpx_dsp/variance.h"
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
 #include "vp9/encoder/vp9_denoiser.h"
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 5d3bc5d..aa3e51c 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -256,6 +256,27 @@
     }                                                   \
   }
 
+// TODO(yunqingwang): SECOND_LEVEL_CHECKS_BEST was a rewrote of
+// SECOND_LEVEL_CHECKS, and SECOND_LEVEL_CHECKS should be rewritten
+// later in the same way.
+#define SECOND_LEVEL_CHECKS_BEST                        \
+  {                                                     \
+    unsigned int second;                                \
+    int br0 = br;                                       \
+    int bc0 = bc;                                       \
+    assert(tr == br || tc == bc);                       \
+    if (tr == br && tc != bc) {                         \
+      kc = bc - tc;                                     \
+    } else if (tr != br && tc == bc) {                  \
+      kr = br - tr;                                     \
+    }                                                   \
+    CHECK_BETTER(second, br0 + kr, bc0);                \
+    CHECK_BETTER(second, br0, bc0 + kc);                \
+    if (br0 != br || bc0 != bc) {                       \
+      CHECK_BETTER(second, br0 + kr, bc0 + kc);         \
+    }                                                   \
+  }
+
 #define SETUP_SUBPEL_SEARCH                                                \
   const uint8_t *const z = x->plane[0].src.buf;                            \
   const int src_stride = x->plane[0].src.stride;                           \
@@ -636,7 +657,6 @@
   const MACROBLOCKD *xd = &x->e_mbd;
   unsigned int besterr = INT_MAX;
   unsigned int sse;
-  unsigned int whichdir = 0;
   int thismse;
   const int y_stride = xd->plane[0].pre[0].stride;
   const int offset = bestmv->row * y_stride + bestmv->col;
@@ -657,6 +677,7 @@
   const MV *search_step = search_step_table;
   int idx, best_idx = -1;
   unsigned int cost_array[5];
+  int kr, kc;
 
   if (!(allow_hp && vp9_use_mv_hp(ref_mv)))
     if (round == 3)
@@ -703,8 +724,11 @@
     }
 
     // Check diagonal sub-pixel position
-    tc = bc + (cost_array[0] <= cost_array[1] ? -hstep : hstep);
-    tr = br + (cost_array[2] <= cost_array[3] ? -hstep : hstep);
+    kc = (cost_array[0] <= cost_array[1] ? -hstep : hstep);
+    kr = (cost_array[2] <= cost_array[3] ? -hstep : hstep);
+
+    tc = bc + kc;
+    tr = br + kr;
     if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
       const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
       MV this_mv = {tr, tc};
@@ -736,7 +760,7 @@
     }
 
     if (iters_per_step > 1 && best_idx != -1)
-      SECOND_LEVEL_CHECKS;
+      SECOND_LEVEL_CHECKS_BEST;
 
     tr = br;
     tc = bc;
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index da80a99..0e12d8c 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -2488,9 +2488,8 @@
   }
 
   // We don't include the cost of the second reference here, because there
-  // are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
-  // words if you present them in that order, the second one is always known
-  // if the first is known.
+  // are only two options: Last/ARF or Golden/ARF; The second one is always
+  // known, which is ARF.
   //
   // Under some circumstances we discount the cost of new mv mode to encourage
   // initiation of a motion field.
diff --git a/vp9/encoder/x86/vp9_ssim_opt_x86_64.asm b/vp9/encoder/x86/vp9_ssim_opt_x86_64.asm
deleted file mode 100644
index 455d10d..0000000
--- a/vp9/encoder/x86/vp9_ssim_opt_x86_64.asm
+++ /dev/null
@@ -1,216 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "vpx_ports/x86_abi_support.asm"
-
-; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
-%macro TABULATE_SSIM 0
-        paddusw         xmm15, xmm3  ; sum_s
-        paddusw         xmm14, xmm4  ; sum_r
-        movdqa          xmm1, xmm3
-        pmaddwd         xmm1, xmm1
-        paddd           xmm13, xmm1 ; sum_sq_s
-        movdqa          xmm2, xmm4
-        pmaddwd         xmm2, xmm2
-        paddd           xmm12, xmm2 ; sum_sq_r
-        pmaddwd         xmm3, xmm4
-        paddd           xmm11, xmm3  ; sum_sxr
-%endmacro
-
-; Sum across the register %1 starting with q words
-%macro SUM_ACROSS_Q 1
-        movdqa          xmm2,%1
-        punpckldq       %1,xmm0
-        punpckhdq       xmm2,xmm0
-        paddq           %1,xmm2
-        movdqa          xmm2,%1
-        punpcklqdq      %1,xmm0
-        punpckhqdq      xmm2,xmm0
-        paddq           %1,xmm2
-%endmacro
-
-; Sum across the register %1 starting with q words
-%macro SUM_ACROSS_W 1
-        movdqa          xmm1, %1
-        punpcklwd       %1,xmm0
-        punpckhwd       xmm1,xmm0
-        paddd           %1, xmm1
-        SUM_ACROSS_Q    %1
-%endmacro
-;void ssim_parms_sse2(
-;    unsigned char *s,
-;    int sp,
-;    unsigned char *r,
-;    int rp
-;    unsigned long *sum_s,
-;    unsigned long *sum_r,
-;    unsigned long *sum_sq_s,
-;    unsigned long *sum_sq_r,
-;    unsigned long *sum_sxr);
-;
-; TODO: Use parm passing through structure, probably don't need the pxors
-; ( calling app will initialize to 0 ) could easily fit everything in sse2
-; without too much hastle, and can probably do better estimates with psadw
-; or pavgb At this point this is just meant to be first pass for calculating
-; all the parms needed for 16x16 ssim so we can play with dssim as distortion
-; in mode selection code.
-global sym(vp9_ssim_parms_16x16_sse2) PRIVATE
-sym(vp9_ssim_parms_16x16_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 9
-    SAVE_XMM 15
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0) ;s
-    mov             rcx,        arg(1) ;sp
-    mov             rdi,        arg(2) ;r
-    mov             rax,        arg(3) ;rp
-
-    pxor            xmm0, xmm0
-    pxor            xmm15,xmm15  ;sum_s
-    pxor            xmm14,xmm14  ;sum_r
-    pxor            xmm13,xmm13  ;sum_sq_s
-    pxor            xmm12,xmm12  ;sum_sq_r
-    pxor            xmm11,xmm11  ;sum_sxr
-
-    mov             rdx, 16      ;row counter
-.NextRow:
-
-    ;grab source and reference pixels
-    movdqu          xmm5, [rsi]
-    movdqu          xmm6, [rdi]
-    movdqa          xmm3, xmm5
-    movdqa          xmm4, xmm6
-    punpckhbw       xmm3, xmm0 ; high_s
-    punpckhbw       xmm4, xmm0 ; high_r
-
-    TABULATE_SSIM
-
-    movdqa          xmm3, xmm5
-    movdqa          xmm4, xmm6
-    punpcklbw       xmm3, xmm0 ; low_s
-    punpcklbw       xmm4, xmm0 ; low_r
-
-    TABULATE_SSIM
-
-    add             rsi, rcx   ; next s row
-    add             rdi, rax   ; next r row
-
-    dec             rdx        ; counter
-    jnz .NextRow
-
-    SUM_ACROSS_W    xmm15
-    SUM_ACROSS_W    xmm14
-    SUM_ACROSS_Q    xmm13
-    SUM_ACROSS_Q    xmm12
-    SUM_ACROSS_Q    xmm11
-
-    mov             rdi,arg(4)
-    movd            [rdi], xmm15;
-    mov             rdi,arg(5)
-    movd            [rdi], xmm14;
-    mov             rdi,arg(6)
-    movd            [rdi], xmm13;
-    mov             rdi,arg(7)
-    movd            [rdi], xmm12;
-    mov             rdi,arg(8)
-    movd            [rdi], xmm11;
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void ssim_parms_sse2(
-;    unsigned char *s,
-;    int sp,
-;    unsigned char *r,
-;    int rp
-;    unsigned long *sum_s,
-;    unsigned long *sum_r,
-;    unsigned long *sum_sq_s,
-;    unsigned long *sum_sq_r,
-;    unsigned long *sum_sxr);
-;
-; TODO: Use parm passing through structure, probably don't need the pxors
-; ( calling app will initialize to 0 ) could easily fit everything in sse2
-; without too much hastle, and can probably do better estimates with psadw
-; or pavgb At this point this is just meant to be first pass for calculating
-; all the parms needed for 16x16 ssim so we can play with dssim as distortion
-; in mode selection code.
-global sym(vp9_ssim_parms_8x8_sse2) PRIVATE
-sym(vp9_ssim_parms_8x8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 9
-    SAVE_XMM 15
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0) ;s
-    mov             rcx,        arg(1) ;sp
-    mov             rdi,        arg(2) ;r
-    mov             rax,        arg(3) ;rp
-
-    pxor            xmm0, xmm0
-    pxor            xmm15,xmm15  ;sum_s
-    pxor            xmm14,xmm14  ;sum_r
-    pxor            xmm13,xmm13  ;sum_sq_s
-    pxor            xmm12,xmm12  ;sum_sq_r
-    pxor            xmm11,xmm11  ;sum_sxr
-
-    mov             rdx, 8      ;row counter
-.NextRow:
-
-    ;grab source and reference pixels
-    movq            xmm3, [rsi]
-    movq            xmm4, [rdi]
-    punpcklbw       xmm3, xmm0 ; low_s
-    punpcklbw       xmm4, xmm0 ; low_r
-
-    TABULATE_SSIM
-
-    add             rsi, rcx   ; next s row
-    add             rdi, rax   ; next r row
-
-    dec             rdx        ; counter
-    jnz .NextRow
-
-    SUM_ACROSS_W    xmm15
-    SUM_ACROSS_W    xmm14
-    SUM_ACROSS_Q    xmm13
-    SUM_ACROSS_Q    xmm12
-    SUM_ACROSS_Q    xmm11
-
-    mov             rdi,arg(4)
-    movd            [rdi], xmm15;
-    mov             rdi,arg(5)
-    movd            [rdi], xmm14;
-    mov             rdi,arg(6)
-    movd            [rdi], xmm13;
-    mov             rdi,arg(7)
-    movd            [rdi], xmm12;
-    mov             rdi,arg(8)
-    movd            [rdi], xmm11;
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 186ce11..84b12d7 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -33,7 +33,6 @@
 VP9_CX_SRCS-yes += encoder/vp9_ethread.h
 VP9_CX_SRCS-yes += encoder/vp9_ethread.c
 VP9_CX_SRCS-yes += encoder/vp9_extend.c
-VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_fastssim.c
 VP9_CX_SRCS-yes += encoder/vp9_firstpass.c
 VP9_CX_SRCS-yes += encoder/vp9_block.h
 VP9_CX_SRCS-yes += encoder/vp9_bitstream.h
@@ -57,7 +56,6 @@
 VP9_CX_SRCS-yes += encoder/vp9_encoder.c
 VP9_CX_SRCS-yes += encoder/vp9_picklpf.c
 VP9_CX_SRCS-yes += encoder/vp9_picklpf.h
-VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_psnrhvs.c
 VP9_CX_SRCS-yes += encoder/vp9_quantize.c
 VP9_CX_SRCS-yes += encoder/vp9_ratectrl.c
 VP9_CX_SRCS-yes += encoder/vp9_rd.c
@@ -72,8 +70,6 @@
 VP9_CX_SRCS-yes += encoder/vp9_svc_layercontext.c
 VP9_CX_SRCS-yes += encoder/vp9_resize.c
 VP9_CX_SRCS-yes += encoder/vp9_resize.h
-VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_ssim.c
-VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_ssim.h
 VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_blockiness.c
 
 VP9_CX_SRCS-yes += encoder/vp9_tokenize.c
@@ -113,7 +109,6 @@
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3_x86_64.asm
 endif
 endif
-VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt_x86_64.asm
 
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3.c
diff --git a/vp9/encoder/vp9_fastssim.c b/vpx_dsp/fastssim.c
similarity index 98%
rename from vp9/encoder/vp9_fastssim.c
rename to vpx_dsp/fastssim.c
index f1d408c..25f01e5 100644
--- a/vp9/encoder/vp9_fastssim.c
+++ b/vpx_dsp/fastssim.c
@@ -11,10 +11,11 @@
  *  project.
  */
 #include <math.h>
+#include <stdlib.h>
 #include <string.h>
 #include "./vpx_config.h"
-#include "./vp9_rtcd.h"
-#include "vp9/encoder/vp9_ssim.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/ssim.h"
 /* TODO(jbb): High bit depth version of this code needed */
 typedef struct fs_level fs_level;
 typedef struct fs_ctx fs_ctx;
@@ -443,10 +444,10 @@
   return 10 * (log10(_weight) - log10(_weight - _ssim));
 }
 
-double vp9_calc_fastssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
+double vpx_calc_fastssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
                          double *ssim_y, double *ssim_u, double *ssim_v) {
   double ssimv;
-  vp9_clear_system_state();
+  vpx_clear_system_state();
 
   *ssim_y = calc_ssim(source->y_buffer, source->y_stride, dest->y_buffer,
                       dest->y_stride, source->y_crop_width,
diff --git a/vp9/encoder/vp9_psnrhvs.c b/vpx_dsp/psnrhvs.c
similarity index 97%
rename from vp9/encoder/vp9_psnrhvs.c
rename to vpx_dsp/psnrhvs.c
index 5104b9a..8aa30f2 100644
--- a/vp9/encoder/vp9_psnrhvs.c
+++ b/vpx_dsp/psnrhvs.c
@@ -15,9 +15,8 @@
 #include <math.h>
 
 #include "./vpx_config.h"
-#include "./vp9_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
-#include "vp9/encoder/vp9_ssim.h"
+#include "vpx_dsp/ssim.h"
 
 #if !defined(M_PI)
 # define M_PI (3.141592653589793238462643)
@@ -201,12 +200,12 @@
   ret /= pixels;
   return ret;
 }
-double vp9_psnrhvs(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
+double vpx_psnrhvs(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
                    double *y_psnrhvs, double *u_psnrhvs, double *v_psnrhvs) {
   double psnrhvs;
   double par = 1.0;
   int step = 7;
-  vp9_clear_system_state();
+  vpx_clear_system_state();
   *y_psnrhvs = calc_psnrhvs(source->y_buffer, source->y_stride, dest->y_buffer,
                             dest->y_stride, par, source->y_crop_width,
                             source->y_crop_height, step, csf_y);
diff --git a/vp9/encoder/vp9_ssim.c b/vpx_dsp/ssim.c
similarity index 91%
rename from vp9/encoder/vp9_ssim.c
rename to vpx_dsp/ssim.c
index 172de5d..991906f 100644
--- a/vp9/encoder/vp9_ssim.c
+++ b/vpx_dsp/ssim.c
@@ -9,11 +9,11 @@
  */
 
 #include <math.h>
-#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/ssim.h"
 #include "vpx_ports/mem.h"
-#include "vp9/encoder/vp9_ssim.h"
 
-void vp9_ssim_parms_16x16_c(uint8_t *s, int sp, uint8_t *r,
+void vpx_ssim_parms_16x16_c(uint8_t *s, int sp, uint8_t *r,
                             int rp, unsigned long *sum_s, unsigned long *sum_r,
                             unsigned long *sum_sq_s, unsigned long *sum_sq_r,
                             unsigned long *sum_sxr) {
@@ -28,7 +28,7 @@
     }
   }
 }
-void vp9_ssim_parms_8x8_c(uint8_t *s, int sp, uint8_t *r, int rp,
+void vpx_ssim_parms_8x8_c(uint8_t *s, int sp, uint8_t *r, int rp,
                           unsigned long *sum_s, unsigned long *sum_r,
                           unsigned long *sum_sq_s, unsigned long *sum_sq_r,
                           unsigned long *sum_sxr) {
@@ -45,7 +45,7 @@
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_ssim_parms_8x8_c(uint16_t *s, int sp, uint16_t *r, int rp,
+void vpx_highbd_ssim_parms_8x8_c(uint16_t *s, int sp, uint16_t *r, int rp,
                                  uint32_t *sum_s, uint32_t *sum_r,
                                  uint32_t *sum_sq_s, uint32_t *sum_sq_r,
                                  uint32_t *sum_sxr) {
@@ -87,7 +87,7 @@
 
 static double ssim_8x8(uint8_t *s, int sp, uint8_t *r, int rp) {
   unsigned long sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
-  vp9_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
+  vpx_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
                      &sum_sxr);
   return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64);
 }
@@ -97,7 +97,7 @@
                               unsigned int bd) {
   uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
   const int oshift = bd - 8;
-  vp9_highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
+  vpx_highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
                             &sum_sxr);
   return similarity(sum_s >> oshift,
                     sum_r >> oshift,
@@ -111,7 +111,7 @@
 // We are using a 8x8 moving window with starting location of each 8x8 window
 // on the 4x4 pixel grid. Such arrangement allows the windows to overlap
 // block boundaries to penalize blocking artifacts.
-double vp9_ssim2(uint8_t *img1, uint8_t *img2, int stride_img1,
+double vpx_ssim2(uint8_t *img1, uint8_t *img2, int stride_img1,
                  int stride_img2, int width, int height) {
   int i, j;
   int samples = 0;
@@ -131,7 +131,7 @@
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-double vp9_highbd_ssim2(uint8_t *img1, uint8_t *img2, int stride_img1,
+double vpx_highbd_ssim2(uint8_t *img1, uint8_t *img2, int stride_img1,
                         int stride_img2, int width, int height,
                         unsigned int bd) {
   int i, j;
@@ -154,20 +154,20 @@
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-double vp9_calc_ssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
+double vpx_calc_ssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
                      double *weight) {
   double a, b, c;
   double ssimv;
 
-  a = vp9_ssim2(source->y_buffer, dest->y_buffer,
+  a = vpx_ssim2(source->y_buffer, dest->y_buffer,
                 source->y_stride, dest->y_stride,
                 source->y_crop_width, source->y_crop_height);
 
-  b = vp9_ssim2(source->u_buffer, dest->u_buffer,
+  b = vpx_ssim2(source->u_buffer, dest->u_buffer,
                 source->uv_stride, dest->uv_stride,
                 source->uv_crop_width, source->uv_crop_height);
 
-  c = vp9_ssim2(source->v_buffer, dest->v_buffer,
+  c = vpx_ssim2(source->v_buffer, dest->v_buffer,
                 source->uv_stride, dest->uv_stride,
                 source->uv_crop_width, source->uv_crop_height);
 
@@ -178,20 +178,20 @@
   return ssimv;
 }
 
-double vp9_calc_ssimg(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
+double vpx_calc_ssimg(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
                       double *ssim_y, double *ssim_u, double *ssim_v) {
   double ssim_all = 0;
   double a, b, c;
 
-  a = vp9_ssim2(source->y_buffer, dest->y_buffer,
+  a = vpx_ssim2(source->y_buffer, dest->y_buffer,
                 source->y_stride, dest->y_stride,
                 source->y_crop_width, source->y_crop_height);
 
-  b = vp9_ssim2(source->u_buffer, dest->u_buffer,
+  b = vpx_ssim2(source->u_buffer, dest->u_buffer,
                 source->uv_stride, dest->uv_stride,
                 source->uv_crop_width, source->uv_crop_height);
 
-  c = vp9_ssim2(source->v_buffer, dest->v_buffer,
+  c = vpx_ssim2(source->v_buffer, dest->v_buffer,
                 source->uv_stride, dest->uv_stride,
                 source->uv_crop_width, source->uv_crop_height);
   *ssim_y = a;
@@ -280,12 +280,12 @@
 }
 void ssimv_parms(uint8_t *img1, int img1_pitch, uint8_t *img2, int img2_pitch,
                  Ssimv *sv) {
-  vp9_ssim_parms_8x8(img1, img1_pitch, img2, img2_pitch,
+  vpx_ssim_parms_8x8(img1, img1_pitch, img2, img2_pitch,
                      &sv->sum_s, &sv->sum_r, &sv->sum_sq_s, &sv->sum_sq_r,
                      &sv->sum_sxr);
 }
 
-double vp9_get_ssim_metrics(uint8_t *img1, int img1_pitch,
+double vpx_get_ssim_metrics(uint8_t *img1, int img1_pitch,
                             uint8_t *img2, int img2_pitch,
                             int width, int height,
                             Ssimv *sv2, Metrics *m,
@@ -298,7 +298,7 @@
   int c = 0;
   double norm;
   double old_ssim_total = 0;
-  vp9_clear_system_state();
+  vpx_clear_system_state();
   // We can sample points as frequently as we like start with 1 per 4x4.
   for (i = 0; i < height; i += 4,
        img1 += img1_pitch * 4, img2 += img2_pitch * 4) {
@@ -448,21 +448,21 @@
 
 
 #if CONFIG_VP9_HIGHBITDEPTH
-double vp9_highbd_calc_ssim(YV12_BUFFER_CONFIG *source,
+double vpx_highbd_calc_ssim(YV12_BUFFER_CONFIG *source,
                             YV12_BUFFER_CONFIG *dest,
                             double *weight, unsigned int bd) {
   double a, b, c;
   double ssimv;
 
-  a = vp9_highbd_ssim2(source->y_buffer, dest->y_buffer,
+  a = vpx_highbd_ssim2(source->y_buffer, dest->y_buffer,
                        source->y_stride, dest->y_stride,
                        source->y_crop_width, source->y_crop_height, bd);
 
-  b = vp9_highbd_ssim2(source->u_buffer, dest->u_buffer,
+  b = vpx_highbd_ssim2(source->u_buffer, dest->u_buffer,
                        source->uv_stride, dest->uv_stride,
                        source->uv_crop_width, source->uv_crop_height, bd);
 
-  c = vp9_highbd_ssim2(source->v_buffer, dest->v_buffer,
+  c = vpx_highbd_ssim2(source->v_buffer, dest->v_buffer,
                        source->uv_stride, dest->uv_stride,
                        source->uv_crop_width, source->uv_crop_height, bd);
 
@@ -473,21 +473,21 @@
   return ssimv;
 }
 
-double vp9_highbd_calc_ssimg(YV12_BUFFER_CONFIG *source,
+double vpx_highbd_calc_ssimg(YV12_BUFFER_CONFIG *source,
                              YV12_BUFFER_CONFIG *dest, double *ssim_y,
                              double *ssim_u, double *ssim_v, unsigned int bd) {
   double ssim_all = 0;
   double a, b, c;
 
-  a = vp9_highbd_ssim2(source->y_buffer, dest->y_buffer,
+  a = vpx_highbd_ssim2(source->y_buffer, dest->y_buffer,
                        source->y_stride, dest->y_stride,
                        source->y_crop_width, source->y_crop_height, bd);
 
-  b = vp9_highbd_ssim2(source->u_buffer, dest->u_buffer,
+  b = vpx_highbd_ssim2(source->u_buffer, dest->u_buffer,
                        source->uv_stride, dest->uv_stride,
                        source->uv_crop_width, source->uv_crop_height, bd);
 
-  c = vp9_highbd_ssim2(source->v_buffer, dest->v_buffer,
+  c = vpx_highbd_ssim2(source->v_buffer, dest->v_buffer,
                        source->uv_stride, dest->uv_stride,
                        source->uv_crop_width, source->uv_crop_height, bd);
   *ssim_y = a;
diff --git a/vp9/encoder/vp9_ssim.h b/vpx_dsp/ssim.h
similarity index 74%
rename from vp9/encoder/vp9_ssim.h
rename to vpx_dsp/ssim.h
index 10f14c4..b1579f7 100644
--- a/vp9/encoder/vp9_ssim.h
+++ b/vpx_dsp/ssim.h
@@ -8,15 +8,24 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_SSIM_H_
-#define VP9_ENCODER_VP9_SSIM_H_
+#ifndef VPX_ENCODER_VP9_SSIM_H_
+#define VPX_ENCODER_VP9_SSIM_H_
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+#include "./vpx_config.h"
 #include "vpx_scale/yv12config.h"
 
+// TODO(aconverse): Unify vp8/vp9_clear_system_state
+#if ARCH_X86 || ARCH_X86_64
+void vpx_reset_mmx_state(void);
+#define vpx_clear_system_state() vpx_reset_mmx_state()
+#else
+#define vpx_clear_system_state()
+#endif
+
 // metrics used for calculating ssim, ssim2, dssim, and ssimc
 typedef struct {
   // source sum ( over 8x8 region )
@@ -59,29 +68,29 @@
   double ssimcd;
 } Metrics;
 
-double vp9_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
+double vpx_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
                       int img2_pitch, int width, int height, Ssimv *sv2,
                       Metrics *m, int do_inconsistency);
 
-double vp9_calc_ssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
+double vpx_calc_ssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
                      double *weight);
 
-double vp9_calc_ssimg(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
+double vpx_calc_ssimg(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
                       double *ssim_y, double *ssim_u, double *ssim_v);
 
-double vp9_calc_fastssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
+double vpx_calc_fastssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
                          double *ssim_y, double *ssim_u, double *ssim_v);
 
-double vp9_psnrhvs(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
+double vpx_psnrhvs(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
                    double *ssim_y, double *ssim_u, double *ssim_v);
 
 #if CONFIG_VP9_HIGHBITDEPTH
-double vp9_highbd_calc_ssim(YV12_BUFFER_CONFIG *source,
+double vpx_highbd_calc_ssim(YV12_BUFFER_CONFIG *source,
                             YV12_BUFFER_CONFIG *dest,
                             double *weight,
                             unsigned int bd);
 
-double vp9_highbd_calc_ssimg(YV12_BUFFER_CONFIG *source,
+double vpx_highbd_calc_ssimg(YV12_BUFFER_CONFIG *source,
                              YV12_BUFFER_CONFIG *dest,
                              double *ssim_y,
                              double *ssim_u,
@@ -93,4 +102,4 @@
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_SSIM_H_
+#endif  // VPX_ENCODER_VP9_SSIM_H_
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 468e4c3..5d4ec3e 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -22,6 +22,10 @@
 DSP_SRCS-yes += bitwriter.c
 DSP_SRCS-yes += bitwriter_buffer.c
 DSP_SRCS-yes += bitwriter_buffer.h
+DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.c
+DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.h
+DSP_SRCS-$(CONFIG_INTERNAL_STATS) += psnrhvs.c
+DSP_SRCS-$(CONFIG_INTERNAL_STATS) += fastssim.c
 endif
 
 ifeq ($(CONFIG_DECODERS),yes)
@@ -295,6 +299,10 @@
 DSP_SRCS-$(HAVE_AVX2)   += x86/variance_avx2.c
 DSP_SRCS-$(HAVE_AVX2)   += x86/variance_impl_avx2.c
 
+ifeq ($(ARCH_X86_64),yes)
+DSP_SRCS-$(HAVE_SSE2)   += x86/ssim_opt_x86_64.asm
+endif  # ARCH_X86_64
+
 ifeq ($(CONFIG_USE_X86INC),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/subpel_variance_sse2.asm  # Contains SSE2 and SSSE3
 endif  # CONFIG_USE_X86INC
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 7326adf..3260227 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -990,6 +990,17 @@
 add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad4x4x4d msa/, "$sse_x86inc";
 
+#
+# Structured Similarity (SSIM)
+#
+if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {
+    add_proto qw/void vpx_ssim_parms_8x8/, "uint8_t *s, int sp, uint8_t *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr";
+    specialize qw/vpx_ssim_parms_8x8/, "$sse2_x86_64";
+
+    add_proto qw/void vpx_ssim_parms_16x16/, "uint8_t *s, int sp, uint8_t *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr";
+    specialize qw/vpx_ssim_parms_16x16/, "$sse2_x86_64";
+}
+
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   #
   # Block subtraction
@@ -1176,6 +1187,13 @@
   add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
   specialize qw/vpx_highbd_sad4x4x4d/, "$sse2_x86inc";
 
+  #
+  # Structured Similarity (SSIM)
+  #
+  if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {
+    add_proto qw/void vpx_highbd_ssim_parms_8x8/, "uint16_t *s, int sp, uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+    specialize qw/vpx_highbd_ssim_parms_8x8/;
+  }
 }  # CONFIG_VP9_HIGHBITDEPTH
 }  # CONFIG_ENCODERS
 
diff --git a/vp8/encoder/x86/ssim_opt_x86_64.asm b/vpx_dsp/x86/ssim_opt_x86_64.asm
similarity index 97%
rename from vp8/encoder/x86/ssim_opt_x86_64.asm
rename to vpx_dsp/x86/ssim_opt_x86_64.asm
index 5964a85..5d05d4f 100644
--- a/vp8/encoder/x86/ssim_opt_x86_64.asm
+++ b/vpx_dsp/x86/ssim_opt_x86_64.asm
@@ -61,8 +61,8 @@
 ; or pavgb At this point this is just meant to be first pass for calculating
 ; all the parms needed for 16x16 ssim so we can play with dssim as distortion
 ; in mode selection code.
-global sym(vp8_ssim_parms_16x16_sse2) PRIVATE
-sym(vp8_ssim_parms_16x16_sse2):
+global sym(vpx_ssim_parms_16x16_sse2) PRIVATE
+sym(vpx_ssim_parms_16x16_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 9
@@ -151,8 +151,8 @@
 ; or pavgb At this point this is just meant to be first pass for calculating
 ; all the parms needed for 16x16 ssim so we can play with dssim as distortion
 ; in mode selection code.
-global sym(vp8_ssim_parms_8x8_sse2) PRIVATE
-sym(vp8_ssim_parms_8x8_sse2):
+global sym(vpx_ssim_parms_8x8_sse2) PRIVATE
+sym(vpx_ssim_parms_8x8_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 9
diff --git a/vpx_ports/msvc.h b/vpx_ports/msvc.h
index 43a36e7..cab7740 100644
--- a/vpx_ports/msvc.h
+++ b/vpx_ports/msvc.h
@@ -18,5 +18,15 @@
 #  define snprintf _snprintf
 # endif  // _MSC_VER < 1900
 
+#if _MSC_VER < 1800  // VS2013 provides round
+#include <math.h>
+static INLINE double round(double x) {
+  if (x < 0)
+    return ceil(x - 0.5);
+  else
+    return floor(x + 0.5);
+}
+#endif  // _MSC_VER < 1800
+
 #endif  // _MSC_VER
 #endif  // VPX_PORTS_MSVC_H_
diff --git a/vpx_ports/x86.h b/vpx_ports/x86.h
index 0fef6a5..5da346e 100644
--- a/vpx_ports/x86.h
+++ b/vpx_ports/x86.h
@@ -136,7 +136,7 @@
 #define xgetbv() 0U  // no AVX for older x64 or unrecognized toolchains.
 #endif
 
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) && _MSC_VER >= 1700
 #include <windows.h>
 #if WINAPI_FAMILY_PARTITION(WINAPI_FAMILY_APP)
 #define getenv(x) NULL