Merge "Change to use correct check for halfpel"
diff --git a/examples/simple_encoder.c b/examples/simple_encoder.c
index a307729..64f0a01 100644
--- a/examples/simple_encoder.c
+++ b/examples/simple_encoder.c
@@ -109,8 +109,8 @@
 void usage_exit(void) {
   fprintf(stderr,
           "Usage: %s <codec> <width> <height> <infile> <outfile> "
-              "<keyframe-interval> [<error-resilient>]\nSee comments in "
-              "simple_encoder.c for more information.\n",
+              "<keyframe-interval> <error-resilient> <frames to encode>\n"
+              "See comments in simple_encoder.c for more information.\n",
           exec_name);
   exit(EXIT_FAILURE);
 }
@@ -147,6 +147,7 @@
   return got_pkts;
 }
 
+// TODO(tomfinegan): Improve command line parsing and add args for bitrate/fps.
 int main(int argc, char **argv) {
   FILE *infile = NULL;
   vpx_codec_ctx_t codec;
@@ -157,12 +158,11 @@
   VpxVideoInfo info = {0};
   VpxVideoWriter *writer = NULL;
   const VpxInterface *encoder = NULL;
-  const int fps = 30;        // TODO(dkovalev) add command line argument
-  const int bitrate = 200;   // kbit/s TODO(dkovalev) add command line argument
+  const int fps = 30;
+  const int bitrate = 200;
   int keyframe_interval = 0;
-
-  // TODO(dkovalev): Add some simple command line parsing code to make the
-  // command line more flexible.
+  int max_frames = 0;
+  int frames_encoded = 0;
   const char *codec_arg = NULL;
   const char *width_arg = NULL;
   const char *height_arg = NULL;
@@ -172,7 +172,7 @@
 
   exec_name = argv[0];
 
-  if (argc < 7)
+  if (argc != 9)
     die("Invalid number of arguments");
 
   codec_arg = argv[1];
@@ -181,6 +181,7 @@
   infile_arg = argv[4];
   outfile_arg = argv[5];
   keyframe_interval_arg = argv[6];
+  max_frames = strtol(argv[8], NULL, 0);
 
   encoder = get_vpx_encoder_by_name(codec_arg);
   if (!encoder)
@@ -219,7 +220,7 @@
   cfg.g_timebase.num = info.time_base.numerator;
   cfg.g_timebase.den = info.time_base.denominator;
   cfg.rc_target_bitrate = bitrate;
-  cfg.g_error_resilient = argc > 7 ? strtol(argv[7], NULL, 0) : 0;
+  cfg.g_error_resilient = strtol(argv[7], NULL, 0);
 
   writer = vpx_video_writer_open(outfile_arg, kContainerIVF, &info);
   if (!writer)
@@ -237,6 +238,9 @@
     if (keyframe_interval > 0 && frame_count % keyframe_interval == 0)
       flags |= VPX_EFLAG_FORCE_KF;
     encode_frame(&codec, &raw, frame_count++, flags, writer);
+    frames_encoded++;
+    if (max_frames > 0 && frames_encoded >= max_frames)
+      break;
   }
 
   // Flush encoder.
diff --git a/examples/twopass_encoder.c b/examples/twopass_encoder.c
index aecc11d..15a6617 100644
--- a/examples/twopass_encoder.c
+++ b/examples/twopass_encoder.c
@@ -59,7 +59,9 @@
 static const char *exec_name;
 
 void usage_exit(void) {
-  fprintf(stderr, "Usage: %s <codec> <width> <height> <infile> <outfile>\n",
+  fprintf(stderr,
+          "Usage: %s <codec> <width> <height> <infile> <outfile> "
+              "<frame limit>\n",
           exec_name);
   exit(EXIT_FAILURE);
 }
@@ -129,7 +131,8 @@
 static vpx_fixed_buf_t pass0(vpx_image_t *raw,
                              FILE *infile,
                              const VpxInterface *encoder,
-                             const vpx_codec_enc_cfg_t *cfg) {
+                             const vpx_codec_enc_cfg_t *cfg,
+                             int max_frames) {
   vpx_codec_ctx_t codec;
   int frame_count = 0;
   vpx_fixed_buf_t stats = {NULL, 0};
@@ -142,6 +145,8 @@
     ++frame_count;
     get_frame_stats(&codec, raw, frame_count, 1, 0, VPX_DL_GOOD_QUALITY,
                     &stats);
+    if (max_frames > 0 && frame_count >= max_frames)
+      break;
   }
 
   // Flush encoder.
@@ -159,7 +164,8 @@
                   FILE *infile,
                   const char *outfile_name,
                   const VpxInterface *encoder,
-                  const vpx_codec_enc_cfg_t *cfg) {
+                  const vpx_codec_enc_cfg_t *cfg,
+                  int max_frames) {
   VpxVideoInfo info = {
     encoder->fourcc,
     cfg->g_w,
@@ -181,6 +187,9 @@
   while (vpx_img_read(raw, infile)) {
     ++frame_count;
     encode_frame(&codec, raw, frame_count, 1, 0, VPX_DL_GOOD_QUALITY, writer);
+
+    if (max_frames > 0 && frame_count >= max_frames)
+      break;
   }
 
   // Flush encoder.
@@ -213,11 +222,14 @@
   const char *const height_arg = argv[3];
   const char *const infile_arg = argv[4];
   const char *const outfile_arg = argv[5];
+  int max_frames = 0;
   exec_name = argv[0];
 
-  if (argc != 6)
+  if (argc != 7)
     die("Invalid number of arguments.");
 
+  max_frames = strtol(argv[6], NULL, 0);
+
   encoder = get_vpx_encoder_by_name(codec_arg);
   if (!encoder)
     die("Unsupported codec.");
@@ -249,13 +261,13 @@
 
   // Pass 0
   cfg.g_pass = VPX_RC_FIRST_PASS;
-  stats = pass0(&raw, infile, encoder, &cfg);
+  stats = pass0(&raw, infile, encoder, &cfg, max_frames);
 
   // Pass 1
   rewind(infile);
   cfg.g_pass = VPX_RC_LAST_PASS;
   cfg.rc_twopass_stats_in = stats;
-  pass1(&raw, infile, outfile_arg, encoder, &cfg);
+  pass1(&raw, infile, outfile_arg, encoder, &cfg, max_frames);
   free(stats.buf);
 
   vpx_img_free(&raw);
diff --git a/test/sad_test.cc b/test/sad_test.cc
index 3f0f74c..e6bd0d7 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -689,17 +689,6 @@
 
 //------------------------------------------------------------------------------
 // x86 functions
-#if HAVE_MMX
-const SadMxNParam mmx_tests[] = {
-  make_tuple(16, 16, &vpx_sad16x16_mmx, -1),
-  make_tuple(16, 8, &vpx_sad16x8_mmx, -1),
-  make_tuple(8, 16, &vpx_sad8x16_mmx, -1),
-  make_tuple(8, 8, &vpx_sad8x8_mmx, -1),
-  make_tuple(4, 4, &vpx_sad4x4_mmx, -1),
-};
-INSTANTIATE_TEST_CASE_P(MMX, SADTest, ::testing::ValuesIn(mmx_tests));
-#endif  // HAVE_MMX
-
 #if HAVE_SSE2
 #if CONFIG_USE_X86INC
 const SadMxNParam sse2_tests[] = {
diff --git a/test/simple_encoder.sh b/test/simple_encoder.sh
index c4a6280..ee633ae 100755
--- a/test/simple_encoder.sh
+++ b/test/simple_encoder.sh
@@ -23,7 +23,7 @@
   fi
 }
 
-# Runs simple_encoder using the codec specified by $1.
+# Runs simple_encoder using the codec specified by $1 with a frame limit of 100.
 simple_encoder() {
   local encoder="${LIBVPX_BIN_PATH}/simple_encoder${VPX_TEST_EXE_SUFFIX}"
   local codec="$1"
@@ -35,7 +35,7 @@
   fi
 
   eval "${VPX_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
-      "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" 9999 \
+      "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" 9999 0 100 \
       ${devnull}
 
   [ -e "${output_file}" ] || return 1
@@ -47,16 +47,13 @@
   fi
 }
 
-# TODO(tomfinegan): Add a frame limit param to simple_encoder and enable this
-# test. VP9 is just too slow right now: This test takes 4m30s+ on a fast
-# machine.
-DISABLED_simple_encoder_vp9() {
+simple_encoder_vp9() {
   if [ "$(vp9_encode_available)" = "yes" ]; then
     simple_encoder vp9 || return 1
   fi
 }
 
 simple_encoder_tests="simple_encoder_vp8
-                      DISABLED_simple_encoder_vp9"
+                      simple_encoder_vp9"
 
 run_tests simple_encoder_verify_environment "${simple_encoder_tests}"
diff --git a/test/twopass_encoder.sh b/test/twopass_encoder.sh
index 1189e51..7a223f2 100755
--- a/test/twopass_encoder.sh
+++ b/test/twopass_encoder.sh
@@ -23,7 +23,8 @@
   fi
 }
 
-# Runs twopass_encoder using the codec specified by $1.
+# Runs twopass_encoder using the codec specified by $1 with a frame limit of
+# 100.
 twopass_encoder() {
   local encoder="${LIBVPX_BIN_PATH}/twopass_encoder${VPX_TEST_EXE_SUFFIX}"
   local codec="$1"
@@ -35,7 +36,7 @@
   fi
 
   eval "${VPX_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
-      "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" \
+      "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" 100 \
       ${devnull}
 
   [ -e "${output_file}" ] || return 1
@@ -47,16 +48,13 @@
   fi
 }
 
-# TODO(tomfinegan): Add a frame limit param to twopass_encoder and enable this
-# test. VP9 is just too slow right now: This test takes 31m16s+ on a fast
-# machine.
-DISABLED_twopass_encoder_vp9() {
+twopass_encoder_vp9() {
   if [ "$(vp9_encode_available)" = "yes" ]; then
     twopass_encoder vp9 || return 1
   fi
 }
 
 twopass_encoder_tests="twopass_encoder_vp8
-                       DISABLED_twopass_encoder_vp9"
+                       twopass_encoder_vp9"
 
 run_tests twopass_encoder_verify_environment "${twopass_encoder_tests}"
diff --git a/test/variance_test.cc b/test/variance_test.cc
index a9ca07c..a6efc92 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -977,20 +977,6 @@
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if HAVE_MMX
-INSTANTIATE_TEST_CASE_P(MMX, VpxMseTest,
-                        ::testing::Values(make_tuple(4, 4, &vpx_mse16x16_mmx)));
-
-INSTANTIATE_TEST_CASE_P(MMX, SumOfSquaresTest,
-                        ::testing::Values(vpx_get_mb_ss_mmx));
-
-INSTANTIATE_TEST_CASE_P(
-    MMX, VpxVarianceTest,
-    ::testing::Values(make_tuple(4, 4, &vpx_variance16x16_mmx, 0),
-                      make_tuple(4, 3, &vpx_variance16x8_mmx, 0),
-                      make_tuple(3, 4, &vpx_variance8x16_mmx, 0),
-                      make_tuple(3, 3, &vpx_variance8x8_mmx, 0),
-                      make_tuple(2, 2, &vpx_variance4x4_mmx, 0)));
-
 INSTANTIATE_TEST_CASE_P(
     MMX, VpxSubpelVarianceTest,
     ::testing::Values(make_tuple(4, 4, &vpx_sub_pixel_variance16x16_mmx, 0),
diff --git a/vp10/encoder/mcomp.c b/vp10/encoder/mcomp.c
index 2c1c591..1ba2e2f 100644
--- a/vp10/encoder/mcomp.c
+++ b/vp10/encoder/mcomp.c
@@ -145,16 +145,6 @@
   cfg->searches_per_step = 8;
 }
 
-/*
- * To avoid the penalty for crossing cache-line read, preload the reference
- * area in a small buffer, which is aligned to make sure there won't be crossing
- * cache-line read while reading from this buffer. This reduced the cpu
- * cycles spent on reading ref data in sub-pixel filter functions.
- * TODO: Currently, since sub-pixel search range here is -3 ~ 3, copy 22 rows x
- * 32 cols area that is enough for 16x16 macroblock. Later, for SPLITMV, we
- * could reduce the area.
- */
-
 /* estimated cost of a motion vector (r,c) */
 #define MVC(r, c)                                       \
     (mvcost ?                                           \
@@ -790,7 +780,6 @@
 }
 
 #undef MVC
-#undef PRE
 #undef CHECK_BETTER
 
 static INLINE int check_bounds(const MACROBLOCK *x, int row, int col,
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 4669145..f3ffe35 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -144,16 +144,6 @@
   cfg->total_steps = ss_count / cfg->searches_per_step;
 }
 
-/*
- * To avoid the penalty for crossing cache-line read, preload the reference
- * area in a small buffer, which is aligned to make sure there won't be crossing
- * cache-line read while reading from this buffer. This reduced the cpu
- * cycles spent on reading ref data in sub-pixel filter functions.
- * TODO: Currently, since sub-pixel search range here is -3 ~ 3, copy 22 rows x
- * 32 cols area that is enough for 16x16 macroblock. Later, for SPLITMV, we
- * could reduce the area.
- */
-
 /* Estimated (square) error cost of a motion vector (r,c). The 14 scale comes
  * from the same math as in mv_err_cost(). */
 #define MVC(r, c)                                              \
@@ -835,7 +825,6 @@
 }
 
 #undef MVC
-#undef PRE
 #undef CHECK_BETTER
 
 static INLINE int check_bounds(const MACROBLOCK *x, int row, int col,
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 1e6c152..0ec93a9 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -953,11 +953,10 @@
                              FIXED_GF_INTERVAL], cm->bit_depth);
       active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
     } else {
-      // Use the min of the average Q (with some increase) and
-      // active_worst_quality as basis for active_best.
+      // Use the min of the average Q and active_worst_quality as basis for
+      // active_best.
       if (cm->current_video_frame > 1) {
-        q = VPXMIN(((17 * rc->avg_frame_qindex[INTER_FRAME]) >> 4),
-                    active_worst_quality);
+        q = VPXMIN(rc->avg_frame_qindex[INTER_FRAME], active_worst_quality);
         active_best_quality = inter_minq[q];
       } else {
         active_best_quality = inter_minq[rc->avg_frame_qindex[KEY_FRAME]];
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 5e44ffd..1a11a6d 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -803,7 +803,7 @@
 static vpx_codec_err_t ctrl_set_target_level(vpx_codec_alg_priv_t *ctx,
                                              va_list args) {
   struct vp9_extracfg extra_cfg = ctx->extra_cfg;
-  extra_cfg.target_level = CAST(VP9E_SET_LEVEL_STATS, args);
+  extra_cfg.target_level = CAST(VP9E_SET_TARGET_LEVEL, args);
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index 109306f..b059d47 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -562,7 +562,7 @@
    *
    * Supported in codecs: VP9
    */
-  VP9E_SET_TARGET_LEVEL,
+  VP9E_SET_TARGET_LEVEL
 };
 
 /*!\brief vpx 1-D scaling mode
@@ -818,8 +818,8 @@
 VPX_CTRL_USE_TYPE(VP9E_SET_RENDER_SIZE, int *)
 #define VPX_CTRL_VP9E_SET_RENDER_SIZE
 
-VPX_CTRL_USE_TYPE(VP9E_SET_LEVEL_STATS,  unsigned int)
-#define VPX_CTRL_VP9E_SET_LEVEL_STATS
+VPX_CTRL_USE_TYPE(VP9E_SET_TARGET_LEVEL,  unsigned int)
+#define VPX_CTRL_VP9E_SET_TARGET_LEVEL
 
 /*!\endcond */
 /*! @} - end defgroup vp8_encoder */
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index dd8c6e8..43802d7 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -285,7 +285,6 @@
 DSP_SRCS-$(HAVE_MSA)    += mips/sad_msa.c
 DSP_SRCS-$(HAVE_MSA)    += mips/subtract_msa.c
 
-DSP_SRCS-$(HAVE_MMX)    += x86/sad_mmx.asm
 DSP_SRCS-$(HAVE_SSE3)   += x86/sad_sse3.asm
 DSP_SRCS-$(HAVE_SSSE3)  += x86/sad_ssse3.asm
 DSP_SRCS-$(HAVE_SSE4_1) += x86/sad_sse4.asm
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index f883ce5..aeadbaf 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -983,16 +983,16 @@
 specialize qw/vpx_sad16x32 msa/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vpx_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad16x16 mmx media neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad16x16 media neon msa/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vpx_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad16x8 mmx neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad16x8 neon msa/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vpx_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad8x16 mmx neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad8x16 neon msa/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vpx_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad8x8 mmx neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad8x8 neon msa/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vpx_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
 specialize qw/vpx_sad8x4 msa/, "$sse2_x86inc";
@@ -1001,7 +1001,7 @@
 specialize qw/vpx_sad4x8 msa/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vpx_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad4x4 mmx neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad4x4 neon msa/, "$sse2_x86inc";
 
 #
 # Avg
@@ -1407,16 +1407,16 @@
   specialize qw/vpx_variance16x32 sse2 msa/;
 
 add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance16x16 mmx sse2 avx2 media neon msa/;
+  specialize qw/vpx_variance16x16 sse2 avx2 media neon msa/;
 
 add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance16x8 mmx sse2 neon msa/;
+  specialize qw/vpx_variance16x8 sse2 neon msa/;
 
 add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance8x16 mmx sse2 neon msa/;
+  specialize qw/vpx_variance8x16 sse2 neon msa/;
 
 add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance8x8 mmx sse2 media neon msa/;
+  specialize qw/vpx_variance8x8 sse2 media neon msa/;
 
 add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_variance8x4 sse2 msa/;
@@ -1425,7 +1425,7 @@
   specialize qw/vpx_variance4x8 sse2 msa/;
 
 add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance4x4 mmx sse2 msa/;
+  specialize qw/vpx_variance4x4 sse2 msa/;
 
 #
 # Specialty Variance
@@ -1434,10 +1434,10 @@
   specialize qw/vpx_get16x16var sse2 avx2 neon msa/;
 
 add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_get8x8var mmx sse2 neon msa/;
+  specialize qw/vpx_get8x8var sse2 neon msa/;
 
 add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_mse16x16 mmx sse2 avx2 media neon msa/;
+  specialize qw/vpx_mse16x16 sse2 avx2 media neon msa/;
 
 add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
   specialize qw/vpx_mse16x8 sse2 msa/;
@@ -1449,7 +1449,7 @@
   specialize qw/vpx_mse8x8 sse2 msa/;
 
 add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *";
-  specialize qw/vpx_get_mb_ss mmx sse2 msa/;
+  specialize qw/vpx_get_mb_ss sse2 msa/;
 
 add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride";
   specialize qw/vpx_get4x4sse_cs neon msa/;
diff --git a/vpx_dsp/x86/sad_mmx.asm b/vpx_dsp/x86/sad_mmx.asm
deleted file mode 100644
index 9968992..0000000
--- a/vpx_dsp/x86/sad_mmx.asm
+++ /dev/null
@@ -1,427 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-global sym(vpx_sad16x16_mmx) PRIVATE
-global sym(vpx_sad8x16_mmx) PRIVATE
-global sym(vpx_sad8x8_mmx) PRIVATE
-global sym(vpx_sad4x4_mmx) PRIVATE
-global sym(vpx_sad16x8_mmx) PRIVATE
-
-;unsigned int vpx_sad16x16_mmx(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-sym(vpx_sad16x16_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        lea             rcx,        [rsi+rax*8]
-
-        lea             rcx,        [rcx+rax*8]
-        pxor            mm7,        mm7
-
-        pxor            mm6,        mm6
-
-.x16x16sad_mmx_loop:
-
-        movq            mm0,        QWORD PTR [rsi]
-        movq            mm2,        QWORD PTR [rsi+8]
-
-        movq            mm1,        QWORD PTR [rdi]
-        movq            mm3,        QWORD PTR [rdi+8]
-
-        movq            mm4,        mm0
-        movq            mm5,        mm2
-
-        psubusb         mm0,        mm1
-        psubusb         mm1,        mm4
-
-        psubusb         mm2,        mm3
-        psubusb         mm3,        mm5
-
-        por             mm0,        mm1
-        por             mm2,        mm3
-
-        movq            mm1,        mm0
-        movq            mm3,        mm2
-
-        punpcklbw       mm0,        mm6
-        punpcklbw       mm2,        mm6
-
-        punpckhbw       mm1,        mm6
-        punpckhbw       mm3,        mm6
-
-        paddw           mm0,        mm2
-        paddw           mm1,        mm3
-
-
-        lea             rsi,        [rsi+rax]
-        add             rdi,        rdx
-
-        paddw           mm7,        mm0
-        paddw           mm7,        mm1
-
-        cmp             rsi,        rcx
-        jne             .x16x16sad_mmx_loop
-
-
-        movq            mm0,        mm7
-
-        punpcklwd       mm0,        mm6
-        punpckhwd       mm7,        mm6
-
-        paddw           mm0,        mm7
-        movq            mm7,        mm0
-
-
-        psrlq           mm0,        32
-        paddw           mm7,        mm0
-
-        movq            rax,        mm7
-
-    pop rdi
-    pop rsi
-    mov rsp, rbp
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vpx_sad8x16_mmx(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-sym(vpx_sad8x16_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        lea             rcx,        [rsi+rax*8]
-
-        lea             rcx,        [rcx+rax*8]
-        pxor            mm7,        mm7
-
-        pxor            mm6,        mm6
-
-.x8x16sad_mmx_loop:
-
-        movq            mm0,        QWORD PTR [rsi]
-        movq            mm1,        QWORD PTR [rdi]
-
-        movq            mm2,        mm0
-        psubusb         mm0,        mm1
-
-        psubusb         mm1,        mm2
-        por             mm0,        mm1
-
-        movq            mm2,        mm0
-        punpcklbw       mm0,        mm6
-
-        punpckhbw       mm2,        mm6
-        lea             rsi,        [rsi+rax]
-
-        add             rdi,        rdx
-        paddw           mm7,        mm0
-
-        paddw           mm7,        mm2
-        cmp             rsi,        rcx
-
-        jne             .x8x16sad_mmx_loop
-
-        movq            mm0,        mm7
-        punpcklwd       mm0,        mm6
-
-        punpckhwd       mm7,        mm6
-        paddw           mm0,        mm7
-
-        movq            mm7,        mm0
-        psrlq           mm0,        32
-
-        paddw           mm7,        mm0
-        movq            rax,        mm7
-
-    pop rdi
-    pop rsi
-    mov rsp, rbp
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vpx_sad8x8_mmx(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-sym(vpx_sad8x8_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        lea             rcx,        [rsi+rax*8]
-        pxor            mm7,        mm7
-
-        pxor            mm6,        mm6
-
-.x8x8sad_mmx_loop:
-
-        movq            mm0,        QWORD PTR [rsi]
-        movq            mm1,        QWORD PTR [rdi]
-
-        movq            mm2,        mm0
-        psubusb         mm0,        mm1
-
-        psubusb         mm1,        mm2
-        por             mm0,        mm1
-
-        movq            mm2,        mm0
-        punpcklbw       mm0,        mm6
-
-        punpckhbw       mm2,        mm6
-        paddw           mm0,        mm2
-
-        lea             rsi,       [rsi+rax]
-        add             rdi,        rdx
-
-        paddw           mm7,       mm0
-        cmp             rsi,        rcx
-
-        jne             .x8x8sad_mmx_loop
-
-        movq            mm0,        mm7
-        punpcklwd       mm0,        mm6
-
-        punpckhwd       mm7,        mm6
-        paddw           mm0,        mm7
-
-        movq            mm7,        mm0
-        psrlq           mm0,        32
-
-        paddw           mm7,        mm0
-        movq            rax,        mm7
-
-    pop rdi
-    pop rsi
-    mov rsp, rbp
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vpx_sad4x4_mmx(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-sym(vpx_sad4x4_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        movd            mm0,        DWORD PTR [rsi]
-        movd            mm1,        DWORD PTR [rdi]
-
-        movd            mm2,        DWORD PTR [rsi+rax]
-        movd            mm3,        DWORD PTR [rdi+rdx]
-
-        punpcklbw       mm0,        mm2
-        punpcklbw       mm1,        mm3
-
-        movq            mm2,        mm0
-        psubusb         mm0,        mm1
-
-        psubusb         mm1,        mm2
-        por             mm0,        mm1
-
-        movq            mm2,        mm0
-        pxor            mm3,        mm3
-
-        punpcklbw       mm0,        mm3
-        punpckhbw       mm2,        mm3
-
-        paddw           mm0,        mm2
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        movd            mm4,        DWORD PTR [rsi]
-        movd            mm5,        DWORD PTR [rdi]
-
-        movd            mm6,        DWORD PTR [rsi+rax]
-        movd            mm7,        DWORD PTR [rdi+rdx]
-
-        punpcklbw       mm4,        mm6
-        punpcklbw       mm5,        mm7
-
-        movq            mm6,        mm4
-        psubusb         mm4,        mm5
-
-        psubusb         mm5,        mm6
-        por             mm4,        mm5
-
-        movq            mm5,        mm4
-        punpcklbw       mm4,        mm3
-
-        punpckhbw       mm5,        mm3
-        paddw           mm4,        mm5
-
-        paddw           mm0,        mm4
-        movq            mm1,        mm0
-
-        punpcklwd       mm0,        mm3
-        punpckhwd       mm1,        mm3
-
-        paddw           mm0,        mm1
-        movq            mm1,        mm0
-
-        psrlq           mm0,        32
-        paddw           mm0,        mm1
-
-        movq            rax,        mm0
-
-    pop rdi
-    pop rsi
-    mov rsp, rbp
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vpx_sad16x8_mmx(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-sym(vpx_sad16x8_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        lea             rcx,        [rsi+rax*8]
-        pxor            mm7,        mm7
-
-        pxor            mm6,        mm6
-
-.x16x8sad_mmx_loop:
-
-        movq            mm0,       [rsi]
-        movq            mm1,       [rdi]
-
-        movq            mm2,        [rsi+8]
-        movq            mm3,        [rdi+8]
-
-        movq            mm4,        mm0
-        movq            mm5,        mm2
-
-        psubusb         mm0,        mm1
-        psubusb         mm1,        mm4
-
-        psubusb         mm2,        mm3
-        psubusb         mm3,        mm5
-
-        por             mm0,        mm1
-        por             mm2,        mm3
-
-        movq            mm1,        mm0
-        movq            mm3,        mm2
-
-        punpcklbw       mm0,        mm6
-        punpckhbw       mm1,        mm6
-
-        punpcklbw       mm2,        mm6
-        punpckhbw       mm3,        mm6
-
-
-        paddw           mm0,        mm2
-        paddw           mm1,        mm3
-
-        paddw           mm0,        mm1
-        lea             rsi,        [rsi+rax]
-
-        add             rdi,        rdx
-        paddw           mm7,        mm0
-
-        cmp             rsi,        rcx
-        jne             .x16x8sad_mmx_loop
-
-        movq            mm0,        mm7
-        punpcklwd       mm0,        mm6
-
-        punpckhwd       mm7,        mm6
-        paddw           mm0,        mm7
-
-        movq            mm7,        mm0
-        psrlq           mm0,        32
-
-        paddw           mm7,        mm0
-        movq            rax,        mm7
-
-    pop rdi
-    pop rsi
-    mov rsp, rbp
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/vpx_dsp/x86/variance_impl_mmx.asm b/vpx_dsp/x86/variance_impl_mmx.asm
index b8ba79b..f4de42a 100644
--- a/vpx_dsp/x86/variance_impl_mmx.asm
+++ b/vpx_dsp/x86/variance_impl_mmx.asm
@@ -13,407 +13,6 @@
 
 %define mmx_filter_shift            7
 
-;unsigned int vpx_get_mb_ss_mmx( short *src_ptr )
-global sym(vpx_get_mb_ss_mmx) PRIVATE
-sym(vpx_get_mb_ss_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    sub         rsp, 8
-    ; end prolog
-
-        mov         rax, arg(0) ;src_ptr
-        mov         rcx, 16
-        pxor        mm4, mm4
-
-.NEXTROW:
-        movq        mm0, [rax]
-        movq        mm1, [rax+8]
-        movq        mm2, [rax+16]
-        movq        mm3, [rax+24]
-        pmaddwd     mm0, mm0
-        pmaddwd     mm1, mm1
-        pmaddwd     mm2, mm2
-        pmaddwd     mm3, mm3
-
-        paddd       mm4, mm0
-        paddd       mm4, mm1
-        paddd       mm4, mm2
-        paddd       mm4, mm3
-
-        add         rax, 32
-        dec         rcx
-        ja          .NEXTROW
-        movq        QWORD PTR [rsp], mm4
-
-        ;return sum[0]+sum[1];
-        movsxd      rax, dword ptr [rsp]
-        movsxd      rcx, dword ptr [rsp+4]
-        add         rax, rcx
-
-    ; begin epilog
-    add rsp, 8
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vpx_get8x8var_mmx
-;(
-;    unsigned char *src_ptr,
-;    int  source_stride,
-;    unsigned char *ref_ptr,
-;    int  recon_stride,
-;    unsigned int *SSE,
-;    int *Sum
-;)
-global sym(vpx_get8x8var_mmx) PRIVATE
-sym(vpx_get8x8var_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push rsi
-    push rdi
-    push rbx
-    sub         rsp, 16
-    ; end prolog
-
-        pxor        mm5, mm5                    ; Blank mmx6
-        pxor        mm6, mm6                    ; Blank mmx7
-        pxor        mm7, mm7                    ; Blank mmx7
-
-        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
-        mov         rbx, arg(2) ;[ref_ptr]
-        movsxd      rcx, dword ptr arg(1) ;[source_stride]
-        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
-
-        ; Row 1
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 2
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 3
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 4
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 5
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        ;              movq        mm4, [rbx + rdx]
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 6
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 7
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 8
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Now accumulate the final results.
-        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
-        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
-        movsx       rdx, WORD PTR [rsp+8]
-        movsx       rcx, WORD PTR [rsp+10]
-        movsx       rbx, WORD PTR [rsp+12]
-        movsx       rax, WORD PTR [rsp+14]
-        add         rdx, rcx
-        add         rbx, rax
-        add         rdx, rbx    ;XSum
-        movsxd      rax, DWORD PTR [rsp]
-        movsxd      rcx, DWORD PTR [rsp+4]
-        add         rax, rcx    ;XXSum
-        mov         rsi, arg(4) ;SSE
-        mov         rdi, arg(5) ;Sum
-        mov         dword ptr [rsi], eax
-        mov         dword ptr [rdi], edx
-        xor         rax, rax    ; return 0
-
-    ; begin epilog
-    add rsp, 16
-    pop rbx
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void
-;vpx_get4x4var_mmx
-;(
-;    unsigned char *src_ptr,
-;    int  source_stride,
-;    unsigned char *ref_ptr,
-;    int  recon_stride,
-;    unsigned int *SSE,
-;    int *Sum
-;)
-global sym(vpx_get4x4var_mmx) PRIVATE
-sym(vpx_get4x4var_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push rsi
-    push rdi
-    push rbx
-    sub         rsp, 16
-    ; end prolog
-
-        pxor        mm5, mm5                    ; Blank mmx6
-        pxor        mm6, mm6                    ; Blank mmx7
-        pxor        mm7, mm7                    ; Blank mmx7
-
-        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
-        mov         rbx, arg(2) ;[ref_ptr]
-        movsxd      rcx, dword ptr arg(1) ;[source_stride]
-        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
-
-        ; Row 1
-        movd        mm0, [rax]                  ; Copy four bytes to mm0
-        movd        mm1, [rbx]                  ; Copy four bytes to mm1
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movd        mm1, [rbx]                  ; Copy four bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        ; Row 2
-        movd        mm0, [rax]                  ; Copy four bytes to mm0
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movd        mm1, [rbx]                  ; Copy four bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        ; Row 3
-        movd        mm0, [rax]                  ; Copy four bytes to mm0
-        punpcklbw   mm0, mm6                    ; unpack to higher precision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movd        mm1, [rbx]                  ; Copy four bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        ; Row 4
-        movd        mm0, [rax]                  ; Copy four bytes to mm0
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        ; Now accumulate the final results.
-        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
-        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
-        movsx       rdx, WORD PTR [rsp+8]
-        movsx       rcx, WORD PTR [rsp+10]
-        movsx       rbx, WORD PTR [rsp+12]
-        movsx       rax, WORD PTR [rsp+14]
-        add         rdx, rcx
-        add         rbx, rax
-        add         rdx, rbx    ;XSum
-        movsxd      rax, DWORD PTR [rsp]
-        movsxd      rcx, DWORD PTR [rsp+4]
-        add         rax, rcx    ;XXSum
-        mov         rsi, arg(4) ;SSE
-        mov         rdi, arg(5) ;Sum
-        mov         dword ptr [rsi], eax
-        mov         dword ptr [rdi], edx
-        xor         rax, rax    ; return 0
-
-    ; begin epilog
-    add rsp, 16
-    pop rbx
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
 ;void vpx_filter_block2d_bil4x4_var_mmx
 ;(
 ;    unsigned char *ref_ptr,
diff --git a/vpx_dsp/x86/variance_mmx.c b/vpx_dsp/x86/variance_mmx.c
index f04f4e2..636231d 100644
--- a/vpx_dsp/x86/variance_mmx.c
+++ b/vpx_dsp/x86/variance_mmx.c
@@ -23,10 +23,6 @@
   {  16,  16,  16,  16, 112, 112, 112, 112 }
 };
 
-extern void vpx_get4x4var_mmx(const uint8_t *a, int a_stride,
-                              const uint8_t *b, int b_stride,
-                              unsigned int *sse, int *sum);
-
 extern void vpx_filter_block2d_bil4x4_var_mmx(const unsigned char *ref_ptr,
                                               int ref_pixels_per_line,
                                               const unsigned char *src_ptr,
@@ -47,98 +43,6 @@
                                            unsigned int *sumsquared);
 
 
-unsigned int vpx_variance4x4_mmx(const unsigned char *a, int a_stride,
-                                 const unsigned char *b, int b_stride,
-                                 unsigned int *sse) {
-    unsigned int var;
-    int avg;
-
-    vpx_get4x4var_mmx(a, a_stride, b, b_stride, &var, &avg);
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 4));
-}
-
-unsigned int vpx_variance8x8_mmx(const unsigned char *a, int a_stride,
-                                 const unsigned char *b, int b_stride,
-                                 unsigned int *sse) {
-    unsigned int var;
-    int avg;
-
-    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &var, &avg);
-    *sse = var;
-
-    return (var - (((unsigned int)avg * avg) >> 6));
-}
-
-unsigned int vpx_mse16x16_mmx(const unsigned char *a, int a_stride,
-                              const unsigned char *b, int b_stride,
-                              unsigned int *sse) {
-    unsigned int sse0, sse1, sse2, sse3, var;
-    int sum0, sum1, sum2, sum3;
-
-    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
-    vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
-    vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
-                      b + 8 * b_stride, b_stride, &sse2, &sum2);
-    vpx_get8x8var_mmx(a + 8 * a_stride + 8, a_stride,
-                      b + 8 * b_stride + 8, b_stride, &sse3, &sum3);
-
-    var = sse0 + sse1 + sse2 + sse3;
-    *sse = var;
-    return var;
-}
-
-unsigned int vpx_variance16x16_mmx(const unsigned char *a, int a_stride,
-                                   const unsigned char *b, int b_stride,
-                                   unsigned int *sse) {
-    unsigned int sse0, sse1, sse2, sse3, var;
-    int sum0, sum1, sum2, sum3, avg;
-
-    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
-    vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
-    vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
-                      b + 8 * b_stride, b_stride, &sse2, &sum2);
-    vpx_get8x8var_mmx(a + 8 * a_stride + 8, a_stride,
-                      b + 8 * b_stride + 8, b_stride, &sse3, &sum3);
-
-    var = sse0 + sse1 + sse2 + sse3;
-    avg = sum0 + sum1 + sum2 + sum3;
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 8));
-}
-
-unsigned int vpx_variance16x8_mmx(const unsigned char *a, int a_stride,
-                                  const unsigned char *b, int b_stride,
-                                  unsigned int *sse) {
-    unsigned int sse0, sse1, var;
-    int sum0, sum1, avg;
-
-    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
-    vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
-
-    var = sse0 + sse1;
-    avg = sum0 + sum1;
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 7));
-}
-
-unsigned int vpx_variance8x16_mmx(const unsigned char *a, int a_stride,
-                                  const unsigned char *b, int b_stride,
-                                  unsigned int *sse) {
-    unsigned int sse0, sse1, var;
-    int sum0, sum1, avg;
-
-    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
-    vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
-                      b + 8 * b_stride, b_stride, &sse1, &sum1);
-
-    var = sse0 + sse1;
-    avg = sum0 + sum1;
-    *sse = var;
-
-    return (var - (((unsigned int)avg * avg) >> 7));
-}
-
 uint32_t vpx_sub_pixel_variance4x4_mmx(const uint8_t *a, int a_stride,
                                        int xoffset, int yoffset,
                                        const uint8_t *b, int b_stride,