Merge "Wide loopfilter 16 pix at a time"

diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index fd2bd36..6e5002f 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc

@@ -22,8 +22,8 @@
 }
 
 namespace {
-typedef void (*convolve_fn_t)(const uint8_t *src, int src_stride,
-                              uint8_t *dst, int dst_stride,
+typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
                               const int16_t *filter_x, int filter_x_stride,
                               const int16_t *filter_y, int filter_y_stride,
                               int w, int h);

diff --git a/vp9/common/vp9_convolve.c b/vp9/common/vp9_convolve.c
index 914afa7..6f1e418 100644
--- a/vp9/common/vp9_convolve.c
+++ b/vp9/common/vp9_convolve.c

@@ -38,8 +38,8 @@
  */
  #define ALIGN_FILTERS_256 1
 
-static void convolve_horiz_c(const uint8_t *src, int src_stride,
-                             uint8_t *dst, int dst_stride,
+static void convolve_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
                              const int16_t *filter_x0, int x_step_q4,
                              const int16_t *filter_y, int y_step_q4,
                              int w, int h, int taps) {
@@ -80,8 +80,8 @@
   }
 }
 
-static void convolve_avg_horiz_c(const uint8_t *src, int src_stride,
-                                 uint8_t *dst, int dst_stride,
+static void convolve_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
                                  const int16_t *filter_x0, int x_step_q4,
                                  const int16_t *filter_y, int y_step_q4,
                                  int w, int h, int taps) {
@@ -122,8 +122,8 @@
   }
 }
 
-static void convolve_vert_c(const uint8_t *src, int src_stride,
-                            uint8_t *dst, int dst_stride,
+static void convolve_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
                             const int16_t *filter_x, int x_step_q4,
                             const int16_t *filter_y0, int y_step_q4,
                             int w, int h, int taps) {
@@ -164,8 +164,8 @@
   }
 }
 
-static void convolve_avg_vert_c(const uint8_t *src, int src_stride,
-                                uint8_t *dst, int dst_stride,
+static void convolve_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                                uint8_t *dst, ptrdiff_t dst_stride,
                                 const int16_t *filter_x, int x_step_q4,
                                 const int16_t *filter_y0, int y_step_q4,
                                 int w, int h, int taps) {
@@ -207,8 +207,8 @@
   }
 }
 
-static void convolve_c(const uint8_t *src, int src_stride,
-                       uint8_t *dst, int dst_stride,
+static void convolve_c(const uint8_t *src, ptrdiff_t src_stride,
+                       uint8_t *dst, ptrdiff_t dst_stride,
                        const int16_t *filter_x, int x_step_q4,
                        const int16_t *filter_y, int y_step_q4,
                        int w, int h, int taps) {
@@ -237,8 +237,8 @@
                   w, h, taps);
 }
 
-static void convolve_avg_c(const uint8_t *src, int src_stride,
-                           uint8_t *dst, int dst_stride,
+static void convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
                            const int16_t *filter_x, int x_step_q4,
                            const int16_t *filter_y, int y_step_q4,
                            int w, int h, int taps) {
@@ -267,8 +267,8 @@
                       w, h, taps);
 }
 
-void vp9_convolve8_horiz_c(const uint8_t *src, int src_stride,
-                           uint8_t *dst, int dst_stride,
+void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
                            const int16_t *filter_x, int x_step_q4,
                            const int16_t *filter_y, int y_step_q4,
                            int w, int h) {
@@ -277,8 +277,8 @@
                    w, h, 8);
 }
 
-void vp9_convolve8_avg_horiz_c(const uint8_t *src, int src_stride,
-                               uint8_t *dst, int dst_stride,
+void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
                                const int16_t *filter_x, int x_step_q4,
                                const int16_t *filter_y, int y_step_q4,
                                int w, int h) {
@@ -287,8 +287,8 @@
                        w, h, 8);
 }
 
-void vp9_convolve8_vert_c(const uint8_t *src, int src_stride,
-                          uint8_t *dst, int dst_stride,
+void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                          uint8_t *dst, ptrdiff_t dst_stride,
                           const int16_t *filter_x, int x_step_q4,
                           const int16_t *filter_y, int y_step_q4,
                           int w, int h) {
@@ -297,8 +297,8 @@
                   w, h, 8);
 }
 
-void vp9_convolve8_avg_vert_c(const uint8_t *src, int src_stride,
-                              uint8_t *dst, int dst_stride,
+void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
                               const int16_t *filter_x, int x_step_q4,
                               const int16_t *filter_y, int y_step_q4,
                               int w, int h) {
@@ -307,8 +307,8 @@
                       w, h, 8);
 }
 
-void vp9_convolve8_c(const uint8_t *src, int src_stride,
-                     uint8_t *dst, int dst_stride,
+void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
+                     uint8_t *dst, ptrdiff_t dst_stride,
                      const int16_t *filter_x, int x_step_q4,
                      const int16_t *filter_y, int y_step_q4,
                      int w, int h) {
@@ -317,8 +317,8 @@
              w, h, 8);
 }
 
-void vp9_convolve8_avg_c(const uint8_t *src, int src_stride,
-                         uint8_t *dst, int dst_stride,
+void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+                         uint8_t *dst, ptrdiff_t dst_stride,
                          const int16_t *filter_x, int x_step_q4,
                          const int16_t *filter_y, int y_step_q4,
                          int w, int h) {
@@ -339,33 +339,25 @@
                    w, h);
 }
 
-void vp9_convolve_copy(const uint8_t *src, int src_stride,
-                       uint8_t *dst, int dst_stride,
-                       const int16_t *filter_x, int filter_x_stride,
-                       const int16_t *filter_y, int filter_y_stride,
-                       int w, int h) {
-  if (w == 16 && h == 16) {
-    vp9_copy_mem16x16(src, src_stride, dst, dst_stride);
-  } else if (w == 8 && h == 8) {
-    vp9_copy_mem8x8(src, src_stride, dst, dst_stride);
-  } else if (w == 8 && h == 4) {
-    vp9_copy_mem8x4(src, src_stride, dst, dst_stride);
-  } else {
-    int r;
+void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
+                         uint8_t *dst, ptrdiff_t dst_stride,
+                         const int16_t *filter_x, int filter_x_stride,
+                         const int16_t *filter_y, int filter_y_stride,
+                         int w, int h) {
+  int r;
 
-    for (r = h; r > 0; --r) {
-      memcpy(dst, src, w);
-      src += src_stride;
-      dst += dst_stride;
-    }
+  for (r = h; r > 0; --r) {
+    memcpy(dst, src, w);
+    src += src_stride;
+    dst += dst_stride;
   }
 }
 
-void vp9_convolve_avg(const uint8_t *src, int src_stride,
-                      uint8_t *dst, int dst_stride,
-                      const int16_t *filter_x, int filter_x_stride,
-                      const int16_t *filter_y, int filter_y_stride,
-                      int w, int h) {
+void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+                        uint8_t *dst, ptrdiff_t dst_stride,
+                        const int16_t *filter_x, int filter_x_stride,
+                        const int16_t *filter_y, int filter_y_stride,
+                        int w, int h) {
   int x, y;
 
   for (y = 0; y < h; ++y) {

diff --git a/vp9/common/vp9_convolve.h b/vp9/common/vp9_convolve.h
index 0596080..3de8111 100644
--- a/vp9/common/vp9_convolve.h
+++ b/vp9/common/vp9_convolve.h

@@ -13,26 +13,12 @@
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 
-typedef void (*convolve_fn_t)(const uint8_t *src, int src_stride,
-                              uint8_t *dst, int dst_stride,
+typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
                               const int16_t *filter_x, int x_step_q4,
                               const int16_t *filter_y, int y_step_q4,
                               int w, int h);
 
-// Not a convolution, a block copy conforming to the convolution prototype
-void vp9_convolve_copy(const uint8_t *src, int src_stride,
-                       uint8_t *dst, int dst_stride,
-                       const int16_t *filter_x, int x_step_q4,
-                       const int16_t *filter_y, int y_step_q4,
-                       int w, int h);
-
-// Not a convolution, a block average conforming to the convolution prototype
-void vp9_convolve_avg(const uint8_t *src, int src_stride,
-                      uint8_t *dst, int dst_stride,
-                      const int16_t *filter_x, int x_step_q4,
-                      const int16_t *filter_y, int y_step_q4,
-                      int w, int h);
-
 struct subpix_fn_table {
   const int16_t (*filter_x)[8];
   const int16_t (*filter_y)[8];

diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index 265a19a..c29fd14 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c

@@ -194,93 +194,6 @@
   assert(((intptr_t)xd->subpix.filter_x & 0xff) == 0);
 }
 
-void vp9_copy_mem16x16_c(const uint8_t *src,
-                         int src_stride,
-                         uint8_t *dst,
-                         int dst_stride) {
-  int r;
-
-  for (r = 0; r < 16; r++) {
-#if !(CONFIG_FAST_UNALIGNED)
-    dst[0] = src[0];
-    dst[1] = src[1];
-    dst[2] = src[2];
-    dst[3] = src[3];
-    dst[4] = src[4];
-    dst[5] = src[5];
-    dst[6] = src[6];
-    dst[7] = src[7];
-    dst[8] = src[8];
-    dst[9] = src[9];
-    dst[10] = src[10];
-    dst[11] = src[11];
-    dst[12] = src[12];
-    dst[13] = src[13];
-    dst[14] = src[14];
-    dst[15] = src[15];
-
-#else
-    ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0];
-    ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1];
-    ((uint32_t *)dst)[2] = ((const uint32_t *)src)[2];
-    ((uint32_t *)dst)[3] = ((const uint32_t *)src)[3];
-
-#endif
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-void vp9_copy_mem8x8_c(const uint8_t *src,
-                       int src_stride,
-                       uint8_t *dst,
-                       int dst_stride) {
-  int r;
-
-  for (r = 0; r < 8; r++) {
-#if !(CONFIG_FAST_UNALIGNED)
-    dst[0] = src[0];
-    dst[1] = src[1];
-    dst[2] = src[2];
-    dst[3] = src[3];
-    dst[4] = src[4];
-    dst[5] = src[5];
-    dst[6] = src[6];
-    dst[7] = src[7];
-#else
-    ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0];
-    ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1];
-#endif
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-void vp9_copy_mem8x4_c(const uint8_t *src,
-                       int src_stride,
-                       uint8_t *dst,
-                       int dst_stride) {
-  int r;
-
-  for (r = 0; r < 4; r++) {
-#if !(CONFIG_FAST_UNALIGNED)
-    dst[0] = src[0];
-    dst[1] = src[1];
-    dst[2] = src[2];
-    dst[3] = src[3];
-    dst[4] = src[4];
-    dst[5] = src[5];
-    dst[6] = src[6];
-    dst[7] = src[7];
-#else
-    ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0];
-    ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1];
-#endif
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
 void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
                                uint8_t *dst, int dst_stride,
                                const int_mv *src_mv,

diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index ab5e2df..56a2284 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh

@@ -43,17 +43,6 @@
 #
 # RECON
 #
-prototype void vp9_copy_mem16x16 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
-specialize vp9_copy_mem16x16 mmx sse2 dspr2
-vp9_copy_mem16x16_dspr2=vp9_copy_mem16x16_dspr2
-
-prototype void vp9_copy_mem8x8 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
-specialize vp9_copy_mem8x8 mmx dspr2
-vp9_copy_mem8x8_dspr2=vp9_copy_mem8x8_dspr2
-
-prototype void vp9_copy_mem8x4 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
-specialize vp9_copy_mem8x4 mmx
-
 prototype void vp9_d27_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
 specialize vp9_d27_predictor_4x4
 
@@ -275,22 +264,28 @@
 #
 # Sub Pixel Filters
 #
-prototype void vp9_convolve8 "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+prototype void vp9_convolve_copy "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+specialize vp9_convolve_copy sse2
+
+prototype void vp9_convolve_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+specialize vp9_convolve_avg sse2
+
+prototype void vp9_convolve8 "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
 specialize vp9_convolve8 ssse3
 
-prototype void vp9_convolve8_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+prototype void vp9_convolve8_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
 specialize vp9_convolve8_horiz ssse3
 
-prototype void vp9_convolve8_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+prototype void vp9_convolve8_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
 specialize vp9_convolve8_vert ssse3
 
-prototype void vp9_convolve8_avg "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+prototype void vp9_convolve8_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
 specialize vp9_convolve8_avg ssse3
 
-prototype void vp9_convolve8_avg_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+prototype void vp9_convolve8_avg_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
 specialize vp9_convolve8_avg_horiz ssse3
 
-prototype void vp9_convolve8_avg_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+prototype void vp9_convolve8_avg_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
 specialize vp9_convolve8_avg_vert ssse3
 
 #

diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c
index 2b66834..98fc4dc 100644
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c

@@ -121,8 +121,8 @@
                                      unsigned int output_height,
                                      const short *filter);
 
-void vp9_convolve8_horiz_ssse3(const uint8_t *src, int src_stride,
-                               uint8_t *dst, int dst_stride,
+void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
                                const int16_t *filter_x, int x_step_q4,
                                const int16_t *filter_y, int y_step_q4,
                                int w, int h) {
@@ -159,8 +159,8 @@
   }
 }
 
-void vp9_convolve8_vert_ssse3(const uint8_t *src, int src_stride,
-                              uint8_t *dst, int dst_stride,
+void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
                               const int16_t *filter_x, int x_step_q4,
                               const int16_t *filter_y, int y_step_q4,
                               int w, int h) {
@@ -197,8 +197,8 @@
   }
 }
 
-void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, int src_stride,
-                               uint8_t *dst, int dst_stride,
+void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
                                const int16_t *filter_x, int x_step_q4,
                                const int16_t *filter_y, int y_step_q4,
                                int w, int h) {
@@ -235,8 +235,8 @@
   }
 }
 
-void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, int src_stride,
-                              uint8_t *dst, int dst_stride,
+void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
                               const int16_t *filter_x, int x_step_q4,
                               const int16_t *filter_y, int y_step_q4,
                               int w, int h) {
@@ -273,8 +273,8 @@
   }
 }
 
-void vp9_convolve8_ssse3(const uint8_t *src, int src_stride,
-                         uint8_t *dst, int dst_stride,
+void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+                         uint8_t *dst, ptrdiff_t dst_stride,
                          const int16_t *filter_x, int x_step_q4,
                          const int16_t *filter_y, int y_step_q4,
                          int w, int h) {
@@ -294,8 +294,8 @@
   }
 }
 
-void vp9_convolve8_avg_ssse3(const uint8_t *src, int src_stride,
-                         uint8_t *dst, int dst_stride,
+void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+                         uint8_t *dst, ptrdiff_t dst_stride,
                          const int16_t *filter_x, int x_step_q4,
                          const int16_t *filter_y, int y_step_q4,
                          int w, int h) {

diff --git a/vp9/common/x86/vp9_copy_sse2.asm b/vp9/common/x86/vp9_copy_sse2.asm
new file mode 100644
index 0000000..dd522c6
--- /dev/null
+++ b/vp9/common/x86/vp9_copy_sse2.asm

@@ -0,0 +1,152 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro convolve_fn 1
+INIT_XMM sse2
+cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \
+                              fx, fxs, fy, fys, w, h
+  mov r4d, dword wm
+  cmp r4d, 4
+  je .w4
+  cmp r4d, 8
+  je .w8
+  cmp r4d, 16
+  je .w16
+  cmp r4d, 32
+  je .w32
+
+  mov                    r4d, dword hm
+.loop64:
+  movu                    m0, [srcq]
+  movu                    m1, [srcq+16]
+  movu                    m2, [srcq+32]
+  movu                    m3, [srcq+48]
+  add                   srcq, src_strideq
+%ifidn %1, avg
+  pavgb                   m0, [dstq]
+  pavgb                   m1, [dstq+16]
+  pavgb                   m2, [dstq+32]
+  pavgb                   m3, [dstq+48]
+%endif
+  mova             [dstq   ], m0
+  mova             [dstq+16], m1
+  mova             [dstq+32], m2
+  mova             [dstq+48], m3
+  add                   dstq, dst_strideq
+  dec                    r4d
+  jnz .loop64
+  RET
+
+.w32:
+  mov                    r4d, dword hm
+.loop32:
+  movu                    m0, [srcq]
+  movu                    m1, [srcq+16]
+  movu                    m2, [srcq+src_strideq]
+  movu                    m3, [srcq+src_strideq+16]
+  lea                   srcq, [srcq+src_strideq*2]
+%ifidn %1, avg
+  pavgb                   m0, [dstq]
+  pavgb                   m1, [dstq            +16]
+  pavgb                   m2, [dstq+dst_strideq]
+  pavgb                   m3, [dstq+dst_strideq+16]
+%endif
+  mova [dstq               ], m0
+  mova [dstq            +16], m1
+  mova [dstq+dst_strideq   ], m2
+  mova [dstq+dst_strideq+16], m3
+  lea                   dstq, [dstq+dst_strideq*2]
+  sub                    r4d, 2
+  jnz .loop32
+  RET
+
+.w16:
+  mov                    r4d, dword hm
+  lea                    r5q, [src_strideq*3]
+  lea                    r6q, [dst_strideq*3]
+.loop16:
+  movu                    m0, [srcq]
+  movu                    m1, [srcq+src_strideq]
+  movu                    m2, [srcq+src_strideq*2]
+  movu                    m3, [srcq+r5q]
+  lea                   srcq, [srcq+src_strideq*4]
+%ifidn %1, avg
+  pavgb                   m0, [dstq]
+  pavgb                   m1, [dstq+dst_strideq]
+  pavgb                   m2, [dstq+dst_strideq*2]
+  pavgb                   m3, [dstq+r6q]
+%endif
+  mova  [dstq              ], m0
+  mova  [dstq+dst_strideq  ], m1
+  mova  [dstq+dst_strideq*2], m2
+  mova  [dstq+r6q          ], m3
+  lea                   dstq, [dstq+dst_strideq*4]
+  sub                    r4d, 4
+  jnz .loop16
+  RET
+
+INIT_MMX sse
+.w8:
+  mov                    r4d, dword hm
+  lea                    r5q, [src_strideq*3]
+  lea                    r6q, [dst_strideq*3]
+.loop8:
+  movu                    m0, [srcq]
+  movu                    m1, [srcq+src_strideq]
+  movu                    m2, [srcq+src_strideq*2]
+  movu                    m3, [srcq+r5q]
+  lea                   srcq, [srcq+src_strideq*4]
+%ifidn %1, avg
+  pavgb                   m0, [dstq]
+  pavgb                   m1, [dstq+dst_strideq]
+  pavgb                   m2, [dstq+dst_strideq*2]
+  pavgb                   m3, [dstq+r6q]
+%endif
+  mova  [dstq              ], m0
+  mova  [dstq+dst_strideq  ], m1
+  mova  [dstq+dst_strideq*2], m2
+  mova  [dstq+r6q          ], m3
+  lea                   dstq, [dstq+dst_strideq*4]
+  sub                    r4d, 4
+  jnz .loop8
+  RET
+
+.w4:
+  mov                    r4d, dword hm
+  lea                    r5q, [src_strideq*3]
+  lea                    r6q, [dst_strideq*3]
+.loop4:
+  movh                    m0, [srcq]
+  movh                    m1, [srcq+src_strideq]
+  movh                    m2, [srcq+src_strideq*2]
+  movh                    m3, [srcq+r5q]
+  lea                   srcq, [srcq+src_strideq*4]
+%ifidn %1, avg
+  pavgb                   m0, [dstq]
+  pavgb                   m1, [dstq+dst_strideq]
+  pavgb                   m2, [dstq+dst_strideq*2]
+  pavgb                   m3, [dstq+r6q]
+%endif
+  movh  [dstq              ], m0
+  movh  [dstq+dst_strideq  ], m1
+  movh  [dstq+dst_strideq*2], m2
+  movh  [dstq+r6q          ], m3
+  lea                   dstq, [dstq+dst_strideq*4]
+  sub                    r4d, 4
+  jnz .loop4
+  RET
+%endmacro
+
+convolve_fn copy
+convolve_fn avg

diff --git a/vp9/common/x86/vp9_iwalsh_mmx.asm b/vp9/common/x86/vp9_iwalsh_mmx.asm
deleted file mode 100644
index 1af2521..0000000
--- a/vp9/common/x86/vp9_iwalsh_mmx.asm
+++ /dev/null

@@ -1,173 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_short_inv_walsh4x4_1_mmx(short *input, short *output)
-global sym(vp9_short_inv_walsh4x4_1_mmx) PRIVATE
-sym(vp9_short_inv_walsh4x4_1_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 2
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov     rsi, arg(0)
-    mov     rax, 3
-
-    mov     rdi, arg(1)
-    add     rax, [rsi]          ;input[0] + 3
-
-    movd    mm0, eax
-
-    punpcklwd mm0, mm0          ;x x val val
-
-    punpckldq mm0, mm0          ;val val val val
-
-    psraw   mm0, 3            ;(input[0] + 3) >> 3
-
-    movq  [rdi + 0], mm0
-    movq  [rdi + 8], mm0
-    movq  [rdi + 16], mm0
-    movq  [rdi + 24], mm0
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp9_short_inv_walsh4x4_mmx(short *input, short *output)
-global sym(vp9_short_inv_walsh4x4_mmx) PRIVATE
-sym(vp9_short_inv_walsh4x4_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 2
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov     rax, 3
-    mov     rsi, arg(0)
-    mov     rdi, arg(1)
-    shl     rax, 16
-
-    movq    mm0, [rsi + 0]        ;ip[0]
-    movq    mm1, [rsi + 8]        ;ip[4]
-    or      rax, 3            ;00030003h
-
-    movq    mm2, [rsi + 16]       ;ip[8]
-    movq    mm3, [rsi + 24]       ;ip[12]
-
-    movq    mm7, rax
-    movq    mm4, mm0
-
-    punpcklwd mm7, mm7          ;0003000300030003h
-    movq    mm5, mm1
-
-    paddw   mm4, mm3          ;ip[0] + ip[12] aka al
-    paddw   mm5, mm2          ;ip[4] + ip[8] aka bl
-
-    movq    mm6, mm4          ;temp al
-
-    paddw   mm4, mm5          ;al + bl
-    psubw   mm6, mm5          ;al - bl
-
-    psubw   mm0, mm3          ;ip[0] - ip[12] aka d1
-    psubw   mm1, mm2          ;ip[4] - ip[8] aka c1
-
-    movq    mm5, mm0          ;temp dl
-
-    paddw   mm0, mm1          ;dl + cl
-    psubw   mm5, mm1          ;dl - cl
-
-    ; 03 02 01 00
-    ; 13 12 11 10
-    ; 23 22 21 20
-    ; 33 32 31 30
-
-    movq    mm3, mm4          ; 03 02 01 00
-    punpcklwd mm4, mm0          ; 11 01 10 00
-    punpckhwd mm3, mm0          ; 13 03 12 02
-
-    movq    mm1, mm6          ; 23 22 21 20
-    punpcklwd mm6, mm5          ; 31 21 30 20
-    punpckhwd mm1, mm5          ; 33 23 32 22
-
-    movq    mm0, mm4          ; 11 01 10 00
-    movq    mm2, mm3          ; 13 03 12 02
-
-    punpckldq mm0, mm6          ; 30 20 10 00 aka ip[0]
-    punpckhdq mm4, mm6          ; 31 21 11 01 aka ip[4]
-
-    punpckldq mm2, mm1          ; 32 22 12 02 aka ip[8]
-    punpckhdq mm3, mm1          ; 33 23 13 03 aka ip[12]
-;~~~~~~~~~~~~~~~~~~~~~
-    movq    mm1, mm0
-    movq    mm5, mm4
-
-    paddw   mm1, mm3          ;ip[0] + ip[12] aka al
-    paddw   mm5, mm2          ;ip[4] + ip[8] aka bl
-
-    movq    mm6, mm1          ;temp al
-
-    paddw   mm1, mm5          ;al + bl
-    psubw   mm6, mm5          ;al - bl
-
-    psubw   mm0, mm3          ;ip[0] - ip[12] aka d1
-    psubw   mm4, mm2          ;ip[4] - ip[8] aka c1
-
-    movq    mm5, mm0          ;temp dl
-
-    paddw   mm0, mm4          ;dl + cl
-    psubw   mm5, mm4          ;dl - cl
-;~~~~~~~~~~~~~~~~~~~~~
-    movq    mm3, mm1          ; 03 02 01 00
-    punpcklwd mm1, mm0          ; 11 01 10 00
-    punpckhwd mm3, mm0          ; 13 03 12 02
-
-    movq    mm4, mm6          ; 23 22 21 20
-    punpcklwd mm6, mm5          ; 31 21 30 20
-    punpckhwd mm4, mm5          ; 33 23 32 22
-
-    movq    mm0, mm1          ; 11 01 10 00
-    movq    mm2, mm3          ; 13 03 12 02
-
-    punpckldq mm0, mm6          ; 30 20 10 00 aka ip[0]
-    punpckhdq mm1, mm6          ; 31 21 11 01 aka ip[4]
-
-    punpckldq mm2, mm4          ; 32 22 12 02 aka ip[8]
-    punpckhdq mm3, mm4          ; 33 23 13 03 aka ip[12]
-
-    paddw   mm0, mm7
-    paddw   mm1, mm7
-    paddw   mm2, mm7
-    paddw   mm3, mm7
-
-    psraw   mm0, 3
-    psraw   mm1, 3
-    psraw   mm2, 3
-    psraw   mm3, 3
-
-    movq  [rdi + 0], mm0
-    movq  [rdi + 8], mm1
-    movq  [rdi + 16], mm2
-    movq  [rdi + 24], mm3
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-

diff --git a/vp9/common/x86/vp9_iwalsh_sse2.asm b/vp9/common/x86/vp9_iwalsh_sse2.asm
deleted file mode 100644
index 84fa2fe..0000000
--- a/vp9/common/x86/vp9_iwalsh_sse2.asm
+++ /dev/null

@@ -1,119 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_short_inv_walsh4x4_sse2(short *input, short *output)
-global sym(vp9_short_inv_walsh4x4_sse2) PRIVATE
-sym(vp9_short_inv_walsh4x4_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 2
-    SAVE_XMM 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov     rsi, arg(0)
-    mov     rdi, arg(1)
-    mov     rax, 3
-
-    movdqa    xmm0, [rsi + 0]       ;ip[4] ip[0]
-    movdqa    xmm1, [rsi + 16]      ;ip[12] ip[8]
-
-    shl     rax, 16
-    or      rax, 3            ;00030003h
-
-    pshufd    xmm2, xmm1, 4eh       ;ip[8] ip[12]
-    movdqa    xmm3, xmm0          ;ip[4] ip[0]
-
-    paddw   xmm0, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
-    psubw   xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
-
-    movdqa    xmm4, xmm0
-    punpcklqdq  xmm0, xmm3          ;d1 a1
-    punpckhqdq  xmm4, xmm3          ;c1 b1
-    movd    xmm6, eax
-
-    movdqa    xmm1, xmm4          ;c1 b1
-    paddw   xmm4, xmm0          ;dl+cl a1+b1 aka op[4] op[0]
-    psubw   xmm0, xmm1          ;d1-c1 a1-b1 aka op[12] op[8]
-
-;;;temp output
-;;  movdqu  [rdi + 0], xmm4
-;;  movdqu  [rdi + 16], xmm3
-
-;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    ; 13 12 11 10 03 02 01 00
-    ;
-    ; 33 32 31 30 23 22 21 20
-    ;
-    movdqa    xmm3, xmm4          ; 13 12 11 10 03 02 01 00
-    punpcklwd xmm4, xmm0          ; 23 03 22 02 21 01 20 00
-    punpckhwd xmm3, xmm0          ; 33 13 32 12 31 11 30 10
-    movdqa    xmm1, xmm4          ; 23 03 22 02 21 01 20 00
-    punpcklwd xmm4, xmm3          ; 31 21 11 01 30 20 10 00
-    punpckhwd xmm1, xmm3          ; 33 23 13 03 32 22 12 02
-    ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    pshufd    xmm2, xmm1, 4eh       ;ip[8] ip[12]
-    movdqa    xmm3, xmm4          ;ip[4] ip[0]
-
-    pshufd    xmm6, xmm6, 0       ;03 03 03 03 03 03 03 03
-
-    paddw   xmm4, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
-    psubw   xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
-
-    movdqa    xmm5, xmm4
-    punpcklqdq  xmm4, xmm3          ;d1 a1
-    punpckhqdq  xmm5, xmm3          ;c1 b1
-
-    movdqa    xmm1, xmm5          ;c1 b1
-    paddw   xmm5, xmm4          ;dl+cl a1+b1 aka op[4] op[0]
-    psubw   xmm4, xmm1          ;d1-c1 a1-b1 aka op[12] op[8]
-;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    ; 13 12 11 10 03 02 01 00
-    ;
-    ; 33 32 31 30 23 22 21 20
-    ;
-    movdqa    xmm0, xmm5          ; 13 12 11 10 03 02 01 00
-    punpcklwd xmm5, xmm4          ; 23 03 22 02 21 01 20 00
-    punpckhwd xmm0, xmm4          ; 33 13 32 12 31 11 30 10
-    movdqa    xmm1, xmm5          ; 23 03 22 02 21 01 20 00
-    punpcklwd xmm5, xmm0          ; 31 21 11 01 30 20 10 00
-    punpckhwd xmm1, xmm0          ; 33 23 13 03 32 22 12 02
-;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    paddw   xmm5, xmm6
-    paddw   xmm1, xmm6
-
-    psraw   xmm5, 3
-    psraw   xmm1, 3
-
-    movdqa  [rdi + 0], xmm5
-    movdqa  [rdi + 16], xmm1
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-x_s1sqr2:
-    times 4 dw 0x8A8C
-align 16
-x_c1sqr2less1:
-    times 4 dw 0x4E7B
-align 16
-fours:
-    times 4 dw 0x0004

diff --git a/vp9/common/x86/vp9_recon_mmx.asm b/vp9/common/x86/vp9_recon_mmx.asm
deleted file mode 100644
index 6fbbe48..0000000
--- a/vp9/common/x86/vp9_recon_mmx.asm
+++ /dev/null

@@ -1,272 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-;void copy_mem8x8_mmx(
-;    unsigned char *src,
-;    int src_stride,
-;    unsigned char *dst,
-;    int dst_stride
-;    )
-global sym(vp9_copy_mem8x8_mmx) PRIVATE
-sym(vp9_copy_mem8x8_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rsi,        arg(0) ;src;
-        movq        mm0,        [rsi]
-
-        movsxd      rax,        dword ptr arg(1) ;src_stride;
-        mov         rdi,        arg(2) ;dst;
-
-        movq        mm1,        [rsi+rax]
-        movq        mm2,        [rsi+rax*2]
-
-        movsxd      rcx,        dword ptr arg(3) ;dst_stride
-        lea         rsi,        [rsi+rax*2]
-
-        movq        [rdi],      mm0
-        add         rsi,        rax
-
-        movq        [rdi+rcx],      mm1
-        movq        [rdi+rcx*2],    mm2
-
-
-        lea         rdi,        [rdi+rcx*2]
-        movq        mm3,        [rsi]
-
-        add         rdi,        rcx
-        movq        mm4,        [rsi+rax]
-
-        movq        mm5,        [rsi+rax*2]
-        movq        [rdi],      mm3
-
-        lea         rsi,        [rsi+rax*2]
-        movq        [rdi+rcx],  mm4
-
-        movq        [rdi+rcx*2],    mm5
-        lea         rdi,        [rdi+rcx*2]
-
-        movq        mm0,        [rsi+rax]
-        movq        mm1,        [rsi+rax*2]
-
-        movq        [rdi+rcx],  mm0
-        movq        [rdi+rcx*2],mm1
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void copy_mem8x4_mmx(
-;    unsigned char *src,
-;    int src_stride,
-;    unsigned char *dst,
-;    int dst_stride
-;    )
-global sym(vp9_copy_mem8x4_mmx) PRIVATE
-sym(vp9_copy_mem8x4_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rsi,        arg(0) ;src;
-        movq        mm0,        [rsi]
-
-        movsxd      rax,        dword ptr arg(1) ;src_stride;
-        mov         rdi,        arg(2) ;dst;
-
-        movq        mm1,        [rsi+rax]
-        movq        mm2,        [rsi+rax*2]
-
-        movsxd      rcx,        dword ptr arg(3) ;dst_stride
-        lea         rsi,        [rsi+rax*2]
-
-        movq        [rdi],      mm0
-        movq        [rdi+rcx],      mm1
-
-        movq        [rdi+rcx*2],    mm2
-        lea         rdi,        [rdi+rcx*2]
-
-        movq        mm3,        [rsi+rax]
-        movq        [rdi+rcx],      mm3
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void copy_mem16x16_mmx(
-;    unsigned char *src,
-;    int src_stride,
-;    unsigned char *dst,
-;    int dst_stride
-;    )
-global sym(vp9_copy_mem16x16_mmx) PRIVATE
-sym(vp9_copy_mem16x16_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rsi,        arg(0) ;src;
-        movsxd      rax,        dword ptr arg(1) ;src_stride;
-
-        mov         rdi,        arg(2) ;dst;
-        movsxd      rcx,        dword ptr arg(3) ;dst_stride
-
-        movq        mm0,            [rsi]
-        movq        mm3,            [rsi+8];
-
-        movq        mm1,            [rsi+rax]
-        movq        mm4,            [rsi+rax+8]
-
-        movq        mm2,            [rsi+rax*2]
-        movq        mm5,            [rsi+rax*2+8]
-
-        lea         rsi,            [rsi+rax*2]
-        add         rsi,            rax
-
-        movq        [rdi],          mm0
-        movq        [rdi+8],        mm3
-
-        movq        [rdi+rcx],      mm1
-        movq        [rdi+rcx+8],    mm4
-
-        movq        [rdi+rcx*2],    mm2
-        movq        [rdi+rcx*2+8],  mm5
-
-        lea         rdi,            [rdi+rcx*2]
-        add         rdi,            rcx
-
-        movq        mm0,            [rsi]
-        movq        mm3,            [rsi+8];
-
-        movq        mm1,            [rsi+rax]
-        movq        mm4,            [rsi+rax+8]
-
-        movq        mm2,            [rsi+rax*2]
-        movq        mm5,            [rsi+rax*2+8]
-
-        lea         rsi,            [rsi+rax*2]
-        add         rsi,            rax
-
-        movq        [rdi],          mm0
-        movq        [rdi+8],        mm3
-
-        movq        [rdi+rcx],      mm1
-        movq        [rdi+rcx+8],    mm4
-
-        movq        [rdi+rcx*2],    mm2
-        movq        [rdi+rcx*2+8],  mm5
-
-        lea         rdi,            [rdi+rcx*2]
-        add         rdi,            rcx
-
-        movq        mm0,            [rsi]
-        movq        mm3,            [rsi+8];
-
-        movq        mm1,            [rsi+rax]
-        movq        mm4,            [rsi+rax+8]
-
-        movq        mm2,            [rsi+rax*2]
-        movq        mm5,            [rsi+rax*2+8]
-
-        lea         rsi,            [rsi+rax*2]
-        add         rsi,            rax
-
-        movq        [rdi],          mm0
-        movq        [rdi+8],        mm3
-
-        movq        [rdi+rcx],      mm1
-        movq        [rdi+rcx+8],    mm4
-
-        movq        [rdi+rcx*2],    mm2
-        movq        [rdi+rcx*2+8],  mm5
-
-        lea         rdi,            [rdi+rcx*2]
-        add         rdi,            rcx
-
-        movq        mm0,            [rsi]
-        movq        mm3,            [rsi+8];
-
-        movq        mm1,            [rsi+rax]
-        movq        mm4,            [rsi+rax+8]
-
-        movq        mm2,            [rsi+rax*2]
-        movq        mm5,            [rsi+rax*2+8]
-
-        lea         rsi,            [rsi+rax*2]
-        add         rsi,            rax
-
-        movq        [rdi],          mm0
-        movq        [rdi+8],        mm3
-
-        movq        [rdi+rcx],      mm1
-        movq        [rdi+rcx+8],    mm4
-
-        movq        [rdi+rcx*2],    mm2
-        movq        [rdi+rcx*2+8],  mm5
-
-        lea         rdi,            [rdi+rcx*2]
-        add         rdi,            rcx
-
-        movq        mm0,            [rsi]
-        movq        mm3,            [rsi+8];
-
-        movq        mm1,            [rsi+rax]
-        movq        mm4,            [rsi+rax+8]
-
-        movq        mm2,            [rsi+rax*2]
-        movq        mm5,            [rsi+rax*2+8]
-
-        lea         rsi,            [rsi+rax*2]
-        add         rsi,            rax
-
-        movq        [rdi],          mm0
-        movq        [rdi+8],        mm3
-
-        movq        [rdi+rcx],      mm1
-        movq        [rdi+rcx+8],    mm4
-
-        movq        [rdi+rcx*2],    mm2
-        movq        [rdi+rcx*2+8],  mm5
-
-        lea         rdi,            [rdi+rcx*2]
-        add         rdi,            rcx
-
-        movq        mm0,            [rsi]
-        movq        mm3,            [rsi+8];
-
-        movq        [rdi],          mm0
-        movq        [rdi+8],        mm3
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret

diff --git a/vp9/common/x86/vp9_recon_sse2.asm b/vp9/common/x86/vp9_recon_sse2.asm
deleted file mode 100644
index f7cc611..0000000
--- a/vp9/common/x86/vp9_recon_sse2.asm
+++ /dev/null

@@ -1,115 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-;void copy_mem16x16_sse2(
-;    unsigned char *src,
-;    int src_stride,
-;    unsigned char *dst,
-;    int dst_stride
-;    )
-global sym(vp9_copy_mem16x16_sse2) PRIVATE
-sym(vp9_copy_mem16x16_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rsi,        arg(0) ;src;
-        movdqu      xmm0,       [rsi]
-
-        movsxd      rax,        dword ptr arg(1) ;src_stride;
-        mov         rdi,        arg(2) ;dst;
-
-        movdqu      xmm1,       [rsi+rax]
-        movdqu      xmm2,       [rsi+rax*2]
-
-        movsxd      rcx,        dword ptr arg(3) ;dst_stride
-        lea         rsi,        [rsi+rax*2]
-
-        movdqa      [rdi],      xmm0
-        add         rsi,        rax
-
-        movdqa      [rdi+rcx],  xmm1
-        movdqa      [rdi+rcx*2],xmm2
-
-        lea         rdi,        [rdi+rcx*2]
-        movdqu      xmm3,       [rsi]
-
-        add         rdi,        rcx
-        movdqu      xmm4,       [rsi+rax]
-
-        movdqu      xmm5,       [rsi+rax*2]
-        lea         rsi,        [rsi+rax*2]
-
-        movdqa      [rdi],  xmm3
-        add         rsi,        rax
-
-        movdqa      [rdi+rcx],  xmm4
-        movdqa      [rdi+rcx*2],xmm5
-
-        lea         rdi,        [rdi+rcx*2]
-        movdqu      xmm0,       [rsi]
-
-        add         rdi,        rcx
-        movdqu      xmm1,       [rsi+rax]
-
-        movdqu      xmm2,       [rsi+rax*2]
-        lea         rsi,        [rsi+rax*2]
-
-        movdqa      [rdi],      xmm0
-        add         rsi,        rax
-
-        movdqa      [rdi+rcx],  xmm1
-
-        movdqa      [rdi+rcx*2],    xmm2
-        movdqu      xmm3,       [rsi]
-
-        movdqu      xmm4,       [rsi+rax]
-        lea         rdi,        [rdi+rcx*2]
-
-        add         rdi,        rcx
-        movdqu      xmm5,       [rsi+rax*2]
-
-        lea         rsi,        [rsi+rax*2]
-        movdqa      [rdi],  xmm3
-
-        add         rsi,        rax
-        movdqa      [rdi+rcx],  xmm4
-
-        movdqa      [rdi+rcx*2],xmm5
-        movdqu      xmm0,       [rsi]
-
-        lea         rdi,        [rdi+rcx*2]
-        movdqu      xmm1,       [rsi+rax]
-
-        add         rdi,        rcx
-        movdqu      xmm2,       [rsi+rax*2]
-
-        lea         rsi,        [rsi+rax*2]
-        movdqa      [rdi],      xmm0
-
-        movdqa      [rdi+rcx],  xmm1
-        movdqa      [rdi+rcx*2],xmm2
-
-        movdqu      xmm3,       [rsi+rax]
-        lea         rdi,        [rdi+rcx*2]
-
-        movdqa      [rdi+rcx],  xmm3
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret

diff --git a/vp9/common/x86/vp9_sadmxn_sse2.c b/vp9/common/x86/vp9_sadmxn_sse2.c
deleted file mode 100644
index ed873a5..0000000
--- a/vp9/common/x86/vp9_sadmxn_sse2.c
+++ /dev/null

@@ -1,95 +0,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <emmintrin.h>  /* SSE2 */
-#include "vpx/vpx_integer.h"
-#include "vpx_ports/emmintrin_compat.h"
-
-unsigned int vp9_sad16x3_sse2(
-  const unsigned char *src_ptr,
-  int  src_stride,
-  const unsigned char *ref_ptr,
-  int  ref_stride) {
-  __m128i s0, s1, s2;
-  __m128i r0, r1, r2;
-  __m128i sad;
-
-  s0 = _mm_loadu_si128((const __m128i *)(src_ptr + 0 * src_stride));
-  s1 = _mm_loadu_si128((const __m128i *)(src_ptr + 1 * src_stride));
-  s2 = _mm_loadu_si128((const __m128i *)(src_ptr + 2 * src_stride));
-
-  r0 = _mm_loadu_si128((const __m128i *)(ref_ptr + 0 * ref_stride));
-  r1 = _mm_loadu_si128((const __m128i *)(ref_ptr + 1 * ref_stride));
-  r2 = _mm_loadu_si128((const __m128i *)(ref_ptr + 2 * ref_stride));
-
-  sad = _mm_sad_epu8(s0, r0);
-  sad = _mm_add_epi16(sad,  _mm_sad_epu8(s1, r1));
-  sad = _mm_add_epi16(sad,  _mm_sad_epu8(s2, r2));
-  sad = _mm_add_epi16(sad,  _mm_srli_si128(sad, 8));
-
-  return _mm_cvtsi128_si32(sad);
-}
-
-unsigned int vp9_sad3x16_sse2(
-  const unsigned char *src_ptr,
-  int  src_stride,
-  const unsigned char *ref_ptr,
-  int  ref_stride) {
-  int r;
-  __m128i s0, s1, s2, s3;
-  __m128i r0, r1, r2, r3;
-  __m128i sad = _mm_setzero_si128();
-  __m128i mask;
-  const int offset = (uintptr_t)src_ptr & 3;
-
-  /* In current use case, the offset is 1 if CONFIG_SUBPELREFMV is off.
-   * Here, for offset=1, we adjust src_ptr to be 4-byte aligned. Then, movd
-   * takes much less time.
-   */
-  if (offset == 1)
-    src_ptr -= 1;
-
-  /* mask = 0xffffffffffff0000ffffffffffff0000 */
-  mask = _mm_cmpeq_epi32(sad, sad);
-  mask = _mm_slli_epi64(mask, 16);
-
-  for (r = 0; r < 16; r += 4) {
-    s0 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 0 * src_stride));
-    s1 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 1 * src_stride));
-    s2 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 2 * src_stride));
-    s3 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 3 * src_stride));
-    r0 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 0 * ref_stride));
-    r1 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 1 * ref_stride));
-    r2 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 2 * ref_stride));
-    r3 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 3 * ref_stride));
-
-    s0 = _mm_unpacklo_epi8(s0, s1);
-    r0 = _mm_unpacklo_epi8(r0, r1);
-    s2 = _mm_unpacklo_epi8(s2, s3);
-    r2 = _mm_unpacklo_epi8(r2, r3);
-    s0 = _mm_unpacklo_epi64(s0, s2);
-    r0 = _mm_unpacklo_epi64(r0, r2);
-
-    // throw out extra byte
-    if (offset == 1)
-      s0 = _mm_and_si128(s0, mask);
-    else
-      s0 = _mm_slli_epi64(s0, 16);
-    r0 = _mm_slli_epi64(r0, 16);
-
-    sad = _mm_add_epi16(sad, _mm_sad_epu8(s0, r0));
-
-    src_ptr += src_stride*4;
-    ref_ptr += ref_stride*4;
-  }
-
-  sad = _mm_add_epi16(sad,  _mm_srli_si128(sad, 8));
-  return _mm_cvtsi128_si32(sad);
-}

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 4cb38f7..f424679 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c

@@ -1288,9 +1288,9 @@
                                                           block, 16),
                                       16, &ssz) >> 2;
 
-        if (best_tx_type != DCT_DCT)
+        if (tx_type != DCT_DCT)
           vp9_short_iht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block, 16),
-                               dst, pd->dst.stride, best_tx_type);
+                               dst, pd->dst.stride, tx_type);
         else
           xd->inv_txm4x4_add(BLOCK_OFFSET(pd->dqcoeff, block, 16),
                              dst, pd->dst.stride);

diff --git a/vp9/encoder/x86/vp9_dct_mmx.asm b/vp9/encoder/x86/vp9_dct_mmx.asm
deleted file mode 100644
index 54766d8..0000000
--- a/vp9/encoder/x86/vp9_dct_mmx.asm
+++ /dev/null

@@ -1,241 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_short_fdct4x4_mmx(short *input, short *output, int pitch)
-global sym(vp9_short_fdct4x4_mmx) PRIVATE
-sym(vp9_short_fdct4x4_mmx):
-    push        rbp
-    mov         rbp,        rsp
-    SHADOW_ARGS_TO_STACK 3
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rsi,        arg(0)      ; input
-        mov         rdi,        arg(1)      ; output
-
-        movsxd      rax,        dword ptr arg(2) ;pitch
-
-        lea         rcx,        [rsi + rax*2]
-        ; read the input data
-        movq        mm0,        [rsi]
-        movq        mm1,        [rsi + rax]
-
-        movq        mm2,        [rcx]
-        movq        mm4,        [rcx + rax]
-
-        ; transpose for the first stage
-        movq        mm3,        mm0         ; 00 01 02 03
-        movq        mm5,        mm2         ; 20 21 22 23
-
-        punpcklwd   mm0,        mm1         ; 00 10 01 11
-        punpckhwd   mm3,        mm1         ; 02 12 03 13
-
-        punpcklwd   mm2,        mm4         ; 20 30 21 31
-        punpckhwd   mm5,        mm4         ; 22 32 23 33
-
-        movq        mm1,        mm0         ; 00 10 01 11
-        punpckldq   mm0,        mm2         ; 00 10 20 30
-
-        punpckhdq   mm1,        mm2         ; 01 11 21 31
-
-        movq        mm2,        mm3         ; 02 12 03 13
-        punpckldq   mm2,        mm5         ; 02 12 22 32
-
-        punpckhdq   mm3,        mm5         ; 03 13 23 33
-
-        ; mm0 0
-        ; mm1 1
-        ; mm2 2
-        ; mm3 3
-
-        ; first stage
-        movq        mm5,        mm0
-        movq        mm4,        mm1
-
-        paddw       mm0,        mm3         ; a1 = 0 + 3
-        paddw       mm1,        mm2         ; b1 = 1 + 2
-
-        psubw       mm4,        mm2         ; c1 = 1 - 2
-        psubw       mm5,        mm3         ; d1 = 0 - 3
-
-        psllw       mm5,        3
-        psllw       mm4,        3
-
-        psllw       mm0,        3
-        psllw       mm1,        3
-
-        ; output 0 and 2
-        movq        mm2,        mm0         ; a1
-
-        paddw       mm0,        mm1         ; op[0] = a1 + b1
-        psubw       mm2,        mm1         ; op[2] = a1 - b1
-
-        ; output 1 and 3
-        ; interleave c1, d1
-        movq        mm1,        mm5         ; d1
-        punpcklwd   mm1,        mm4         ; c1 d1
-        punpckhwd   mm5,        mm4         ; c1 d1
-
-        movq        mm3,        mm1
-        movq        mm4,        mm5
-
-        pmaddwd     mm1,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
-        pmaddwd     mm4,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
-
-        pmaddwd     mm3,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
-        pmaddwd     mm5,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
-
-        paddd       mm1,        MMWORD PTR[GLOBAL(_14500)]
-        paddd       mm4,        MMWORD PTR[GLOBAL(_14500)]
-        paddd       mm3,        MMWORD PTR[GLOBAL(_7500)]
-        paddd       mm5,        MMWORD PTR[GLOBAL(_7500)]
-
-        psrad       mm1,        12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
-        psrad       mm4,        12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
-        psrad       mm3,        12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
-        psrad       mm5,        12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
-
-        packssdw    mm1,        mm4         ; op[1]
-        packssdw    mm3,        mm5         ; op[3]
-
-        ; done with vertical
-        ; transpose for the second stage
-        movq        mm4,        mm0         ; 00 10 20 30
-        movq        mm5,        mm2         ; 02 12 22 32
-
-        punpcklwd   mm0,        mm1         ; 00 01 10 11
-        punpckhwd   mm4,        mm1         ; 20 21 30 31
-
-        punpcklwd   mm2,        mm3         ; 02 03 12 13
-        punpckhwd   mm5,        mm3         ; 22 23 32 33
-
-        movq        mm1,        mm0         ; 00 01 10 11
-        punpckldq   mm0,        mm2         ; 00 01 02 03
-
-        punpckhdq   mm1,        mm2         ; 01 22 12 13
-
-        movq        mm2,        mm4         ; 20 31 30 31
-        punpckldq   mm2,        mm5         ; 20 21 22 23
-
-        punpckhdq   mm4,        mm5         ; 30 31 32 33
-
-        ; mm0 0
-        ; mm1 1
-        ; mm2 2
-        ; mm3 4
-
-        movq        mm5,        mm0
-        movq        mm3,        mm1
-
-        paddw       mm0,        mm4         ; a1 = 0 + 3
-        paddw       mm1,        mm2         ; b1 = 1 + 2
-
-        psubw       mm3,        mm2         ; c1 = 1 - 2
-        psubw       mm5,        mm4         ; d1 = 0 - 3
-
-        pxor        mm6,        mm6         ; zero out for compare
-
-        pcmpeqw     mm6,        mm5         ; d1 != 0
-
-        pandn       mm6,        MMWORD PTR[GLOBAL(_cmp_mask)]   ; clear upper,
-                                                                ; and keep bit 0 of lower
-
-        ; output 0 and 2
-        movq        mm2,        mm0         ; a1
-
-        paddw       mm0,        mm1         ; a1 + b1
-        psubw       mm2,        mm1         ; a1 - b1
-
-        paddw       mm0,        MMWORD PTR[GLOBAL(_7w)]
-        paddw       mm2,        MMWORD PTR[GLOBAL(_7w)]
-
-        psraw       mm0,        4           ; op[0] = (a1 + b1 + 7)>>4
-        psraw       mm2,        4           ; op[8] = (a1 - b1 + 7)>>4
-
-        movq        MMWORD PTR[rdi + 0 ],  mm0
-        movq        MMWORD PTR[rdi + 16],  mm2
-
-        ; output 1 and 3
-        ; interleave c1, d1
-        movq        mm1,        mm5         ; d1
-        punpcklwd   mm1,        mm3         ; c1 d1
-        punpckhwd   mm5,        mm3         ; c1 d1
-
-        movq        mm3,        mm1
-        movq        mm4,        mm5
-
-        pmaddwd     mm1,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
-        pmaddwd     mm4,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
-
-        pmaddwd     mm3,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
-        pmaddwd     mm5,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
-
-        paddd       mm1,        MMWORD PTR[GLOBAL(_12000)]
-        paddd       mm4,        MMWORD PTR[GLOBAL(_12000)]
-        paddd       mm3,        MMWORD PTR[GLOBAL(_51000)]
-        paddd       mm5,        MMWORD PTR[GLOBAL(_51000)]
-
-        psrad       mm1,        16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
-        psrad       mm4,        16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
-        psrad       mm3,        16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
-        psrad       mm5,        16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
-
-        packssdw    mm1,        mm4         ; op[4]
-        packssdw    mm3,        mm5         ; op[12]
-
-        paddw       mm1,        mm6         ; op[4] += (d1!=0)
-
-        movq        MMWORD PTR[rdi + 8 ],  mm1
-        movq        MMWORD PTR[rdi + 24],  mm3
-
-     ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 8
-_5352_2217:
-    dw 5352
-    dw 2217
-    dw 5352
-    dw 2217
-align 8
-_2217_neg5352:
-    dw 2217
-    dw -5352
-    dw 2217
-    dw -5352
-align 8
-_cmp_mask:
-    times 4 dw 1
-align 8
-_7w:
-    times 4 dw 7
-align 8
-_14500:
-    times 2 dd 14500
-align 8
-_7500:
-    times 2 dd 7500
-align 8
-_12000:
-    times 2 dd 12000
-align 8
-_51000:
-    times 2 dd 51000

diff --git a/vp9/encoder/x86/vp9_dct_mmx.h b/vp9/encoder/x86/vp9_dct_mmx.h
deleted file mode 100644
index 3bac7c8..0000000
--- a/vp9/encoder/x86/vp9_dct_mmx.h
+++ /dev/null

@@ -1,17 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_ENCODER_X86_VP9_DCT_MMX_H_
-#define VP9_ENCODER_X86_VP9_DCT_MMX_H_
-
-extern void vp9_short_fdct4x4_mmx(short *input, short *output, int pitch);
-
-
-#endif /* VP9_ENCODER_X86_VP9_DCT_MMX_H_ */

diff --git a/vp9/encoder/x86/vp9_fwalsh_sse2.asm b/vp9/encoder/x86/vp9_fwalsh_sse2.asm
deleted file mode 100644
index 7bee9ef..0000000
--- a/vp9/encoder/x86/vp9_fwalsh_sse2.asm
+++ /dev/null

@@ -1,164 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_short_walsh4x4_sse2(short *input, short *output, int pitch)
-global sym(vp9_short_walsh4x4_sse2) PRIVATE
-sym(vp9_short_walsh4x4_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 3
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov     rsi, arg(0)           ; input
-    mov     rdi, arg(1)           ; output
-    movsxd  rdx, dword ptr arg(2) ; pitch
-
-    ; first for loop
-    movq    xmm0, MMWORD PTR [rsi]           ; load input
-    movq    xmm1, MMWORD PTR [rsi + rdx]
-    lea     rsi,  [rsi + rdx*2]
-    movq    xmm2, MMWORD PTR [rsi]
-    movq    xmm3, MMWORD PTR [rsi + rdx]
-
-    punpcklwd xmm0,  xmm1
-    punpcklwd xmm2,  xmm3
-
-    movdqa    xmm1, xmm0
-    punpckldq xmm0, xmm2           ; ip[1] ip[0]
-    punpckhdq xmm1, xmm2           ; ip[3] ip[2]
-
-    movdqa    xmm2, xmm0
-    paddw     xmm0, xmm1
-    psubw     xmm2, xmm1
-
-    psllw     xmm0, 2              ; d1  a1
-    psllw     xmm2, 2              ; c1  b1
-
-    movdqa    xmm1, xmm0
-    punpcklqdq xmm0, xmm2          ; b1  a1
-    punpckhqdq xmm1, xmm2          ; c1  d1
-
-    pxor      xmm6, xmm6
-    movq      xmm6, xmm0
-    pxor      xmm7, xmm7
-    pcmpeqw   xmm7, xmm6
-    paddw     xmm7, [GLOBAL(c1)]
-
-    movdqa    xmm2, xmm0
-    paddw     xmm0, xmm1           ; b1+c1  a1+d1
-    psubw     xmm2, xmm1           ; b1-c1  a1-d1
-    paddw     xmm0, xmm7           ; b1+c1  a1+d1+(a1!=0)
-
-    ; second for loop
-    ; input: 13  9  5  1 12  8  4  0 (xmm0)
-    ;        14 10  6  2 15 11  7  3 (xmm2)
-    ; after shuffle:
-    ;        13  5  9  1 12  4  8  0 (xmm0)
-    ;        14  6 10  2 15  7 11  3 (xmm1)
-    pshuflw   xmm3, xmm0, 0xd8
-    pshufhw   xmm0, xmm3, 0xd8
-    pshuflw   xmm3, xmm2, 0xd8
-    pshufhw   xmm1, xmm3, 0xd8
-
-    movdqa    xmm2, xmm0
-    pmaddwd   xmm0, [GLOBAL(c1)]    ; d11 a11 d10 a10
-    pmaddwd   xmm2, [GLOBAL(cn1)]   ; c11 b11 c10 b10
-    movdqa    xmm3, xmm1
-    pmaddwd   xmm1, [GLOBAL(c1)]    ; d12 a12 d13 a13
-    pmaddwd   xmm3, [GLOBAL(cn1)]   ; c12 b12 c13 b13
-
-    pshufd    xmm4, xmm0, 0xd8      ; d11 d10 a11 a10
-    pshufd    xmm5, xmm2, 0xd8      ; c11 c10 b11 b10
-    pshufd    xmm6, xmm1, 0x72      ; d13 d12 a13 a12
-    pshufd    xmm7, xmm3, 0x72      ; c13 c12 b13 b12
-
-    movdqa    xmm0, xmm4
-    punpcklqdq xmm0, xmm5           ; b11 b10 a11 a10
-    punpckhqdq xmm4, xmm5           ; c11 c10 d11 d10
-    movdqa    xmm1, xmm6
-    punpcklqdq xmm1, xmm7           ; b13 b12 a13 a12
-    punpckhqdq xmm6, xmm7           ; c13 c12 d13 d12
-
-    movdqa    xmm2, xmm0
-    paddd     xmm0, xmm4            ; b21 b20 a21 a20
-    psubd     xmm2, xmm4            ; c21 c20 d21 d20
-    movdqa    xmm3, xmm1
-    paddd     xmm1, xmm6            ; b23 b22 a23 a22
-    psubd     xmm3, xmm6            ; c23 c22 d23 d22
-
-    pxor      xmm4, xmm4
-    movdqa    xmm5, xmm4
-    pcmpgtd   xmm4, xmm0
-    pcmpgtd   xmm5, xmm2
-    pand      xmm4, [GLOBAL(cd1)]
-    pand      xmm5, [GLOBAL(cd1)]
-
-    pxor      xmm6, xmm6
-    movdqa    xmm7, xmm6
-    pcmpgtd   xmm6, xmm1
-    pcmpgtd   xmm7, xmm3
-    pand      xmm6, [GLOBAL(cd1)]
-    pand      xmm7, [GLOBAL(cd1)]
-
-    paddd     xmm0, xmm4
-    paddd     xmm2, xmm5
-    paddd     xmm0, [GLOBAL(cd3)]
-    paddd     xmm2, [GLOBAL(cd3)]
-    paddd     xmm1, xmm6
-    paddd     xmm3, xmm7
-    paddd     xmm1, [GLOBAL(cd3)]
-    paddd     xmm3, [GLOBAL(cd3)]
-
-    psrad     xmm0, 3
-    psrad     xmm1, 3
-    psrad     xmm2, 3
-    psrad     xmm3, 3
-    movdqa    xmm4, xmm0
-    punpcklqdq xmm0, xmm1           ; a23 a22 a21 a20
-    punpckhqdq xmm4, xmm1           ; b23 b22 b21 b20
-    movdqa    xmm5, xmm2
-    punpckhqdq xmm2, xmm3           ; c23 c22 c21 c20
-    punpcklqdq xmm5, xmm3           ; d23 d22 d21 d20
-
-    packssdw  xmm0, xmm4            ; b23 b22 b21 b20 a23 a22 a21 a20
-    packssdw  xmm2, xmm5            ; d23 d22 d21 d20 c23 c22 c21 c20
-
-    movdqa  XMMWORD PTR [rdi], xmm0
-    movdqa  XMMWORD PTR [rdi + 16], xmm2
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-c1:
-    dw 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001
-align 16
-cn1:
-    dw 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff
-align 16
-cd1:
-    dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
-align 16
-cd3:
-    dd 0x00000003, 0x00000003, 0x00000003, 0x00000003

diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 95ea60b..ee744d5 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk

@@ -75,12 +75,9 @@
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_sse2.c
 VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.h
 VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.c
-VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_iwalsh_mmx.asm
-VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_recon_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_iwalsh_sse2.asm
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_copy_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_loopfilter_sse2.asm
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_recon_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_intrapred_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_intrapred_ssse3.asm
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
@@ -90,7 +87,6 @@
 endif
 
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_sadmxn_sse2.c
 
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon$(ASM)
 

diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 12a49f8..dee83c9 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk

@@ -78,13 +78,10 @@
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_mmx.c
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_impl_mmx.asm
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_sad_mmx.asm
-VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
-VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.h
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_fwalsh_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm