Merge "Wide loopfilter 16 pix at a time"
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index fd2bd36..6e5002f 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -22,8 +22,8 @@
}
namespace {
-typedef void (*convolve_fn_t)(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int filter_x_stride,
const int16_t *filter_y, int filter_y_stride,
int w, int h);
diff --git a/vp9/common/vp9_convolve.c b/vp9/common/vp9_convolve.c
index 914afa7..6f1e418 100644
--- a/vp9/common/vp9_convolve.c
+++ b/vp9/common/vp9_convolve.c
@@ -38,8 +38,8 @@
*/
#define ALIGN_FILTERS_256 1
-static void convolve_horiz_c(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+static void convolve_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x0, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h, int taps) {
@@ -80,8 +80,8 @@
}
}
-static void convolve_avg_horiz_c(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+static void convolve_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x0, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h, int taps) {
@@ -122,8 +122,8 @@
}
}
-static void convolve_vert_c(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+static void convolve_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y0, int y_step_q4,
int w, int h, int taps) {
@@ -164,8 +164,8 @@
}
}
-static void convolve_avg_vert_c(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+static void convolve_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y0, int y_step_q4,
int w, int h, int taps) {
@@ -207,8 +207,8 @@
}
}
-static void convolve_c(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+static void convolve_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h, int taps) {
@@ -237,8 +237,8 @@
w, h, taps);
}
-static void convolve_avg_c(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+static void convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h, int taps) {
@@ -267,8 +267,8 @@
w, h, taps);
}
-void vp9_convolve8_horiz_c(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
@@ -277,8 +277,8 @@
w, h, 8);
}
-void vp9_convolve8_avg_horiz_c(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
@@ -287,8 +287,8 @@
w, h, 8);
}
-void vp9_convolve8_vert_c(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
@@ -297,8 +297,8 @@
w, h, 8);
}
-void vp9_convolve8_avg_vert_c(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
@@ -307,8 +307,8 @@
w, h, 8);
}
-void vp9_convolve8_c(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
@@ -317,8 +317,8 @@
w, h, 8);
}
-void vp9_convolve8_avg_c(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
@@ -339,33 +339,25 @@
w, h);
}
-void vp9_convolve_copy(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
- const int16_t *filter_x, int filter_x_stride,
- const int16_t *filter_y, int filter_y_stride,
- int w, int h) {
- if (w == 16 && h == 16) {
- vp9_copy_mem16x16(src, src_stride, dst, dst_stride);
- } else if (w == 8 && h == 8) {
- vp9_copy_mem8x8(src, src_stride, dst, dst_stride);
- } else if (w == 8 && h == 4) {
- vp9_copy_mem8x4(src, src_stride, dst, dst_stride);
- } else {
- int r;
+void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int filter_x_stride,
+ const int16_t *filter_y, int filter_y_stride,
+ int w, int h) {
+ int r;
- for (r = h; r > 0; --r) {
- memcpy(dst, src, w);
- src += src_stride;
- dst += dst_stride;
- }
+ for (r = h; r > 0; --r) {
+ memcpy(dst, src, w);
+ src += src_stride;
+ dst += dst_stride;
}
}
-void vp9_convolve_avg(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
- const int16_t *filter_x, int filter_x_stride,
- const int16_t *filter_y, int filter_y_stride,
- int w, int h) {
+void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int filter_x_stride,
+ const int16_t *filter_y, int filter_y_stride,
+ int w, int h) {
int x, y;
for (y = 0; y < h; ++y) {
diff --git a/vp9/common/vp9_convolve.h b/vp9/common/vp9_convolve.h
index 0596080..3de8111 100644
--- a/vp9/common/vp9_convolve.h
+++ b/vp9/common/vp9_convolve.h
@@ -13,26 +13,12 @@
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
-typedef void (*convolve_fn_t)(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h);
-// Not a convolution, a block copy conforming to the convolution prototype
-void vp9_convolve_copy(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h);
-
-// Not a convolution, a block average conforming to the convolution prototype
-void vp9_convolve_avg(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h);
-
struct subpix_fn_table {
const int16_t (*filter_x)[8];
const int16_t (*filter_y)[8];
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index 265a19a..c29fd14 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -194,93 +194,6 @@
assert(((intptr_t)xd->subpix.filter_x & 0xff) == 0);
}
-void vp9_copy_mem16x16_c(const uint8_t *src,
- int src_stride,
- uint8_t *dst,
- int dst_stride) {
- int r;
-
- for (r = 0; r < 16; r++) {
-#if !(CONFIG_FAST_UNALIGNED)
- dst[0] = src[0];
- dst[1] = src[1];
- dst[2] = src[2];
- dst[3] = src[3];
- dst[4] = src[4];
- dst[5] = src[5];
- dst[6] = src[6];
- dst[7] = src[7];
- dst[8] = src[8];
- dst[9] = src[9];
- dst[10] = src[10];
- dst[11] = src[11];
- dst[12] = src[12];
- dst[13] = src[13];
- dst[14] = src[14];
- dst[15] = src[15];
-
-#else
- ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0];
- ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1];
- ((uint32_t *)dst)[2] = ((const uint32_t *)src)[2];
- ((uint32_t *)dst)[3] = ((const uint32_t *)src)[3];
-
-#endif
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-void vp9_copy_mem8x8_c(const uint8_t *src,
- int src_stride,
- uint8_t *dst,
- int dst_stride) {
- int r;
-
- for (r = 0; r < 8; r++) {
-#if !(CONFIG_FAST_UNALIGNED)
- dst[0] = src[0];
- dst[1] = src[1];
- dst[2] = src[2];
- dst[3] = src[3];
- dst[4] = src[4];
- dst[5] = src[5];
- dst[6] = src[6];
- dst[7] = src[7];
-#else
- ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0];
- ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1];
-#endif
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-void vp9_copy_mem8x4_c(const uint8_t *src,
- int src_stride,
- uint8_t *dst,
- int dst_stride) {
- int r;
-
- for (r = 0; r < 4; r++) {
-#if !(CONFIG_FAST_UNALIGNED)
- dst[0] = src[0];
- dst[1] = src[1];
- dst[2] = src[2];
- dst[3] = src[3];
- dst[4] = src[4];
- dst[5] = src[5];
- dst[6] = src[6];
- dst[7] = src[7];
-#else
- ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0];
- ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1];
-#endif
- src += src_stride;
- dst += dst_stride;
- }
-}
-
void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
const int_mv *src_mv,
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index ab5e2df..56a2284 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -43,17 +43,6 @@
#
# RECON
#
-prototype void vp9_copy_mem16x16 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
-specialize vp9_copy_mem16x16 mmx sse2 dspr2
-vp9_copy_mem16x16_dspr2=vp9_copy_mem16x16_dspr2
-
-prototype void vp9_copy_mem8x8 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
-specialize vp9_copy_mem8x8 mmx dspr2
-vp9_copy_mem8x8_dspr2=vp9_copy_mem8x8_dspr2
-
-prototype void vp9_copy_mem8x4 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
-specialize vp9_copy_mem8x4 mmx
-
prototype void vp9_d27_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
specialize vp9_d27_predictor_4x4
@@ -275,22 +264,28 @@
#
# Sub Pixel Filters
#
-prototype void vp9_convolve8 "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+prototype void vp9_convolve_copy "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+specialize vp9_convolve_copy sse2
+
+prototype void vp9_convolve_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+specialize vp9_convolve_avg sse2
+
+prototype void vp9_convolve8 "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve8 ssse3
-prototype void vp9_convolve8_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+prototype void vp9_convolve8_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve8_horiz ssse3
-prototype void vp9_convolve8_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+prototype void vp9_convolve8_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve8_vert ssse3
-prototype void vp9_convolve8_avg "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+prototype void vp9_convolve8_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve8_avg ssse3
-prototype void vp9_convolve8_avg_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+prototype void vp9_convolve8_avg_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve8_avg_horiz ssse3
-prototype void vp9_convolve8_avg_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+prototype void vp9_convolve8_avg_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve8_avg_vert ssse3
#
diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c
index 2b66834..98fc4dc 100644
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c
@@ -121,8 +121,8 @@
unsigned int output_height,
const short *filter);
-void vp9_convolve8_horiz_ssse3(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
@@ -159,8 +159,8 @@
}
}
-void vp9_convolve8_vert_ssse3(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
@@ -197,8 +197,8 @@
}
}
-void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
@@ -235,8 +235,8 @@
}
}
-void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
@@ -273,8 +273,8 @@
}
}
-void vp9_convolve8_ssse3(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
@@ -294,8 +294,8 @@
}
}
-void vp9_convolve8_avg_ssse3(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
diff --git a/vp9/common/x86/vp9_copy_sse2.asm b/vp9/common/x86/vp9_copy_sse2.asm
new file mode 100644
index 0000000..dd522c6
--- /dev/null
+++ b/vp9/common/x86/vp9_copy_sse2.asm
@@ -0,0 +1,152 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro convolve_fn 1
+INIT_XMM sse2
+cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \
+ fx, fxs, fy, fys, w, h
+ mov r4d, dword wm
+ cmp r4d, 4
+ je .w4
+ cmp r4d, 8
+ je .w8
+ cmp r4d, 16
+ je .w16
+ cmp r4d, 32
+ je .w32
+
+ mov r4d, dword hm
+.loop64:
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+32]
+ movu m3, [srcq+48]
+ add srcq, src_strideq
+%ifidn %1, avg
+ pavgb m0, [dstq]
+ pavgb m1, [dstq+16]
+ pavgb m2, [dstq+32]
+ pavgb m3, [dstq+48]
+%endif
+ mova [dstq ], m0
+ mova [dstq+16], m1
+ mova [dstq+32], m2
+ mova [dstq+48], m3
+ add dstq, dst_strideq
+ dec r4d
+ jnz .loop64
+ RET
+
+.w32:
+ mov r4d, dword hm
+.loop32:
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+src_strideq]
+ movu m3, [srcq+src_strideq+16]
+ lea srcq, [srcq+src_strideq*2]
+%ifidn %1, avg
+ pavgb m0, [dstq]
+ pavgb m1, [dstq +16]
+ pavgb m2, [dstq+dst_strideq]
+ pavgb m3, [dstq+dst_strideq+16]
+%endif
+ mova [dstq ], m0
+ mova [dstq +16], m1
+ mova [dstq+dst_strideq ], m2
+ mova [dstq+dst_strideq+16], m3
+ lea dstq, [dstq+dst_strideq*2]
+ sub r4d, 2
+ jnz .loop32
+ RET
+
+.w16:
+ mov r4d, dword hm
+ lea r5q, [src_strideq*3]
+ lea r6q, [dst_strideq*3]
+.loop16:
+ movu m0, [srcq]
+ movu m1, [srcq+src_strideq]
+ movu m2, [srcq+src_strideq*2]
+ movu m3, [srcq+r5q]
+ lea srcq, [srcq+src_strideq*4]
+%ifidn %1, avg
+ pavgb m0, [dstq]
+ pavgb m1, [dstq+dst_strideq]
+ pavgb m2, [dstq+dst_strideq*2]
+ pavgb m3, [dstq+r6q]
+%endif
+ mova [dstq ], m0
+ mova [dstq+dst_strideq ], m1
+ mova [dstq+dst_strideq*2], m2
+ mova [dstq+r6q ], m3
+ lea dstq, [dstq+dst_strideq*4]
+ sub r4d, 4
+ jnz .loop16
+ RET
+
+INIT_MMX sse
+.w8:
+ mov r4d, dword hm
+ lea r5q, [src_strideq*3]
+ lea r6q, [dst_strideq*3]
+.loop8:
+ movu m0, [srcq]
+ movu m1, [srcq+src_strideq]
+ movu m2, [srcq+src_strideq*2]
+ movu m3, [srcq+r5q]
+ lea srcq, [srcq+src_strideq*4]
+%ifidn %1, avg
+ pavgb m0, [dstq]
+ pavgb m1, [dstq+dst_strideq]
+ pavgb m2, [dstq+dst_strideq*2]
+ pavgb m3, [dstq+r6q]
+%endif
+ mova [dstq ], m0
+ mova [dstq+dst_strideq ], m1
+ mova [dstq+dst_strideq*2], m2
+ mova [dstq+r6q ], m3
+ lea dstq, [dstq+dst_strideq*4]
+ sub r4d, 4
+ jnz .loop8
+ RET
+
+.w4:
+ mov r4d, dword hm
+ lea r5q, [src_strideq*3]
+ lea r6q, [dst_strideq*3]
+.loop4:
+ movh m0, [srcq]
+ movh m1, [srcq+src_strideq]
+ movh m2, [srcq+src_strideq*2]
+ movh m3, [srcq+r5q]
+ lea srcq, [srcq+src_strideq*4]
+%ifidn %1, avg
+ pavgb m0, [dstq]
+ pavgb m1, [dstq+dst_strideq]
+ pavgb m2, [dstq+dst_strideq*2]
+ pavgb m3, [dstq+r6q]
+%endif
+ movh [dstq ], m0
+ movh [dstq+dst_strideq ], m1
+ movh [dstq+dst_strideq*2], m2
+ movh [dstq+r6q ], m3
+ lea dstq, [dstq+dst_strideq*4]
+ sub r4d, 4
+ jnz .loop4
+ RET
+%endmacro
+
+convolve_fn copy
+convolve_fn avg
diff --git a/vp9/common/x86/vp9_iwalsh_mmx.asm b/vp9/common/x86/vp9_iwalsh_mmx.asm
deleted file mode 100644
index 1af2521..0000000
--- a/vp9/common/x86/vp9_iwalsh_mmx.asm
+++ /dev/null
@@ -1,173 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_short_inv_walsh4x4_1_mmx(short *input, short *output)
-global sym(vp9_short_inv_walsh4x4_1_mmx) PRIVATE
-sym(vp9_short_inv_walsh4x4_1_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 2
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0)
- mov rax, 3
-
- mov rdi, arg(1)
- add rax, [rsi] ;input[0] + 3
-
- movd mm0, eax
-
- punpcklwd mm0, mm0 ;x x val val
-
- punpckldq mm0, mm0 ;val val val val
-
- psraw mm0, 3 ;(input[0] + 3) >> 3
-
- movq [rdi + 0], mm0
- movq [rdi + 8], mm0
- movq [rdi + 16], mm0
- movq [rdi + 24], mm0
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_short_inv_walsh4x4_mmx(short *input, short *output)
-global sym(vp9_short_inv_walsh4x4_mmx) PRIVATE
-sym(vp9_short_inv_walsh4x4_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 2
- push rsi
- push rdi
- ; end prolog
-
- mov rax, 3
- mov rsi, arg(0)
- mov rdi, arg(1)
- shl rax, 16
-
- movq mm0, [rsi + 0] ;ip[0]
- movq mm1, [rsi + 8] ;ip[4]
- or rax, 3 ;00030003h
-
- movq mm2, [rsi + 16] ;ip[8]
- movq mm3, [rsi + 24] ;ip[12]
-
- movq mm7, rax
- movq mm4, mm0
-
- punpcklwd mm7, mm7 ;0003000300030003h
- movq mm5, mm1
-
- paddw mm4, mm3 ;ip[0] + ip[12] aka al
- paddw mm5, mm2 ;ip[4] + ip[8] aka bl
-
- movq mm6, mm4 ;temp al
-
- paddw mm4, mm5 ;al + bl
- psubw mm6, mm5 ;al - bl
-
- psubw mm0, mm3 ;ip[0] - ip[12] aka d1
- psubw mm1, mm2 ;ip[4] - ip[8] aka c1
-
- movq mm5, mm0 ;temp dl
-
- paddw mm0, mm1 ;dl + cl
- psubw mm5, mm1 ;dl - cl
-
- ; 03 02 01 00
- ; 13 12 11 10
- ; 23 22 21 20
- ; 33 32 31 30
-
- movq mm3, mm4 ; 03 02 01 00
- punpcklwd mm4, mm0 ; 11 01 10 00
- punpckhwd mm3, mm0 ; 13 03 12 02
-
- movq mm1, mm6 ; 23 22 21 20
- punpcklwd mm6, mm5 ; 31 21 30 20
- punpckhwd mm1, mm5 ; 33 23 32 22
-
- movq mm0, mm4 ; 11 01 10 00
- movq mm2, mm3 ; 13 03 12 02
-
- punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0]
- punpckhdq mm4, mm6 ; 31 21 11 01 aka ip[4]
-
- punpckldq mm2, mm1 ; 32 22 12 02 aka ip[8]
- punpckhdq mm3, mm1 ; 33 23 13 03 aka ip[12]
-;~~~~~~~~~~~~~~~~~~~~~
- movq mm1, mm0
- movq mm5, mm4
-
- paddw mm1, mm3 ;ip[0] + ip[12] aka al
- paddw mm5, mm2 ;ip[4] + ip[8] aka bl
-
- movq mm6, mm1 ;temp al
-
- paddw mm1, mm5 ;al + bl
- psubw mm6, mm5 ;al - bl
-
- psubw mm0, mm3 ;ip[0] - ip[12] aka d1
- psubw mm4, mm2 ;ip[4] - ip[8] aka c1
-
- movq mm5, mm0 ;temp dl
-
- paddw mm0, mm4 ;dl + cl
- psubw mm5, mm4 ;dl - cl
-;~~~~~~~~~~~~~~~~~~~~~
- movq mm3, mm1 ; 03 02 01 00
- punpcklwd mm1, mm0 ; 11 01 10 00
- punpckhwd mm3, mm0 ; 13 03 12 02
-
- movq mm4, mm6 ; 23 22 21 20
- punpcklwd mm6, mm5 ; 31 21 30 20
- punpckhwd mm4, mm5 ; 33 23 32 22
-
- movq mm0, mm1 ; 11 01 10 00
- movq mm2, mm3 ; 13 03 12 02
-
- punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0]
- punpckhdq mm1, mm6 ; 31 21 11 01 aka ip[4]
-
- punpckldq mm2, mm4 ; 32 22 12 02 aka ip[8]
- punpckhdq mm3, mm4 ; 33 23 13 03 aka ip[12]
-
- paddw mm0, mm7
- paddw mm1, mm7
- paddw mm2, mm7
- paddw mm3, mm7
-
- psraw mm0, 3
- psraw mm1, 3
- psraw mm2, 3
- psraw mm3, 3
-
- movq [rdi + 0], mm0
- movq [rdi + 8], mm1
- movq [rdi + 16], mm2
- movq [rdi + 24], mm3
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
diff --git a/vp9/common/x86/vp9_iwalsh_sse2.asm b/vp9/common/x86/vp9_iwalsh_sse2.asm
deleted file mode 100644
index 84fa2fe..0000000
--- a/vp9/common/x86/vp9_iwalsh_sse2.asm
+++ /dev/null
@@ -1,119 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_short_inv_walsh4x4_sse2(short *input, short *output)
-global sym(vp9_short_inv_walsh4x4_sse2) PRIVATE
-sym(vp9_short_inv_walsh4x4_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 2
- SAVE_XMM 6
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0)
- mov rdi, arg(1)
- mov rax, 3
-
- movdqa xmm0, [rsi + 0] ;ip[4] ip[0]
- movdqa xmm1, [rsi + 16] ;ip[12] ip[8]
-
- shl rax, 16
- or rax, 3 ;00030003h
-
- pshufd xmm2, xmm1, 4eh ;ip[8] ip[12]
- movdqa xmm3, xmm0 ;ip[4] ip[0]
-
- paddw xmm0, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
- psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
-
- movdqa xmm4, xmm0
- punpcklqdq xmm0, xmm3 ;d1 a1
- punpckhqdq xmm4, xmm3 ;c1 b1
- movd xmm6, eax
-
- movdqa xmm1, xmm4 ;c1 b1
- paddw xmm4, xmm0 ;dl+cl a1+b1 aka op[4] op[0]
- psubw xmm0, xmm1 ;d1-c1 a1-b1 aka op[12] op[8]
-
-;;;temp output
-;; movdqu [rdi + 0], xmm4
-;; movdqu [rdi + 16], xmm3
-
-;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- ; 13 12 11 10 03 02 01 00
- ;
- ; 33 32 31 30 23 22 21 20
- ;
- movdqa xmm3, xmm4 ; 13 12 11 10 03 02 01 00
- punpcklwd xmm4, xmm0 ; 23 03 22 02 21 01 20 00
- punpckhwd xmm3, xmm0 ; 33 13 32 12 31 11 30 10
- movdqa xmm1, xmm4 ; 23 03 22 02 21 01 20 00
- punpcklwd xmm4, xmm3 ; 31 21 11 01 30 20 10 00
- punpckhwd xmm1, xmm3 ; 33 23 13 03 32 22 12 02
- ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- pshufd xmm2, xmm1, 4eh ;ip[8] ip[12]
- movdqa xmm3, xmm4 ;ip[4] ip[0]
-
- pshufd xmm6, xmm6, 0 ;03 03 03 03 03 03 03 03
-
- paddw xmm4, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
- psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
-
- movdqa xmm5, xmm4
- punpcklqdq xmm4, xmm3 ;d1 a1
- punpckhqdq xmm5, xmm3 ;c1 b1
-
- movdqa xmm1, xmm5 ;c1 b1
- paddw xmm5, xmm4 ;dl+cl a1+b1 aka op[4] op[0]
- psubw xmm4, xmm1 ;d1-c1 a1-b1 aka op[12] op[8]
-;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- ; 13 12 11 10 03 02 01 00
- ;
- ; 33 32 31 30 23 22 21 20
- ;
- movdqa xmm0, xmm5 ; 13 12 11 10 03 02 01 00
- punpcklwd xmm5, xmm4 ; 23 03 22 02 21 01 20 00
- punpckhwd xmm0, xmm4 ; 33 13 32 12 31 11 30 10
- movdqa xmm1, xmm5 ; 23 03 22 02 21 01 20 00
- punpcklwd xmm5, xmm0 ; 31 21 11 01 30 20 10 00
- punpckhwd xmm1, xmm0 ; 33 23 13 03 32 22 12 02
-;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- paddw xmm5, xmm6
- paddw xmm1, xmm6
-
- psraw xmm5, 3
- psraw xmm1, 3
-
- movdqa [rdi + 0], xmm5
- movdqa [rdi + 16], xmm1
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-SECTION_RODATA
-align 16
-x_s1sqr2:
- times 4 dw 0x8A8C
-align 16
-x_c1sqr2less1:
- times 4 dw 0x4E7B
-align 16
-fours:
- times 4 dw 0x0004
diff --git a/vp9/common/x86/vp9_recon_mmx.asm b/vp9/common/x86/vp9_recon_mmx.asm
deleted file mode 100644
index 6fbbe48..0000000
--- a/vp9/common/x86/vp9_recon_mmx.asm
+++ /dev/null
@@ -1,272 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-;void copy_mem8x8_mmx(
-; unsigned char *src,
-; int src_stride,
-; unsigned char *dst,
-; int dst_stride
-; )
-global sym(vp9_copy_mem8x8_mmx) PRIVATE
-sym(vp9_copy_mem8x8_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src;
- movq mm0, [rsi]
-
- movsxd rax, dword ptr arg(1) ;src_stride;
- mov rdi, arg(2) ;dst;
-
- movq mm1, [rsi+rax]
- movq mm2, [rsi+rax*2]
-
- movsxd rcx, dword ptr arg(3) ;dst_stride
- lea rsi, [rsi+rax*2]
-
- movq [rdi], mm0
- add rsi, rax
-
- movq [rdi+rcx], mm1
- movq [rdi+rcx*2], mm2
-
-
- lea rdi, [rdi+rcx*2]
- movq mm3, [rsi]
-
- add rdi, rcx
- movq mm4, [rsi+rax]
-
- movq mm5, [rsi+rax*2]
- movq [rdi], mm3
-
- lea rsi, [rsi+rax*2]
- movq [rdi+rcx], mm4
-
- movq [rdi+rcx*2], mm5
- lea rdi, [rdi+rcx*2]
-
- movq mm0, [rsi+rax]
- movq mm1, [rsi+rax*2]
-
- movq [rdi+rcx], mm0
- movq [rdi+rcx*2],mm1
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void copy_mem8x4_mmx(
-; unsigned char *src,
-; int src_stride,
-; unsigned char *dst,
-; int dst_stride
-; )
-global sym(vp9_copy_mem8x4_mmx) PRIVATE
-sym(vp9_copy_mem8x4_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src;
- movq mm0, [rsi]
-
- movsxd rax, dword ptr arg(1) ;src_stride;
- mov rdi, arg(2) ;dst;
-
- movq mm1, [rsi+rax]
- movq mm2, [rsi+rax*2]
-
- movsxd rcx, dword ptr arg(3) ;dst_stride
- lea rsi, [rsi+rax*2]
-
- movq [rdi], mm0
- movq [rdi+rcx], mm1
-
- movq [rdi+rcx*2], mm2
- lea rdi, [rdi+rcx*2]
-
- movq mm3, [rsi+rax]
- movq [rdi+rcx], mm3
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void copy_mem16x16_mmx(
-; unsigned char *src,
-; int src_stride,
-; unsigned char *dst,
-; int dst_stride
-; )
-global sym(vp9_copy_mem16x16_mmx) PRIVATE
-sym(vp9_copy_mem16x16_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src;
- movsxd rax, dword ptr arg(1) ;src_stride;
-
- mov rdi, arg(2) ;dst;
- movsxd rcx, dword ptr arg(3) ;dst_stride
-
- movq mm0, [rsi]
- movq mm3, [rsi+8];
-
- movq mm1, [rsi+rax]
- movq mm4, [rsi+rax+8]
-
- movq mm2, [rsi+rax*2]
- movq mm5, [rsi+rax*2+8]
-
- lea rsi, [rsi+rax*2]
- add rsi, rax
-
- movq [rdi], mm0
- movq [rdi+8], mm3
-
- movq [rdi+rcx], mm1
- movq [rdi+rcx+8], mm4
-
- movq [rdi+rcx*2], mm2
- movq [rdi+rcx*2+8], mm5
-
- lea rdi, [rdi+rcx*2]
- add rdi, rcx
-
- movq mm0, [rsi]
- movq mm3, [rsi+8];
-
- movq mm1, [rsi+rax]
- movq mm4, [rsi+rax+8]
-
- movq mm2, [rsi+rax*2]
- movq mm5, [rsi+rax*2+8]
-
- lea rsi, [rsi+rax*2]
- add rsi, rax
-
- movq [rdi], mm0
- movq [rdi+8], mm3
-
- movq [rdi+rcx], mm1
- movq [rdi+rcx+8], mm4
-
- movq [rdi+rcx*2], mm2
- movq [rdi+rcx*2+8], mm5
-
- lea rdi, [rdi+rcx*2]
- add rdi, rcx
-
- movq mm0, [rsi]
- movq mm3, [rsi+8];
-
- movq mm1, [rsi+rax]
- movq mm4, [rsi+rax+8]
-
- movq mm2, [rsi+rax*2]
- movq mm5, [rsi+rax*2+8]
-
- lea rsi, [rsi+rax*2]
- add rsi, rax
-
- movq [rdi], mm0
- movq [rdi+8], mm3
-
- movq [rdi+rcx], mm1
- movq [rdi+rcx+8], mm4
-
- movq [rdi+rcx*2], mm2
- movq [rdi+rcx*2+8], mm5
-
- lea rdi, [rdi+rcx*2]
- add rdi, rcx
-
- movq mm0, [rsi]
- movq mm3, [rsi+8];
-
- movq mm1, [rsi+rax]
- movq mm4, [rsi+rax+8]
-
- movq mm2, [rsi+rax*2]
- movq mm5, [rsi+rax*2+8]
-
- lea rsi, [rsi+rax*2]
- add rsi, rax
-
- movq [rdi], mm0
- movq [rdi+8], mm3
-
- movq [rdi+rcx], mm1
- movq [rdi+rcx+8], mm4
-
- movq [rdi+rcx*2], mm2
- movq [rdi+rcx*2+8], mm5
-
- lea rdi, [rdi+rcx*2]
- add rdi, rcx
-
- movq mm0, [rsi]
- movq mm3, [rsi+8];
-
- movq mm1, [rsi+rax]
- movq mm4, [rsi+rax+8]
-
- movq mm2, [rsi+rax*2]
- movq mm5, [rsi+rax*2+8]
-
- lea rsi, [rsi+rax*2]
- add rsi, rax
-
- movq [rdi], mm0
- movq [rdi+8], mm3
-
- movq [rdi+rcx], mm1
- movq [rdi+rcx+8], mm4
-
- movq [rdi+rcx*2], mm2
- movq [rdi+rcx*2+8], mm5
-
- lea rdi, [rdi+rcx*2]
- add rdi, rcx
-
- movq mm0, [rsi]
- movq mm3, [rsi+8];
-
- movq [rdi], mm0
- movq [rdi+8], mm3
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
diff --git a/vp9/common/x86/vp9_recon_sse2.asm b/vp9/common/x86/vp9_recon_sse2.asm
deleted file mode 100644
index f7cc611..0000000
--- a/vp9/common/x86/vp9_recon_sse2.asm
+++ /dev/null
@@ -1,115 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-;void copy_mem16x16_sse2(
-; unsigned char *src,
-; int src_stride,
-; unsigned char *dst,
-; int dst_stride
-; )
-global sym(vp9_copy_mem16x16_sse2) PRIVATE
-sym(vp9_copy_mem16x16_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src;
- movdqu xmm0, [rsi]
-
- movsxd rax, dword ptr arg(1) ;src_stride;
- mov rdi, arg(2) ;dst;
-
- movdqu xmm1, [rsi+rax]
- movdqu xmm2, [rsi+rax*2]
-
- movsxd rcx, dword ptr arg(3) ;dst_stride
- lea rsi, [rsi+rax*2]
-
- movdqa [rdi], xmm0
- add rsi, rax
-
- movdqa [rdi+rcx], xmm1
- movdqa [rdi+rcx*2],xmm2
-
- lea rdi, [rdi+rcx*2]
- movdqu xmm3, [rsi]
-
- add rdi, rcx
- movdqu xmm4, [rsi+rax]
-
- movdqu xmm5, [rsi+rax*2]
- lea rsi, [rsi+rax*2]
-
- movdqa [rdi], xmm3
- add rsi, rax
-
- movdqa [rdi+rcx], xmm4
- movdqa [rdi+rcx*2],xmm5
-
- lea rdi, [rdi+rcx*2]
- movdqu xmm0, [rsi]
-
- add rdi, rcx
- movdqu xmm1, [rsi+rax]
-
- movdqu xmm2, [rsi+rax*2]
- lea rsi, [rsi+rax*2]
-
- movdqa [rdi], xmm0
- add rsi, rax
-
- movdqa [rdi+rcx], xmm1
-
- movdqa [rdi+rcx*2], xmm2
- movdqu xmm3, [rsi]
-
- movdqu xmm4, [rsi+rax]
- lea rdi, [rdi+rcx*2]
-
- add rdi, rcx
- movdqu xmm5, [rsi+rax*2]
-
- lea rsi, [rsi+rax*2]
- movdqa [rdi], xmm3
-
- add rsi, rax
- movdqa [rdi+rcx], xmm4
-
- movdqa [rdi+rcx*2],xmm5
- movdqu xmm0, [rsi]
-
- lea rdi, [rdi+rcx*2]
- movdqu xmm1, [rsi+rax]
-
- add rdi, rcx
- movdqu xmm2, [rsi+rax*2]
-
- lea rsi, [rsi+rax*2]
- movdqa [rdi], xmm0
-
- movdqa [rdi+rcx], xmm1
- movdqa [rdi+rcx*2],xmm2
-
- movdqu xmm3, [rsi+rax]
- lea rdi, [rdi+rcx*2]
-
- movdqa [rdi+rcx], xmm3
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
diff --git a/vp9/common/x86/vp9_sadmxn_sse2.c b/vp9/common/x86/vp9_sadmxn_sse2.c
deleted file mode 100644
index ed873a5..0000000
--- a/vp9/common/x86/vp9_sadmxn_sse2.c
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <emmintrin.h> /* SSE2 */
-#include "vpx/vpx_integer.h"
-#include "vpx_ports/emmintrin_compat.h"
-
-unsigned int vp9_sad16x3_sse2(
- const unsigned char *src_ptr,
- int src_stride,
- const unsigned char *ref_ptr,
- int ref_stride) {
- __m128i s0, s1, s2;
- __m128i r0, r1, r2;
- __m128i sad;
-
- s0 = _mm_loadu_si128((const __m128i *)(src_ptr + 0 * src_stride));
- s1 = _mm_loadu_si128((const __m128i *)(src_ptr + 1 * src_stride));
- s2 = _mm_loadu_si128((const __m128i *)(src_ptr + 2 * src_stride));
-
- r0 = _mm_loadu_si128((const __m128i *)(ref_ptr + 0 * ref_stride));
- r1 = _mm_loadu_si128((const __m128i *)(ref_ptr + 1 * ref_stride));
- r2 = _mm_loadu_si128((const __m128i *)(ref_ptr + 2 * ref_stride));
-
- sad = _mm_sad_epu8(s0, r0);
- sad = _mm_add_epi16(sad, _mm_sad_epu8(s1, r1));
- sad = _mm_add_epi16(sad, _mm_sad_epu8(s2, r2));
- sad = _mm_add_epi16(sad, _mm_srli_si128(sad, 8));
-
- return _mm_cvtsi128_si32(sad);
-}
-
-unsigned int vp9_sad3x16_sse2(
- const unsigned char *src_ptr,
- int src_stride,
- const unsigned char *ref_ptr,
- int ref_stride) {
- int r;
- __m128i s0, s1, s2, s3;
- __m128i r0, r1, r2, r3;
- __m128i sad = _mm_setzero_si128();
- __m128i mask;
- const int offset = (uintptr_t)src_ptr & 3;
-
- /* In current use case, the offset is 1 if CONFIG_SUBPELREFMV is off.
- * Here, for offset=1, we adjust src_ptr to be 4-byte aligned. Then, movd
- * takes much less time.
- */
- if (offset == 1)
- src_ptr -= 1;
-
- /* mask = 0xffffffffffff0000ffffffffffff0000 */
- mask = _mm_cmpeq_epi32(sad, sad);
- mask = _mm_slli_epi64(mask, 16);
-
- for (r = 0; r < 16; r += 4) {
- s0 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 0 * src_stride));
- s1 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 1 * src_stride));
- s2 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 2 * src_stride));
- s3 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 3 * src_stride));
- r0 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 0 * ref_stride));
- r1 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 1 * ref_stride));
- r2 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 2 * ref_stride));
- r3 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 3 * ref_stride));
-
- s0 = _mm_unpacklo_epi8(s0, s1);
- r0 = _mm_unpacklo_epi8(r0, r1);
- s2 = _mm_unpacklo_epi8(s2, s3);
- r2 = _mm_unpacklo_epi8(r2, r3);
- s0 = _mm_unpacklo_epi64(s0, s2);
- r0 = _mm_unpacklo_epi64(r0, r2);
-
- // throw out extra byte
- if (offset == 1)
- s0 = _mm_and_si128(s0, mask);
- else
- s0 = _mm_slli_epi64(s0, 16);
- r0 = _mm_slli_epi64(r0, 16);
-
- sad = _mm_add_epi16(sad, _mm_sad_epu8(s0, r0));
-
- src_ptr += src_stride*4;
- ref_ptr += ref_stride*4;
- }
-
- sad = _mm_add_epi16(sad, _mm_srli_si128(sad, 8));
- return _mm_cvtsi128_si32(sad);
-}
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 4cb38f7..f424679 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -1288,9 +1288,9 @@
block, 16),
16, &ssz) >> 2;
- if (best_tx_type != DCT_DCT)
+ if (tx_type != DCT_DCT)
vp9_short_iht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block, 16),
- dst, pd->dst.stride, best_tx_type);
+ dst, pd->dst.stride, tx_type);
else
xd->inv_txm4x4_add(BLOCK_OFFSET(pd->dqcoeff, block, 16),
dst, pd->dst.stride);
diff --git a/vp9/encoder/x86/vp9_dct_mmx.asm b/vp9/encoder/x86/vp9_dct_mmx.asm
deleted file mode 100644
index 54766d8..0000000
--- a/vp9/encoder/x86/vp9_dct_mmx.asm
+++ /dev/null
@@ -1,241 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_short_fdct4x4_mmx(short *input, short *output, int pitch)
-global sym(vp9_short_fdct4x4_mmx) PRIVATE
-sym(vp9_short_fdct4x4_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 3
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ; input
- mov rdi, arg(1) ; output
-
- movsxd rax, dword ptr arg(2) ;pitch
-
- lea rcx, [rsi + rax*2]
- ; read the input data
- movq mm0, [rsi]
- movq mm1, [rsi + rax]
-
- movq mm2, [rcx]
- movq mm4, [rcx + rax]
-
- ; transpose for the first stage
- movq mm3, mm0 ; 00 01 02 03
- movq mm5, mm2 ; 20 21 22 23
-
- punpcklwd mm0, mm1 ; 00 10 01 11
- punpckhwd mm3, mm1 ; 02 12 03 13
-
- punpcklwd mm2, mm4 ; 20 30 21 31
- punpckhwd mm5, mm4 ; 22 32 23 33
-
- movq mm1, mm0 ; 00 10 01 11
- punpckldq mm0, mm2 ; 00 10 20 30
-
- punpckhdq mm1, mm2 ; 01 11 21 31
-
- movq mm2, mm3 ; 02 12 03 13
- punpckldq mm2, mm5 ; 02 12 22 32
-
- punpckhdq mm3, mm5 ; 03 13 23 33
-
- ; mm0 0
- ; mm1 1
- ; mm2 2
- ; mm3 3
-
- ; first stage
- movq mm5, mm0
- movq mm4, mm1
-
- paddw mm0, mm3 ; a1 = 0 + 3
- paddw mm1, mm2 ; b1 = 1 + 2
-
- psubw mm4, mm2 ; c1 = 1 - 2
- psubw mm5, mm3 ; d1 = 0 - 3
-
- psllw mm5, 3
- psllw mm4, 3
-
- psllw mm0, 3
- psllw mm1, 3
-
- ; output 0 and 2
- movq mm2, mm0 ; a1
-
- paddw mm0, mm1 ; op[0] = a1 + b1
- psubw mm2, mm1 ; op[2] = a1 - b1
-
- ; output 1 and 3
- ; interleave c1, d1
- movq mm1, mm5 ; d1
- punpcklwd mm1, mm4 ; c1 d1
- punpckhwd mm5, mm4 ; c1 d1
-
- movq mm3, mm1
- movq mm4, mm5
-
- pmaddwd mm1, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
- pmaddwd mm4, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
-
- pmaddwd mm3, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
- pmaddwd mm5, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
-
- paddd mm1, MMWORD PTR[GLOBAL(_14500)]
- paddd mm4, MMWORD PTR[GLOBAL(_14500)]
- paddd mm3, MMWORD PTR[GLOBAL(_7500)]
- paddd mm5, MMWORD PTR[GLOBAL(_7500)]
-
- psrad mm1, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12
- psrad mm4, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12
- psrad mm3, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12
- psrad mm5, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12
-
- packssdw mm1, mm4 ; op[1]
- packssdw mm3, mm5 ; op[3]
-
- ; done with vertical
- ; transpose for the second stage
- movq mm4, mm0 ; 00 10 20 30
- movq mm5, mm2 ; 02 12 22 32
-
- punpcklwd mm0, mm1 ; 00 01 10 11
- punpckhwd mm4, mm1 ; 20 21 30 31
-
- punpcklwd mm2, mm3 ; 02 03 12 13
- punpckhwd mm5, mm3 ; 22 23 32 33
-
- movq mm1, mm0 ; 00 01 10 11
- punpckldq mm0, mm2 ; 00 01 02 03
-
- punpckhdq mm1, mm2 ; 01 22 12 13
-
- movq mm2, mm4 ; 20 31 30 31
- punpckldq mm2, mm5 ; 20 21 22 23
-
- punpckhdq mm4, mm5 ; 30 31 32 33
-
- ; mm0 0
- ; mm1 1
- ; mm2 2
- ; mm3 4
-
- movq mm5, mm0
- movq mm3, mm1
-
- paddw mm0, mm4 ; a1 = 0 + 3
- paddw mm1, mm2 ; b1 = 1 + 2
-
- psubw mm3, mm2 ; c1 = 1 - 2
- psubw mm5, mm4 ; d1 = 0 - 3
-
- pxor mm6, mm6 ; zero out for compare
-
- pcmpeqw mm6, mm5 ; d1 != 0
-
- pandn mm6, MMWORD PTR[GLOBAL(_cmp_mask)] ; clear upper,
- ; and keep bit 0 of lower
-
- ; output 0 and 2
- movq mm2, mm0 ; a1
-
- paddw mm0, mm1 ; a1 + b1
- psubw mm2, mm1 ; a1 - b1
-
- paddw mm0, MMWORD PTR[GLOBAL(_7w)]
- paddw mm2, MMWORD PTR[GLOBAL(_7w)]
-
- psraw mm0, 4 ; op[0] = (a1 + b1 + 7)>>4
- psraw mm2, 4 ; op[8] = (a1 - b1 + 7)>>4
-
- movq MMWORD PTR[rdi + 0 ], mm0
- movq MMWORD PTR[rdi + 16], mm2
-
- ; output 1 and 3
- ; interleave c1, d1
- movq mm1, mm5 ; d1
- punpcklwd mm1, mm3 ; c1 d1
- punpckhwd mm5, mm3 ; c1 d1
-
- movq mm3, mm1
- movq mm4, mm5
-
- pmaddwd mm1, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
- pmaddwd mm4, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
-
- pmaddwd mm3, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
- pmaddwd mm5, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
-
- paddd mm1, MMWORD PTR[GLOBAL(_12000)]
- paddd mm4, MMWORD PTR[GLOBAL(_12000)]
- paddd mm3, MMWORD PTR[GLOBAL(_51000)]
- paddd mm5, MMWORD PTR[GLOBAL(_51000)]
-
- psrad mm1, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16
- psrad mm4, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16
- psrad mm3, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16
- psrad mm5, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16
-
- packssdw mm1, mm4 ; op[4]
- packssdw mm3, mm5 ; op[12]
-
- paddw mm1, mm6 ; op[4] += (d1!=0)
-
- movq MMWORD PTR[rdi + 8 ], mm1
- movq MMWORD PTR[rdi + 24], mm3
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-SECTION_RODATA
-align 8
-_5352_2217:
- dw 5352
- dw 2217
- dw 5352
- dw 2217
-align 8
-_2217_neg5352:
- dw 2217
- dw -5352
- dw 2217
- dw -5352
-align 8
-_cmp_mask:
- times 4 dw 1
-align 8
-_7w:
- times 4 dw 7
-align 8
-_14500:
- times 2 dd 14500
-align 8
-_7500:
- times 2 dd 7500
-align 8
-_12000:
- times 2 dd 12000
-align 8
-_51000:
- times 2 dd 51000
diff --git a/vp9/encoder/x86/vp9_dct_mmx.h b/vp9/encoder/x86/vp9_dct_mmx.h
deleted file mode 100644
index 3bac7c8..0000000
--- a/vp9/encoder/x86/vp9_dct_mmx.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_ENCODER_X86_VP9_DCT_MMX_H_
-#define VP9_ENCODER_X86_VP9_DCT_MMX_H_
-
-extern void vp9_short_fdct4x4_mmx(short *input, short *output, int pitch);
-
-
-#endif /* VP9_ENCODER_X86_VP9_DCT_MMX_H_ */
diff --git a/vp9/encoder/x86/vp9_fwalsh_sse2.asm b/vp9/encoder/x86/vp9_fwalsh_sse2.asm
deleted file mode 100644
index 7bee9ef..0000000
--- a/vp9/encoder/x86/vp9_fwalsh_sse2.asm
+++ /dev/null
@@ -1,164 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_short_walsh4x4_sse2(short *input, short *output, int pitch)
-global sym(vp9_short_walsh4x4_sse2) PRIVATE
-sym(vp9_short_walsh4x4_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 3
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ; input
- mov rdi, arg(1) ; output
- movsxd rdx, dword ptr arg(2) ; pitch
-
- ; first for loop
- movq xmm0, MMWORD PTR [rsi] ; load input
- movq xmm1, MMWORD PTR [rsi + rdx]
- lea rsi, [rsi + rdx*2]
- movq xmm2, MMWORD PTR [rsi]
- movq xmm3, MMWORD PTR [rsi + rdx]
-
- punpcklwd xmm0, xmm1
- punpcklwd xmm2, xmm3
-
- movdqa xmm1, xmm0
- punpckldq xmm0, xmm2 ; ip[1] ip[0]
- punpckhdq xmm1, xmm2 ; ip[3] ip[2]
-
- movdqa xmm2, xmm0
- paddw xmm0, xmm1
- psubw xmm2, xmm1
-
- psllw xmm0, 2 ; d1 a1
- psllw xmm2, 2 ; c1 b1
-
- movdqa xmm1, xmm0
- punpcklqdq xmm0, xmm2 ; b1 a1
- punpckhqdq xmm1, xmm2 ; c1 d1
-
- pxor xmm6, xmm6
- movq xmm6, xmm0
- pxor xmm7, xmm7
- pcmpeqw xmm7, xmm6
- paddw xmm7, [GLOBAL(c1)]
-
- movdqa xmm2, xmm0
- paddw xmm0, xmm1 ; b1+c1 a1+d1
- psubw xmm2, xmm1 ; b1-c1 a1-d1
- paddw xmm0, xmm7 ; b1+c1 a1+d1+(a1!=0)
-
- ; second for loop
- ; input: 13 9 5 1 12 8 4 0 (xmm0)
- ; 14 10 6 2 15 11 7 3 (xmm2)
- ; after shuffle:
- ; 13 5 9 1 12 4 8 0 (xmm0)
- ; 14 6 10 2 15 7 11 3 (xmm1)
- pshuflw xmm3, xmm0, 0xd8
- pshufhw xmm0, xmm3, 0xd8
- pshuflw xmm3, xmm2, 0xd8
- pshufhw xmm1, xmm3, 0xd8
-
- movdqa xmm2, xmm0
- pmaddwd xmm0, [GLOBAL(c1)] ; d11 a11 d10 a10
- pmaddwd xmm2, [GLOBAL(cn1)] ; c11 b11 c10 b10
- movdqa xmm3, xmm1
- pmaddwd xmm1, [GLOBAL(c1)] ; d12 a12 d13 a13
- pmaddwd xmm3, [GLOBAL(cn1)] ; c12 b12 c13 b13
-
- pshufd xmm4, xmm0, 0xd8 ; d11 d10 a11 a10
- pshufd xmm5, xmm2, 0xd8 ; c11 c10 b11 b10
- pshufd xmm6, xmm1, 0x72 ; d13 d12 a13 a12
- pshufd xmm7, xmm3, 0x72 ; c13 c12 b13 b12
-
- movdqa xmm0, xmm4
- punpcklqdq xmm0, xmm5 ; b11 b10 a11 a10
- punpckhqdq xmm4, xmm5 ; c11 c10 d11 d10
- movdqa xmm1, xmm6
- punpcklqdq xmm1, xmm7 ; b13 b12 a13 a12
- punpckhqdq xmm6, xmm7 ; c13 c12 d13 d12
-
- movdqa xmm2, xmm0
- paddd xmm0, xmm4 ; b21 b20 a21 a20
- psubd xmm2, xmm4 ; c21 c20 d21 d20
- movdqa xmm3, xmm1
- paddd xmm1, xmm6 ; b23 b22 a23 a22
- psubd xmm3, xmm6 ; c23 c22 d23 d22
-
- pxor xmm4, xmm4
- movdqa xmm5, xmm4
- pcmpgtd xmm4, xmm0
- pcmpgtd xmm5, xmm2
- pand xmm4, [GLOBAL(cd1)]
- pand xmm5, [GLOBAL(cd1)]
-
- pxor xmm6, xmm6
- movdqa xmm7, xmm6
- pcmpgtd xmm6, xmm1
- pcmpgtd xmm7, xmm3
- pand xmm6, [GLOBAL(cd1)]
- pand xmm7, [GLOBAL(cd1)]
-
- paddd xmm0, xmm4
- paddd xmm2, xmm5
- paddd xmm0, [GLOBAL(cd3)]
- paddd xmm2, [GLOBAL(cd3)]
- paddd xmm1, xmm6
- paddd xmm3, xmm7
- paddd xmm1, [GLOBAL(cd3)]
- paddd xmm3, [GLOBAL(cd3)]
-
- psrad xmm0, 3
- psrad xmm1, 3
- psrad xmm2, 3
- psrad xmm3, 3
- movdqa xmm4, xmm0
- punpcklqdq xmm0, xmm1 ; a23 a22 a21 a20
- punpckhqdq xmm4, xmm1 ; b23 b22 b21 b20
- movdqa xmm5, xmm2
- punpckhqdq xmm2, xmm3 ; c23 c22 c21 c20
- punpcklqdq xmm5, xmm3 ; d23 d22 d21 d20
-
- packssdw xmm0, xmm4 ; b23 b22 b21 b20 a23 a22 a21 a20
- packssdw xmm2, xmm5 ; d23 d22 d21 d20 c23 c22 c21 c20
-
- movdqa XMMWORD PTR [rdi], xmm0
- movdqa XMMWORD PTR [rdi + 16], xmm2
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-SECTION_RODATA
-align 16
-c1:
- dw 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001
-align 16
-cn1:
- dw 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff
-align 16
-cd1:
- dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
-align 16
-cd3:
- dd 0x00000003, 0x00000003, 0x00000003, 0x00000003
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 95ea60b..ee744d5 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -75,12 +75,9 @@
VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_sse2.c
VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.h
VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.c
-VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_iwalsh_mmx.asm
-VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_recon_mmx.asm
VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_iwalsh_sse2.asm
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_copy_sse2.asm
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_loopfilter_sse2.asm
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_recon_sse2.asm
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_intrapred_sse2.asm
VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_intrapred_ssse3.asm
VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
@@ -90,7 +87,6 @@
endif
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_sadmxn_sse2.c
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon$(ASM)
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 12a49f8..dee83c9 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -78,13 +78,10 @@
VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_mmx.c
VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_impl_mmx.asm
VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_sad_mmx.asm
-VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
-VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.h
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_fwalsh_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm