Merge "Change to use correct check for halfpel"
diff --git a/examples/simple_encoder.c b/examples/simple_encoder.c
index a307729..64f0a01 100644
--- a/examples/simple_encoder.c
+++ b/examples/simple_encoder.c
@@ -109,8 +109,8 @@
void usage_exit(void) {
fprintf(stderr,
"Usage: %s <codec> <width> <height> <infile> <outfile> "
- "<keyframe-interval> [<error-resilient>]\nSee comments in "
- "simple_encoder.c for more information.\n",
+ "<keyframe-interval> <error-resilient> <frames to encode>\n"
+ "See comments in simple_encoder.c for more information.\n",
exec_name);
exit(EXIT_FAILURE);
}
@@ -147,6 +147,7 @@
return got_pkts;
}
+// TODO(tomfinegan): Improve command line parsing and add args for bitrate/fps.
int main(int argc, char **argv) {
FILE *infile = NULL;
vpx_codec_ctx_t codec;
@@ -157,12 +158,11 @@
VpxVideoInfo info = {0};
VpxVideoWriter *writer = NULL;
const VpxInterface *encoder = NULL;
- const int fps = 30; // TODO(dkovalev) add command line argument
- const int bitrate = 200; // kbit/s TODO(dkovalev) add command line argument
+ const int fps = 30;
+ const int bitrate = 200;
int keyframe_interval = 0;
-
- // TODO(dkovalev): Add some simple command line parsing code to make the
- // command line more flexible.
+ int max_frames = 0;
+ int frames_encoded = 0;
const char *codec_arg = NULL;
const char *width_arg = NULL;
const char *height_arg = NULL;
@@ -172,7 +172,7 @@
exec_name = argv[0];
- if (argc < 7)
+ if (argc != 9)
die("Invalid number of arguments");
codec_arg = argv[1];
@@ -181,6 +181,7 @@
infile_arg = argv[4];
outfile_arg = argv[5];
keyframe_interval_arg = argv[6];
+ max_frames = strtol(argv[8], NULL, 0);
encoder = get_vpx_encoder_by_name(codec_arg);
if (!encoder)
@@ -219,7 +220,7 @@
cfg.g_timebase.num = info.time_base.numerator;
cfg.g_timebase.den = info.time_base.denominator;
cfg.rc_target_bitrate = bitrate;
- cfg.g_error_resilient = argc > 7 ? strtol(argv[7], NULL, 0) : 0;
+ cfg.g_error_resilient = strtol(argv[7], NULL, 0);
writer = vpx_video_writer_open(outfile_arg, kContainerIVF, &info);
if (!writer)
@@ -237,6 +238,9 @@
if (keyframe_interval > 0 && frame_count % keyframe_interval == 0)
flags |= VPX_EFLAG_FORCE_KF;
encode_frame(&codec, &raw, frame_count++, flags, writer);
+ frames_encoded++;
+ if (max_frames > 0 && frames_encoded >= max_frames)
+ break;
}
// Flush encoder.
diff --git a/examples/twopass_encoder.c b/examples/twopass_encoder.c
index aecc11d..15a6617 100644
--- a/examples/twopass_encoder.c
+++ b/examples/twopass_encoder.c
@@ -59,7 +59,9 @@
static const char *exec_name;
void usage_exit(void) {
- fprintf(stderr, "Usage: %s <codec> <width> <height> <infile> <outfile>\n",
+ fprintf(stderr,
+ "Usage: %s <codec> <width> <height> <infile> <outfile> "
+ "<frame limit>\n",
exec_name);
exit(EXIT_FAILURE);
}
@@ -129,7 +131,8 @@
static vpx_fixed_buf_t pass0(vpx_image_t *raw,
FILE *infile,
const VpxInterface *encoder,
- const vpx_codec_enc_cfg_t *cfg) {
+ const vpx_codec_enc_cfg_t *cfg,
+ int max_frames) {
vpx_codec_ctx_t codec;
int frame_count = 0;
vpx_fixed_buf_t stats = {NULL, 0};
@@ -142,6 +145,8 @@
++frame_count;
get_frame_stats(&codec, raw, frame_count, 1, 0, VPX_DL_GOOD_QUALITY,
&stats);
+ if (max_frames > 0 && frame_count >= max_frames)
+ break;
}
// Flush encoder.
@@ -159,7 +164,8 @@
FILE *infile,
const char *outfile_name,
const VpxInterface *encoder,
- const vpx_codec_enc_cfg_t *cfg) {
+ const vpx_codec_enc_cfg_t *cfg,
+ int max_frames) {
VpxVideoInfo info = {
encoder->fourcc,
cfg->g_w,
@@ -181,6 +187,9 @@
while (vpx_img_read(raw, infile)) {
++frame_count;
encode_frame(&codec, raw, frame_count, 1, 0, VPX_DL_GOOD_QUALITY, writer);
+
+ if (max_frames > 0 && frame_count >= max_frames)
+ break;
}
// Flush encoder.
@@ -213,11 +222,14 @@
const char *const height_arg = argv[3];
const char *const infile_arg = argv[4];
const char *const outfile_arg = argv[5];
+ int max_frames = 0;
exec_name = argv[0];
- if (argc != 6)
+ if (argc != 7)
die("Invalid number of arguments.");
+ max_frames = strtol(argv[6], NULL, 0);
+
encoder = get_vpx_encoder_by_name(codec_arg);
if (!encoder)
die("Unsupported codec.");
@@ -249,13 +261,13 @@
// Pass 0
cfg.g_pass = VPX_RC_FIRST_PASS;
- stats = pass0(&raw, infile, encoder, &cfg);
+ stats = pass0(&raw, infile, encoder, &cfg, max_frames);
// Pass 1
rewind(infile);
cfg.g_pass = VPX_RC_LAST_PASS;
cfg.rc_twopass_stats_in = stats;
- pass1(&raw, infile, outfile_arg, encoder, &cfg);
+ pass1(&raw, infile, outfile_arg, encoder, &cfg, max_frames);
free(stats.buf);
vpx_img_free(&raw);
diff --git a/test/sad_test.cc b/test/sad_test.cc
index 3f0f74c..e6bd0d7 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -689,17 +689,6 @@
//------------------------------------------------------------------------------
// x86 functions
-#if HAVE_MMX
-const SadMxNParam mmx_tests[] = {
- make_tuple(16, 16, &vpx_sad16x16_mmx, -1),
- make_tuple(16, 8, &vpx_sad16x8_mmx, -1),
- make_tuple(8, 16, &vpx_sad8x16_mmx, -1),
- make_tuple(8, 8, &vpx_sad8x8_mmx, -1),
- make_tuple(4, 4, &vpx_sad4x4_mmx, -1),
-};
-INSTANTIATE_TEST_CASE_P(MMX, SADTest, ::testing::ValuesIn(mmx_tests));
-#endif // HAVE_MMX
-
#if HAVE_SSE2
#if CONFIG_USE_X86INC
const SadMxNParam sse2_tests[] = {
diff --git a/test/simple_encoder.sh b/test/simple_encoder.sh
index c4a6280..ee633ae 100755
--- a/test/simple_encoder.sh
+++ b/test/simple_encoder.sh
@@ -23,7 +23,7 @@
fi
}
-# Runs simple_encoder using the codec specified by $1.
+# Runs simple_encoder using the codec specified by $1 with a frame limit of 100.
simple_encoder() {
local encoder="${LIBVPX_BIN_PATH}/simple_encoder${VPX_TEST_EXE_SUFFIX}"
local codec="$1"
@@ -35,7 +35,7 @@
fi
eval "${VPX_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
- "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" 9999 \
+ "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" 9999 0 100 \
${devnull}
[ -e "${output_file}" ] || return 1
@@ -47,16 +47,13 @@
fi
}
-# TODO(tomfinegan): Add a frame limit param to simple_encoder and enable this
-# test. VP9 is just too slow right now: This test takes 4m30s+ on a fast
-# machine.
-DISABLED_simple_encoder_vp9() {
+simple_encoder_vp9() {
if [ "$(vp9_encode_available)" = "yes" ]; then
simple_encoder vp9 || return 1
fi
}
simple_encoder_tests="simple_encoder_vp8
- DISABLED_simple_encoder_vp9"
+ simple_encoder_vp9"
run_tests simple_encoder_verify_environment "${simple_encoder_tests}"
diff --git a/test/twopass_encoder.sh b/test/twopass_encoder.sh
index 1189e51..7a223f2 100755
--- a/test/twopass_encoder.sh
+++ b/test/twopass_encoder.sh
@@ -23,7 +23,8 @@
fi
}
-# Runs twopass_encoder using the codec specified by $1.
+# Runs twopass_encoder using the codec specified by $1 with a frame limit of
+# 100.
twopass_encoder() {
local encoder="${LIBVPX_BIN_PATH}/twopass_encoder${VPX_TEST_EXE_SUFFIX}"
local codec="$1"
@@ -35,7 +36,7 @@
fi
eval "${VPX_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
- "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" \
+ "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" 100 \
${devnull}
[ -e "${output_file}" ] || return 1
@@ -47,16 +48,13 @@
fi
}
-# TODO(tomfinegan): Add a frame limit param to twopass_encoder and enable this
-# test. VP9 is just too slow right now: This test takes 31m16s+ on a fast
-# machine.
-DISABLED_twopass_encoder_vp9() {
+twopass_encoder_vp9() {
if [ "$(vp9_encode_available)" = "yes" ]; then
twopass_encoder vp9 || return 1
fi
}
twopass_encoder_tests="twopass_encoder_vp8
- DISABLED_twopass_encoder_vp9"
+ twopass_encoder_vp9"
run_tests twopass_encoder_verify_environment "${twopass_encoder_tests}"
diff --git a/test/variance_test.cc b/test/variance_test.cc
index a9ca07c..a6efc92 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -977,20 +977,6 @@
#endif // CONFIG_VP9_HIGHBITDEPTH
#if HAVE_MMX
-INSTANTIATE_TEST_CASE_P(MMX, VpxMseTest,
- ::testing::Values(make_tuple(4, 4, &vpx_mse16x16_mmx)));
-
-INSTANTIATE_TEST_CASE_P(MMX, SumOfSquaresTest,
- ::testing::Values(vpx_get_mb_ss_mmx));
-
-INSTANTIATE_TEST_CASE_P(
- MMX, VpxVarianceTest,
- ::testing::Values(make_tuple(4, 4, &vpx_variance16x16_mmx, 0),
- make_tuple(4, 3, &vpx_variance16x8_mmx, 0),
- make_tuple(3, 4, &vpx_variance8x16_mmx, 0),
- make_tuple(3, 3, &vpx_variance8x8_mmx, 0),
- make_tuple(2, 2, &vpx_variance4x4_mmx, 0)));
-
INSTANTIATE_TEST_CASE_P(
MMX, VpxSubpelVarianceTest,
::testing::Values(make_tuple(4, 4, &vpx_sub_pixel_variance16x16_mmx, 0),
diff --git a/vp10/encoder/mcomp.c b/vp10/encoder/mcomp.c
index 2c1c591..1ba2e2f 100644
--- a/vp10/encoder/mcomp.c
+++ b/vp10/encoder/mcomp.c
@@ -145,16 +145,6 @@
cfg->searches_per_step = 8;
}
-/*
- * To avoid the penalty for crossing cache-line read, preload the reference
- * area in a small buffer, which is aligned to make sure there won't be crossing
- * cache-line read while reading from this buffer. This reduced the cpu
- * cycles spent on reading ref data in sub-pixel filter functions.
- * TODO: Currently, since sub-pixel search range here is -3 ~ 3, copy 22 rows x
- * 32 cols area that is enough for 16x16 macroblock. Later, for SPLITMV, we
- * could reduce the area.
- */
-
/* estimated cost of a motion vector (r,c) */
#define MVC(r, c) \
(mvcost ? \
@@ -790,7 +780,6 @@
}
#undef MVC
-#undef PRE
#undef CHECK_BETTER
static INLINE int check_bounds(const MACROBLOCK *x, int row, int col,
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 4669145..f3ffe35 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -144,16 +144,6 @@
cfg->total_steps = ss_count / cfg->searches_per_step;
}
-/*
- * To avoid the penalty for crossing cache-line read, preload the reference
- * area in a small buffer, which is aligned to make sure there won't be crossing
- * cache-line read while reading from this buffer. This reduced the cpu
- * cycles spent on reading ref data in sub-pixel filter functions.
- * TODO: Currently, since sub-pixel search range here is -3 ~ 3, copy 22 rows x
- * 32 cols area that is enough for 16x16 macroblock. Later, for SPLITMV, we
- * could reduce the area.
- */
-
/* Estimated (square) error cost of a motion vector (r,c). The 14 scale comes
* from the same math as in mv_err_cost(). */
#define MVC(r, c) \
@@ -835,7 +825,6 @@
}
#undef MVC
-#undef PRE
#undef CHECK_BETTER
static INLINE int check_bounds(const MACROBLOCK *x, int row, int col,
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 1e6c152..0ec93a9 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -953,11 +953,10 @@
FIXED_GF_INTERVAL], cm->bit_depth);
active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
} else {
- // Use the min of the average Q (with some increase) and
- // active_worst_quality as basis for active_best.
+ // Use the min of the average Q and active_worst_quality as basis for
+ // active_best.
if (cm->current_video_frame > 1) {
- q = VPXMIN(((17 * rc->avg_frame_qindex[INTER_FRAME]) >> 4),
- active_worst_quality);
+ q = VPXMIN(rc->avg_frame_qindex[INTER_FRAME], active_worst_quality);
active_best_quality = inter_minq[q];
} else {
active_best_quality = inter_minq[rc->avg_frame_qindex[KEY_FRAME]];
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 5e44ffd..1a11a6d 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -803,7 +803,7 @@
static vpx_codec_err_t ctrl_set_target_level(vpx_codec_alg_priv_t *ctx,
va_list args) {
struct vp9_extracfg extra_cfg = ctx->extra_cfg;
- extra_cfg.target_level = CAST(VP9E_SET_LEVEL_STATS, args);
+ extra_cfg.target_level = CAST(VP9E_SET_TARGET_LEVEL, args);
return update_extra_cfg(ctx, &extra_cfg);
}
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index 109306f..b059d47 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -562,7 +562,7 @@
*
* Supported in codecs: VP9
*/
- VP9E_SET_TARGET_LEVEL,
+ VP9E_SET_TARGET_LEVEL
};
/*!\brief vpx 1-D scaling mode
@@ -818,8 +818,8 @@
VPX_CTRL_USE_TYPE(VP9E_SET_RENDER_SIZE, int *)
#define VPX_CTRL_VP9E_SET_RENDER_SIZE
-VPX_CTRL_USE_TYPE(VP9E_SET_LEVEL_STATS, unsigned int)
-#define VPX_CTRL_VP9E_SET_LEVEL_STATS
+VPX_CTRL_USE_TYPE(VP9E_SET_TARGET_LEVEL, unsigned int)
+#define VPX_CTRL_VP9E_SET_TARGET_LEVEL
/*!\endcond */
/*! @} - end defgroup vp8_encoder */
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index dd8c6e8..43802d7 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -285,7 +285,6 @@
DSP_SRCS-$(HAVE_MSA) += mips/sad_msa.c
DSP_SRCS-$(HAVE_MSA) += mips/subtract_msa.c
-DSP_SRCS-$(HAVE_MMX) += x86/sad_mmx.asm
DSP_SRCS-$(HAVE_SSE3) += x86/sad_sse3.asm
DSP_SRCS-$(HAVE_SSSE3) += x86/sad_ssse3.asm
DSP_SRCS-$(HAVE_SSE4_1) += x86/sad_sse4.asm
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index f883ce5..aeadbaf 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -983,16 +983,16 @@
specialize qw/vpx_sad16x32 msa/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad16x16 mmx media neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad16x16 media neon msa/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad16x8 mmx neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad16x8 neon msa/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad8x16 mmx neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad8x16 neon msa/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad8x8 mmx neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad8x8 neon msa/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
specialize qw/vpx_sad8x4 msa/, "$sse2_x86inc";
@@ -1001,7 +1001,7 @@
specialize qw/vpx_sad4x8 msa/, "$sse2_x86inc";
add_proto qw/unsigned int vpx_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad4x4 mmx neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad4x4 neon msa/, "$sse2_x86inc";
#
# Avg
@@ -1407,16 +1407,16 @@
specialize qw/vpx_variance16x32 sse2 msa/;
add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_variance16x16 mmx sse2 avx2 media neon msa/;
+ specialize qw/vpx_variance16x16 sse2 avx2 media neon msa/;
add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_variance16x8 mmx sse2 neon msa/;
+ specialize qw/vpx_variance16x8 sse2 neon msa/;
add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_variance8x16 mmx sse2 neon msa/;
+ specialize qw/vpx_variance8x16 sse2 neon msa/;
add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_variance8x8 mmx sse2 media neon msa/;
+ specialize qw/vpx_variance8x8 sse2 media neon msa/;
add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_variance8x4 sse2 msa/;
@@ -1425,7 +1425,7 @@
specialize qw/vpx_variance4x8 sse2 msa/;
add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_variance4x4 mmx sse2 msa/;
+ specialize qw/vpx_variance4x4 sse2 msa/;
#
# Specialty Variance
@@ -1434,10 +1434,10 @@
specialize qw/vpx_get16x16var sse2 avx2 neon msa/;
add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vpx_get8x8var mmx sse2 neon msa/;
+ specialize qw/vpx_get8x8var sse2 neon msa/;
add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vpx_mse16x16 mmx sse2 avx2 media neon msa/;
+ specialize qw/vpx_mse16x16 sse2 avx2 media neon msa/;
add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
specialize qw/vpx_mse16x8 sse2 msa/;
@@ -1449,7 +1449,7 @@
specialize qw/vpx_mse8x8 sse2 msa/;
add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *";
- specialize qw/vpx_get_mb_ss mmx sse2 msa/;
+ specialize qw/vpx_get_mb_ss sse2 msa/;
add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride";
specialize qw/vpx_get4x4sse_cs neon msa/;
diff --git a/vpx_dsp/x86/sad_mmx.asm b/vpx_dsp/x86/sad_mmx.asm
deleted file mode 100644
index 9968992..0000000
--- a/vpx_dsp/x86/sad_mmx.asm
+++ /dev/null
@@ -1,427 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-global sym(vpx_sad16x16_mmx) PRIVATE
-global sym(vpx_sad8x16_mmx) PRIVATE
-global sym(vpx_sad8x8_mmx) PRIVATE
-global sym(vpx_sad4x4_mmx) PRIVATE
-global sym(vpx_sad16x8_mmx) PRIVATE
-
-;unsigned int vpx_sad16x16_mmx(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride)
-sym(vpx_sad16x16_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- lea rcx, [rsi+rax*8]
-
- lea rcx, [rcx+rax*8]
- pxor mm7, mm7
-
- pxor mm6, mm6
-
-.x16x16sad_mmx_loop:
-
- movq mm0, QWORD PTR [rsi]
- movq mm2, QWORD PTR [rsi+8]
-
- movq mm1, QWORD PTR [rdi]
- movq mm3, QWORD PTR [rdi+8]
-
- movq mm4, mm0
- movq mm5, mm2
-
- psubusb mm0, mm1
- psubusb mm1, mm4
-
- psubusb mm2, mm3
- psubusb mm3, mm5
-
- por mm0, mm1
- por mm2, mm3
-
- movq mm1, mm0
- movq mm3, mm2
-
- punpcklbw mm0, mm6
- punpcklbw mm2, mm6
-
- punpckhbw mm1, mm6
- punpckhbw mm3, mm6
-
- paddw mm0, mm2
- paddw mm1, mm3
-
-
- lea rsi, [rsi+rax]
- add rdi, rdx
-
- paddw mm7, mm0
- paddw mm7, mm1
-
- cmp rsi, rcx
- jne .x16x16sad_mmx_loop
-
-
- movq mm0, mm7
-
- punpcklwd mm0, mm6
- punpckhwd mm7, mm6
-
- paddw mm0, mm7
- movq mm7, mm0
-
-
- psrlq mm0, 32
- paddw mm7, mm0
-
- movq rax, mm7
-
- pop rdi
- pop rsi
- mov rsp, rbp
- ; begin epilog
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;unsigned int vpx_sad8x16_mmx(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride)
-sym(vpx_sad8x16_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- lea rcx, [rsi+rax*8]
-
- lea rcx, [rcx+rax*8]
- pxor mm7, mm7
-
- pxor mm6, mm6
-
-.x8x16sad_mmx_loop:
-
- movq mm0, QWORD PTR [rsi]
- movq mm1, QWORD PTR [rdi]
-
- movq mm2, mm0
- psubusb mm0, mm1
-
- psubusb mm1, mm2
- por mm0, mm1
-
- movq mm2, mm0
- punpcklbw mm0, mm6
-
- punpckhbw mm2, mm6
- lea rsi, [rsi+rax]
-
- add rdi, rdx
- paddw mm7, mm0
-
- paddw mm7, mm2
- cmp rsi, rcx
-
- jne .x8x16sad_mmx_loop
-
- movq mm0, mm7
- punpcklwd mm0, mm6
-
- punpckhwd mm7, mm6
- paddw mm0, mm7
-
- movq mm7, mm0
- psrlq mm0, 32
-
- paddw mm7, mm0
- movq rax, mm7
-
- pop rdi
- pop rsi
- mov rsp, rbp
- ; begin epilog
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;unsigned int vpx_sad8x8_mmx(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride)
-sym(vpx_sad8x8_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- lea rcx, [rsi+rax*8]
- pxor mm7, mm7
-
- pxor mm6, mm6
-
-.x8x8sad_mmx_loop:
-
- movq mm0, QWORD PTR [rsi]
- movq mm1, QWORD PTR [rdi]
-
- movq mm2, mm0
- psubusb mm0, mm1
-
- psubusb mm1, mm2
- por mm0, mm1
-
- movq mm2, mm0
- punpcklbw mm0, mm6
-
- punpckhbw mm2, mm6
- paddw mm0, mm2
-
- lea rsi, [rsi+rax]
- add rdi, rdx
-
- paddw mm7, mm0
- cmp rsi, rcx
-
- jne .x8x8sad_mmx_loop
-
- movq mm0, mm7
- punpcklwd mm0, mm6
-
- punpckhwd mm7, mm6
- paddw mm0, mm7
-
- movq mm7, mm0
- psrlq mm0, 32
-
- paddw mm7, mm0
- movq rax, mm7
-
- pop rdi
- pop rsi
- mov rsp, rbp
- ; begin epilog
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;unsigned int vpx_sad4x4_mmx(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride)
-sym(vpx_sad4x4_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- movd mm0, DWORD PTR [rsi]
- movd mm1, DWORD PTR [rdi]
-
- movd mm2, DWORD PTR [rsi+rax]
- movd mm3, DWORD PTR [rdi+rdx]
-
- punpcklbw mm0, mm2
- punpcklbw mm1, mm3
-
- movq mm2, mm0
- psubusb mm0, mm1
-
- psubusb mm1, mm2
- por mm0, mm1
-
- movq mm2, mm0
- pxor mm3, mm3
-
- punpcklbw mm0, mm3
- punpckhbw mm2, mm3
-
- paddw mm0, mm2
-
- lea rsi, [rsi+rax*2]
- lea rdi, [rdi+rdx*2]
-
- movd mm4, DWORD PTR [rsi]
- movd mm5, DWORD PTR [rdi]
-
- movd mm6, DWORD PTR [rsi+rax]
- movd mm7, DWORD PTR [rdi+rdx]
-
- punpcklbw mm4, mm6
- punpcklbw mm5, mm7
-
- movq mm6, mm4
- psubusb mm4, mm5
-
- psubusb mm5, mm6
- por mm4, mm5
-
- movq mm5, mm4
- punpcklbw mm4, mm3
-
- punpckhbw mm5, mm3
- paddw mm4, mm5
-
- paddw mm0, mm4
- movq mm1, mm0
-
- punpcklwd mm0, mm3
- punpckhwd mm1, mm3
-
- paddw mm0, mm1
- movq mm1, mm0
-
- psrlq mm0, 32
- paddw mm0, mm1
-
- movq rax, mm0
-
- pop rdi
- pop rsi
- mov rsp, rbp
- ; begin epilog
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;unsigned int vpx_sad16x8_mmx(
-; unsigned char *src_ptr,
-; int src_stride,
-; unsigned char *ref_ptr,
-; int ref_stride)
-sym(vpx_sad16x8_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
-
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
-
- lea rcx, [rsi+rax*8]
- pxor mm7, mm7
-
- pxor mm6, mm6
-
-.x16x8sad_mmx_loop:
-
- movq mm0, [rsi]
- movq mm1, [rdi]
-
- movq mm2, [rsi+8]
- movq mm3, [rdi+8]
-
- movq mm4, mm0
- movq mm5, mm2
-
- psubusb mm0, mm1
- psubusb mm1, mm4
-
- psubusb mm2, mm3
- psubusb mm3, mm5
-
- por mm0, mm1
- por mm2, mm3
-
- movq mm1, mm0
- movq mm3, mm2
-
- punpcklbw mm0, mm6
- punpckhbw mm1, mm6
-
- punpcklbw mm2, mm6
- punpckhbw mm3, mm6
-
-
- paddw mm0, mm2
- paddw mm1, mm3
-
- paddw mm0, mm1
- lea rsi, [rsi+rax]
-
- add rdi, rdx
- paddw mm7, mm0
-
- cmp rsi, rcx
- jne .x16x8sad_mmx_loop
-
- movq mm0, mm7
- punpcklwd mm0, mm6
-
- punpckhwd mm7, mm6
- paddw mm0, mm7
-
- movq mm7, mm0
- psrlq mm0, 32
-
- paddw mm7, mm0
- movq rax, mm7
-
- pop rdi
- pop rsi
- mov rsp, rbp
- ; begin epilog
- UNSHADOW_ARGS
- pop rbp
- ret
diff --git a/vpx_dsp/x86/variance_impl_mmx.asm b/vpx_dsp/x86/variance_impl_mmx.asm
index b8ba79b..f4de42a 100644
--- a/vpx_dsp/x86/variance_impl_mmx.asm
+++ b/vpx_dsp/x86/variance_impl_mmx.asm
@@ -13,407 +13,6 @@
%define mmx_filter_shift 7
-;unsigned int vpx_get_mb_ss_mmx( short *src_ptr )
-global sym(vpx_get_mb_ss_mmx) PRIVATE
-sym(vpx_get_mb_ss_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- GET_GOT rbx
- push rsi
- push rdi
- sub rsp, 8
- ; end prolog
-
- mov rax, arg(0) ;src_ptr
- mov rcx, 16
- pxor mm4, mm4
-
-.NEXTROW:
- movq mm0, [rax]
- movq mm1, [rax+8]
- movq mm2, [rax+16]
- movq mm3, [rax+24]
- pmaddwd mm0, mm0
- pmaddwd mm1, mm1
- pmaddwd mm2, mm2
- pmaddwd mm3, mm3
-
- paddd mm4, mm0
- paddd mm4, mm1
- paddd mm4, mm2
- paddd mm4, mm3
-
- add rax, 32
- dec rcx
- ja .NEXTROW
- movq QWORD PTR [rsp], mm4
-
- ;return sum[0]+sum[1];
- movsxd rax, dword ptr [rsp]
- movsxd rcx, dword ptr [rsp+4]
- add rax, rcx
-
- ; begin epilog
- add rsp, 8
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vpx_get8x8var_mmx
-;(
-; unsigned char *src_ptr,
-; int source_stride,
-; unsigned char *ref_ptr,
-; int recon_stride,
-; unsigned int *SSE,
-; int *Sum
-;)
-global sym(vpx_get8x8var_mmx) PRIVATE
-sym(vpx_get8x8var_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- push rsi
- push rdi
- push rbx
- sub rsp, 16
- ; end prolog
-
- pxor mm5, mm5 ; Blank mmx6
- pxor mm6, mm6 ; Blank mmx7
- pxor mm7, mm7 ; Blank mmx7
-
- mov rax, arg(0) ;[src_ptr] ; Load base addresses
- mov rbx, arg(2) ;[ref_ptr]
- movsxd rcx, dword ptr arg(1) ;[source_stride]
- movsxd rdx, dword ptr arg(3) ;[recon_stride]
-
- ; Row 1
- movq mm0, [rax] ; Copy eight bytes to mm0
- movq mm1, [rbx] ; Copy eight bytes to mm1
- movq mm2, mm0 ; Take copies
- movq mm3, mm1 ; Take copies
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- punpckhbw mm2, mm6 ; unpack to higher prrcision
- punpckhbw mm3, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- psubsw mm2, mm3 ; A-B (high order) to MM2
-
- paddw mm5, mm0 ; accumulate differences in mm5
- paddw mm5, mm2 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- pmaddwd mm2, mm2 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
- paddd mm7, mm2 ; accumulate in mm7
-
- ; Row 2
- movq mm0, [rax] ; Copy eight bytes to mm0
- movq mm2, mm0 ; Take copies
- movq mm3, mm1 ; Take copies
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- punpckhbw mm2, mm6 ; unpack to higher prrcision
- punpckhbw mm3, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- psubsw mm2, mm3 ; A-B (high order) to MM2
-
- paddw mm5, mm0 ; accumulate differences in mm5
- paddw mm5, mm2 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- pmaddwd mm2, mm2 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
- paddd mm7, mm2 ; accumulate in mm7
-
- ; Row 3
- movq mm0, [rax] ; Copy eight bytes to mm0
- movq mm2, mm0 ; Take copies
- movq mm3, mm1 ; Take copies
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- punpckhbw mm2, mm6 ; unpack to higher prrcision
- punpckhbw mm3, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- psubsw mm2, mm3 ; A-B (high order) to MM2
-
- paddw mm5, mm0 ; accumulate differences in mm5
- paddw mm5, mm2 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- pmaddwd mm2, mm2 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
- paddd mm7, mm2 ; accumulate in mm7
-
- ; Row 4
- movq mm0, [rax] ; Copy eight bytes to mm0
- movq mm2, mm0 ; Take copies
- movq mm3, mm1 ; Take copies
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- punpckhbw mm2, mm6 ; unpack to higher prrcision
- punpckhbw mm3, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- psubsw mm2, mm3 ; A-B (high order) to MM2
-
- paddw mm5, mm0 ; accumulate differences in mm5
- paddw mm5, mm2 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- pmaddwd mm2, mm2 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
- paddd mm7, mm2 ; accumulate in mm7
-
- ; Row 5
- movq mm0, [rax] ; Copy eight bytes to mm0
- movq mm2, mm0 ; Take copies
- movq mm3, mm1 ; Take copies
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- punpckhbw mm2, mm6 ; unpack to higher prrcision
- punpckhbw mm3, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- psubsw mm2, mm3 ; A-B (high order) to MM2
-
- paddw mm5, mm0 ; accumulate differences in mm5
- paddw mm5, mm2 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- pmaddwd mm2, mm2 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
- ; movq mm4, [rbx + rdx]
- paddd mm7, mm0 ; accumulate in mm7
- paddd mm7, mm2 ; accumulate in mm7
-
- ; Row 6
- movq mm0, [rax] ; Copy eight bytes to mm0
- movq mm2, mm0 ; Take copies
- movq mm3, mm1 ; Take copies
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- punpckhbw mm2, mm6 ; unpack to higher prrcision
- punpckhbw mm3, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- psubsw mm2, mm3 ; A-B (high order) to MM2
-
- paddw mm5, mm0 ; accumulate differences in mm5
- paddw mm5, mm2 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- pmaddwd mm2, mm2 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
- paddd mm7, mm2 ; accumulate in mm7
-
- ; Row 7
- movq mm0, [rax] ; Copy eight bytes to mm0
- movq mm2, mm0 ; Take copies
- movq mm3, mm1 ; Take copies
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- punpckhbw mm2, mm6 ; unpack to higher prrcision
- punpckhbw mm3, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- psubsw mm2, mm3 ; A-B (high order) to MM2
-
- paddw mm5, mm0 ; accumulate differences in mm5
- paddw mm5, mm2 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- pmaddwd mm2, mm2 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movq mm1, [rbx] ; Copy eight bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
- paddd mm7, mm2 ; accumulate in mm7
-
- ; Row 8
- movq mm0, [rax] ; Copy eight bytes to mm0
- movq mm2, mm0 ; Take copies
- movq mm3, mm1 ; Take copies
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- punpckhbw mm2, mm6 ; unpack to higher prrcision
- punpckhbw mm3, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- psubsw mm2, mm3 ; A-B (high order) to MM2
-
- paddw mm5, mm0 ; accumulate differences in mm5
- paddw mm5, mm2 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- pmaddwd mm2, mm2 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- paddd mm7, mm0 ; accumulate in mm7
- paddd mm7, mm2 ; accumulate in mm7
-
- ; Now accumulate the final results.
- movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
- movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
- movsx rdx, WORD PTR [rsp+8]
- movsx rcx, WORD PTR [rsp+10]
- movsx rbx, WORD PTR [rsp+12]
- movsx rax, WORD PTR [rsp+14]
- add rdx, rcx
- add rbx, rax
- add rdx, rbx ;XSum
- movsxd rax, DWORD PTR [rsp]
- movsxd rcx, DWORD PTR [rsp+4]
- add rax, rcx ;XXSum
- mov rsi, arg(4) ;SSE
- mov rdi, arg(5) ;Sum
- mov dword ptr [rsi], eax
- mov dword ptr [rdi], edx
- xor rax, rax ; return 0
-
- ; begin epilog
- add rsp, 16
- pop rbx
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void
-;vpx_get4x4var_mmx
-;(
-; unsigned char *src_ptr,
-; int source_stride,
-; unsigned char *ref_ptr,
-; int recon_stride,
-; unsigned int *SSE,
-; int *Sum
-;)
-global sym(vpx_get4x4var_mmx) PRIVATE
-sym(vpx_get4x4var_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- push rsi
- push rdi
- push rbx
- sub rsp, 16
- ; end prolog
-
- pxor mm5, mm5 ; Blank mmx6
- pxor mm6, mm6 ; Blank mmx7
- pxor mm7, mm7 ; Blank mmx7
-
- mov rax, arg(0) ;[src_ptr] ; Load base addresses
- mov rbx, arg(2) ;[ref_ptr]
- movsxd rcx, dword ptr arg(1) ;[source_stride]
- movsxd rdx, dword ptr arg(3) ;[recon_stride]
-
- ; Row 1
- movd mm0, [rax] ; Copy four bytes to mm0
- movd mm1, [rbx] ; Copy four bytes to mm1
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- paddw mm5, mm0 ; accumulate differences in mm5
- pmaddwd mm0, mm0 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movd mm1, [rbx] ; Copy four bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
-
- ; Row 2
- movd mm0, [rax] ; Copy four bytes to mm0
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- paddw mm5, mm0 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movd mm1, [rbx] ; Copy four bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
-
- ; Row 3
- movd mm0, [rax] ; Copy four bytes to mm0
- punpcklbw mm0, mm6 ; unpack to higher precision
- punpcklbw mm1, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
- paddw mm5, mm0 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- add rbx,rdx ; Inc pointer into ref data
- add rax,rcx ; Inc pointer into the new data
- movd mm1, [rbx] ; Copy four bytes to mm1
- paddd mm7, mm0 ; accumulate in mm7
-
- ; Row 4
- movd mm0, [rax] ; Copy four bytes to mm0
-
- punpcklbw mm0, mm6 ; unpack to higher prrcision
- punpcklbw mm1, mm6
- psubsw mm0, mm1 ; A-B (low order) to MM0
-
- paddw mm5, mm0 ; accumulate differences in mm5
-
- pmaddwd mm0, mm0 ; square and accumulate
- paddd mm7, mm0 ; accumulate in mm7
-
- ; Now accumulate the final results.
- movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
- movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
- movsx rdx, WORD PTR [rsp+8]
- movsx rcx, WORD PTR [rsp+10]
- movsx rbx, WORD PTR [rsp+12]
- movsx rax, WORD PTR [rsp+14]
- add rdx, rcx
- add rbx, rax
- add rdx, rbx ;XSum
- movsxd rax, DWORD PTR [rsp]
- movsxd rcx, DWORD PTR [rsp+4]
- add rax, rcx ;XXSum
- mov rsi, arg(4) ;SSE
- mov rdi, arg(5) ;Sum
- mov dword ptr [rsi], eax
- mov dword ptr [rdi], edx
- xor rax, rax ; return 0
-
- ; begin epilog
- add rsp, 16
- pop rbx
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
;void vpx_filter_block2d_bil4x4_var_mmx
;(
; unsigned char *ref_ptr,
diff --git a/vpx_dsp/x86/variance_mmx.c b/vpx_dsp/x86/variance_mmx.c
index f04f4e2..636231d 100644
--- a/vpx_dsp/x86/variance_mmx.c
+++ b/vpx_dsp/x86/variance_mmx.c
@@ -23,10 +23,6 @@
{ 16, 16, 16, 16, 112, 112, 112, 112 }
};
-extern void vpx_get4x4var_mmx(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- unsigned int *sse, int *sum);
-
extern void vpx_filter_block2d_bil4x4_var_mmx(const unsigned char *ref_ptr,
int ref_pixels_per_line,
const unsigned char *src_ptr,
@@ -47,98 +43,6 @@
unsigned int *sumsquared);
-unsigned int vpx_variance4x4_mmx(const unsigned char *a, int a_stride,
- const unsigned char *b, int b_stride,
- unsigned int *sse) {
- unsigned int var;
- int avg;
-
- vpx_get4x4var_mmx(a, a_stride, b, b_stride, &var, &avg);
- *sse = var;
- return (var - (((unsigned int)avg * avg) >> 4));
-}
-
-unsigned int vpx_variance8x8_mmx(const unsigned char *a, int a_stride,
- const unsigned char *b, int b_stride,
- unsigned int *sse) {
- unsigned int var;
- int avg;
-
- vpx_get8x8var_mmx(a, a_stride, b, b_stride, &var, &avg);
- *sse = var;
-
- return (var - (((unsigned int)avg * avg) >> 6));
-}
-
-unsigned int vpx_mse16x16_mmx(const unsigned char *a, int a_stride,
- const unsigned char *b, int b_stride,
- unsigned int *sse) {
- unsigned int sse0, sse1, sse2, sse3, var;
- int sum0, sum1, sum2, sum3;
-
- vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
- vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
- vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
- b + 8 * b_stride, b_stride, &sse2, &sum2);
- vpx_get8x8var_mmx(a + 8 * a_stride + 8, a_stride,
- b + 8 * b_stride + 8, b_stride, &sse3, &sum3);
-
- var = sse0 + sse1 + sse2 + sse3;
- *sse = var;
- return var;
-}
-
-unsigned int vpx_variance16x16_mmx(const unsigned char *a, int a_stride,
- const unsigned char *b, int b_stride,
- unsigned int *sse) {
- unsigned int sse0, sse1, sse2, sse3, var;
- int sum0, sum1, sum2, sum3, avg;
-
- vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
- vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
- vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
- b + 8 * b_stride, b_stride, &sse2, &sum2);
- vpx_get8x8var_mmx(a + 8 * a_stride + 8, a_stride,
- b + 8 * b_stride + 8, b_stride, &sse3, &sum3);
-
- var = sse0 + sse1 + sse2 + sse3;
- avg = sum0 + sum1 + sum2 + sum3;
- *sse = var;
- return (var - (((unsigned int)avg * avg) >> 8));
-}
-
-unsigned int vpx_variance16x8_mmx(const unsigned char *a, int a_stride,
- const unsigned char *b, int b_stride,
- unsigned int *sse) {
- unsigned int sse0, sse1, var;
- int sum0, sum1, avg;
-
- vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
- vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
-
- var = sse0 + sse1;
- avg = sum0 + sum1;
- *sse = var;
- return (var - (((unsigned int)avg * avg) >> 7));
-}
-
-unsigned int vpx_variance8x16_mmx(const unsigned char *a, int a_stride,
- const unsigned char *b, int b_stride,
- unsigned int *sse) {
- unsigned int sse0, sse1, var;
- int sum0, sum1, avg;
-
- vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
- vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
- b + 8 * b_stride, b_stride, &sse1, &sum1);
-
- var = sse0 + sse1;
- avg = sum0 + sum1;
- *sse = var;
-
- return (var - (((unsigned int)avg * avg) >> 7));
-}
-
uint32_t vpx_sub_pixel_variance4x4_mmx(const uint8_t *a, int a_stride,
int xoffset, int yoffset,
const uint8_t *b, int b_stride,