Merge "Proper fix of a msvc complier warning"
diff --git a/build/make/iosbuild.sh b/build/make/iosbuild.sh
index 6f7180d..ae5ba18 100755
--- a/build/make/iosbuild.sh
+++ b/build/make/iosbuild.sh
@@ -29,11 +29,14 @@
LIBVPX_SOURCE_DIR=$(cd ${SCRIPT_DIR}/../..; pwd)
LIPO=$(xcrun -sdk iphoneos${SDK} -find lipo)
ORIG_PWD="$(pwd)"
-TARGETS="arm64-darwin-gcc
- armv7-darwin-gcc
- armv7s-darwin-gcc
- x86-iphonesimulator-gcc
- x86_64-iphonesimulator-gcc"
+ARM_TARGETS="arm64-darwin-gcc
+ armv7-darwin-gcc
+ armv7s-darwin-gcc"
+SIM_TARGETS="x86-iphonesimulator-gcc
+ x86_64-iphonesimulator-gcc"
+OSX_TARGETS="x86-darwin15-gcc
+ x86_64-darwin15-gcc"
+TARGETS="${ARM_TARGETS} ${SIM_TARGETS}"
# Configures for the target specified by $1, and invokes make with the dist
# target using $DIST_DIR as the distribution output directory.
@@ -197,15 +200,27 @@
fi
}
+print_list() {
+ local indent="$1"
+ shift
+ local list="$@"
+ for entry in ${list}; do
+ echo "${indent}${entry}"
+ done
+}
+
iosbuild_usage() {
cat << EOF
Usage: ${0##*/} [arguments]
--help: Display this message and exit.
--extra-configure-args <args>: Extra args to pass when configuring libvpx.
+ --macosx: Uses darwin15 targets instead of iphonesimulator targets for x86
+ and x86_64. Allows linking to framework when builds target MacOSX
+ instead of iOS.
--preserve-build-output: Do not delete the build directory.
--show-build-output: Show output from each library build.
--targets <targets>: Override default target list. Defaults:
- ${TARGETS}
+$(print_list " " ${TARGETS})
--test-link: Confirms all targets can be linked. Functionally identical to
passing --enable-examples via --extra-configure-args.
--verbose: Output information about the environment and each stage of the
@@ -249,6 +264,9 @@
TARGETS="$2"
shift
;;
+ --macosx)
+ TARGETS="${ARM_TARGETS} ${OSX_TARGETS}"
+ ;;
--verbose)
VERBOSE=yes
;;
@@ -273,10 +291,12 @@
MAKEFLAGS=${MAKEFLAGS}
ORIG_PWD=${ORIG_PWD}
PRESERVE_BUILD_OUTPUT=${PRESERVE_BUILD_OUTPUT}
- TARGETS="${TARGETS}"
+ TARGETS="$(print_list "" ${TARGETS})"
+ OSX_TARGETS="${OSX_TARGETS}"
+ SIM_TARGETS="${SIM_TARGETS}"
EOF
fi
build_framework "${TARGETS}"
echo "Successfully built '${FRAMEWORK_DIR}' for:"
-echo " ${TARGETS}"
+print_list "" ${TARGETS}
diff --git a/test/test.mk b/test/test.mk
index 8d66244..2487bd2 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -92,10 +92,9 @@
## shared library builds don't make these functions accessible.
##
ifeq ($(CONFIG_SHARED),)
-LIBVPX_TEST_SRCS-$(CONFIG_VP9) += lpf_8_test.cc
## VP8
-ifneq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),)
+ifeq ($(CONFIG_VP8),yes)
# These tests require both the encoder and decoder to be built.
ifeq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),yesyes)
@@ -105,10 +104,10 @@
LIBVPX_TEST_SRCS-$(CONFIG_POSTPROC) += pp_filter_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += vp8_decrypt_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += quantize_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += set_roi.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += variance_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += quantize_test.cc
LIBVPX_TEST_SRCS-yes += idct_test.cc
LIBVPX_TEST_SRCS-yes += sixtap_predict_test.cc
@@ -121,7 +120,7 @@
endif # VP8
## VP9
-ifneq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_DECODER),)
+ifeq ($(CONFIG_VP9),yes)
# These tests require both the encoder and decoder to be built.
ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_DECODER),yesyes)
@@ -134,25 +133,25 @@
LIBVPX_TEST_SRCS-yes += vp9_encoder_parms_get_to_decoder.cc
endif
-LIBVPX_TEST_SRCS-$(CONFIG_VP9) += convolve_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_thread_test.cc
+LIBVPX_TEST_SRCS-yes += convolve_test.cc
+LIBVPX_TEST_SRCS-yes += lpf_8_test.cc
+LIBVPX_TEST_SRCS-yes += vp9_intrapred_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_decrypt_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_thread_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct16x16_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct32x32_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_avg_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_error_block_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9) += vp9_intrapred_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc
ifeq ($(CONFIG_VP9_ENCODER),yes)
LIBVPX_TEST_SRCS-$(CONFIG_SPATIAL_SVC) += svc_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_INTERNAL_STATS) += blockiness_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_INTERNAL_STATS) += consistency_test.cc
-
endif
ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_TEMPORAL_DENOISING),yesyes)
@@ -162,14 +161,19 @@
endif # VP9
-LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += sad_test.cc
-
-TEST_INTRA_PRED_SPEED_SRCS-$(CONFIG_VP9) := test_intra_pred_speed.cc
-TEST_INTRA_PRED_SPEED_SRCS-$(CONFIG_VP9) += ../md5_utils.h ../md5_utils.c
-
## VP10
+ifeq ($(CONFIG_VP10),yes)
+
+LIBVPX_TEST_SRCS-yes += vp10_inv_txfm_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_dct_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_inv_txfm_test.cc
+
+endif # VP10
+
+## Multi-codec / unconditional whitebox tests.
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += sad_test.cc
+
+TEST_INTRA_PRED_SPEED_SRCS-yes := test_intra_pred_speed.cc
+TEST_INTRA_PRED_SPEED_SRCS-yes += ../md5_utils.h ../md5_utils.c
endif # CONFIG_SHARED
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 5f4c48f..c270072 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -187,15 +187,10 @@
vpx_d153_predictor_4x4_c, vpx_d207_predictor_4x4_c,
vpx_d63_predictor_4x4_c, vpx_tm_predictor_4x4_c)
-#if HAVE_SSE && CONFIG_USE_X86INC
-INTRA_PRED_TEST(SSE, TestIntraPred4, vpx_dc_predictor_4x4_sse,
- vpx_dc_left_predictor_4x4_sse, vpx_dc_top_predictor_4x4_sse,
- vpx_dc_128_predictor_4x4_sse, vpx_v_predictor_4x4_sse, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL)
-#endif // HAVE_SSE && CONFIG_USE_X86INC
-
#if HAVE_SSE2 && CONFIG_USE_X86INC
-INTRA_PRED_TEST(SSE2, TestIntraPred4, NULL, NULL, NULL, NULL, NULL,
+INTRA_PRED_TEST(SSE2, TestIntraPred4, vpx_dc_predictor_4x4_sse2,
+ vpx_dc_left_predictor_4x4_sse2, vpx_dc_top_predictor_4x4_sse2,
+ vpx_dc_128_predictor_4x4_sse2, vpx_v_predictor_4x4_sse2,
vpx_h_predictor_4x4_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
vpx_tm_predictor_4x4_sse2)
#endif // HAVE_SSE2 && CONFIG_USE_X86INC
diff --git a/test/vp9_avg_test.cc b/test/vp9_avg_test.cc
index 290bdc7..cbc667e 100644
--- a/test/vp9_avg_test.cc
+++ b/test/vp9_avg_test.cc
@@ -372,7 +372,10 @@
::testing::Values(
make_tuple(16, 16, 0, 8, &vp9_avg_8x8_neon),
make_tuple(16, 16, 5, 8, &vp9_avg_8x8_neon),
- make_tuple(32, 32, 15, 8, &vp9_avg_8x8_neon)));
+ make_tuple(32, 32, 15, 8, &vp9_avg_8x8_neon),
+ make_tuple(16, 16, 0, 4, &vp9_avg_4x4_neon),
+ make_tuple(16, 16, 5, 4, &vp9_avg_4x4_neon),
+ make_tuple(32, 32, 15, 4, &vp9_avg_4x4_neon)));
INSTANTIATE_TEST_CASE_P(
NEON, IntProRowTest, ::testing::Values(
diff --git a/third_party/x86inc/x86inc.asm b/third_party/x86inc/x86inc.asm
index be59de3..e7d3fa5 100644
--- a/third_party/x86inc/x86inc.asm
+++ b/third_party/x86inc/x86inc.asm
@@ -119,7 +119,7 @@
%if ABI_IS_32BIT
%if CONFIG_PIC=1
%ifidn __OUTPUT_FORMAT__,elf32
- %define GET_GOT_SAVE_ARG 1
+ %define GET_GOT_DEFINED 1
%define WRT_PLT wrt ..plt
%macro GET_GOT 1
extern _GLOBAL_OFFSET_TABLE_
@@ -138,7 +138,7 @@
%define RESTORE_GOT pop %1
%endmacro
%elifidn __OUTPUT_FORMAT__,macho32
- %define GET_GOT_SAVE_ARG 1
+ %define GET_GOT_DEFINED 1
%macro GET_GOT 1
push %1
call %%get_got
@@ -149,6 +149,8 @@
%undef RESTORE_GOT
%define RESTORE_GOT pop %1
%endmacro
+ %else
+ %define GET_GOT_DEFINED 0
%endif
%endif
diff --git a/vp10/encoder/block.h b/vp10/encoder/block.h
index cb2a234..ab0252b 100644
--- a/vp10/encoder/block.h
+++ b/vp10/encoder/block.h
@@ -70,6 +70,8 @@
int rddiv;
int rdmult;
int mb_energy;
+ int * m_search_count_ptr;
+ int * ex_search_count_ptr;
// These are set to their default values at the beginning, and then adjusted
// further in the encoding process.
diff --git a/vp10/encoder/encodeframe.c b/vp10/encoder/encodeframe.c
index f1814a9..44ca276 100644
--- a/vp10/encoder/encodeframe.c
+++ b/vp10/encoder/encodeframe.c
@@ -2642,6 +2642,10 @@
TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col];
int mi_row;
+ // Set up pointers to per thread motion search counters.
+ td->mb.m_search_count_ptr = &td->rd_counts.m_search_count;
+ td->mb.ex_search_count_ptr = &td->rd_counts.ex_search_count;
+
for (mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
mi_row += MI_BLOCK_SIZE) {
encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok);
@@ -2695,6 +2699,8 @@
vp10_zero(rdc->coef_counts);
vp10_zero(rdc->comp_pred_diff);
vp10_zero(rdc->filter_diff);
+ rdc->m_search_count = 0; // Count of motion search hits.
+ rdc->ex_search_count = 0; // Exhaustive mesh search hits.
for (i = 0; i < MAX_SEGMENTS; ++i) {
const int qindex = CONFIG_MISC_FIXES && cm->seg.enabled ?
diff --git a/vp10/encoder/encoder.c b/vp10/encoder/encoder.c
index 630f35e..6bba848 100644
--- a/vp10/encoder/encoder.c
+++ b/vp10/encoder/encoder.c
@@ -2843,7 +2843,7 @@
recon_err = vp10_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
if (cpi->twopass.total_left_stats.coded_error != 0.0)
- fprintf(f, "%10u %dx%d %d %d %10d %10d %10d %10d"
+ fprintf(f, "%10u %dx%d %10d %10d %d %d %10d %10d %10d %10d"
"%10"PRId64" %10"PRId64" %5d %5d %10"PRId64" "
"%10"PRId64" %10"PRId64" %10d "
"%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf"
@@ -2852,6 +2852,8 @@
"%10lf %8u %10"PRId64" %10d %10d %10d\n",
cpi->common.current_video_frame,
cm->width, cm->height,
+ cpi->td.rd_counts.m_search_count,
+ cpi->td.rd_counts.ex_search_count,
cpi->rc.source_alt_ref_pending,
cpi->rc.source_alt_ref_active,
cpi->rc.this_frame_target,
diff --git a/vp10/encoder/encoder.h b/vp10/encoder/encoder.h
index 8519c27..2a44e47 100644
--- a/vp10/encoder/encoder.h
+++ b/vp10/encoder/encoder.h
@@ -250,6 +250,8 @@
vp10_coeff_count coef_counts[TX_SIZES][PLANE_TYPES];
int64_t comp_pred_diff[REFERENCE_MODES];
int64_t filter_diff[SWITCHABLE_FILTER_CONTEXTS];
+ int m_search_count;
+ int ex_search_count;
} RD_COUNTS;
typedef struct ThreadData {
diff --git a/vp10/encoder/ethread.c b/vp10/encoder/ethread.c
index ea49105..ad47ccf 100644
--- a/vp10/encoder/ethread.c
+++ b/vp10/encoder/ethread.c
@@ -30,6 +30,11 @@
for (n = 0; n < ENTROPY_TOKENS; n++)
td->rd_counts.coef_counts[i][j][k][l][m][n] +=
td_t->rd_counts.coef_counts[i][j][k][l][m][n];
+
+
+ // Counts of all motion searches and exhuastive mesh searches.
+ td->rd_counts.m_search_count += td_t->rd_counts.m_search_count;
+ td->rd_counts.ex_search_count += td_t->rd_counts.ex_search_count;
}
static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) {
diff --git a/vp10/encoder/mcomp.c b/vp10/encoder/mcomp.c
index d6ab00f..04e1daf 100644
--- a/vp10/encoder/mcomp.c
+++ b/vp10/encoder/mcomp.c
@@ -1523,69 +1523,83 @@
#undef CHECK_BETTER
-int vp10_full_range_search_c(const MACROBLOCK *x,
- const search_site_config *cfg,
- MV *ref_mv, MV *best_mv,
- int search_param, int sad_per_bit, int *num00,
- const vp9_variance_fn_ptr_t *fn_ptr,
- const MV *center_mv) {
+// Exhuastive motion search around a given centre position with a given
+// step size.
+static int exhuastive_mesh_search(const MACROBLOCK *x,
+ MV *ref_mv, MV *best_mv,
+ int range, int step, int sad_per_bit,
+ const vp9_variance_fn_ptr_t *fn_ptr,
+ const MV *center_mv) {
const MACROBLOCKD *const xd = &x->e_mbd;
const struct buf_2d *const what = &x->plane[0].src;
const struct buf_2d *const in_what = &xd->plane[0].pre[0];
- const int range = 64;
- const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+ MV fcenter_mv = {center_mv->row, center_mv->col};
unsigned int best_sad = INT_MAX;
int r, c, i;
int start_col, end_col, start_row, end_row;
+ int col_step = (step > 1) ? step : 4;
- // The cfg and search_param parameters are not used in this search variant
- (void)cfg;
- (void)search_param;
+ assert(step >= 1);
- clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
- *best_mv = *ref_mv;
- *num00 = 11;
+ clamp_mv(&fcenter_mv, x->mv_col_min, x->mv_col_max,
+ x->mv_row_min, x->mv_row_max);
+ *best_mv = fcenter_mv;
best_sad = fn_ptr->sdf(what->buf, what->stride,
- get_buf_from_mv(in_what, ref_mv), in_what->stride) +
- mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
- start_row = VPXMAX(-range, x->mv_row_min - ref_mv->row);
- start_col = VPXMAX(-range, x->mv_col_min - ref_mv->col);
- end_row = VPXMIN(range, x->mv_row_max - ref_mv->row);
- end_col = VPXMIN(range, x->mv_col_max - ref_mv->col);
+ get_buf_from_mv(in_what, &fcenter_mv), in_what->stride) +
+ mvsad_err_cost(x, &fcenter_mv, ref_mv, sad_per_bit);
+ start_row = VPXMAX(-range, x->mv_row_min - fcenter_mv.row);
+ start_col = VPXMAX(-range, x->mv_col_min - fcenter_mv.col);
+ end_row = VPXMIN(range, x->mv_row_max - fcenter_mv.row);
+ end_col = VPXMIN(range, x->mv_col_max - fcenter_mv.col);
- for (r = start_row; r <= end_row; ++r) {
- for (c = start_col; c <= end_col; c += 4) {
- if (c + 3 <= end_col) {
- unsigned int sads[4];
- const uint8_t *addrs[4];
- for (i = 0; i < 4; ++i) {
- const MV mv = {ref_mv->row + r, ref_mv->col + c + i};
- addrs[i] = get_buf_from_mv(in_what, &mv);
- }
-
- fn_ptr->sdx4df(what->buf, what->stride, addrs, in_what->stride, sads);
-
- for (i = 0; i < 4; ++i) {
- if (sads[i] < best_sad) {
- const MV mv = {ref_mv->row + r, ref_mv->col + c + i};
- const unsigned int sad = sads[i] +
- mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
- if (sad < best_sad) {
- best_sad = sad;
- *best_mv = mv;
- }
+ for (r = start_row; r <= end_row; r += step) {
+ for (c = start_col; c <= end_col; c += col_step) {
+ // Step > 1 means we are not checking every location in this pass.
+ if (step > 1) {
+ const MV mv = {fcenter_mv.row + r, fcenter_mv.col + c};
+ unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &mv), in_what->stride);
+ if (sad < best_sad) {
+ sad += mvsad_err_cost(x, &mv, ref_mv, sad_per_bit);
+ if (sad < best_sad) {
+ best_sad = sad;
+ *best_mv = mv;
}
}
} else {
- for (i = 0; i < end_col - c; ++i) {
- const MV mv = {ref_mv->row + r, ref_mv->col + c + i};
- unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
- get_buf_from_mv(in_what, &mv), in_what->stride);
- if (sad < best_sad) {
- sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+ // 4 sads in a single call if we are checking every location
+ if (c + 3 <= end_col) {
+ unsigned int sads[4];
+ const uint8_t *addrs[4];
+ for (i = 0; i < 4; ++i) {
+ const MV mv = {fcenter_mv.row + r, fcenter_mv.col + c + i};
+ addrs[i] = get_buf_from_mv(in_what, &mv);
+ }
+ fn_ptr->sdx4df(what->buf, what->stride, addrs,
+ in_what->stride, sads);
+
+ for (i = 0; i < 4; ++i) {
+ if (sads[i] < best_sad) {
+ const MV mv = {fcenter_mv.row + r, fcenter_mv.col + c + i};
+ const unsigned int sad = sads[i] +
+ mvsad_err_cost(x, &mv, ref_mv, sad_per_bit);
+ if (sad < best_sad) {
+ best_sad = sad;
+ *best_mv = mv;
+ }
+ }
+ }
+ } else {
+ for (i = 0; i < end_col - c; ++i) {
+ const MV mv = {fcenter_mv.row + r, fcenter_mv.col + c + i};
+ unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &mv), in_what->stride);
if (sad < best_sad) {
- best_sad = sad;
- *best_mv = mv;
+ sad += mvsad_err_cost(x, &mv, ref_mv, sad_per_bit);
+ if (sad < best_sad) {
+ best_sad = sad;
+ *best_mv = mv;
+ }
}
}
}
@@ -2014,6 +2028,70 @@
return bestsme;
}
+#define MIN_RANGE 7
+#define MAX_RANGE 256
+#define MIN_INTERVAL 1
+// Runs an limited range exhaustive mesh search using a pattern set
+// according to the encode speed profile.
+static int full_pixel_exhaustive(VP10_COMP *cpi, MACROBLOCK *x,
+ MV *centre_mv_full, int sadpb, int *cost_list,
+ const vp9_variance_fn_ptr_t *fn_ptr,
+ const MV *ref_mv, MV *dst_mv) {
+ const SPEED_FEATURES *const sf = &cpi->sf;
+ MV temp_mv = {centre_mv_full->row, centre_mv_full->col};
+ MV f_ref_mv = {ref_mv->row >> 3, ref_mv->col >> 3};
+ int bestsme;
+ int i;
+ int interval = sf->mesh_patterns[0].interval;
+ int range = sf->mesh_patterns[0].range;
+ int baseline_interval_divisor;
+
+ // Keep track of number of exhaustive calls (this frame in this thread).
+ ++(*x->ex_search_count_ptr);
+
+ // Trap illegal values for interval and range for this function.
+ if ((range < MIN_RANGE) || (range > MAX_RANGE) ||
+ (interval < MIN_INTERVAL) || (interval > range))
+ return INT_MAX;
+
+ baseline_interval_divisor = range / interval;
+
+ // Check size of proposed first range against magnitude of the centre
+ // value used as a starting point.
+ range = VPXMAX(range, (5 * VPXMAX(abs(temp_mv.row), abs(temp_mv.col))) / 4);
+ range = VPXMIN(range, MAX_RANGE);
+ interval = VPXMAX(interval, range / baseline_interval_divisor);
+
+ // initial search
+ bestsme = exhuastive_mesh_search(x, &f_ref_mv, &temp_mv, range,
+ interval, sadpb, fn_ptr, &temp_mv);
+
+ if ((interval > MIN_INTERVAL) && (range > MIN_RANGE)) {
+ // Progressive searches with range and step size decreasing each time
+ // till we reach a step size of 1. Then break out.
+ for (i = 1; i < MAX_MESH_STEP; ++i) {
+ // First pass with coarser step and longer range
+ bestsme = exhuastive_mesh_search(x, &f_ref_mv, &temp_mv,
+ sf->mesh_patterns[i].range,
+ sf->mesh_patterns[i].interval,
+ sadpb, fn_ptr, &temp_mv);
+
+ if (sf->mesh_patterns[i].interval == 1)
+ break;
+ }
+ }
+
+ if (bestsme < INT_MAX)
+ bestsme = vp10_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
+ *dst_mv = temp_mv;
+
+ // Return cost list.
+ if (cost_list) {
+ calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, dst_mv, cost_list);
+ }
+ return bestsme;
+}
+
int vp10_full_search_sad_c(const MACROBLOCK *x, const MV *ref_mv,
int sad_per_bit, int distance,
const vp9_variance_fn_ptr_t *fn_ptr,
@@ -2327,6 +2405,18 @@
return best_sad;
}
+#define MIN_EX_SEARCH_LIMIT 128
+static int is_exhaustive_allowed(VP10_COMP *cpi, MACROBLOCK *x) {
+ const SPEED_FEATURES *const sf = &cpi->sf;
+ const int max_ex = VPXMAX(MIN_EX_SEARCH_LIMIT,
+ (*x->m_search_count_ptr * sf->max_exaustive_pct) / 100);
+
+ return sf->allow_exhaustive_searches &&
+ (sf->exhaustive_searches_thresh < INT_MAX) &&
+ (*x->ex_search_count_ptr <= max_ex) &&
+ !cpi->rc.is_src_frame_alt_ref;
+}
+
int vp10_full_pixel_search(VP10_COMP *cpi, MACROBLOCK *x,
BLOCK_SIZE bsize, MV *mvp_full,
int step_param, int error_per_bit,
@@ -2345,6 +2435,9 @@
cost_list[4] = INT_MAX;
}
+ // Keep track of number of searches (this frame in this thread).
+ ++(*x->m_search_count_ptr);
+
switch (method) {
case FAST_DIAMOND:
var = vp10_fast_dia_search(x, mvp_full, step_param, error_per_bit, 0,
@@ -2370,6 +2463,29 @@
var = vp10_full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
MAX_MVSEARCH_STEPS - 1 - step_param,
1, cost_list, fn_ptr, ref_mv, tmp_mv);
+
+ // Should we allow a follow on exhaustive search?
+ if (is_exhaustive_allowed(cpi, x)) {
+ int64_t exhuastive_thr = sf->exhaustive_searches_thresh;
+ exhuastive_thr >>= 8 - (b_width_log2_lookup[bsize] +
+ b_height_log2_lookup[bsize]);
+
+ // Threshold variance for an exhaustive full search.
+ if (var > exhuastive_thr) {
+ int var_ex;
+ MV tmp_mv_ex;
+ var_ex = full_pixel_exhaustive(cpi, x, tmp_mv,
+ error_per_bit, cost_list, fn_ptr,
+ ref_mv, &tmp_mv_ex);
+
+ if (var_ex < var) {
+ var = var_ex;
+ *tmp_mv = tmp_mv_ex;
+ }
+ }
+ }
+ break;
+
break;
default:
assert(0 && "Invalid search method.");
diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index 974700e..bba2171 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@@ -1783,7 +1783,7 @@
seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV) {
MV *const new_mv = &mode_mv[NEWMV][0].as_mv;
int step_param = 0;
- int thissme, bestsme = INT_MAX;
+ int bestsme = INT_MAX;
int sadpb = x->sadperbit4;
MV mvp_full;
int max_mv;
@@ -1838,27 +1838,6 @@
&bsi->ref_mv[0]->as_mv, new_mv,
INT_MAX, 1);
- // Should we do a full search (best quality only)
- if (cpi->oxcf.mode == BEST) {
- int_mv *const best_mv = &mi->bmi[i].as_mv[0];
- /* Check if mvp_full is within the range. */
- clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,
- x->mv_row_min, x->mv_row_max);
- thissme = cpi->full_search_sad(x, &mvp_full,
- sadpb, 16, &cpi->fn_ptr[bsize],
- &bsi->ref_mv[0]->as_mv,
- &best_mv->as_mv);
- cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] = INT_MAX;
- if (thissme < bestsme) {
- bestsme = thissme;
- *new_mv = best_mv->as_mv;
- } else {
- // The full search result is actually worse so re-instate the
- // previous best vector
- best_mv->as_mv = *new_mv;
- }
- }
-
if (bestsme < INT_MAX) {
int distortion;
cpi->find_fractional_mv_step(
diff --git a/vp10/encoder/speed_features.c b/vp10/encoder/speed_features.c
index d40383f..ce0aebe 100644
--- a/vp10/encoder/speed_features.c
+++ b/vp10/encoder/speed_features.c
@@ -16,6 +16,23 @@
#include "vpx_dsp/vpx_dsp_common.h"
+// Mesh search patters for various speed settings
+static MESH_PATTERN best_quality_mesh_pattern[MAX_MESH_STEP] =
+ {{64, 4}, {28, 2}, {15, 1}, {7, 1}};
+
+#define MAX_MESH_SPEED 5 // Max speed setting for mesh motion method
+static MESH_PATTERN good_quality_mesh_patterns[MAX_MESH_SPEED + 1]
+ [MAX_MESH_STEP] =
+ {{{64, 8}, {28, 4}, {15, 1}, {7, 1}},
+ {{64, 8}, {28, 4}, {15, 1}, {7, 1}},
+ {{64, 8}, {14, 2}, {7, 1}, {7, 1}},
+ {{64, 16}, {24, 8}, {12, 4}, {7, 1}},
+ {{64, 16}, {24, 8}, {12, 4}, {7, 1}},
+ {{64, 16}, {24, 8}, {12, 4}, {7, 1}},
+ };
+static unsigned char good_quality_max_mesh_pct[MAX_MESH_SPEED + 1] =
+ {50, 25, 15, 5, 1, 1};
+
// Intra only frames, golden frames (except alt ref overlays) and
// alt ref frames tend to be coded at a higher than ambient quality
static int frame_is_boosted(const VP10_COMP *cpi) {
@@ -251,6 +268,8 @@
sf->static_segmentation = 0;
sf->adaptive_rd_thresh = 1;
sf->use_fast_coef_costing = 1;
+ sf->allow_exhaustive_searches = 0;
+ sf->exhaustive_searches_thresh = INT_MAX;
if (speed >= 1) {
sf->use_square_partition_only = !frame_is_intra_only(cm);
@@ -498,8 +517,36 @@
set_good_speed_feature(cpi, cm, sf, oxcf->speed);
cpi->full_search_sad = vp10_full_search_sad;
- cpi->diamond_search_sad = oxcf->mode == BEST ? vp10_full_range_search
- : vp10_diamond_search_sad;
+ cpi->diamond_search_sad = vp10_diamond_search_sad;
+
+ sf->allow_exhaustive_searches = 1;
+ if (oxcf->mode == BEST) {
+ if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION)
+ sf->exhaustive_searches_thresh = (1 << 20);
+ else
+ sf->exhaustive_searches_thresh = (1 << 21);
+ sf->max_exaustive_pct = 100;
+ for (i = 0; i < MAX_MESH_STEP; ++i) {
+ sf->mesh_patterns[i].range = best_quality_mesh_pattern[i].range;
+ sf->mesh_patterns[i].interval = best_quality_mesh_pattern[i].interval;
+ }
+ } else {
+ int speed = (oxcf->speed > MAX_MESH_SPEED) ? MAX_MESH_SPEED : oxcf->speed;
+ if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION)
+ sf->exhaustive_searches_thresh = (1 << 22);
+ else
+ sf->exhaustive_searches_thresh = (1 << 23);
+ sf->max_exaustive_pct = good_quality_max_mesh_pct[speed];
+ if (speed > 0)
+ sf->exhaustive_searches_thresh = sf->exhaustive_searches_thresh << 1;
+
+ for (i = 0; i < MAX_MESH_STEP; ++i) {
+ sf->mesh_patterns[i].range =
+ good_quality_mesh_patterns[speed][i].range;
+ sf->mesh_patterns[i].interval =
+ good_quality_mesh_patterns[speed][i].interval;
+ }
+ }
// Slow quant, dct and trellis not worthwhile for first pass
// so make sure they are always turned off.
diff --git a/vp10/encoder/speed_features.h b/vp10/encoder/speed_features.h
index 3969a2f..3b91999 100644
--- a/vp10/encoder/speed_features.h
+++ b/vp10/encoder/speed_features.h
@@ -195,6 +195,13 @@
int fullpel_search_step_param;
} MV_SPEED_FEATURES;
+#define MAX_MESH_STEP 4
+
+typedef struct MESH_PATTERN {
+ int range;
+ int interval;
+} MESH_PATTERN;
+
typedef struct SPEED_FEATURES {
MV_SPEED_FEATURES mv;
@@ -290,6 +297,18 @@
// point for this motion search and limits the search range around it.
int adaptive_motion_search;
+ // Flag for allowing some use of exhaustive searches;
+ int allow_exhaustive_searches;
+
+ // Threshold for allowing exhaistive motion search.
+ int exhaustive_searches_thresh;
+
+ // Maximum number of exhaustive searches for a frame.
+ int max_exaustive_pct;
+
+ // Pattern to be used for any exhaustive mesh searches.
+ MESH_PATTERN mesh_patterns[MAX_MESH_STEP];
+
int schedule_mode_search;
// Allows sub 8x8 modes to use the prediction filter that was determined
diff --git a/vp8/common/vp8_loopfilter.c b/vp8/common/vp8_loopfilter.c
index 8b55dff..756ad48 100644
--- a/vp8/common/vp8_loopfilter.c
+++ b/vp8/common/vp8_loopfilter.c
@@ -141,8 +141,8 @@
else /* Delta Value */
{
lvl_seg += mbd->segment_feature_data[MB_LVL_ALT_LF][seg];
- lvl_seg = (lvl_seg > 0) ? ((lvl_seg > 63) ? 63: lvl_seg) : 0;
}
+ lvl_seg = (lvl_seg > 0) ? ((lvl_seg > 63) ? 63: lvl_seg) : 0;
}
if (!mbd->mode_ref_lf_delta_enabled)
diff --git a/vp8/decoder/decodeframe.c b/vp8/decoder/decodeframe.c
index f0d7603..4bc87eb 100644
--- a/vp8/decoder/decodeframe.c
+++ b/vp8/decoder/decodeframe.c
@@ -73,10 +73,9 @@
/* Delta Value */
else
- {
QIndex = pc->base_qindex + xd->segment_feature_data[MB_LVL_ALT_Q][mbmi->segment_id];
- QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) : 0; /* Clamp to valid range */
- }
+
+ QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) : 0; /* Clamp to valid range */
}
else
QIndex = pc->base_qindex;
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 8fe6503..d166bbf 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -198,7 +198,7 @@
specialize qw/vp9_avg_8x8 sse2 neon msa/;
add_proto qw/unsigned int vp9_avg_4x4/, "const uint8_t *, int p";
-specialize qw/vp9_avg_4x4 sse2 msa/;
+specialize qw/vp9_avg_4x4 sse2 neon msa/;
add_proto qw/void vp9_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
specialize qw/vp9_minmax_8x8 sse2/;
diff --git a/vp9/encoder/arm/neon/vp9_avg_neon.c b/vp9/encoder/arm/neon/vp9_avg_neon.c
index 5996bd4..78467ce 100644
--- a/vp9/encoder/arm/neon/vp9_avg_neon.c
+++ b/vp9/encoder/arm/neon/vp9_avg_neon.c
@@ -24,6 +24,18 @@
return vget_lane_u32(c, 0);
}
+unsigned int vp9_avg_4x4_neon(const uint8_t *s, int p) {
+ uint16x8_t v_sum;
+ uint32x2_t v_s0 = vdup_n_u32(0);
+ uint32x2_t v_s1 = vdup_n_u32(0);
+ v_s0 = vld1_lane_u32((const uint32_t *)s, v_s0, 0);
+ v_s0 = vld1_lane_u32((const uint32_t *)(s + p), v_s0, 1);
+ v_s1 = vld1_lane_u32((const uint32_t *)(s + 2 * p), v_s1, 0);
+ v_s1 = vld1_lane_u32((const uint32_t *)(s + 3 * p), v_s1, 1);
+ v_sum = vaddl_u8(vreinterpret_u8_u32(v_s0), vreinterpret_u8_u32(v_s1));
+ return (horizontal_add_u16x8(v_sum) + 8) >> 4;
+}
+
unsigned int vp9_avg_8x8_neon(const uint8_t *s, int p) {
uint8x8_t v_s0 = vld1_u8(s);
const uint8x8_t v_s1 = vld1_u8(s + p);
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index fc76c11..93aa40a 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -194,7 +194,8 @@
int mi_col,
PICK_MODE_CONTEXT *ctx,
int motion_magnitude,
- int is_skin) {
+ int is_skin,
+ int *zeromv_filter) {
int mv_col, mv_row;
int sse_diff = ctx->zeromv_sse - ctx->newmv_sse;
MV_REFERENCE_FRAME frame;
@@ -237,6 +238,7 @@
mbmi->mv[0].as_int = 0;
ctx->best_sse_inter_mode = ZEROMV;
ctx->best_sse_mv.as_int = 0;
+ *zeromv_filter = 1;
}
if (ctx->newmv_sse > sse_thresh(bs, increase_denoising)) {
@@ -320,6 +322,7 @@
VP9_DENOISER_DECISION *denoiser_decision) {
int mv_col, mv_row;
int motion_magnitude = 0;
+ int zeromv_filter = 0;
VP9_DENOISER_DECISION decision = COPY_BLOCK;
YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME];
YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y;
@@ -360,7 +363,8 @@
denoiser->increase_denoising,
mi_row, mi_col, ctx,
motion_magnitude,
- is_skin);
+ is_skin,
+ &zeromv_filter);
if (decision == FILTER_BLOCK) {
decision = vp9_denoiser_filter(src.buf, src.stride,
@@ -382,6 +386,8 @@
num_4x4_blocks_high_lookup[bs] << 2);
}
*denoiser_decision = decision;
+ if (decision == FILTER_BLOCK && zeromv_filter == 1)
+ *denoiser_decision = FILTER_ZEROMV_BLOCK;
}
static void copy_frame(YV12_BUFFER_CONFIG * const dest,
diff --git a/vp9/encoder/vp9_denoiser.h b/vp9/encoder/vp9_denoiser.h
index c8c9352..d07056b 100644
--- a/vp9/encoder/vp9_denoiser.h
+++ b/vp9/encoder/vp9_denoiser.h
@@ -23,7 +23,8 @@
typedef enum vp9_denoiser_decision {
COPY_BLOCK,
- FILTER_BLOCK
+ FILTER_BLOCK,
+ FILTER_ZEROMV_BLOCK
} VP9_DENOISER_DECISION;
typedef enum vp9_denoiser_level {
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 1727201..90650db 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -1696,11 +1696,11 @@
VP9_DENOISER_DECISION decision = COPY_BLOCK;
vp9_denoiser_denoise(&cpi->denoiser, x, mi_row, mi_col,
VPXMAX(BLOCK_8X8, bsize), ctx, &decision);
- // If INTRA mode was selected, re-evaluate ZEROMV on denoised result.
- // Only do this under noise conditions, and if rdcost of ZEROMV on
- // original source is not significantly higher than rdcost of INTRA MODE.
- if (best_ref_frame == INTRA_FRAME &&
- decision == FILTER_BLOCK &&
+ // If INTRA or GOLDEN reference was selected, re-evaluate ZEROMV on denoised
+ // result. Only do this under noise conditions, and if rdcost of ZEROMV on
+ // original source is not significantly higher than rdcost of best mode.
+ if (((best_ref_frame == INTRA_FRAME && decision >= FILTER_BLOCK) ||
+ (best_ref_frame == GOLDEN_FRAME && decision == FILTER_ZEROMV_BLOCK)) &&
cpi->noise_estimate.enabled &&
cpi->noise_estimate.level > kLow &&
zero_last_cost_orig < (best_rdc.rdcost << 3)) {
@@ -1721,13 +1721,21 @@
this_rdc.dist = dist;
this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, rate, dist);
// Switch to ZEROMV if the rdcost for ZEROMV on denoised source
- // is lower than INTRA (on original source).
+ // is lower than best_ref mode (on original source).
if (this_rdc.rdcost > best_rdc.rdcost) {
this_rdc = best_rdc;
mbmi->mode = best_mode;
mbmi->ref_frame[0] = best_ref_frame;
- mbmi->mv[0].as_int = INVALID_MV;
mbmi->interp_filter = best_pred_filter;
+ if (best_ref_frame == INTRA_FRAME)
+ mbmi->mv[0].as_int = INVALID_MV;
+ else if (best_ref_frame == GOLDEN_FRAME) {
+ mbmi->mv[0].as_int = frame_mv[best_mode][best_ref_frame].as_int;
+ if (reuse_inter_pred) {
+ xd->plane[0].pre[0] = yv12_mb[GOLDEN_FRAME][0];
+ vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+ }
+ }
mbmi->tx_size = best_tx_size;
x->skip_txfm[0] = best_mode_skip_txfm;
} else {
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 45445df..8ab51cd 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -1075,7 +1075,7 @@
if (!cpi->refresh_alt_ref_frame) {
active_best_quality = cq_level;
} else {
- active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+ active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
// Modify best quality for second level arfs. For mode VPX_Q this
// becomes the baseline frame q.
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 16f9c85..015dbc0 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -135,15 +135,38 @@
for (i = 0, k = 0; i < block_height; i++) {
for (j = 0; j < block_width; j++, k++) {
- int src_byte = frame1[byte];
- int pixel_value = *frame2++;
+ int pixel_value = *frame2;
- modifier = src_byte - pixel_value;
- // This is an integer approximation of:
- // float coeff = (3.0 * modifer * modifier) / pow(2, strength);
- // modifier = (int)roundf(coeff > 16 ? 0 : 16-coeff);
- modifier *= modifier;
- modifier *= 3;
+ // non-local mean approach
+ int diff_sse[9] = { 0 };
+ int idx, idy, index = 0;
+
+ for (idy = -1; idy <= 1; ++idy) {
+ for (idx = -1; idx <= 1; ++idx) {
+ int row = i + idy;
+ int col = j + idx;
+
+ if (row >= 0 && row < (int)block_height &&
+ col >= 0 && col < (int)block_width) {
+ int diff = frame1[byte + idy * (int)stride + idx] -
+ frame2[idy * (int)block_width + idx];
+ diff_sse[index] = diff * diff;
+ ++index;
+ }
+ }
+ }
+
+ assert(index > 0);
+
+ modifier = 0;
+ for (idx = 0; idx < 9; ++idx)
+ modifier += diff_sse[idx];
+
+ modifier *= 3;
+ modifier /= index;
+
+ ++frame2;
+
modifier += rounding;
modifier >>= strength;
@@ -182,15 +205,34 @@
for (i = 0, k = 0; i < block_height; i++) {
for (j = 0; j < block_width; j++, k++) {
- int src_byte = frame1[byte];
- int pixel_value = *frame2++;
+ int pixel_value = *frame2;
+ int diff_sse[9] = { 0 };
+ int idx, idy, index = 0;
- modifier = src_byte - pixel_value;
- // This is an integer approximation of:
- // float coeff = (3.0 * modifer * modifier) / pow(2, strength);
- // modifier = (int)roundf(coeff > 16 ? 0 : 16-coeff);
- modifier *= modifier;
+ for (idy = -1; idy <= 1; ++idy) {
+ for (idx = -1; idx <= 1; ++idx) {
+ int row = i + idy;
+ int col = j + idx;
+
+ if (row >= 0 && row < (int)block_height &&
+ col >= 0 && col < (int)block_width) {
+ int diff = frame1[byte + idy * (int)stride + idx] -
+ frame2[idy * (int)block_width + idx];
+ diff_sse[index] = diff * diff;
+ ++index;
+ }
+ }
+ }
+ assert(index > 0);
+
+ modifier = 0;
+ for (idx = 0; idx < 9; ++idx)
+ modifier += diff_sse[idx];
+
modifier *= 3;
+ modifier /= index;
+
+ ++frame2;
modifier += rounding;
modifier >>= strength;
@@ -383,55 +425,58 @@
if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
int adj_strength = strength + 2 * (mbd->bd - 8);
// Apply the filter (YUV)
- vp9_highbd_temporal_filter_apply(f->y_buffer + mb_y_offset,
- f->y_stride,
- predictor, 16, 16, adj_strength,
- filter_weight,
- accumulator, count);
- vp9_highbd_temporal_filter_apply(f->u_buffer + mb_uv_offset,
- f->uv_stride, predictor + 256,
- mb_uv_width, mb_uv_height,
- adj_strength,
- filter_weight, accumulator + 256,
- count + 256);
- vp9_highbd_temporal_filter_apply(f->v_buffer + mb_uv_offset,
- f->uv_stride, predictor + 512,
- mb_uv_width, mb_uv_height,
- adj_strength, filter_weight,
- accumulator + 512, count + 512);
+ vp9_highbd_temporal_filter_apply_c(f->y_buffer + mb_y_offset,
+ f->y_stride,
+ predictor, 16, 16, adj_strength,
+ filter_weight,
+ accumulator, count);
+ vp9_highbd_temporal_filter_apply_c(f->u_buffer + mb_uv_offset,
+ f->uv_stride, predictor + 256,
+ mb_uv_width, mb_uv_height,
+ adj_strength,
+ filter_weight, accumulator + 256,
+ count + 256);
+ vp9_highbd_temporal_filter_apply_c(f->v_buffer + mb_uv_offset,
+ f->uv_stride, predictor + 512,
+ mb_uv_width, mb_uv_height,
+ adj_strength, filter_weight,
+ accumulator + 512, count + 512);
} else {
// Apply the filter (YUV)
- vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
+ vp9_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride,
+ predictor, 16, 16,
+ strength, filter_weight,
+ accumulator, count);
+ vp9_temporal_filter_apply_c(f->u_buffer + mb_uv_offset,
+ f->uv_stride,
+ predictor + 256,
+ mb_uv_width, mb_uv_height, strength,
+ filter_weight, accumulator + 256,
+ count + 256);
+ vp9_temporal_filter_apply_c(f->v_buffer + mb_uv_offset,
+ f->uv_stride,
+ predictor + 512,
+ mb_uv_width, mb_uv_height, strength,
+ filter_weight, accumulator + 512,
+ count + 512);
+ }
+#else
+ // Apply the filter (YUV)
+ // TODO(jingning): Need SIMD optimization for this.
+ vp9_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride,
predictor, 16, 16,
strength, filter_weight,
accumulator, count);
- vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride,
+ vp9_temporal_filter_apply_c(f->u_buffer + mb_uv_offset, f->uv_stride,
predictor + 256,
mb_uv_width, mb_uv_height, strength,
filter_weight, accumulator + 256,
count + 256);
- vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride,
+ vp9_temporal_filter_apply_c(f->v_buffer + mb_uv_offset, f->uv_stride,
predictor + 512,
mb_uv_width, mb_uv_height, strength,
filter_weight, accumulator + 512,
count + 512);
- }
-#else
- // Apply the filter (YUV)
- vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
- predictor, 16, 16,
- strength, filter_weight,
- accumulator, count);
- vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride,
- predictor + 256,
- mb_uv_width, mb_uv_height, strength,
- filter_weight, accumulator + 256,
- count + 256);
- vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride,
- predictor + 512,
- mb_uv_width, mb_uv_height, strength,
- filter_weight, accumulator + 512,
- count + 512);
#endif // CONFIG_VP9_HIGHBITDEPTH
}
}
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index bb9981b..5ce7134 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -91,7 +91,7 @@
specialize qw/vpx_d153_predictor_4x4/, "$ssse3_x86inc";
add_proto qw/void vpx_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_v_predictor_4x4 neon msa/, "$sse_x86inc";
+specialize qw/vpx_v_predictor_4x4 neon msa/, "$sse2_x86inc";
add_proto qw/void vpx_ve_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_ve_predictor_4x4/;
@@ -100,16 +100,16 @@
specialize qw/vpx_tm_predictor_4x4 neon dspr2 msa/, "$sse2_x86inc";
add_proto qw/void vpx_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_predictor_4x4 dspr2 msa neon/, "$sse_x86inc";
+specialize qw/vpx_dc_predictor_4x4 dspr2 msa neon/, "$sse2_x86inc";
add_proto qw/void vpx_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_top_predictor_4x4 msa neon/, "$sse_x86inc";
+specialize qw/vpx_dc_top_predictor_4x4 msa neon/, "$sse2_x86inc";
add_proto qw/void vpx_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_left_predictor_4x4 msa neon/, "$sse_x86inc";
+specialize qw/vpx_dc_left_predictor_4x4 msa neon/, "$sse2_x86inc";
add_proto qw/void vpx_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_128_predictor_4x4 msa neon/, "$sse_x86inc";
+specialize qw/vpx_dc_128_predictor_4x4 msa neon/, "$sse2_x86inc";
add_proto qw/void vpx_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_d207_predictor_8x8/, "$ssse3_x86inc";
diff --git a/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
index 93df92a..22d52a2 100644
--- a/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
+++ b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
@@ -123,8 +123,10 @@
%define sec_str sec_stridemp
; Store bilin_filter and pw_8 location in stack
- GET_GOT eax
- add esp, 4 ; restore esp
+ %if GET_GOT_DEFINED == 1
+ GET_GOT eax
+ add esp, 4 ; restore esp
+ %endif
lea ecx, [GLOBAL(bilin_filter_m)]
mov g_bilin_filterm, ecx
@@ -140,8 +142,10 @@
%define block_height heightd
; Store bilin_filter and pw_8 location in stack
- GET_GOT eax
- add esp, 4 ; restore esp
+ %if GET_GOT_DEFINED == 1
+ GET_GOT eax
+ add esp, 4 ; restore esp
+ %endif
lea ecx, [GLOBAL(bilin_filter_m)]
mov g_bilin_filterm, ecx
diff --git a/vpx_dsp/x86/intrapred_sse2.asm b/vpx_dsp/x86/intrapred_sse2.asm
index 13fe140..edbf05e 100644
--- a/vpx_dsp/x86/intrapred_sse2.asm
+++ b/vpx_dsp/x86/intrapred_sse2.asm
@@ -23,17 +23,18 @@
SECTION .text
-INIT_MMX sse
-cglobal dc_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset
+INIT_XMM sse2
+cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset
GET_GOT goffsetq
- pxor m1, m1
+ movd m2, [leftq]
movd m0, [aboveq]
- punpckldq m0, [leftq]
+ pxor m1, m1
+ punpckldq m0, m2
psadbw m0, m1
paddw m0, [GLOBAL(pw_4)]
psraw m0, 3
- pshufw m0, m0, 0x0
+ pshuflw m0, m0, 0x0
packuswb m0, m0
movd [dstq ], m0
movd [dstq+strideq], m0
@@ -44,16 +45,17 @@
RESTORE_GOT
RET
-INIT_MMX sse
-cglobal dc_left_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset
+INIT_XMM sse2
+cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset
GET_GOT goffsetq
+ movifnidn leftq, leftmp
pxor m1, m1
movd m0, [leftq]
psadbw m0, m1
paddw m0, [GLOBAL(pw2_4)]
psraw m0, 2
- pshufw m0, m0, 0x0
+ pshuflw m0, m0, 0x0
packuswb m0, m0
movd [dstq ], m0
movd [dstq+strideq], m0
@@ -64,8 +66,8 @@
RESTORE_GOT
RET
-INIT_MMX sse
-cglobal dc_top_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset
+INIT_XMM sse2
+cglobal dc_top_predictor_4x4, 3, 5, 2, dst, stride, above, left, goffset
GET_GOT goffsetq
pxor m1, m1
@@ -73,7 +75,7 @@
psadbw m0, m1
paddw m0, [GLOBAL(pw2_4)]
psraw m0, 2
- pshufw m0, m0, 0x0
+ pshuflw m0, m0, 0x0
packuswb m0, m0
movd [dstq ], m0
movd [dstq+strideq], m0
@@ -166,8 +168,8 @@
RESTORE_GOT
RET
-INIT_MMX sse
-cglobal dc_128_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset
+INIT_XMM sse2
+cglobal dc_128_predictor_4x4, 2, 5, 1, dst, stride, above, left, goffset
GET_GOT goffsetq
DEFINE_ARGS dst, stride, stride3
@@ -453,7 +455,7 @@
RESTORE_GOT
RET
-INIT_MMX sse
+INIT_XMM sse2
cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above
movd m0, [aboveq]
movd [dstq ], m0
diff --git a/vpx_dsp/x86/subpel_variance_sse2.asm b/vpx_dsp/x86/subpel_variance_sse2.asm
index 05dcff7..c655e4b 100644
--- a/vpx_dsp/x86/subpel_variance_sse2.asm
+++ b/vpx_dsp/x86/subpel_variance_sse2.asm
@@ -139,8 +139,10 @@
%define sec_str sec_stridemp
;Store bilin_filter and pw_8 location in stack
- GET_GOT eax
- add esp, 4 ; restore esp
+ %if GET_GOT_DEFINED == 1
+ GET_GOT eax
+ add esp, 4 ; restore esp
+ %endif
lea ecx, [GLOBAL(bilin_filter_m)]
mov g_bilin_filterm, ecx
@@ -156,8 +158,10 @@
%define block_height heightd
;Store bilin_filter and pw_8 location in stack
- GET_GOT eax
- add esp, 4 ; restore esp
+ %if GET_GOT_DEFINED == 1
+ GET_GOT eax
+ add esp, 4 ; restore esp
+ %endif
lea ecx, [GLOBAL(bilin_filter_m)]
mov g_bilin_filterm, ecx
diff --git a/vpx_ports/x86_abi_support.asm b/vpx_ports/x86_abi_support.asm
index c94b76a..708fa10 100644
--- a/vpx_ports/x86_abi_support.asm
+++ b/vpx_ports/x86_abi_support.asm
@@ -189,7 +189,6 @@
%if ABI_IS_32BIT
%if CONFIG_PIC=1
%ifidn __OUTPUT_FORMAT__,elf32
- %define GET_GOT_SAVE_ARG 1
%define WRT_PLT wrt ..plt
%macro GET_GOT 1
extern _GLOBAL_OFFSET_TABLE_
@@ -208,7 +207,6 @@
%define RESTORE_GOT pop %1
%endmacro
%elifidn __OUTPUT_FORMAT__,macho32
- %define GET_GOT_SAVE_ARG 1
%macro GET_GOT 1
push %1
call %%get_got