Merge "Format fixes in vp9_rd_pick_inter_mode_sb/sub8x8"
diff --git a/build/make/configure.sh b/build/make/configure.sh
index c527cd5..932dd8e 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -201,7 +201,7 @@
soft_enable() {
for var in $*; do
if ! disabled $var; then
- log_echo " enabling $var"
+ enabled $var || log_echo " enabling $var"
enable_feature $var
fi
done
@@ -210,7 +210,7 @@
soft_disable() {
for var in $*; do
if ! enabled $var; then
- log_echo " disabling $var"
+ disabled $var || log_echo " disabling $var"
disable_feature $var
fi
done
@@ -508,9 +508,11 @@
elif [ $action = "disable" ] && ! disabled $option ; then
echo "${CMDLINE_SELECT}" | grep "^ *$option\$" >/dev/null ||
die_unknown $opt
+ log_echo " disabling $option"
elif [ $action = "enable" ] && ! enabled $option ; then
echo "${CMDLINE_SELECT}" | grep "^ *$option\$" >/dev/null ||
die_unknown $opt
+ log_echo " enabling $option"
fi
${action}_feature $option
;;
@@ -606,6 +608,13 @@
EXE_SFX=
}
+# Reliably find the newest available Darwin SDKs. (Older versions of
+# xcrun don't support --show-sdk-path.)
+show_darwin_sdk_path() {
+ xcrun --sdk $1 --show-sdk-path 2>/dev/null ||
+ xcodebuild -sdk $1 -version Path 2>/dev/null
+}
+
process_common_toolchain() {
if [ -z "$toolchain" ]; then
gcctarget="${CHOST:-$(gcc -dumpmachine 2> /dev/null)}"
@@ -729,31 +738,17 @@
IOS_VERSION_MIN="6.0"
# Handle darwin variants. Newer SDKs allow targeting older
- # platforms, so find the newest SDK available.
+ # platforms, so use the newest one available.
case ${toolchain} in
*-darwin*)
- if [ -z "${DEVELOPER_DIR}" ]; then
- DEVELOPER_DIR=`xcode-select -print-path 2> /dev/null`
- [ $? -ne 0 ] && OSX_SKIP_DIR_CHECK=1
- fi
- if [ -z "${OSX_SKIP_DIR_CHECK}" ]; then
- OSX_SDK_ROOTS="${DEVELOPER_DIR}/SDKs"
- OSX_SDK_VERSIONS="MacOSX10.4u.sdk MacOSX10.5.sdk MacOSX10.6.sdk"
- OSX_SDK_VERSIONS="${OSX_SDK_VERSIONS} MacOSX10.7.sdk"
- for v in ${OSX_SDK_VERSIONS}; do
- if [ -d "${OSX_SDK_ROOTS}/${v}" ]; then
- osx_sdk_dir="${OSX_SDK_ROOTS}/${v}"
- fi
- done
+ osx_sdk_dir="$(show_darwin_sdk_path macosx)"
+ if [ -d "${osx_sdk_dir}" ]; then
+ add_cflags "-isysroot ${osx_sdk_dir}"
+ add_ldflags "-isysroot ${osx_sdk_dir}"
fi
;;
esac
- if [ -d "${osx_sdk_dir}" ]; then
- add_cflags "-isysroot ${osx_sdk_dir}"
- add_ldflags "-isysroot ${osx_sdk_dir}"
- fi
-
case ${toolchain} in
*-darwin8-*)
add_cflags "-mmacosx-version-min=10.4"
@@ -786,9 +781,11 @@
*-iphonesimulator-*)
add_cflags "-miphoneos-version-min=${IOS_VERSION_MIN}"
add_ldflags "-miphoneos-version-min=${IOS_VERSION_MIN}"
- osx_sdk_dir="$(xcrun --sdk iphonesimulator --show-sdk-path)"
- add_cflags "-isysroot ${osx_sdk_dir}"
- add_ldflags "-isysroot ${osx_sdk_dir}"
+ iossim_sdk_dir="$(show_darwin_sdk_path iphonesimulator)"
+ if [ -d "${iossim_sdk_dir}" ]; then
+ add_cflags "-isysroot ${iossim_sdk_dir}"
+ add_ldflags "-isysroot ${iossim_sdk_dir}"
+ fi
;;
esac
@@ -960,7 +957,7 @@
;;
darwin*)
- XCRUN_FIND="xcrun --sdk iphoneos -find"
+ XCRUN_FIND="xcrun --sdk iphoneos --find"
CXX="$(${XCRUN_FIND} clang++)"
CC="$(${XCRUN_FIND} clang)"
AR="$(${XCRUN_FIND} ar)"
@@ -987,10 +984,14 @@
# options that were put in above
ASFLAGS="-arch ${tgt_isa} -g"
- alt_libc="$(xcrun --sdk iphoneos --show-sdk-path)"
- add_cflags -arch ${tgt_isa} -isysroot ${alt_libc}
+ add_cflags -arch ${tgt_isa}
add_ldflags -arch ${tgt_isa}
+ alt_libc="$(show_darwin_sdk_path iphoneos)"
+ if [ -d "${alt_libc}" ]; then
+ add_cflags -isysroot ${alt_libc}
+ fi
+
if [ "${LD}" = "${CXX}" ]; then
add_ldflags -miphoneos-version-min="${IOS_VERSION_MIN}"
else
@@ -1251,8 +1252,7 @@
fi
tgt_os_no_version=$(echo "${tgt_os}" | tr -d "[0-9]")
- if [ "${tgt_os_no_version}" = "darwin" ] || \
- [ "${tgt_os_no_version}" = "openbsd" ] || [ "`uname`" = "OpenBSD" ]; then
+ if [ "${tgt_os_no_version}" = "openbsd" ] || [ "`uname`" = "OpenBSD" ]; then
openbsd_like=yes
fi
# Default use_x86inc to yes when we are 64 bit, non-pic, or on any
diff --git a/build/make/iosbuild.sh b/build/make/iosbuild.sh
index bdd6690..89fa681 100755
--- a/build/make/iosbuild.sh
+++ b/build/make/iosbuild.sh
@@ -18,6 +18,10 @@
devnull='> /dev/null 2>&1'
BUILD_ROOT="_iosbuild"
+CONFIGURE_ARGS="--disable-docs
+ --disable-examples
+ --disable-libyuv
+ --disable-unit-tests"
DIST_DIR="_dist"
FRAMEWORK_DIR="VPX.framework"
HEADER_DIR="${FRAMEWORK_DIR}/Headers/vpx"
@@ -43,7 +47,7 @@
mkdir "${target}"
cd "${target}"
eval "${LIBVPX_SOURCE_DIR}/configure" --target="${target}" \
- --disable-docs ${EXTRA_CONFIGURE_ARGS} ${devnull}
+ ${CONFIGURE_ARGS} ${EXTRA_CONFIGURE_ARGS} ${devnull}
export DIST_DIR
eval make -j ${MAKE_JOBS} dist ${devnull}
cd "${old_pwd}"
@@ -252,6 +256,7 @@
cat << EOF
BUILD_ROOT=${BUILD_ROOT}
DIST_DIR=${DIST_DIR}
+ CONFIGURE_ARGS=${CONFIGURE_ARGS}
EXTRA_CONFIGURE_ARGS=${EXTRA_CONFIGURE_ARGS}
FRAMEWORK_DIR=${FRAMEWORK_DIR}
HEADER_DIR=${HEADER_DIR}
diff --git a/test/sad_test.cc b/test/sad_test.cc
index eef8c75..65e9561 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -1234,14 +1234,24 @@
#endif // CONFIG_USE_X86INC
#endif // HAVE_SSSE3
-#if HAVE_AVX2
#if CONFIG_VP9_ENCODER
+#if HAVE_AVX2
const SadMxNx4Func sad_64x64x4d_avx2 = vp9_sad64x64x4d_avx2;
const SadMxNx4Func sad_32x32x4d_avx2 = vp9_sad32x32x4d_avx2;
INSTANTIATE_TEST_CASE_P(AVX2, SADx4Test, ::testing::Values(
make_tuple(32, 32, sad_32x32x4d_avx2, -1),
make_tuple(64, 64, sad_64x64x4d_avx2, -1)));
-#endif // CONFIG_VP9_ENCODER
#endif // HAVE_AVX2
+#if HAVE_NEON
+const SadMxNx4Func sad_16x16x4d_neon = vp9_sad16x16x4d_neon;
+const SadMxNx4Func sad_32x32x4d_neon = vp9_sad32x32x4d_neon;
+const SadMxNx4Func sad_64x64x4d_neon = vp9_sad64x64x4d_neon;
+INSTANTIATE_TEST_CASE_P(NEON, SADx4Test, ::testing::Values(
+ make_tuple(16, 16, sad_16x16x4d_neon, -1),
+ make_tuple(32, 32, sad_32x32x4d_neon, -1),
+ make_tuple(64, 64, sad_64x64x4d_neon, -1)));
+#endif // HAVE_NEON
+#endif // CONFIG_VP9_ENCODER
+
} // namespace
diff --git a/third_party/x86inc/x86inc.asm b/third_party/x86inc/x86inc.asm
index 8e75a4b..9273fc9 100644
--- a/third_party/x86inc/x86inc.asm
+++ b/third_party/x86inc/x86inc.asm
@@ -78,6 +78,9 @@
%macro SECTION_RODATA 0-1 16
%ifidn __OUTPUT_FORMAT__,macho64
SECTION .text align=%1
+ %elifidn __OUTPUT_FORMAT__,macho32
+ SECTION .text align=%1
+ fakegot:
%elifidn __OUTPUT_FORMAT__,macho
SECTION .text align=%1
fakegot:
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 41b3066..53d9fbb 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -11,6 +11,7 @@
#include "vpx_config.h"
#include "./vpx_scale_rtcd.h"
+#include "./vp8_rtcd.h"
#include "vp8/common/onyxc_int.h"
#include "vp8/common/blockd.h"
#include "onyx_int.h"
@@ -1760,8 +1761,16 @@
reset_temporal_layer_change(cpi, oxcf, prev_number_of_layers);
}
+ if (!cpi->initial_width)
+ {
+ cpi->initial_width = cpi->oxcf.Width;
+ cpi->initial_height = cpi->oxcf.Height;
+ }
+
cm->Width = cpi->oxcf.Width;
cm->Height = cpi->oxcf.Height;
+ assert(cm->Width <= cpi->initial_width);
+ assert(cm->Height <= cpi->initial_height);
/* TODO(jkoleszar): if an internal spatial resampling is active,
* and we downsize the input image, maybe we should clear the
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index b1a749c..82d7453 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -665,6 +665,9 @@
int droppable;
+ int initial_width;
+ int initial_height;
+
#if CONFIG_TEMPORAL_DENOISING
VP8_DENOISER denoiser;
#endif
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index f81f078..96b4cb5 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -447,9 +447,14 @@
{
vpx_codec_err_t res;
- if (((cfg->g_w != ctx->cfg.g_w) || (cfg->g_h != ctx->cfg.g_h))
- && (cfg->g_lag_in_frames > 1 || cfg->g_pass != VPX_RC_ONE_PASS))
- ERROR("Cannot change width or height after initialization");
+ if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h)
+ {
+ if (cfg->g_lag_in_frames > 1 || cfg->g_pass != VPX_RC_ONE_PASS)
+ ERROR("Cannot change width or height after initialization");
+ if ((ctx->cpi->initial_width && (int)cfg->g_w > ctx->cpi->initial_width) ||
+ (ctx->cpi->initial_height && (int)cfg->g_h > ctx->cpi->initial_height))
+ ERROR("Cannot increast width or height larger than their initial values");
+ }
/* Prevent increasing lag_in_frames. This check is stricter than it needs
* to be -- the limit is not increasing past the first lag_in_frames
diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index 4557e19..47e5164 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -334,20 +334,6 @@
-EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP
};
-#define COUNT_SAT 20
-#define MAX_UPDATE_FACTOR 128
-
-static int adapt_prob(vp9_prob pre_prob, const unsigned int ct[2]) {
- return merge_probs(pre_prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR);
-}
-
-static void adapt_probs(const vp9_tree_index *tree,
- const vp9_prob *pre_probs, const unsigned int *counts,
- vp9_prob *probs) {
- vp9_tree_merge_probs(tree, pre_probs, counts, COUNT_SAT, MAX_UPDATE_FACTOR,
- probs);
-}
-
void vp9_adapt_mode_probs(VP9_COMMON *cm) {
int i, j;
FRAME_CONTEXT *fc = cm->fc;
@@ -355,39 +341,41 @@
const FRAME_COUNTS *counts = &cm->counts;
for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
- fc->intra_inter_prob[i] = adapt_prob(pre_fc->intra_inter_prob[i],
- counts->intra_inter[i]);
+ fc->intra_inter_prob[i] = mode_mv_merge_probs(pre_fc->intra_inter_prob[i],
+ counts->intra_inter[i]);
for (i = 0; i < COMP_INTER_CONTEXTS; i++)
- fc->comp_inter_prob[i] = adapt_prob(pre_fc->comp_inter_prob[i],
- counts->comp_inter[i]);
+ fc->comp_inter_prob[i] = mode_mv_merge_probs(pre_fc->comp_inter_prob[i],
+ counts->comp_inter[i]);
for (i = 0; i < REF_CONTEXTS; i++)
- fc->comp_ref_prob[i] = adapt_prob(pre_fc->comp_ref_prob[i],
- counts->comp_ref[i]);
+ fc->comp_ref_prob[i] = mode_mv_merge_probs(pre_fc->comp_ref_prob[i],
+ counts->comp_ref[i]);
for (i = 0; i < REF_CONTEXTS; i++)
for (j = 0; j < 2; j++)
- fc->single_ref_prob[i][j] = adapt_prob(pre_fc->single_ref_prob[i][j],
- counts->single_ref[i][j]);
+ fc->single_ref_prob[i][j] = mode_mv_merge_probs(
+ pre_fc->single_ref_prob[i][j], counts->single_ref[i][j]);
for (i = 0; i < INTER_MODE_CONTEXTS; i++)
- adapt_probs(vp9_inter_mode_tree, pre_fc->inter_mode_probs[i],
+ vp9_tree_merge_probs(vp9_inter_mode_tree, pre_fc->inter_mode_probs[i],
counts->inter_mode[i], fc->inter_mode_probs[i]);
for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
- adapt_probs(vp9_intra_mode_tree, pre_fc->y_mode_prob[i],
+ vp9_tree_merge_probs(vp9_intra_mode_tree, pre_fc->y_mode_prob[i],
counts->y_mode[i], fc->y_mode_prob[i]);
for (i = 0; i < INTRA_MODES; ++i)
- adapt_probs(vp9_intra_mode_tree, pre_fc->uv_mode_prob[i],
- counts->uv_mode[i], fc->uv_mode_prob[i]);
+ vp9_tree_merge_probs(vp9_intra_mode_tree, pre_fc->uv_mode_prob[i],
+ counts->uv_mode[i], fc->uv_mode_prob[i]);
for (i = 0; i < PARTITION_CONTEXTS; i++)
- adapt_probs(vp9_partition_tree, pre_fc->partition_prob[i],
- counts->partition[i], fc->partition_prob[i]);
+ vp9_tree_merge_probs(vp9_partition_tree, pre_fc->partition_prob[i],
+ counts->partition[i], fc->partition_prob[i]);
if (cm->interp_filter == SWITCHABLE) {
for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
- adapt_probs(vp9_switchable_interp_tree, pre_fc->switchable_interp_prob[i],
- counts->switchable_interp[i], fc->switchable_interp_prob[i]);
+ vp9_tree_merge_probs(vp9_switchable_interp_tree,
+ pre_fc->switchable_interp_prob[i],
+ counts->switchable_interp[i],
+ fc->switchable_interp_prob[i]);
}
if (cm->tx_mode == TX_MODE_SELECT) {
@@ -399,23 +387,24 @@
for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
tx_counts_to_branch_counts_8x8(counts->tx.p8x8[i], branch_ct_8x8p);
for (j = 0; j < TX_SIZES - 3; ++j)
- fc->tx_probs.p8x8[i][j] = adapt_prob(pre_fc->tx_probs.p8x8[i][j],
- branch_ct_8x8p[j]);
+ fc->tx_probs.p8x8[i][j] = mode_mv_merge_probs(
+ pre_fc->tx_probs.p8x8[i][j], branch_ct_8x8p[j]);
tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i], branch_ct_16x16p);
for (j = 0; j < TX_SIZES - 2; ++j)
- fc->tx_probs.p16x16[i][j] = adapt_prob(pre_fc->tx_probs.p16x16[i][j],
- branch_ct_16x16p[j]);
+ fc->tx_probs.p16x16[i][j] = mode_mv_merge_probs(
+ pre_fc->tx_probs.p16x16[i][j], branch_ct_16x16p[j]);
tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i], branch_ct_32x32p);
for (j = 0; j < TX_SIZES - 1; ++j)
- fc->tx_probs.p32x32[i][j] = adapt_prob(pre_fc->tx_probs.p32x32[i][j],
- branch_ct_32x32p[j]);
+ fc->tx_probs.p32x32[i][j] = mode_mv_merge_probs(
+ pre_fc->tx_probs.p32x32[i][j], branch_ct_32x32p[j]);
}
}
for (i = 0; i < SKIP_CONTEXTS; ++i)
- fc->skip_probs[i] = adapt_prob(pre_fc->skip_probs[i], counts->skip[i]);
+ fc->skip_probs[i] = mode_mv_merge_probs(
+ pre_fc->skip_probs[i], counts->skip[i]);
}
static void set_default_lf_deltas(struct loopfilter *lf) {
diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c
index 922c039..2477e6e 100644
--- a/vp9/common/vp9_entropymv.c
+++ b/vp9/common/vp9_entropymv.c
@@ -11,9 +11,6 @@
#include "vp9/common/vp9_onyxc_int.h"
#include "vp9/common/vp9_entropymv.h"
-#define MV_COUNT_SAT 20
-#define MV_MAX_UPDATE_FACTOR 128
-
// Integer pel reference mv threshold for use of high-precision 1/8 mv
#define COMPANDED_MVREF_THRESH 8
@@ -183,16 +180,6 @@
}
}
-static vp9_prob adapt_prob(vp9_prob prep, const unsigned int ct[2]) {
- return merge_probs(prep, ct, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR);
-}
-
-static void adapt_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs,
- const unsigned int *counts, vp9_prob *probs) {
- vp9_tree_merge_probs(tree, pre_probs, counts, MV_COUNT_SAT,
- MV_MAX_UPDATE_FACTOR, probs);
-}
-
void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) {
int i, j;
@@ -200,30 +187,32 @@
const nmv_context *pre_fc = &cm->frame_contexts[cm->frame_context_idx].nmvc;
const nmv_context_counts *counts = &cm->counts.mv;
- adapt_probs(vp9_mv_joint_tree, pre_fc->joints, counts->joints, fc->joints);
+ vp9_tree_merge_probs(vp9_mv_joint_tree, pre_fc->joints, counts->joints,
+ fc->joints);
for (i = 0; i < 2; ++i) {
nmv_component *comp = &fc->comps[i];
const nmv_component *pre_comp = &pre_fc->comps[i];
const nmv_component_counts *c = &counts->comps[i];
- comp->sign = adapt_prob(pre_comp->sign, c->sign);
- adapt_probs(vp9_mv_class_tree, pre_comp->classes, c->classes,
- comp->classes);
- adapt_probs(vp9_mv_class0_tree, pre_comp->class0, c->class0, comp->class0);
+ comp->sign = mode_mv_merge_probs(pre_comp->sign, c->sign);
+ vp9_tree_merge_probs(vp9_mv_class_tree, pre_comp->classes, c->classes,
+ comp->classes);
+ vp9_tree_merge_probs(vp9_mv_class0_tree, pre_comp->class0, c->class0,
+ comp->class0);
for (j = 0; j < MV_OFFSET_BITS; ++j)
- comp->bits[j] = adapt_prob(pre_comp->bits[j], c->bits[j]);
+ comp->bits[j] = mode_mv_merge_probs(pre_comp->bits[j], c->bits[j]);
for (j = 0; j < CLASS0_SIZE; ++j)
- adapt_probs(vp9_mv_fp_tree, pre_comp->class0_fp[j], c->class0_fp[j],
- comp->class0_fp[j]);
+ vp9_tree_merge_probs(vp9_mv_fp_tree, pre_comp->class0_fp[j],
+ c->class0_fp[j], comp->class0_fp[j]);
- adapt_probs(vp9_mv_fp_tree, pre_comp->fp, c->fp, comp->fp);
+ vp9_tree_merge_probs(vp9_mv_fp_tree, pre_comp->fp, c->fp, comp->fp);
if (allow_hp) {
- comp->class0_hp = adapt_prob(pre_comp->class0_hp, c->class0_hp);
- comp->hp = adapt_prob(pre_comp->hp, c->hp);
+ comp->class0_hp = mode_mv_merge_probs(pre_comp->class0_hp, c->class0_hp);
+ comp->hp = mode_mv_merge_probs(pre_comp->hp, c->hp);
}
}
}
diff --git a/vp9/common/vp9_mfqe.c b/vp9/common/vp9_mfqe.c
index e7ee903..1494c3f 100644
--- a/vp9/common/vp9_mfqe.c
+++ b/vp9/common/vp9_mfqe.c
@@ -35,14 +35,26 @@
}
}
+void vp9_filter_by_weight8x8_c(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int src_weight) {
+ filter_by_weight(src, src_stride, dst, dst_stride, 8, src_weight);
+}
+
+void vp9_filter_by_weight16x16_c(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ int src_weight) {
+ filter_by_weight(src, src_stride, dst, dst_stride, 16, src_weight);
+}
+
static void filter_by_weight32x32(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride, int weight) {
- filter_by_weight(src, src_stride, dst, dst_stride, 16, weight);
- filter_by_weight(src + 16, src_stride, dst + 16, dst_stride, 16, weight);
- filter_by_weight(src + src_stride * 16, src_stride, dst + dst_stride * 16,
- dst_stride, 16, weight);
- filter_by_weight(src + src_stride * 16 + 16, src_stride,
- dst + dst_stride * 16 + 16, dst_stride, 16, weight);
+ vp9_filter_by_weight16x16(src, src_stride, dst, dst_stride, weight);
+ vp9_filter_by_weight16x16(src + 16, src_stride, dst + 16, dst_stride,
+ weight);
+ vp9_filter_by_weight16x16(src + src_stride * 16, src_stride,
+ dst + dst_stride * 16, dst_stride, weight);
+ vp9_filter_by_weight16x16(src + src_stride * 16 + 16, src_stride,
+ dst + dst_stride * 16 + 16, dst_stride, weight);
}
static void filter_by_weight64x64(const uint8_t *src, int src_stride,
@@ -62,13 +74,13 @@
int uvd_stride, BLOCK_SIZE block_size,
int weight) {
if (block_size == BLOCK_16X16) {
- filter_by_weight(y, y_stride, yd, yd_stride, 16, weight);
- filter_by_weight(u, uv_stride, ud, uvd_stride, 8, weight);
- filter_by_weight(v, uv_stride, vd, uvd_stride, 8, weight);
+ vp9_filter_by_weight16x16(y, y_stride, yd, yd_stride, weight);
+ vp9_filter_by_weight8x8(u, uv_stride, ud, uvd_stride, weight);
+ vp9_filter_by_weight8x8(v, uv_stride, vd, uvd_stride, weight);
} else if (block_size == BLOCK_32X32) {
filter_by_weight32x32(y, y_stride, yd, yd_stride, weight);
- filter_by_weight(u, uv_stride, ud, uvd_stride, 16, weight);
- filter_by_weight(v, uv_stride, vd, uvd_stride, 16, weight);
+ vp9_filter_by_weight16x16(u, uv_stride, ud, uvd_stride, weight);
+ vp9_filter_by_weight16x16(v, uv_stride, vd, uvd_stride, weight);
} else if (block_size == BLOCK_64X64) {
filter_by_weight64x64(y, y_stride, yd, yd_stride, weight);
filter_by_weight32x32(u, uv_stride, ud, uvd_stride, weight);
diff --git a/vp9/common/vp9_prob.c b/vp9/common/vp9_prob.c
index a1befc6..3b7b9bf 100644
--- a/vp9/common/vp9_prob.c
+++ b/vp9/common/vp9_prob.c
@@ -29,33 +29,25 @@
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
-
static unsigned int tree_merge_probs_impl(unsigned int i,
const vp9_tree_index *tree,
const vp9_prob *pre_probs,
const unsigned int *counts,
- unsigned int count_sat,
- unsigned int max_update,
vp9_prob *probs) {
const int l = tree[i];
const unsigned int left_count = (l <= 0)
? counts[-l]
- : tree_merge_probs_impl(l, tree, pre_probs, counts,
- count_sat, max_update, probs);
+ : tree_merge_probs_impl(l, tree, pre_probs, counts, probs);
const int r = tree[i + 1];
const unsigned int right_count = (r <= 0)
? counts[-r]
- : tree_merge_probs_impl(r, tree, pre_probs, counts,
- count_sat, max_update, probs);
+ : tree_merge_probs_impl(r, tree, pre_probs, counts, probs);
const unsigned int ct[2] = { left_count, right_count };
- probs[i >> 1] = merge_probs(pre_probs[i >> 1], ct,
- count_sat, max_update);
+ probs[i >> 1] = mode_mv_merge_probs(pre_probs[i >> 1], ct);
return left_count + right_count;
}
void vp9_tree_merge_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs,
- const unsigned int *counts, unsigned int count_sat,
- unsigned int max_update_factor, vp9_prob *probs) {
- tree_merge_probs_impl(0, tree, pre_probs, counts, count_sat,
- max_update_factor, probs);
+ const unsigned int *counts, vp9_prob *probs) {
+ tree_merge_probs_impl(0, tree, pre_probs, counts, probs);
}
diff --git a/vp9/common/vp9_prob.h b/vp9/common/vp9_prob.h
index bc1511a..c69c62c 100644
--- a/vp9/common/vp9_prob.h
+++ b/vp9/common/vp9_prob.h
@@ -33,6 +33,8 @@
#define vp9_complement(x) (255 - x)
+#define MODE_MV_COUNT_SAT 20
+
/* We build coding trees compactly in arrays.
Each node of the tree is a pair of vp9_tree_indices.
Array index often references a corresponding probability table.
@@ -69,9 +71,28 @@
return weighted_prob(pre_prob, prob, factor);
}
+// MODE_MV_MAX_UPDATE_FACTOR (128) * count / MODE_MV_COUNT_SAT;
+static const int count_to_update_factor[MODE_MV_COUNT_SAT + 1] = {
+ 0, 6, 12, 19, 25, 32, 38, 44, 51, 57, 64,
+ 70, 76, 83, 89, 96, 102, 108, 115, 121, 128
+};
+
+static INLINE vp9_prob mode_mv_merge_probs(vp9_prob pre_prob,
+ const unsigned int ct[2]) {
+ const unsigned int den = ct[0] + ct[1];
+ if (den == 0) {
+ return pre_prob;
+ } else {
+ const unsigned int count = MIN(den, MODE_MV_COUNT_SAT);
+ const unsigned int factor = count_to_update_factor[count];
+ const vp9_prob prob =
+ clip_prob(((int64_t)(ct[0]) * 256 + (den >> 1)) / den);
+ return weighted_prob(pre_prob, prob, factor);
+ }
+}
+
void vp9_tree_merge_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs,
- const unsigned int *counts, unsigned int count_sat,
- unsigned int max_update_factor, vp9_prob *probs);
+ const unsigned int *counts, vp9_prob *probs);
DECLARE_ALIGNED(16, extern const uint8_t, vp9_norm[256]);
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 4e9ec0f..a1b15e8 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -274,6 +274,12 @@
add_proto qw/void vp9_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
specialize qw/vp9_plane_add_noise sse2/;
$vp9_plane_add_noise_sse2=vp9_plane_add_noise_wmt;
+
+add_proto qw/void vp9_filter_by_weight16x16/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight";
+specialize qw/vp9_filter_by_weight16x16 sse2/;
+
+add_proto qw/void vp9_filter_by_weight8x8/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight";
+specialize qw/vp9_filter_by_weight8x8 sse2/;
}
#
@@ -1043,7 +1049,7 @@
specialize qw/vp9_sad4x4x8 sse4/;
add_proto qw/void vp9_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad64x64x4d sse2 avx2/;
+specialize qw/vp9_sad64x64x4d sse2 avx2 neon/;
add_proto qw/void vp9_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_sad32x64x4d sse2/;
@@ -1058,10 +1064,10 @@
specialize qw/vp9_sad16x32x4d sse2/;
add_proto qw/void vp9_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad32x32x4d sse2 avx2/;
+specialize qw/vp9_sad32x32x4d sse2 avx2 neon/;
add_proto qw/void vp9_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad16x16x4d sse2/;
+specialize qw/vp9_sad16x16x4d sse2 neon/;
add_proto qw/void vp9_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
specialize qw/vp9_sad16x8x4d sse2/;
@@ -1160,7 +1166,7 @@
specialize qw/vp9_quantize_b_32x32/, "$ssse3_x86_64";
add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
- specialize qw/vp9_fdct8x8_quant sse2 ssse3/;
+ specialize qw/vp9_fdct8x8_quant sse2 ssse3 neon/;
}
#
diff --git a/vp9/common/x86/vp9_mfqe_sse2.asm b/vp9/common/x86/vp9_mfqe_sse2.asm
new file mode 100644
index 0000000..6029420
--- /dev/null
+++ b/vp9/common/x86/vp9_mfqe_sse2.asm
@@ -0,0 +1,287 @@
+;
+; Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+; This file is a duplicate of mfqe_sse2.asm in VP8.
+; TODO(jackychen): Find a way to fix the duplicate.
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp9_filter_by_weight16x16_sse2
+;(
+; unsigned char *src,
+; int src_stride,
+; unsigned char *dst,
+; int dst_stride,
+; int src_weight
+;)
+global sym(vp9_filter_by_weight16x16_sse2) PRIVATE
+sym(vp9_filter_by_weight16x16_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movd xmm0, arg(4) ; src_weight
+ pshuflw xmm0, xmm0, 0x0 ; replicate to all low words
+ punpcklqdq xmm0, xmm0 ; replicate to all hi words
+
+ movdqa xmm1, [GLOBAL(tMFQE)]
+ psubw xmm1, xmm0 ; dst_weight
+
+ mov rax, arg(0) ; src
+ mov rsi, arg(1) ; src_stride
+ mov rdx, arg(2) ; dst
+ mov rdi, arg(3) ; dst_stride
+
+ mov rcx, 16 ; loop count
+ pxor xmm6, xmm6
+
+.combine
+ movdqa xmm2, [rax]
+ movdqa xmm4, [rdx]
+ add rax, rsi
+
+ ; src * src_weight
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, xmm6
+ punpckhbw xmm3, xmm6
+ pmullw xmm2, xmm0
+ pmullw xmm3, xmm0
+
+ ; dst * dst_weight
+ movdqa xmm5, xmm4
+ punpcklbw xmm4, xmm6
+ punpckhbw xmm5, xmm6
+ pmullw xmm4, xmm1
+ pmullw xmm5, xmm1
+
+ ; sum, round and shift
+ paddw xmm2, xmm4
+ paddw xmm3, xmm5
+ paddw xmm2, [GLOBAL(tMFQE_round)]
+ paddw xmm3, [GLOBAL(tMFQE_round)]
+ psrlw xmm2, 4
+ psrlw xmm3, 4
+
+ packuswb xmm2, xmm3
+ movdqa [rdx], xmm2
+ add rdx, rdi
+
+ dec rcx
+ jnz .combine
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+
+ ret
+
+;void vp9_filter_by_weight8x8_sse2
+;(
+; unsigned char *src,
+; int src_stride,
+; unsigned char *dst,
+; int dst_stride,
+; int src_weight
+;)
+global sym(vp9_filter_by_weight8x8_sse2) PRIVATE
+sym(vp9_filter_by_weight8x8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movd xmm0, arg(4) ; src_weight
+ pshuflw xmm0, xmm0, 0x0 ; replicate to all low words
+ punpcklqdq xmm0, xmm0 ; replicate to all hi words
+
+ movdqa xmm1, [GLOBAL(tMFQE)]
+ psubw xmm1, xmm0 ; dst_weight
+
+ mov rax, arg(0) ; src
+ mov rsi, arg(1) ; src_stride
+ mov rdx, arg(2) ; dst
+ mov rdi, arg(3) ; dst_stride
+
+ mov rcx, 8 ; loop count
+ pxor xmm4, xmm4
+
+.combine
+ movq xmm2, [rax]
+ movq xmm3, [rdx]
+ add rax, rsi
+
+ ; src * src_weight
+ punpcklbw xmm2, xmm4
+ pmullw xmm2, xmm0
+
+ ; dst * dst_weight
+ punpcklbw xmm3, xmm4
+ pmullw xmm3, xmm1
+
+ ; sum, round and shift
+ paddw xmm2, xmm3
+ paddw xmm2, [GLOBAL(tMFQE_round)]
+ psrlw xmm2, 4
+
+ packuswb xmm2, xmm4
+ movq [rdx], xmm2
+ add rdx, rdi
+
+ dec rcx
+ jnz .combine
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+
+ ret
+
+;void vp9_variance_and_sad_16x16_sse2 | arg
+;(
+; unsigned char *src1, 0
+; int stride1, 1
+; unsigned char *src2, 2
+; int stride2, 3
+; unsigned int *variance, 4
+; unsigned int *sad, 5
+;)
+global sym(vp9_variance_and_sad_16x16_sse2) PRIVATE
+sym(vp9_variance_and_sad_16x16_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rax, arg(0) ; src1
+ mov rcx, arg(1) ; stride1
+ mov rdx, arg(2) ; src2
+ mov rdi, arg(3) ; stride2
+
+ mov rsi, 16 ; block height
+
+ ; Prep accumulator registers
+ pxor xmm3, xmm3 ; SAD
+ pxor xmm4, xmm4 ; sum of src2
+ pxor xmm5, xmm5 ; sum of src2^2
+
+ ; Because we're working with the actual output frames
+ ; we can't depend on any kind of data alignment.
+.accumulate
+ movdqa xmm0, [rax] ; src1
+ movdqa xmm1, [rdx] ; src2
+ add rax, rcx ; src1 + stride1
+ add rdx, rdi ; src2 + stride2
+
+ ; SAD(src1, src2)
+ psadbw xmm0, xmm1
+ paddusw xmm3, xmm0
+
+ ; SUM(src2)
+ pxor xmm2, xmm2
+ psadbw xmm2, xmm1 ; sum src2 by misusing SAD against 0
+ paddusw xmm4, xmm2
+
+ ; pmaddubsw would be ideal if it took two unsigned values. instead,
+ ; it expects a signed and an unsigned value. so instead we zero extend
+ ; and operate on words.
+ pxor xmm2, xmm2
+ movdqa xmm0, xmm1
+ punpcklbw xmm0, xmm2
+ punpckhbw xmm1, xmm2
+ pmaddwd xmm0, xmm0
+ pmaddwd xmm1, xmm1
+ paddd xmm5, xmm0
+ paddd xmm5, xmm1
+
+ sub rsi, 1
+ jnz .accumulate
+
+ ; phaddd only operates on adjacent double words.
+ ; Finalize SAD and store
+ movdqa xmm0, xmm3
+ psrldq xmm0, 8
+ paddusw xmm0, xmm3
+ paddd xmm0, [GLOBAL(t128)]
+ psrld xmm0, 8
+
+ mov rax, arg(5)
+ movd [rax], xmm0
+
+ ; Accumulate sum of src2
+ movdqa xmm0, xmm4
+ psrldq xmm0, 8
+ paddusw xmm0, xmm4
+ ; Square src2. Ignore high value
+ pmuludq xmm0, xmm0
+ psrld xmm0, 8
+
+ ; phaddw could be used to sum adjacent values but we want
+ ; all the values summed. promote to doubles, accumulate,
+ ; shift and sum
+ pxor xmm2, xmm2
+ movdqa xmm1, xmm5
+ punpckldq xmm1, xmm2
+ punpckhdq xmm5, xmm2
+ paddd xmm1, xmm5
+ movdqa xmm2, xmm1
+ psrldq xmm1, 8
+ paddd xmm1, xmm2
+
+ psubd xmm1, xmm0
+
+ ; (variance + 128) >> 8
+ paddd xmm1, [GLOBAL(t128)]
+ psrld xmm1, 8
+ mov rax, arg(4)
+
+ movd [rax], xmm1
+
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+t128:
+%ifndef __NASM_VER__
+ ddq 128
+%elif CONFIG_BIG_ENDIAN
+ dq 0, 128
+%else
+ dq 128, 0
+%endif
+align 16
+tMFQE: ; 1 << MFQE_PRECISION
+ times 8 dw 0x10
+align 16
+tMFQE_round: ; 1 << (MFQE_PRECISION - 1)
+ times 8 dw 0x08
diff --git a/vp9/encoder/arm/neon/vp9_dct_neon.c b/vp9/encoder/arm/neon/vp9_dct_neon.c
index 6c66f5d..a6d4797 100644
--- a/vp9/encoder/arm/neon/vp9_dct_neon.c
+++ b/vp9/encoder/arm/neon/vp9_dct_neon.c
@@ -32,6 +32,24 @@
}
}
+void vp9_fdct8x8_quant_neon(const int16_t *input, int stride,
+ int16_t* coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t* zbin_ptr,
+ const int16_t* round_ptr, const int16_t* quant_ptr,
+ const int16_t* quant_shift_ptr,
+ int16_t* qcoeff_ptr, int16_t* dqcoeff_ptr,
+ const int16_t* dequant_ptr, uint16_t* eob_ptr,
+ const int16_t* scan_ptr,
+ const int16_t* iscan_ptr) {
+ int16_t temp_buffer[64];
+ (void)coeff_ptr;
+
+ vp9_fdct8x8_neon(input, temp_buffer, stride);
+ vp9_quantize_fp_neon(temp_buffer, n_coeffs, skip_block, zbin_ptr, round_ptr,
+ quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
+ dequant_ptr, eob_ptr, scan_ptr, iscan_ptr);
+}
+
void vp9_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
int i;
// stage 1
diff --git a/vp9/encoder/arm/neon/vp9_sad4d_neon.c b/vp9/encoder/arm/neon/vp9_sad4d_neon.c
new file mode 100644
index 0000000..cec1689
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_sad4d_neon.c
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+
+static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
+ const uint16x8_t vec_hi) {
+ const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo),
+ vget_high_u16(vec_lo));
+ const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi),
+ vget_high_u16(vec_hi));
+ const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
+ const uint64x2_t b = vpaddlq_u32(a);
+ const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+ vreinterpret_u32_u64(vget_high_u64(b)));
+ return vget_lane_u32(c, 0);
+}
+
+// Calculate the absolute difference of 64 bytes from vec_src_00, vec_src_16,
+// vec_src_32, vec_src_48 and ref. Accumulate partial sums in vec_sum_ref_lo
+// and vec_sum_ref_hi.
+static void sad_neon_64(const uint8x16_t vec_src_00,
+ const uint8x16_t vec_src_16,
+ const uint8x16_t vec_src_32,
+ const uint8x16_t vec_src_48,
+ const uint8_t *ref,
+ uint16x8_t *vec_sum_ref_lo,
+ uint16x8_t *vec_sum_ref_hi) {
+ const uint8x16_t vec_ref_00 = vld1q_u8(ref);
+ const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
+ const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
+ const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
+
+ *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
+ vget_low_u8(vec_ref_00));
+ *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
+ vget_high_u8(vec_ref_00));
+ *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
+ vget_low_u8(vec_ref_16));
+ *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
+ vget_high_u8(vec_ref_16));
+ *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_32),
+ vget_low_u8(vec_ref_32));
+ *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_32),
+ vget_high_u8(vec_ref_32));
+ *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_48),
+ vget_low_u8(vec_ref_48));
+ *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_48),
+ vget_high_u8(vec_ref_48));
+}
+
+// Calculate the absolute difference of 32 bytes from vec_src_00, vec_src_16,
+// and ref. Accumulate partial sums in vec_sum_ref_lo and vec_sum_ref_hi.
+static void sad_neon_32(const uint8x16_t vec_src_00,
+ const uint8x16_t vec_src_16,
+ const uint8_t *ref,
+ uint16x8_t *vec_sum_ref_lo,
+ uint16x8_t *vec_sum_ref_hi) {
+ const uint8x16_t vec_ref_00 = vld1q_u8(ref);
+ const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
+
+ *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
+ vget_low_u8(vec_ref_00));
+ *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
+ vget_high_u8(vec_ref_00));
+ *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
+ vget_low_u8(vec_ref_16));
+ *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
+ vget_high_u8(vec_ref_16));
+}
+
+void vp9_sad64x64x4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t* const ref[4], int ref_stride,
+ unsigned int *res) {
+ int i;
+ uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
+ const uint8_t *ref0, *ref1, *ref2, *ref3;
+ ref0 = ref[0];
+ ref1 = ref[1];
+ ref2 = ref[2];
+ ref3 = ref[3];
+
+ for (i = 0; i < 64; ++i) {
+ const uint8x16_t vec_src_00 = vld1q_u8(src);
+ const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
+ const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
+ const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
+
+ sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref0,
+ &vec_sum_ref0_lo, &vec_sum_ref0_hi);
+ sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref1,
+ &vec_sum_ref1_lo, &vec_sum_ref1_hi);
+ sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref2,
+ &vec_sum_ref2_lo, &vec_sum_ref2_hi);
+ sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref3,
+ &vec_sum_ref3_lo, &vec_sum_ref3_hi);
+
+ src += src_stride;
+ ref0 += ref_stride;
+ ref1 += ref_stride;
+ ref2 += ref_stride;
+ ref3 += ref_stride;
+ }
+
+ res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
+ res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
+ res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
+ res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+}
+
+void vp9_sad32x32x4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t* const ref[4], int ref_stride,
+ unsigned int *res) {
+ int i;
+ uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
+ const uint8_t *ref0, *ref1, *ref2, *ref3;
+ ref0 = ref[0];
+ ref1 = ref[1];
+ ref2 = ref[2];
+ ref3 = ref[3];
+
+ for (i = 0; i < 32; ++i) {
+ const uint8x16_t vec_src_00 = vld1q_u8(src);
+ const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
+
+ sad_neon_32(vec_src_00, vec_src_16, ref0,
+ &vec_sum_ref0_lo, &vec_sum_ref0_hi);
+ sad_neon_32(vec_src_00, vec_src_16, ref1,
+ &vec_sum_ref1_lo, &vec_sum_ref1_hi);
+ sad_neon_32(vec_src_00, vec_src_16, ref2,
+ &vec_sum_ref2_lo, &vec_sum_ref2_hi);
+ sad_neon_32(vec_src_00, vec_src_16, ref3,
+ &vec_sum_ref3_lo, &vec_sum_ref3_hi);
+
+ src += src_stride;
+ ref0 += ref_stride;
+ ref1 += ref_stride;
+ ref2 += ref_stride;
+ ref3 += ref_stride;
+ }
+
+ res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
+ res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
+ res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
+ res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+}
+
+void vp9_sad16x16x4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t* const ref[4], int ref_stride,
+ unsigned int *res) {
+ int i;
+ uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
+ uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
+ const uint8_t *ref0, *ref1, *ref2, *ref3;
+ ref0 = ref[0];
+ ref1 = ref[1];
+ ref2 = ref[2];
+ ref3 = ref[3];
+
+ for (i = 0; i < 16; ++i) {
+ const uint8x16_t vec_src = vld1q_u8(src);
+ const uint8x16_t vec_ref0 = vld1q_u8(ref0);
+ const uint8x16_t vec_ref1 = vld1q_u8(ref1);
+ const uint8x16_t vec_ref2 = vld1q_u8(ref2);
+ const uint8x16_t vec_ref3 = vld1q_u8(ref3);
+
+ vec_sum_ref0_lo = vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src),
+ vget_low_u8(vec_ref0));
+ vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src),
+ vget_high_u8(vec_ref0));
+ vec_sum_ref1_lo = vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src),
+ vget_low_u8(vec_ref1));
+ vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src),
+ vget_high_u8(vec_ref1));
+ vec_sum_ref2_lo = vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src),
+ vget_low_u8(vec_ref2));
+ vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src),
+ vget_high_u8(vec_ref2));
+ vec_sum_ref3_lo = vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src),
+ vget_low_u8(vec_ref3));
+ vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src),
+ vget_high_u8(vec_ref3));
+
+ src += src_stride;
+ ref0 += ref_stride;
+ ref1 += ref_stride;
+ ref2 += ref_stride;
+ ref3 += ref_stride;
+ }
+
+ res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
+ res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
+ res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
+ res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+}
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
index 5f1c8ce..5a3671d 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -24,8 +24,6 @@
int percent_refresh;
// Maximum q-delta as percentage of base q.
int max_qdelta_perc;
- // Block size below which we don't apply cyclic refresh.
- BLOCK_SIZE min_block_size;
// Superblock starting index for cycling through the frame.
int sb_index;
// Controls how long block will need to wait to be refreshed again, in
@@ -40,9 +38,13 @@
int rdmult;
// Cyclic refresh map.
signed char *map;
- // Thresholds applied to projected rate/distortion of the superblock.
+ // Thresholds applied to the projected rate/distortion of the coding block,
+ // when deciding whether block should be refreshed.
int64_t thresh_rate_sb;
int64_t thresh_dist_sb;
+ // Threshold applied to the motion vector (in units of 1/8 pel) of the
+ // coding block, when deciding whether block should be refreshed.
+ int16_t motion_thresh;
// Rate target ratio to set q delta.
double rate_ratio_qdelta;
};
@@ -93,32 +95,23 @@
// mode, and rate/distortion.
static int candidate_refresh_aq(const CYCLIC_REFRESH *cr,
const MB_MODE_INFO *mbmi,
- BLOCK_SIZE bsize, int use_rd,
- int64_t rate_sb) {
- if (use_rd) {
- MV mv = mbmi->mv[0].as_mv;
- // If projected rate is below the thresh_rate (well below target,
- // so undershoot expected), accept it for lower-qp coding.
- if (rate_sb < cr->thresh_rate_sb)
- return 1;
- // Otherwise, reject the block for lower-qp coding if any of the following:
- // 1) mode uses large mv
- // 2) mode is an intra-mode (we may want to allow some of this under
- // another thresh_dist)
- else if (mv.row > 32 || mv.row < -32 ||
- mv.col > 32 || mv.col < -32 || !is_inter_block(mbmi))
- return 0;
- else
- return 1;
- } else {
- // Rate/distortion not used for update.
- if (bsize < cr->min_block_size ||
- mbmi->mv[0].as_int != 0 ||
- !is_inter_block(mbmi))
- return 0;
- else
- return 1;
- }
+ int64_t rate,
+ int64_t dist) {
+ MV mv = mbmi->mv[0].as_mv;
+ // If projected rate is below the thresh_rate accept it for lower-qp coding.
+ // Otherwise, reject the block for lower-qp coding if projected distortion
+ // is above the threshold, and any of the following is true:
+ // 1) mode uses large mv
+ // 2) mode is an intra-mode
+ if (rate < cr->thresh_rate_sb)
+ return 1;
+ else if (dist > cr->thresh_dist_sb &&
+ (mv.row > cr->motion_thresh || mv.row < -cr->motion_thresh ||
+ mv.col > cr->motion_thresh || mv.col < -cr->motion_thresh ||
+ !is_inter_block(mbmi)))
+ return 0;
+ else
+ return 1;
}
// Compute delta-q for the segment.
@@ -194,8 +187,9 @@
void vp9_cyclic_refresh_update_segment(VP9_COMP *const cpi,
MB_MODE_INFO *const mbmi,
int mi_row, int mi_col,
- BLOCK_SIZE bsize, int use_rd,
- int64_t rate_sb) {
+ BLOCK_SIZE bsize,
+ int64_t rate,
+ int64_t dist) {
const VP9_COMMON *const cm = &cpi->common;
CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
const int bw = num_8x8_blocks_wide_lookup[bsize];
@@ -203,8 +197,7 @@
const int xmis = MIN(cm->mi_cols - mi_col, bw);
const int ymis = MIN(cm->mi_rows - mi_row, bh);
const int block_index = mi_row * cm->mi_cols + mi_col;
- const int refresh_this_block = candidate_refresh_aq(cr, mbmi, bsize, use_rd,
- rate_sb);
+ const int refresh_this_block = candidate_refresh_aq(cr, mbmi, rate, dist);
// Default is to not update the refresh map.
int new_map_value = cr->map[block_index];
int x = 0; int y = 0;
@@ -358,19 +351,13 @@
const double q = vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth);
vp9_clear_system_state();
cr->max_qdelta_perc = 50;
- cr->min_block_size = BLOCK_8X8;
cr->time_for_refresh = 0;
- // Set rate threshold to some fraction of target (and scaled by 256).
- cr->thresh_rate_sb = (rc->sb64_target_rate * 256) >> 2;
+ // Set rate threshold to some fraction (set to 1 for now) of the target
+ // rate (target is given by sb64_target_rate and scaled by 256).
+ cr->thresh_rate_sb = (rc->sb64_target_rate << 8);
// Distortion threshold, quadratic in Q, scale factor to be adjusted.
- cr->thresh_dist_sb = 8 * (int)(q * q);
- if (cpi->sf.use_nonrd_pick_mode) {
- // May want to be more conservative with thresholds in non-rd mode for now
- // as rate/distortion are derived from model based on prediction residual.
- cr->thresh_rate_sb = (rc->sb64_target_rate * 256);
- cr->thresh_dist_sb = 16 * (int)(q * q);
- }
-
+ cr->thresh_dist_sb = (int)(q * q) << 5;
+ cr->motion_thresh = 32;
// Set up segmentation.
// Clear down the segment map.
vp9_enable_segmentation(&cm->seg);
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.h b/vp9/encoder/vp9_aq_cyclicrefresh.h
index 656d760..1ed07c2 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.h
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.h
@@ -42,9 +42,8 @@
// and segmentation map.
void vp9_cyclic_refresh_update_segment(struct VP9_COMP *const cpi,
MB_MODE_INFO *const mbmi,
- int mi_row, int mi_col,
- BLOCK_SIZE bsize, int use_rd,
- int64_t rate_sb);
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ int64_t rate, int64_t dist);
// Update the segmentation map, and related quantities: cyclic refresh map,
// refresh sb_index, and target number of blocks to be refreshed.
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 3f4ed94..a72856d 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -611,12 +611,10 @@
case ONE_LOOP_REDUCED: {
int updates = 0;
int noupdates_before_first = 0;
-
if (tx_size >= TX_16X16 && cpi->sf.tx_size_search_method == USE_TX_8X8) {
vp9_write_bit(bc, 0);
return;
}
-
for (i = 0; i < PLANE_TYPES; ++i) {
for (j = 0; j < REF_TYPES; ++j) {
for (k = 0; k < COEF_BANDS; ++k) {
@@ -668,7 +666,6 @@
}
return;
}
-
default:
assert(0);
}
@@ -678,16 +675,14 @@
const TX_MODE tx_mode = cpi->common.tx_mode;
const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
TX_SIZE tx_size;
- vp9_coeff_stats frame_branch_ct[TX_SIZES][PLANE_TYPES];
- vp9_coeff_probs_model frame_coef_probs[TX_SIZES][PLANE_TYPES];
-
- for (tx_size = TX_4X4; tx_size <= TX_32X32; ++tx_size)
- build_tree_distribution(cpi, tx_size, frame_branch_ct[tx_size],
- frame_coef_probs[tx_size]);
-
- for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
- update_coef_probs_common(w, cpi, tx_size, frame_branch_ct[tx_size],
- frame_coef_probs[tx_size]);
+ for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size) {
+ vp9_coeff_stats frame_branch_ct[PLANE_TYPES];
+ vp9_coeff_probs_model frame_coef_probs[PLANE_TYPES];
+ build_tree_distribution(cpi, tx_size, frame_branch_ct,
+ frame_coef_probs);
+ update_coef_probs_common(w, cpi, tx_size, frame_branch_ct,
+ frame_coef_probs);
+ }
}
static void encode_loopfilter(struct loopfilter *lf,
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index 7d4e26a..4f245e2 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -403,10 +403,7 @@
void vp9_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx) {
ctx->zeromv_sse = UINT_MAX;
- // This should be initialized as zero since mode search stage might skip
- // NEWMV mode if inferred motion vector modes provide sufficiently good
- // prediction quality.
- ctx->newmv_sse = 0;
+ ctx->newmv_sse = UINT_MAX;
}
void vp9_denoiser_update_frame_stats(MB_MODE_INFO *mbmi, unsigned int sse,
@@ -418,7 +415,7 @@
ctx->best_zeromv_reference_frame = mbmi->ref_frame[0];
}
- if (mode == NEWMV) {
+ if (mbmi->mv[0].as_int != 0 && sse < ctx->newmv_sse) {
ctx->newmv_sse = sse;
ctx->best_sse_inter_mode = mode;
ctx->best_sse_mv = mbmi->mv[0];
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index e142a31..730a229 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -402,7 +402,7 @@
BLOCK_SIZE bsize,
int mi_row,
int mi_col,
- int threshold,
+ int64_t threshold,
BLOCK_SIZE bsize_min) {
VP9_COMMON * const cm = &cpi->common;
variance_node vt;
@@ -490,18 +490,23 @@
int pixels_wide = 64, pixels_high = 64;
const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
const struct scale_factors *const sf = &cm->frame_refs[LAST_FRAME - 1].sf;
-
// Always use 4x4 partition for key frame.
int use_4x4_partition = (cm->frame_type == KEY_FRAME);
-
int variance4x4downsample[16];
int low_res = (cm->width <= 352 && cm->height <= 288) ? 1 : 0;
const int threshold_multiplier = cm->frame_type == KEY_FRAME ? 80 : 4;
- int64_t threshold_base = (int64_t)(threshold_multiplier *
+ int64_t threshold_base;
+ int64_t threshold;
+ int64_t threshold_bsize_min;
+ int64_t threshold_bsize_max;
+
+ vp9_clear_system_state();
+ threshold_base = (int64_t)(threshold_multiplier *
vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth));
- int64_t threshold = threshold_base;
- int64_t threshold_bsize_min = threshold_base << 6;
- int64_t threshold_bsize_max = threshold_base;
+ threshold = threshold_base;
+ threshold_bsize_min = threshold_base << 6;
+ threshold_bsize_max = threshold_base;
+
// Modify thresholds for key frame and for low-resolutions (set lower
// thresholds to favor split).
if (cm->frame_type == KEY_FRAME) {
@@ -512,7 +517,6 @@
threshold_bsize_max = threshold_base >> 2;
}
- vp9_clear_system_state();
set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64);
if (xd->mb_to_right_edge < 0)
@@ -770,8 +774,8 @@
// Else for cyclic refresh mode update the segment map, set the segment id
// and then update the quantizer.
if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
- vp9_cyclic_refresh_update_segment(cpi, &xd->mi[0].src_mi->mbmi,
- mi_row, mi_col, bsize, 1, ctx->rate);
+ vp9_cyclic_refresh_update_segment(cpi, &xd->mi[0].src_mi->mbmi, mi_row,
+ mi_col, bsize, ctx->rate, ctx->dist);
}
}
@@ -1458,9 +1462,9 @@
: cm->last_frame_seg_map;
mbmi->segment_id = vp9_get_segment_id(cm, map, bsize, mi_row, mi_col);
} else {
- // Setting segmentation map for cyclic_refresh
- vp9_cyclic_refresh_update_segment(cpi, mbmi, mi_row, mi_col, bsize, 1,
- ctx->rate);
+ // Setting segmentation map for cyclic_refresh.
+ vp9_cyclic_refresh_update_segment(cpi, mbmi, mi_row, mi_col, bsize,
+ ctx->rate, ctx->dist);
}
vp9_init_plane_quantizers(cpi, x);
}
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index c85bf2a..e938427 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -2955,18 +2955,18 @@
if (undershoot_seen || loop_count > 1) {
// Update rate_correction_factor unless
- vp9_rc_update_rate_correction_factors(cpi, 1);
+ vp9_rc_update_rate_correction_factors(cpi);
q = (q_high + q_low + 1) / 2;
} else {
// Update rate_correction_factor unless
- vp9_rc_update_rate_correction_factors(cpi, 0);
+ vp9_rc_update_rate_correction_factors(cpi);
q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
bottom_index, MAX(q_high, top_index));
while (q < q_low && retries < 10) {
- vp9_rc_update_rate_correction_factors(cpi, 0);
+ vp9_rc_update_rate_correction_factors(cpi);
q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
bottom_index, MAX(q_high, top_index));
retries++;
@@ -2979,10 +2979,10 @@
q_high = q > q_low ? q - 1 : q_low;
if (overshoot_seen || loop_count > 1) {
- vp9_rc_update_rate_correction_factors(cpi, 1);
+ vp9_rc_update_rate_correction_factors(cpi);
q = (q_high + q_low) / 2;
} else {
- vp9_rc_update_rate_correction_factors(cpi, 0);
+ vp9_rc_update_rate_correction_factors(cpi);
q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
bottom_index, top_index);
// Special case reset for qlow for constrained quality.
@@ -2995,7 +2995,7 @@
}
while (q > q_high && retries < 10) {
- vp9_rc_update_rate_correction_factors(cpi, 0);
+ vp9_rc_update_rate_correction_factors(cpi);
q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
bottom_index, top_index);
retries++;
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 3b0f2f0..9fc63e3 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -38,6 +38,8 @@
#define OUTPUT_FPF 0
#define ARF_STATS_OUTPUT 0
+#define GROUP_ADAPTIVE_MAXQ 0
+
#define BOOST_BREAKOUT 12.5
#define BOOST_FACTOR 12.5
#define ERR_DIVISOR 128.0
@@ -54,6 +56,7 @@
#define NEW_MV_MODE_PENALTY 32
#define SVC_FACTOR_PT_LOW 0.45
#define DARK_THRESH 64
+#define DEFAULT_GRP_WEIGHT 1.0
#define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x) - 0.000001 : (x) + 0.000001)
@@ -1082,8 +1085,9 @@
#define EDIV_SIZE_FACTOR 800
static int get_twopass_worst_quality(const VP9_COMP *cpi,
- const FIRSTPASS_STATS *stats,
- int section_target_bandwidth) {
+ const double section_err,
+ int section_target_bandwidth,
+ double group_weight_factor) {
const RATE_CONTROL *const rc = &cpi->rc;
const VP9EncoderConfig *const oxcf = &cpi->oxcf;
@@ -1092,7 +1096,6 @@
} else {
const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
? cpi->initial_mbs : cpi->common.MBs;
- const double section_err = stats->coded_error / stats->count;
const double err_per_mb = section_err / num_mbs;
const double speed_term = 1.0 + 0.04 * oxcf->speed;
const double ediv_size_correction = num_mbs / EDIV_SIZE_FACTOR;
@@ -1101,9 +1104,11 @@
int q;
int is_svc_upper_layer = 0;
+
if (is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id > 0)
is_svc_upper_layer = 1;
+
// Try and pick a max Q that will be high enough to encode the
// content at the given rate.
for (q = rc->best_quality; q < rc->worst_quality; ++q) {
@@ -1113,9 +1118,10 @@
is_svc_upper_layer ? SVC_FACTOR_PT_LOW :
FACTOR_PT_LOW, FACTOR_PT_HIGH, q,
cpi->common.bit_depth);
- const int bits_per_mb = vp9_rc_bits_per_mb(INTER_FRAME, q,
- factor * speed_term,
- cpi->common.bit_depth);
+ const int bits_per_mb =
+ vp9_rc_bits_per_mb(INTER_FRAME, q,
+ factor * speed_term * group_weight_factor,
+ cpi->common.bit_depth);
if (bits_per_mb <= target_norm_bits_per_mb)
break;
}
@@ -1699,6 +1705,9 @@
double boost_score = 0.0;
double old_boost_score = 0.0;
double gf_group_err = 0.0;
+#if GROUP_ADAPTIVE_MAXQ
+ double gf_group_raw_error = 0.0;
+#endif
double gf_first_frame_err = 0.0;
double mod_frame_err = 0.0;
@@ -1742,8 +1751,12 @@
// If this is a key frame or the overlay from a previous arf then
// the error score / cost of this frame has already been accounted for.
- if (cpi->common.frame_type == KEY_FRAME || rc->source_alt_ref_active)
+ if (cpi->common.frame_type == KEY_FRAME || rc->source_alt_ref_active) {
gf_group_err -= gf_first_frame_err;
+#if GROUP_ADAPTIVE_MAXQ
+ gf_group_raw_error -= this_frame->coded_error;
+#endif
+ }
// Motion breakout threshold for loop below depends on image size.
mv_ratio_accumulator_thresh =
@@ -1782,6 +1795,9 @@
// Accumulate error score of frames in this gf group.
mod_frame_err = calculate_modified_err(twopass, oxcf, this_frame);
gf_group_err += mod_frame_err;
+#if GROUP_ADAPTIVE_MAXQ
+ gf_group_raw_error += this_frame->coded_error;
+#endif
if (EOF == input_stats(twopass, &next_frame))
break;
@@ -1863,6 +1879,9 @@
if (EOF == input_stats(twopass, this_frame))
break;
gf_group_err += calculate_modified_err(twopass, oxcf, this_frame);
+#if GROUP_ADAPTIVE_MAXQ
+ gf_group_raw_error += this_frame->coded_error;
+#endif
}
rc->baseline_gf_interval = new_gf_interval;
}
@@ -1893,6 +1912,29 @@
// Calculate the bits to be allocated to the gf/arf group as a whole
gf_group_bits = calculate_total_gf_group_bits(cpi, gf_group_err);
+#if GROUP_ADAPTIVE_MAXQ
+ // Calculate an estimate of the maxq needed for the group.
+ // We are more agressive about correcting for sections
+ // where there could be significant overshoot than for easier
+ // sections where we do not wish to risk creating an overshoot
+ // of the allocated bit budget.
+ if ((cpi->oxcf.rc_mode != VPX_Q) && (rc->baseline_gf_interval > 1)) {
+ const int vbr_group_bits_per_frame =
+ (int)(gf_group_bits / rc->baseline_gf_interval);
+ const double group_av_err = gf_group_raw_error / rc->baseline_gf_interval;
+ const int tmp_q =
+ get_twopass_worst_quality(cpi, group_av_err, vbr_group_bits_per_frame,
+ twopass->kfgroup_inter_fraction);
+
+ if (tmp_q < twopass->baseline_worst_quality) {
+ twopass->active_worst_quality =
+ (tmp_q + twopass->baseline_worst_quality + 1) / 2;
+ } else {
+ twopass->active_worst_quality = tmp_q;
+ }
+ }
+#endif
+
// Calculate the extra bits to be used for boosted frame(s)
gf_arf_bits = calculate_boost_bits(rc->baseline_gf_interval,
rc->gfu_boost, gf_group_bits);
@@ -2116,7 +2158,7 @@
// Reset to the start of the group.
reset_fpf_position(twopass, start_position);
- kf_group_err = 0;
+ kf_group_err = 0.0;
// Rescan to get the correct error data for the forced kf group.
for (i = 0; i < rc->frames_to_key; ++i) {
@@ -2226,6 +2268,16 @@
kf_bits = calculate_boost_bits((rc->frames_to_key - 1),
rc->kf_boost, twopass->kf_group_bits);
+ // Work out the fraction of the kf group bits reserved for the inter frames
+ // within the group after discounting the bits for the kf itself.
+ if (twopass->kf_group_bits) {
+ twopass->kfgroup_inter_fraction =
+ (double)(twopass->kf_group_bits - kf_bits) /
+ (double)twopass->kf_group_bits;
+ } else {
+ twopass->kfgroup_inter_fraction = 1.0;
+ }
+
twopass->kf_group_bits -= kf_bits;
// Save the bits to spend on the key frame.
@@ -2316,7 +2368,6 @@
GF_GROUP *const gf_group = &twopass->gf_group;
int frames_left;
FIRSTPASS_STATS this_frame;
- FIRSTPASS_STATS this_frame_copy;
int target_rate;
LAYER_CONTEXT *const lc = is_two_pass_svc(cpi) ?
@@ -2374,9 +2425,14 @@
// Special case code for first frame.
const int section_target_bandwidth = (int)(twopass->bits_left /
frames_left);
- const int tmp_q = get_twopass_worst_quality(cpi, &twopass->total_left_stats,
- section_target_bandwidth);
+ const double section_error =
+ twopass->total_left_stats.coded_error / twopass->total_left_stats.count;
+ const int tmp_q =
+ get_twopass_worst_quality(cpi, section_error,
+ section_target_bandwidth, DEFAULT_GRP_WEIGHT);
+
twopass->active_worst_quality = tmp_q;
+ twopass->baseline_worst_quality = tmp_q;
rc->ni_av_qi = tmp_q;
rc->last_q[INTER_FRAME] = tmp_q;
rc->avg_q = vp9_convert_qindex_to_q(tmp_q, cm->bit_depth);
@@ -2388,13 +2444,13 @@
if (EOF == input_stats(twopass, &this_frame))
return;
- // Local copy of the current frame's first pass stats.
- this_frame_copy = this_frame;
-
// Keyframe and section processing.
if (rc->frames_to_key == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY)) {
+ FIRSTPASS_STATS this_frame_copy;
+ this_frame_copy = this_frame;
// Define next KF group and assign bits to it.
- find_next_key_frame(cpi, &this_frame_copy);
+ find_next_key_frame(cpi, &this_frame);
+ this_frame = this_frame_copy;
} else {
cm->frame_type = INTER_FRAME;
}
@@ -2423,7 +2479,7 @@
// Define a new GF/ARF group. (Should always enter here for key frames).
if (rc->frames_till_gf_update_due == 0) {
- define_gf_group(cpi, &this_frame_copy);
+ define_gf_group(cpi, &this_frame);
rc->frames_till_gf_update_due = rc->baseline_gf_interval;
if (lc != NULL)
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index a8e4987..3476770 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -109,11 +109,16 @@
// Error score of frames still to be coded in kf group
int64_t kf_group_error_left;
+
+ // The fraction for a kf groups total bits allocated to the inter frames
+ double kfgroup_inter_fraction;
+
int sr_update_lag;
int kf_zeromotion_pct;
int last_kfgroup_zeromotion_pct;
int gf_zeromotion_pct;
+ int baseline_worst_quality;
int active_worst_quality;
int extend_minq;
int extend_maxq;
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 21f4cce..11da367 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -396,7 +396,7 @@
}
}
-void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
+void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi) {
const VP9_COMMON *const cm = &cpi->common;
int correction_factor = 100;
double rate_correction_factor = get_rate_correction_factor(cpi);
@@ -431,19 +431,8 @@
// More heavily damped adjustment used if we have been oscillating either side
// of target.
- switch (damp_var) {
- case 0:
- adjustment_limit = 0.75;
- break;
- case 1:
- adjustment_limit = 0.25 +
- 0.5 * MIN(1, fabs(log10(0.01 * correction_factor)));
- break;
- case 2:
- default:
- adjustment_limit = 0.25;
- break;
- }
+ adjustment_limit = 0.25 +
+ 0.5 * MIN(1, fabs(log10(0.01 * correction_factor)));
cpi->rc.q_2_frame = cpi->rc.q_1_frame;
cpi->rc.q_1_frame = cm->base_qindex;
@@ -1222,9 +1211,7 @@
rc->projected_frame_size = (int)(bytes_used << 3);
// Post encode loop adjustment of Q prediction.
- vp9_rc_update_rate_correction_factors(
- cpi, (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) ? 2 :
- ((oxcf->rc_mode == VPX_CBR) ? 1 : 0));
+ vp9_rc_update_rate_correction_factors(cpi);
// Keep a record of last Q and ambient average Q.
if (cm->frame_type == KEY_FRAME) {
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index a53f4e0..9774127 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -160,7 +160,7 @@
// Updates rate correction factors
// Changes only the rate correction factors in the rate control structure.
-void vp9_rc_update_rate_correction_factors(struct VP9_COMP *cpi, int damp_var);
+void vp9_rc_update_rate_correction_factors(struct VP9_COMP *cpi);
// Decide if we should drop this frame: For 1-pass CBR.
// Changes only the decimation count in the rate control structure
diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index 375407d..adbe024 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@@ -570,10 +570,6 @@
rd->thresh_mult[THR_NEWA] += 1000;
rd->thresh_mult[THR_NEWG] += 1000;
- // Adjust threshold only in real time mode, which only uses last
- // reference frame.
- rd->thresh_mult[THR_NEWMV] += sf->elevate_newmv_thresh;
-
rd->thresh_mult[THR_NEARMV] += 1000;
rd->thresh_mult[THR_NEARA] += 1000;
rd->thresh_mult[THR_COMP_NEARESTLA] += 1000;
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 15831fb..81f3195 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -446,7 +446,6 @@
sf->always_this_block_size = BLOCK_16X16;
sf->search_type_check_frequency = 50;
sf->encode_breakout_thresh = 0;
- sf->elevate_newmv_thresh = 0;
// Recode loop tolerance %.
sf->recode_tolerance = 25;
sf->default_interp_filter = SWITCHABLE;
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index c2cfd62..eaa0acc 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -390,9 +390,6 @@
// enabled in real time mode.
int encode_breakout_thresh;
- // In real time encoding, increase the threshold for NEWMV.
- int elevate_newmv_thresh;
-
// default interp filter choice
INTERP_FILTER default_interp_filter;
diff --git a/vp9/encoder/x86/vp9_variance_sse2.c b/vp9/encoder/x86/vp9_variance_sse2.c
index b4d2b0a..8490bbb 100644
--- a/vp9/encoder/x86/vp9_variance_sse2.c
+++ b/vp9/encoder/x86/vp9_variance_sse2.c
@@ -314,13 +314,15 @@
return *sse;
}
+// The 2 unused parameters are place holders for PIC enabled build.
#define DECL(w, opt) \
int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
ptrdiff_t src_stride, \
int x_offset, int y_offset, \
const uint8_t *dst, \
ptrdiff_t dst_stride, \
- int height, unsigned int *sse)
+ int height, unsigned int *sse, \
+ void *unused0, void *unused)
#define DECLS(opt1, opt2) \
DECL(4, opt2); \
DECL(8, opt1); \
@@ -342,26 +344,26 @@
unsigned int sse; \
int se = vp9_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
y_offset, dst, dst_stride, \
- h, &sse); \
+ h, &sse, NULL, NULL); \
if (w > wf) { \
unsigned int sse2; \
int se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \
x_offset, y_offset, \
dst + 16, dst_stride, \
- h, &sse2); \
+ h, &sse2, NULL, NULL); \
se += se2; \
sse += sse2; \
if (w > wf * 2) { \
se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
x_offset, y_offset, \
dst + 32, dst_stride, \
- h, &sse2); \
+ h, &sse2, NULL, NULL); \
se += se2; \
sse += sse2; \
se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
x_offset, y_offset, \
dst + 48, dst_stride, \
- h, &sse2); \
+ h, &sse2, NULL, NULL); \
se += se2; \
sse += sse2; \
} \
@@ -391,6 +393,7 @@
#undef FNS
#undef FN
+// The 2 unused parameters are place holders for PIC enabled build.
#define DECL(w, opt) \
int vp9_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
ptrdiff_t src_stride, \
@@ -399,7 +402,8 @@
ptrdiff_t dst_stride, \
const uint8_t *sec, \
ptrdiff_t sec_stride, \
- int height, unsigned int *sse)
+ int height, unsigned int *sse, \
+ void *unused0, void *unused)
#define DECLS(opt1, opt2) \
DECL(4, opt2); \
DECL(8, opt1); \
@@ -422,26 +426,30 @@
unsigned int sse; \
int se = vp9_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \
y_offset, dst, dst_stride, \
- sec, w, h, &sse); \
+ sec, w, h, &sse, NULL, \
+ NULL); \
if (w > wf) { \
unsigned int sse2; \
int se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \
x_offset, y_offset, \
dst + 16, dst_stride, \
- sec + 16, w, h, &sse2); \
+ sec + 16, w, h, &sse2, \
+ NULL, NULL); \
se += se2; \
sse += sse2; \
if (w > wf * 2) { \
se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \
x_offset, y_offset, \
dst + 32, dst_stride, \
- sec + 32, w, h, &sse2); \
+ sec + 32, w, h, &sse2, \
+ NULL, NULL); \
se += se2; \
sse += sse2; \
se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \
x_offset, y_offset, \
dst + 48, dst_stride, \
- sec + 48, w, h, &sse2); \
+ sec + 48, w, h, &sse2, \
+ NULL, NULL); \
se += se2; \
sse += sse2; \
} \
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 06096a6..58920e2 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -84,6 +84,7 @@
VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_subpixel_8t_intrin_avx2.c
VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_intrin_ssse3.c
ifeq ($(CONFIG_VP9_POSTPROC),yes)
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_mfqe_sse2.asm
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm
endif
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 33a1e67..fbdd4ba 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -152,6 +152,7 @@
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_avg_neon.c
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_sad_neon.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_sad4d_neon.c
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c
diff --git a/vpx_ports/x86.h b/vpx_ports/x86.h
index 2a3ebbe..ae349fb 100644
--- a/vpx_ports/x86.h
+++ b/vpx_ports/x86.h
@@ -189,7 +189,7 @@
if (reg_ecx & BIT(19)) flags |= HAS_SSE4_1;
// bits 27 (OSXSAVE) & 28 (256-bit AVX)
- if (reg_ecx & (BIT(27) | BIT(28))) {
+ if ((reg_ecx & (BIT(27) | BIT(28))) == (BIT(27) | BIT(28))) {
if ((xgetbv() & 0x6) == 0x6) {
flags |= HAS_AVX;